DeepSeek-R1-0528-GGUF

562
47
528.0B
1 language
Q4
ik_llama.cpp
by
ubergarm
Language Model
OTHER
0528B params
New
562 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
1181GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
492GB+ RAM

Code Examples

Token embedding and output tensors (GPU)bash
#!/usr/bin/env bash

custom="
# Token embedding and output tensors (GPU)
token_embd\.weight=q8_0
output\.weight=q8_0
output_norm\.weight=q8_0

# First 3 dense layers (0-3) (GPU)
blk\.[0-2]\..*=q8_0

# All attention, weights, and bias tensors for MoE layers (3-60) (GPU)
blk\.[3-9]\.attn_.*=q8_0
blk\.[1-5][0-9]\.attn_.*=q8_0
blk\.60\.attn_.*=q8_0

blk\.[3-9]\.ffn_norm\.weight=q8_0
blk\.[1-5][0-9]\.ffn_norm\.weight=q8_0
blk\.60\.ffn_norm\.weight=q8_0

blk\.[3-9]\.exp_probs_b\.bias=q8_0
blk\.[1-5][0-9]\.exp_probs_b\.bias=q8_0
blk\.60\.exp_probs_b\.bias=q8_0

# Shared Experts (3-60) (GPU)
blk\.[3-9]\.ffn_down_shexp\.weight=q8_0
blk\.[1-5][0-9]\.ffn_down_shexp\.weight=q8_0
blk\.60\.ffn_down_shexp\.weight=q8_0

blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=q8_0
blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=q8_0
blk\.60\.ffn_(gate|up)_shexp\.weight=q8_0

# MoE Experts (3-60) (CPU)
blk\.[3-9]\.ffn_down_exps\.weight=iq5_ks_r4
blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq5_ks_r4
blk\.60\.ffn_down_exps\.weight=iq5_ks_r4

blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
blk\.60\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
"

custom=$(
  echo "$custom" | grep -v '^#' | \
  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)

./build/bin/llama-quantize \
    --custom-q "$custom" \
    --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ4_KS_R4.gguf \
    IQ4_KS_R4 \
    24
bash
--n-gpu-layers 63 \
    -ot "blk\.(3|4|5|6|7)\.ffn_.*=CUDA0" \
    -ot "blk\.(8|9|10|11|12)\.ffn_.*=CUDA1" \
    --override-tensor exps=CPU \

llm_load_tensors:        CPU buffer size = 252646.07 MiB
llm_load_tensors:        CPU buffer size =   938.98 MiB
llm_load_tensors:      CUDA0 buffer size = 33753.38 MiB
llm_load_tensors:      CUDA1 buffer size = 33900.64 MiB
...
llama_kv_cache_init:      CUDA0 KV buffer size =   592.89 MiB
llama_kv_cache_init:      CUDA1 KV buffer size =   573.76 MiB
llama_new_context_with_model: KV self size  = 1166.62 MiB, c^KV (q8_0): 1166.62 MiB, kv^T: not used
llama_new_context_with_model:  CUDA_Host  output buffer size =     0.99 MiB
llama_new_context_with_model: pipeline parallelism enabled (n_copies=1)
llama_new_context_with_model:      CUDA0 compute buffer size =  3425.00 MiB
llama_new_context_with_model:      CUDA1 compute buffer size =  3386.00 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =    78.01 MiB
First 3 dense layers (0-3) (GPU)bash
#!/usr/bin/env bash

custom="
# First 3 dense layers (0-3) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[0-2]\.attn_k_b.*=q5_0
blk\.[0-2]\.attn_.*=iq5_ks
blk\.[0-2]\.ffn_down.*=iq5_ks
blk\.[0-2]\.ffn_(gate|up).*=iq4_ks
blk\.[0-2]\..*=iq5_ks

# All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[3-9]\.attn_k_b.*=q5_0
blk\.[1-5][0-9]\.attn_k_b.*=q5_0
blk\.60\.attn_k_b.*=q5_0

blk\.[3-9]\.attn_.*=iq5_ks
blk\.[1-5][0-9]\.attn_.*=iq5_ks
blk\.60\.attn_.*=iq5_ks

#blk\.[3-9]\.ffn_norm\.weight=iq5_ks
#blk\.[1-5][0-9]\.ffn_norm\.weight=iq5_ks
#blk\.60\.ffn_norm\.weight=iq5_ks

#blk\.[3-9]\.exp_probs_b\.bias=iq5_ks
#blk\.[1-5][0-9]\.exp_probs_b\.bias=iq5_ks
#blk\.60\.exp_probs_b\.bias=iq5_ks

# Shared Experts (3-60) (GPU)
blk\.[3-9]\.ffn_down_shexp\.weight=iq5_ks
blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq5_ks
blk\.60\.ffn_down_shexp\.weight=iq5_ks

blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks

# Routed Experts (3-60) (CPU)
blk\.[3-9]\.ffn_down_exps\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq4_ks
blk\.60\.ffn_down_exps\.weight=iq4_ks

blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq3_ks
blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq3_ks
blk\.60\.ffn_(gate|up)_exps\.weight=iq3_ks

# put last so output weight doesn't catch all the attn ones
# Token embedding and output tensors (GPU)
# note token_embd cannot be repacked quant type
token_embd\.weight=iq5_k
output\.weight=iq6_k
"

custom=$(
  echo "$custom" | grep -v '^#' | \
  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)

./build/bin/llama-quantize \
    --custom-q "$custom" \
    --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_KS.gguf \
    IQ3_KS \
    24
Notes:bashllama.cpp
#!/usr/bin/env bash

# Notes:
# https://github.com/ikawrakow/ik_llama.cpp/issues/296#issuecomment-2765210993
# https://github.com/ikawrakow/ik_llama.cpp/issues/296#issuecomment-2768567062
custom="
# Token embedding and output tensors (GPU)
# note token_embd cannot be repacked quant type
token_embd\.weight=iq5_ks
output\.weight=iq5_ks
output_norm\.weight=iq5_ks

# First 3 dense layers (0-3) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[0-2]\.attn_k_b.*=q5_0
blk\.[0-2]\.attn_.*=iq5_ks
blk\.[0-2]\..*=iq5_ks

# All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[3-9]\.attn_k_b.*=q5_0
blk\.[1-5][0-9]\.attn_k_b.*=q5_0
blk\.60\.attn_k_b.*=q5_0

blk\.[3-9]\.attn_.*=iq5_ks
blk\.[1-5][0-9]\.attn_.*=iq5_ks
blk\.60\.attn_.*=iq5_ks

blk\.[3-9]\.ffn_norm\.weight=iq5_ks
blk\.[1-5][0-9]\.ffn_norm\.weight=iq5_ks
blk\.60\.ffn_norm\.weight=iq5_ks

blk\.[3-9]\.exp_probs_b\.bias=iq5_ks
blk\.[1-5][0-9]\.exp_probs_b\.bias=iq5_ks
blk\.60\.exp_probs_b\.bias=iq5_ks

# Shared Experts (3-60) (GPU)
blk\.[3-9]\.ffn_down_shexp\.weight=iq5_ks
blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq5_ks
blk\.60\.ffn_down_shexp\.weight=iq5_ks

blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks

# Routed Experts (3-60) (CPU)
blk\.[3-9]\.ffn_down_exps\.weight=iq3_k_r4
blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq3_k_r4
blk\.60\.ffn_down_exps\.weight=iq3_k_r4

blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq2_k_r4
blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq2_k_r4
blk\.60\.ffn_(gate|up)_exps\.weight=iq2_k_r4
"

custom=$(
  echo "$custom" | grep -v '^#' | \
  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)

./build/bin/llama-quantize \
    --custom-q "$custom" \
    --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ2_K_R4.gguf \
    IQ2_K_R4 \
    24
You can use more CUDA devices just set them all visibile and do *not* use `-ts ...` with this `-ot .bash
# You can use more CUDA devices just set them all visibile and do *not* use `-ts ...` with this `-ot ...` strategy.
CUDA_VISIBLE_DEVICES="0" \
./build/bin/llama-server \
    --model /mnt/raid/hf/DeepSeek-R1-0528-GGUF/IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf \
    --alias ubergarm/DeepSeek-R1-0528-IQ1_S_R4 \
    --ctx-size 32768 \
    -ctk q8_0 \
    -mla 3 -fa \
    -amb 256 \
    -fmoe \
    --n-gpu-layers 99 \
    -ot "blk\.(3|4|5|6)\.ffn_.*=CUDA0" \
    --override-tensor exps=CPU \
    -rtr \
    --parallel 1 \
    --threads 24 \
    --host 127.0.0.1 \
    --port 8080

llm_load_tensors:        CPU buffer size = 117936.00 MiB
llm_load_tensors:  CUDA_Host buffer size =   469.99 MiB
llm_load_tensors:      CUDA0 buffer size = 17851.01 MiB
....................................................................................................
llama_kv_cache_init:      CUDA0 KV buffer size =  2196.00 MiB
llama_new_context_with_model: KV self size  = 2196.00 MiB, c^KV (f16): 2196.00 MiB, kv^T: not used
llama_new_context_with_model:  CUDA_Host  output buffer size =     0.99 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =  3041.00 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =    78.01 MiB
Token embedding and output tensors (GPU)bash
#!/usr/bin/env bash

custom="
# Token embedding and output tensors (GPU)
# note token_embd cannot be repacked quant type
token_embd\.weight=iq4_ks
output\.weight=iq4_ks
output_norm\.weight=iq4_ks

# First 3 dense layers (0-3) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[0-2]\.attn_k_b.*=q4_0
blk\.[0-2]\.attn_.*=iq4_ks
blk\.[0-2]\..*=iq4_ks

# All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[3-9]\.attn_k_b.*=q4_0
blk\.[1-5][0-9]\.attn_k_b.*=q4_0
blk\.60\.attn_k_b.*=q4_0

blk\.[3-9]\.attn_.*=iq4_ks
blk\.[1-5][0-9]\.attn_.*=iq4_ks
blk\.60\.attn_.*=iq4_ks

blk\.[3-9]\.ffn_norm\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_norm\.weight=iq4_ks
blk\.60\.ffn_norm\.weight=iq4_ks

blk\.[3-9]\.exp_probs_b\.bias=iq4_ks
blk\.[1-5][0-9]\.exp_probs_b\.bias=iq4_ks
blk\.60\.exp_probs_b\.bias=iq4_ks

# Shared Experts (3-60) (GPU)
blk\.[3-9]\.ffn_down_shexp\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq4_ks
blk\.60\.ffn_down_shexp\.weight=iq4_ks

blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks

# Routed Experts (3-60) (CPU)
blk\.[3-9]\.ffn_down_exps\.weight=iq1_m_r4
blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq1_m_r4
blk\.60\.ffn_down_exps\.weight=iq1_m_r4

blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq1_s_r4
blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq1_s_r4
blk\.60\.ffn_(gate|up)_exps\.weight=iq1_s_r4
"

custom=$(
  echo "$custom" | grep -v '^#' | \
  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)

./build/bin/llama-quantize \
    --custom-q "$custom" \
    --imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ1_S_R4.gguf \
    IQ1_S_R4 \
    24
Quick Startbash
# Fits 32k context in under 24GB VRAM
# Optional `-ser 6,1` improves speed at some cost to quality
# Recommended sampling: --temp 0.6 --top-p 0.95
CUDA_VISIBLE_DEVICES="0," \
./build/bin/llama-server \
    --model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
    --alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
    --ctx-size 32768 \
    -ctk q8_0 \
    -mla 3 -fa \
    -amb 512 \
    -fmoe \
    --n-gpu-layers 63 \
    --override-tensor exps=CPU \
    --parallel 1 \
    --threads 16 \
    --host 127.0.0.1 \
    --port 8080
`ik_llama.cpp` API server for MultiGPU(+CPU)bash
# Adjust number of routed expert layers for additional VRAM on each GPU
# Compile with -DGGML_SCHED_MAX_COPIES=1 for multi-GPUs
# Compile with -DGGML_CUDA_IQK_FORCE_BF16=1 if putting `_R4` tensors on GPU (for DeepSeek only)
# (might go faster or slower with FORCE_BF16 depending on GPU model)
# If you have extra VRAM go with `-b 4096 -ub 4096` for potential big PP gains!
./build/bin/llama-server \
    --model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
    --alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
    --ctx-size 32768 \
    -ctk q8_0 \
    -mla 3 -fa \
    -amb 512 \
    -fmoe \
    --n-gpu-layers 63 \
    -ot "blk\.(3|4)\.ffn_.*=CUDA0" \
    -ot "blk\.(5|6)\.ffn_.*=CUDA1" \
    --override-tensor exps=CPU \
    --parallel 1 \
    --threads 16 \
    --host 127.0.0.1 \
    --port 8080
`ik_llama.cpp` API server for CPU *only*textllama.cpp
# The goal for now is as much RAM bandwidth in a single NUMA node e.g.
# Use BIOS `NPS0` on AMD Epyc or single socket of Intel Xeon in BIOS `SNC=Disable` & Snoop Interleave
# Tune your `--threads` for token generation, and `--threads-batch` for prompt processing (prefill)
# Note `--run-time-repack` will pre-allocate enough RAM for model weights instead of mmap()'ing off disk
# Note there are options for both Explicit and Transparent Huge Pages with tuning discussions in [git repo](https://github.com/ikawrakow/ik_llama.cpp/pull/278#issuecomment-2746381515)
numactl -N 0 -m 0 \
./build/bin/llama-server \
    --model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
    --alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
    --run-time-repack \
    --ctx-size 65536 \
    -ctk q8_0 \
    -mla 3 -fa \
    -amb 512 \
    -fmoe \
    --parallel 1 \
    --threads 88 \
    --threads-batch 128 \
    --numa numactl \
    --host 127.0.0.1 \
    --port 8080
Do *not* use the wiki.utf8 to avoid potential over-fitting on wiki.test.raw common test corpusbash
cat calibration_data_v5_rc.txt > ubergarm-imatrix-calibration-corpus-v02.txt
cat c4.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
cat code.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
cat multilingual.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
cat technical.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
cat tiny.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
# Do *not* use the wiki.utf8 to avoid potential over-fitting on wiki.test.raw common test corpus
# 1.7MiB total size of ubergarm-imatrix-calibration-corpus-v02.txt

./build/bin/llama-imatrix \
    -m /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-Q8_0.gguf \
    -f ubergarm-imatrix-calibration-corpus-v02.txt \
    -o /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
    --verbosity 1 \
    --ctx-size 512 \
    --layer-similarity \
    --threads 128
Perplexitybash
$ ./build/bin/llama-perplexity \
    --model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
    -f wiki.test.raw \
    --seed 1337 \
    --ctx-size 512 \
    -mla 3 -fa \
    -amb 512 \
    -fmoe \
    --n-gpu-layers 63 \
    -ot "blk\.(3|4|5|6|7|8)\.ffn_.*=CUDA0" \
    -ot "blk\.(9|10|11|12|13)\.ffn_.*=CUDA1" \
    --override-tensor exps=CPU \
    --threads 24

Final estimate: PPL = 3.2730 +/- 0.01738
Splitbash
$ ./build/bin/llama-gguf-split \
    --dry-run \
    --split \
    --split-max-size 50G \
    /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf
    /mnt/raid/hf/DeepSeek-R1-0528-GGUF/IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.