DeepSeek-R1-0528-GGUF
562
47
528.0B
1 language
Q4
ik_llama.cpp
by
ubergarm
Language Model
OTHER
0528B params
New
562 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
1181GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
492GB+ RAM
Code Examples
Token embedding and output tensors (GPU)bash
#!/usr/bin/env bash
custom="
# Token embedding and output tensors (GPU)
token_embd\.weight=q8_0
output\.weight=q8_0
output_norm\.weight=q8_0
# First 3 dense layers (0-3) (GPU)
blk\.[0-2]\..*=q8_0
# All attention, weights, and bias tensors for MoE layers (3-60) (GPU)
blk\.[3-9]\.attn_.*=q8_0
blk\.[1-5][0-9]\.attn_.*=q8_0
blk\.60\.attn_.*=q8_0
blk\.[3-9]\.ffn_norm\.weight=q8_0
blk\.[1-5][0-9]\.ffn_norm\.weight=q8_0
blk\.60\.ffn_norm\.weight=q8_0
blk\.[3-9]\.exp_probs_b\.bias=q8_0
blk\.[1-5][0-9]\.exp_probs_b\.bias=q8_0
blk\.60\.exp_probs_b\.bias=q8_0
# Shared Experts (3-60) (GPU)
blk\.[3-9]\.ffn_down_shexp\.weight=q8_0
blk\.[1-5][0-9]\.ffn_down_shexp\.weight=q8_0
blk\.60\.ffn_down_shexp\.weight=q8_0
blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=q8_0
blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=q8_0
blk\.60\.ffn_(gate|up)_shexp\.weight=q8_0
# MoE Experts (3-60) (CPU)
blk\.[3-9]\.ffn_down_exps\.weight=iq5_ks_r4
blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq5_ks_r4
blk\.60\.ffn_down_exps\.weight=iq5_ks_r4
blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
blk\.60\.ffn_(gate|up)_exps\.weight=iq4_ks_r4
"
custom=$(
echo "$custom" | grep -v '^#' | \
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)
./build/bin/llama-quantize \
--custom-q "$custom" \
--imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ4_KS_R4.gguf \
IQ4_KS_R4 \
24bash
--n-gpu-layers 63 \
-ot "blk\.(3|4|5|6|7)\.ffn_.*=CUDA0" \
-ot "blk\.(8|9|10|11|12)\.ffn_.*=CUDA1" \
--override-tensor exps=CPU \
llm_load_tensors: CPU buffer size = 252646.07 MiB
llm_load_tensors: CPU buffer size = 938.98 MiB
llm_load_tensors: CUDA0 buffer size = 33753.38 MiB
llm_load_tensors: CUDA1 buffer size = 33900.64 MiB
...
llama_kv_cache_init: CUDA0 KV buffer size = 592.89 MiB
llama_kv_cache_init: CUDA1 KV buffer size = 573.76 MiB
llama_new_context_with_model: KV self size = 1166.62 MiB, c^KV (q8_0): 1166.62 MiB, kv^T: not used
llama_new_context_with_model: CUDA_Host output buffer size = 0.99 MiB
llama_new_context_with_model: pipeline parallelism enabled (n_copies=1)
llama_new_context_with_model: CUDA0 compute buffer size = 3425.00 MiB
llama_new_context_with_model: CUDA1 compute buffer size = 3386.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 78.01 MiBFirst 3 dense layers (0-3) (GPU)bash
#!/usr/bin/env bash
custom="
# First 3 dense layers (0-3) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[0-2]\.attn_k_b.*=q5_0
blk\.[0-2]\.attn_.*=iq5_ks
blk\.[0-2]\.ffn_down.*=iq5_ks
blk\.[0-2]\.ffn_(gate|up).*=iq4_ks
blk\.[0-2]\..*=iq5_ks
# All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[3-9]\.attn_k_b.*=q5_0
blk\.[1-5][0-9]\.attn_k_b.*=q5_0
blk\.60\.attn_k_b.*=q5_0
blk\.[3-9]\.attn_.*=iq5_ks
blk\.[1-5][0-9]\.attn_.*=iq5_ks
blk\.60\.attn_.*=iq5_ks
#blk\.[3-9]\.ffn_norm\.weight=iq5_ks
#blk\.[1-5][0-9]\.ffn_norm\.weight=iq5_ks
#blk\.60\.ffn_norm\.weight=iq5_ks
#blk\.[3-9]\.exp_probs_b\.bias=iq5_ks
#blk\.[1-5][0-9]\.exp_probs_b\.bias=iq5_ks
#blk\.60\.exp_probs_b\.bias=iq5_ks
# Shared Experts (3-60) (GPU)
blk\.[3-9]\.ffn_down_shexp\.weight=iq5_ks
blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq5_ks
blk\.60\.ffn_down_shexp\.weight=iq5_ks
blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks
# Routed Experts (3-60) (CPU)
blk\.[3-9]\.ffn_down_exps\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq4_ks
blk\.60\.ffn_down_exps\.weight=iq4_ks
blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq3_ks
blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq3_ks
blk\.60\.ffn_(gate|up)_exps\.weight=iq3_ks
# put last so output weight doesn't catch all the attn ones
# Token embedding and output tensors (GPU)
# note token_embd cannot be repacked quant type
token_embd\.weight=iq5_k
output\.weight=iq6_k
"
custom=$(
echo "$custom" | grep -v '^#' | \
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)
./build/bin/llama-quantize \
--custom-q "$custom" \
--imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_KS.gguf \
IQ3_KS \
24Notes:bashllama.cpp
#!/usr/bin/env bash
# Notes:
# https://github.com/ikawrakow/ik_llama.cpp/issues/296#issuecomment-2765210993
# https://github.com/ikawrakow/ik_llama.cpp/issues/296#issuecomment-2768567062
custom="
# Token embedding and output tensors (GPU)
# note token_embd cannot be repacked quant type
token_embd\.weight=iq5_ks
output\.weight=iq5_ks
output_norm\.weight=iq5_ks
# First 3 dense layers (0-3) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[0-2]\.attn_k_b.*=q5_0
blk\.[0-2]\.attn_.*=iq5_ks
blk\.[0-2]\..*=iq5_ks
# All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[3-9]\.attn_k_b.*=q5_0
blk\.[1-5][0-9]\.attn_k_b.*=q5_0
blk\.60\.attn_k_b.*=q5_0
blk\.[3-9]\.attn_.*=iq5_ks
blk\.[1-5][0-9]\.attn_.*=iq5_ks
blk\.60\.attn_.*=iq5_ks
blk\.[3-9]\.ffn_norm\.weight=iq5_ks
blk\.[1-5][0-9]\.ffn_norm\.weight=iq5_ks
blk\.60\.ffn_norm\.weight=iq5_ks
blk\.[3-9]\.exp_probs_b\.bias=iq5_ks
blk\.[1-5][0-9]\.exp_probs_b\.bias=iq5_ks
blk\.60\.exp_probs_b\.bias=iq5_ks
# Shared Experts (3-60) (GPU)
blk\.[3-9]\.ffn_down_shexp\.weight=iq5_ks
blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq5_ks
blk\.60\.ffn_down_shexp\.weight=iq5_ks
blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks
# Routed Experts (3-60) (CPU)
blk\.[3-9]\.ffn_down_exps\.weight=iq3_k_r4
blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq3_k_r4
blk\.60\.ffn_down_exps\.weight=iq3_k_r4
blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq2_k_r4
blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq2_k_r4
blk\.60\.ffn_(gate|up)_exps\.weight=iq2_k_r4
"
custom=$(
echo "$custom" | grep -v '^#' | \
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)
./build/bin/llama-quantize \
--custom-q "$custom" \
--imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ2_K_R4.gguf \
IQ2_K_R4 \
24You can use more CUDA devices just set them all visibile and do *not* use `-ts ...` with this `-ot .bash
# You can use more CUDA devices just set them all visibile and do *not* use `-ts ...` with this `-ot ...` strategy.
CUDA_VISIBLE_DEVICES="0" \
./build/bin/llama-server \
--model /mnt/raid/hf/DeepSeek-R1-0528-GGUF/IQ1_S_R4/DeepSeek-R1-0528-IQ1_S_R4-00001-of-00003.gguf \
--alias ubergarm/DeepSeek-R1-0528-IQ1_S_R4 \
--ctx-size 32768 \
-ctk q8_0 \
-mla 3 -fa \
-amb 256 \
-fmoe \
--n-gpu-layers 99 \
-ot "blk\.(3|4|5|6)\.ffn_.*=CUDA0" \
--override-tensor exps=CPU \
-rtr \
--parallel 1 \
--threads 24 \
--host 127.0.0.1 \
--port 8080
llm_load_tensors: CPU buffer size = 117936.00 MiB
llm_load_tensors: CUDA_Host buffer size = 469.99 MiB
llm_load_tensors: CUDA0 buffer size = 17851.01 MiB
....................................................................................................
llama_kv_cache_init: CUDA0 KV buffer size = 2196.00 MiB
llama_new_context_with_model: KV self size = 2196.00 MiB, c^KV (f16): 2196.00 MiB, kv^T: not used
llama_new_context_with_model: CUDA_Host output buffer size = 0.99 MiB
llama_new_context_with_model: CUDA0 compute buffer size = 3041.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 78.01 MiBToken embedding and output tensors (GPU)bash
#!/usr/bin/env bash
custom="
# Token embedding and output tensors (GPU)
# note token_embd cannot be repacked quant type
token_embd\.weight=iq4_ks
output\.weight=iq4_ks
output_norm\.weight=iq4_ks
# First 3 dense layers (0-3) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[0-2]\.attn_k_b.*=q4_0
blk\.[0-2]\.attn_.*=iq4_ks
blk\.[0-2]\..*=iq4_ks
# All attention, norm weights, and bias tensors for MoE layers (3-60) (GPU)
# Except blk.*.attn_k_b.weight is not divisible by 256 so only supports qN_0
blk\.[3-9]\.attn_k_b.*=q4_0
blk\.[1-5][0-9]\.attn_k_b.*=q4_0
blk\.60\.attn_k_b.*=q4_0
blk\.[3-9]\.attn_.*=iq4_ks
blk\.[1-5][0-9]\.attn_.*=iq4_ks
blk\.60\.attn_.*=iq4_ks
blk\.[3-9]\.ffn_norm\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_norm\.weight=iq4_ks
blk\.60\.ffn_norm\.weight=iq4_ks
blk\.[3-9]\.exp_probs_b\.bias=iq4_ks
blk\.[1-5][0-9]\.exp_probs_b\.bias=iq4_ks
blk\.60\.exp_probs_b\.bias=iq4_ks
# Shared Experts (3-60) (GPU)
blk\.[3-9]\.ffn_down_shexp\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_down_shexp\.weight=iq4_ks
blk\.60\.ffn_down_shexp\.weight=iq4_ks
blk\.[3-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.[1-5][0-9]\.ffn_(gate|up)_shexp\.weight=iq4_ks
blk\.60\.ffn_(gate|up)_shexp\.weight=iq4_ks
# Routed Experts (3-60) (CPU)
blk\.[3-9]\.ffn_down_exps\.weight=iq1_m_r4
blk\.[1-5][0-9]\.ffn_down_exps\.weight=iq1_m_r4
blk\.60\.ffn_down_exps\.weight=iq1_m_r4
blk\.[3-9]\.ffn_(gate|up)_exps\.weight=iq1_s_r4
blk\.[1-5][0-9]\.ffn_(gate|up)_exps\.weight=iq1_s_r4
blk\.60\.ffn_(gate|up)_exps\.weight=iq1_s_r4
"
custom=$(
echo "$custom" | grep -v '^#' | \
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)
./build/bin/llama-quantize \
--custom-q "$custom" \
--imatrix /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-256x21B-0528-BF16-00001-of-00030.gguf \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ1_S_R4.gguf \
IQ1_S_R4 \
24Quick Startbash
# Fits 32k context in under 24GB VRAM
# Optional `-ser 6,1` improves speed at some cost to quality
# Recommended sampling: --temp 0.6 --top-p 0.95
CUDA_VISIBLE_DEVICES="0," \
./build/bin/llama-server \
--model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
--alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
--ctx-size 32768 \
-ctk q8_0 \
-mla 3 -fa \
-amb 512 \
-fmoe \
--n-gpu-layers 63 \
--override-tensor exps=CPU \
--parallel 1 \
--threads 16 \
--host 127.0.0.1 \
--port 8080`ik_llama.cpp` API server for MultiGPU(+CPU)bash
# Adjust number of routed expert layers for additional VRAM on each GPU
# Compile with -DGGML_SCHED_MAX_COPIES=1 for multi-GPUs
# Compile with -DGGML_CUDA_IQK_FORCE_BF16=1 if putting `_R4` tensors on GPU (for DeepSeek only)
# (might go faster or slower with FORCE_BF16 depending on GPU model)
# If you have extra VRAM go with `-b 4096 -ub 4096` for potential big PP gains!
./build/bin/llama-server \
--model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
--alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
--ctx-size 32768 \
-ctk q8_0 \
-mla 3 -fa \
-amb 512 \
-fmoe \
--n-gpu-layers 63 \
-ot "blk\.(3|4)\.ffn_.*=CUDA0" \
-ot "blk\.(5|6)\.ffn_.*=CUDA1" \
--override-tensor exps=CPU \
--parallel 1 \
--threads 16 \
--host 127.0.0.1 \
--port 8080`ik_llama.cpp` API server for CPU *only*textllama.cpp
# The goal for now is as much RAM bandwidth in a single NUMA node e.g.
# Use BIOS `NPS0` on AMD Epyc or single socket of Intel Xeon in BIOS `SNC=Disable` & Snoop Interleave
# Tune your `--threads` for token generation, and `--threads-batch` for prompt processing (prefill)
# Note `--run-time-repack` will pre-allocate enough RAM for model weights instead of mmap()'ing off disk
# Note there are options for both Explicit and Transparent Huge Pages with tuning discussions in [git repo](https://github.com/ikawrakow/ik_llama.cpp/pull/278#issuecomment-2746381515)
numactl -N 0 -m 0 \
./build/bin/llama-server \
--model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
--alias ubergarm/DeepSeek-R1-0528-IQ3_K_R4 \
--run-time-repack \
--ctx-size 65536 \
-ctk q8_0 \
-mla 3 -fa \
-amb 512 \
-fmoe \
--parallel 1 \
--threads 88 \
--threads-batch 128 \
--numa numactl \
--host 127.0.0.1 \
--port 8080Do *not* use the wiki.utf8 to avoid potential over-fitting on wiki.test.raw common test corpusbash
cat calibration_data_v5_rc.txt > ubergarm-imatrix-calibration-corpus-v02.txt
cat c4.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
cat code.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
cat multilingual.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
cat technical.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
cat tiny.utf8 >> ubergarm-imatrix-calibration-corpus-v02.txt
# Do *not* use the wiki.utf8 to avoid potential over-fitting on wiki.test.raw common test corpus
# 1.7MiB total size of ubergarm-imatrix-calibration-corpus-v02.txt
./build/bin/llama-imatrix \
-m /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-Q8_0.gguf \
-f ubergarm-imatrix-calibration-corpus-v02.txt \
-o /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/imatrix-DeepSeek-R1-0528.dat \
--verbosity 1 \
--ctx-size 512 \
--layer-similarity \
--threads 128Perplexitybash
$ ./build/bin/llama-perplexity \
--model /mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf \
-f wiki.test.raw \
--seed 1337 \
--ctx-size 512 \
-mla 3 -fa \
-amb 512 \
-fmoe \
--n-gpu-layers 63 \
-ot "blk\.(3|4|5|6|7|8)\.ffn_.*=CUDA0" \
-ot "blk\.(9|10|11|12|13)\.ffn_.*=CUDA1" \
--override-tensor exps=CPU \
--threads 24
Final estimate: PPL = 3.2730 +/- 0.01738Splitbash
$ ./build/bin/llama-gguf-split \
--dry-run \
--split \
--split-max-size 50G \
/mnt/raid/models/ubergarm/DeepSeek-R1-0528-GGUF/DeepSeek-R1-0528-IQ3_K_R4.gguf
/mnt/raid/hf/DeepSeek-R1-0528-GGUF/IQ3_K_R4/DeepSeek-R1-0528-IQ3_K_R4Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.