GLM-4.7-Flash-GGUF
3.3K
8
ik_llama.cpp
by
ubergarm
Language Model
OTHER
New
3K downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Code Examples
IQ5_K 21.157 GiB (6.069 BPW)bash
#!/usr/bin/env bash
custom="
## Attention [0-47] (GPU)
blk\..*\.attn_k_b\.weight=q8_0
blk\..*\.attn_v_b\.weight=q8_0
# Balance of attn tensors (GPU)
blk\..*\.attn_kv_a_mqa\.weight=q8_0
blk\..*\.attn_q_a\.weight=q8_0
blk\..*\.attn_q_b\.weight=q8_0
blk\..*\.attn_output\.weight=q8_0
## First Dense Layer [0] (GPU)
blk\..*\.ffn_down\.weight=q8_0
blk\..*\.ffn_(gate|up)\.weight=q8_0
## Shared Expert (1-39) (GPU)
blk\..*\.ffn_down_shexp\.weight=q8_0
blk\..*\.ffn_(gate|up)_shexp\.weight=q8_0
## Routed Experts (1-39) (CPU)
blk\..*\.ffn_down_exps\.weight=iq6_k
blk\..*\.ffn_(gate|up)_exps\.weight=iq5_k
## Token embedding and output tensors (GPU)
token_embd\.weight=q8_0
output\.weight=q8_0
"
custom=$(
echo "$custom" | grep -v '^#' | \
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)
./build/bin/llama-quantize \
--custom-q "$custom" \
--imatrix /mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/imatrix-GLM-4.7-Flash-BF16.dat \
/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-64x2.6B-BF16-00001-of-00002.gguf \
/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-IQ5_K.gguf \
IQ5_K \
24bash
#!/usr/bin/env bash
IMATRIX=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/imatrix-GLM-4.7-Flash-mainline-BF16.dat
SRC_GGUF=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-64x2.6B-BF16-00001-of-00002.gguf
DST_GGUF=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-IQ4_XS.gguf
TYPE_ATTN=q8_0
TYPE_FFN_UP=q6_K
TYPE_FFN_GATE=q6_K
TYPE_FFN_DOWN=q6_K
TYPE_FFN_UP_SHEXP=q8_0
TYPE_FFN_GATE_SHEXP=q8_0
TYPE_FFN_DOWN_SHEXP=q8_0
TYPE_FFN_UP_EXPS=iq4_xs
TYPE_FFN_GATE_EXPS=iq4_xs
TYPE_FFN_DOWN_EXPS=iq4_xs
TYPE_TOKEN_EMBED=q4_K
TYPE_OUTPUT=q6_K
TYPE_DEFAULT=q8_0
./build/bin/llama-quantize \
--imatrix $IMATRIX \
--tensor-type "attn_k_b.weight=$TYPE_ATTN" \
--tensor-type "attn\.*=$TYPE_ATTN" \
--tensor-type "ffn_up.weight=$TYPE_FFN_UP" \
--tensor-type "ffn_gate.weight=$TYPE_FFN_GATE" \
--tensor-type "ffn_down.weight=$TYPE_FFN_DOWN" \
--tensor-type "ffn_up_shexp.weight=$TYPE_FFN_UP_SHEXP" \
--tensor-type "ffn_gate_shexp.weight=$TYPE_FFN_GATE_SHEXP" \
--tensor-type "ffn_down_shexp.weight=$TYPE_FFN_DOWN_SHEXP" \
--tensor-type "ffn_up_exps.weight=$TYPE_FFN_UP_EXPS" \
--tensor-type "ffn_gate_exps.weight=$TYPE_FFN_GATE_EXPS" \
--tensor-type "ffn_down_exps.weight=$TYPE_FFN_DOWN_EXPS" \
--tensor-type "token_embd.weight=$TYPE_TOKEN_EMBED" \
--output-tensor-type "$TYPE_OUTPUT" \
"$SRC_GGUF" \
"$DST_GGUF" \
"$TYPE_DEFAULT" \
$(nproc)Detailsbash
## convert with ddh0:glm4moelite@a15dbef
python \
convert_hf_to_gguf.py \
--outtype bf16 \
--split-max-size 50G \
--outfile /mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/ \
/mnt/raid/models/zai-org/GLM-4.7-Flash/
## generate imatrix
#!/usr/bin/env bash
model=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-64x2.6B-BF16-00001-of-00002.gguf
./build/bin/llama-imatrix \
--model "$model"\
-f ubergarm-imatrix-calibration-corpus-v02.txt \
-o /mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/imatrix-GLM-4.7-Flash-mainline-BF16.dat \
--ctx-size 512 \
-fit off \
-fa off \
-ub 4096 -b 4096 \
-ngl 99 \
-ts 40,48 \
--threads 1 \
--no-mmap \
--output-format dat
## test it (probably needs another template for tool use)
#!/usr/bin/env bash
#model=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-64x2.6B-BF16-00001-of-00002.gguf
model=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-IQ4_XS.gguf
./build/bin/llama-server \
--model "$model" \
-c 32768 \
--alias ubergarm/GLM-4.7-Flash \
--jinja \
-fit off \
-fa on \
-ngl 99 \
--threads 1 \
--host 127.0.0.1 \
--port 8080bash
model=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-IQ4_XS.gguf
CUDA_VISIBLE_DEVICES="0" \
./build/bin/llama-sweep-bench \
--model "$model" \
-c 69632 \
-fit off \
-fa on \
-ngl 99 \
-ub 4096 -b 4096 \
--threads 1Quick Startbashllama.cpp
# Clone and checkout
$ git clone https://github.com/ikawrakow/ik_llama.cpp
$ cd ik_llama.cpp
# Build for hybrid CPU+CUDA
$ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
$ cmake --build build --config Release -j $(nproc)
# Full GPU Offload
./build/bin/llama-server \
--model "$model" \
--alias ubergarm/GLM-4.7-Flash \
-c 32768 \
-ctk q8_0 \
-ger \
--merge-qkv \
-mla 3 -amb 512 \
-ngl 99 \
-ub 4096 -b 4096 \
--threads 1 \
--host 127.0.0.1 \
--port 8080 \
--jinja \
--no-mmapDeploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.