GLM-4.7-Flash-GGUF

3.3K
8
ik_llama.cpp
by
ubergarm
Language Model
OTHER
New
3K downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Code Examples

IQ5_K 21.157 GiB (6.069 BPW)bash
#!/usr/bin/env bash

custom="
## Attention [0-47] (GPU)
blk\..*\.attn_k_b\.weight=q8_0
blk\..*\.attn_v_b\.weight=q8_0

# Balance of attn tensors (GPU)
blk\..*\.attn_kv_a_mqa\.weight=q8_0
blk\..*\.attn_q_a\.weight=q8_0
blk\..*\.attn_q_b\.weight=q8_0
blk\..*\.attn_output\.weight=q8_0

## First Dense Layer [0] (GPU)
blk\..*\.ffn_down\.weight=q8_0
blk\..*\.ffn_(gate|up)\.weight=q8_0

## Shared Expert (1-39) (GPU)
blk\..*\.ffn_down_shexp\.weight=q8_0
blk\..*\.ffn_(gate|up)_shexp\.weight=q8_0

## Routed Experts (1-39) (CPU)
blk\..*\.ffn_down_exps\.weight=iq6_k
blk\..*\.ffn_(gate|up)_exps\.weight=iq5_k

## Token embedding and output tensors (GPU)
token_embd\.weight=q8_0
output\.weight=q8_0
"

custom=$(
  echo "$custom" | grep -v '^#' | \
  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)

./build/bin/llama-quantize \
    --custom-q "$custom" \
    --imatrix /mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/imatrix-GLM-4.7-Flash-BF16.dat \
    /mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-64x2.6B-BF16-00001-of-00002.gguf \
    /mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-IQ5_K.gguf \
    IQ5_K \
    24
bash
#!/usr/bin/env bash

IMATRIX=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/imatrix-GLM-4.7-Flash-mainline-BF16.dat
SRC_GGUF=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-64x2.6B-BF16-00001-of-00002.gguf
DST_GGUF=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-IQ4_XS.gguf

TYPE_ATTN=q8_0

TYPE_FFN_UP=q6_K
TYPE_FFN_GATE=q6_K
TYPE_FFN_DOWN=q6_K

TYPE_FFN_UP_SHEXP=q8_0
TYPE_FFN_GATE_SHEXP=q8_0
TYPE_FFN_DOWN_SHEXP=q8_0

TYPE_FFN_UP_EXPS=iq4_xs
TYPE_FFN_GATE_EXPS=iq4_xs
TYPE_FFN_DOWN_EXPS=iq4_xs

TYPE_TOKEN_EMBED=q4_K
TYPE_OUTPUT=q6_K

TYPE_DEFAULT=q8_0

./build/bin/llama-quantize \
  --imatrix $IMATRIX \
  --tensor-type "attn_k_b.weight=$TYPE_ATTN" \
  --tensor-type "attn\.*=$TYPE_ATTN" \
  --tensor-type "ffn_up.weight=$TYPE_FFN_UP" \
  --tensor-type "ffn_gate.weight=$TYPE_FFN_GATE" \
  --tensor-type "ffn_down.weight=$TYPE_FFN_DOWN" \
  --tensor-type "ffn_up_shexp.weight=$TYPE_FFN_UP_SHEXP" \
  --tensor-type "ffn_gate_shexp.weight=$TYPE_FFN_GATE_SHEXP" \
  --tensor-type "ffn_down_shexp.weight=$TYPE_FFN_DOWN_SHEXP" \
  --tensor-type "ffn_up_exps.weight=$TYPE_FFN_UP_EXPS" \
  --tensor-type "ffn_gate_exps.weight=$TYPE_FFN_GATE_EXPS" \
  --tensor-type "ffn_down_exps.weight=$TYPE_FFN_DOWN_EXPS" \
  --tensor-type "token_embd.weight=$TYPE_TOKEN_EMBED" \
  --output-tensor-type "$TYPE_OUTPUT" \
  "$SRC_GGUF" \
  "$DST_GGUF" \
  "$TYPE_DEFAULT" \
  $(nproc)
Detailsbash
## convert with ddh0:glm4moelite@a15dbef
python \
    convert_hf_to_gguf.py \
    --outtype bf16 \
    --split-max-size 50G \
    --outfile /mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/ \
    /mnt/raid/models/zai-org/GLM-4.7-Flash/

## generate imatrix
#!/usr/bin/env bash

model=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-64x2.6B-BF16-00001-of-00002.gguf

./build/bin/llama-imatrix \
    --model "$model"\
    -f ubergarm-imatrix-calibration-corpus-v02.txt \
    -o /mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/imatrix-GLM-4.7-Flash-mainline-BF16.dat \
    --ctx-size 512 \
    -fit off \
    -fa off \
    -ub 4096 -b 4096 \
    -ngl 99 \
    -ts 40,48 \
    --threads 1 \
    --no-mmap \
    --output-format dat

## test it (probably needs another template for tool use)
#!/usr/bin/env bash

#model=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-64x2.6B-BF16-00001-of-00002.gguf
model=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-IQ4_XS.gguf

./build/bin/llama-server \
  --model "$model" \
  -c 32768 \
  --alias ubergarm/GLM-4.7-Flash \
  --jinja \
  -fit off \
  -fa on \
  -ngl 99 \
  --threads 1 \
  --host 127.0.0.1 \
  --port 8080
bash
model=/mnt/raid/models/ubergarm/GLM-4.7-Flash-GGUF/GLM-4.7-Flash-IQ4_XS.gguf

CUDA_VISIBLE_DEVICES="0" \
./build/bin/llama-sweep-bench \
  --model "$model" \
  -c 69632 \
  -fit off \
  -fa on \
  -ngl 99 \
  -ub 4096 -b 4096 \
  --threads 1
Quick Startbashllama.cpp
# Clone and checkout
$ git clone https://github.com/ikawrakow/ik_llama.cpp
$ cd ik_llama.cpp

# Build for hybrid CPU+CUDA
$ cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
$ cmake --build build --config Release -j $(nproc)

# Full GPU Offload
./build/bin/llama-server \
  --model "$model" \
  --alias ubergarm/GLM-4.7-Flash \
  -c 32768 \
  -ctk q8_0 \
  -ger \
  --merge-qkv \
  -mla 3 -amb 512 \
  -ngl 99 \
  -ub 4096 -b 4096 \
  --threads 1 \
  --host 127.0.0.1 \
  --port 8080 \
  --jinja \
  --no-mmap

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.