Kimi-Dev-72B-GGUF

64
2
72.0B
ik_llama.cpp
by
ubergarm
Language Model
OTHER
72B params
New
64 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
161GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
68GB+ RAM

Code Examples

Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp

# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)

# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 8192 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ngl 48 \
    --threads 16 \
    --parallel 1 \
    --host 127.0.0.1 \
    --port 8080
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16
bash
./build/bin/llama-sweep-bench \
    --model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
    --ctx-size 6144 \
    -ctk q8_0 -ctv q8_0 \
    -fa \
    --no-mmap \
    -ub 2048 -b 2048 \
    -ngl 48 \
    --warmup-batch \
    --threads 16

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.