Kimi-Dev-72B-GGUF
64
2
72.0B
ik_llama.cpp
by
ubergarm
Language Model
OTHER
72B params
New
64 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
161GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
68GB+ RAM
Code Examples
Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080Quickstartbashllama.cpp
# Clone
git clone [email protected]:ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build (might try adding -DGGML_CUDA_IQK_FORCE_BF16=1 for 3090s and older)
cmake -B build -DGGML_CUDA=ON -DGGML_CUDA_F16=ON -DGGML_SCHED_MAX_COPIES=1
cmake --build build --config Release -j $(nproc)
# Run (set threads to number of CPU physical cores, mmap is fine for faster startup, adjust ctx/ngl as needed)
./build/bin/llama-server \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 8192 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ngl 48 \
--threads 16 \
--parallel 1 \
--host 127.0.0.1 \
--port 8080bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16bash
./build/bin/llama-sweep-bench \
--model /mnt/models/ubergarm/Kimi-Dev-72B-GGUF/Kimi-Dev-72B-smol-IQ3_K.gguf \
--ctx-size 6144 \
-ctk q8_0 -ctv q8_0 \
-fa \
--no-mmap \
-ub 2048 -b 2048 \
-ngl 48 \
--warmup-batch \
--threads 16Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.