Inference Stack#

llama.cpp Server (lithium)#

Installation#

# Clone and build
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
make -j8

# Or use prebuilt
cargo install llama-cpp-server

Server Configuration#

server.yml:

host: 0.0.0.0
port: 8080
model: /models/qwen3.5-27b-q5_k_m.gguf
n_ctx: 8192
n_batch: 512
n_ubatch: 512
n_threads: 12
n_gpu_layers: 50
flash_attn: true
cache_type_k: q8_0
cache_type_v: f16

Run:

./server -c server.yml

API Endpoints#

Chat completion:

curl http://lithium.mrzk.io:8080/completion \
  -H "Content-Type: application/json" \
  -d '{
    "prompt": "user: Hello\nassistant:",
    "n_predict": 512,
    "temperature": 0.7,
    "stop": ["user:", "</s>"]
  }'

Embeddings:

curl http://lithium.mrzk.io:8080/embedding \
  -H "Content-Type: application/json" \
  -d '{
    "content": "Hello world"
  }'

Model Registry#

Model Storage#

Directory structure:

/models/
├── qwen3.5-27b-q5_k_m.gguf    # 27GB, main inference
├── qwen3.5-27b-q4_0.gguf      # 18GB, fallback for large contexts
├── gemma4-26b-a4b-q4_0.gguf   # 19GB, alternative model
├── nomic-embed-text-v1.5.gguf  # 274MB, embeddings
└── models.json                 # Metadata registry

models.json:

{
  "qwen3.5-27b-q5_k_m": {
    "path": "/models/qwen3.5-27b-q5_k_m.gguf",
    "size_gb": 27,
    "quantization": "Q5_K_M",
    "context": 8192,
    "parameters": "27B",
    "purpose": "main-inference"
  },
  "gemma4-26b-a4b-q4_0": {
    "path": "/models/gemma4-26b-a4b-q4_0.gguf",
    "size_gb": 19,
    "quantization": "Q4_0",
    "context": 4096,
    "parameters": "26B",
    "purpose": "alternative"
  }
}

Model Download Script#

download-model.sh:

#!/bin/bash
set -e

MODEL_NAME=$1
HUGGINGFACE_REPO=$2

echo "Downloading $MODEL_NAME from $HUGGINGFACE_REPO"

# Use huggingface-cli if available
if command -v huggingface-cli &> /dev/null; then
  huggingface-cli download $HUGGINGFACE_REPO \
    --include "*.gguf" \
    --local-dir /models
else
  # Fallback to wget
  wget -P /models "https://huggingface.co/$HUGGINGFACE_REPO/resolve/main/$MODEL_NAME.gguf"
fi

# Verify checksum if provided
if [ -f "${MODEL_NAME}.sha256" ]; then
  sha256sum -c "${MODEL_NAME}.sha256"
fi

echo "Download complete: /models/${MODEL_NAME}.gguf"

Usage:

./download-model.sh qwen3.5-27b-q5_k_m TheBloke/Qwen-2.5-27B-Instruct-GGUF

Janky Configuration#

~/.config/janky/config.toml:

[llm]
base_url = "http://lithium.mrzk.io:8080"
model = "qwen3.5-27b-q5_k_m"
temperature = 0.7
max_tokens = 4096
context_window = 8192

[discord]
enabled = true
channel_id = "1491598715805372416"

[skills]
path = "~/.janky/skills"
auto_load = true

[wiki]
path = "~/.janky/wiki"
auto_commit = true

Performance Tuning#

GPU Offloading#

Check VRAM:

nvidia-smi --query-gpu=memory.total --format=csv

Adjust n_gpu_layers:

VRAM 16GB → n_gpu_layers: 50-60
VRAM 24GB → n_gpu_layers: 70-80
VRAM 48GB → n_gpu_layers: 100+ (full offload)

Context Optimization#

For large contexts (>8k):

model: /models/qwen3.5-27b-q4_0.gguf  # Smaller quantization
n_ctx: 16384
n_batch: 256
n_ubatch: 256

For speed (small contexts):

model: /models/qwen3.5-27b-q5_k_m.gguf  # Higher quality
n_ctx: 4096
n_batch: 512
n_ubatch: 512

Monitoring#

Prometheus Metrics#

Enable in server:

# server.yml
metrics:
  enabled: true
  port: 8081

Grafana dashboard:

  • Requests/sec
  • Token generation speed
  • Context utilization
  • GPU memory usage

Logs#

# Tail server logs
tail -f /var/log/llama-server.log

# Watch GPU usage
watch -n 1 nvidia-smi