# From here is where you define the config for llama-swap. healthCheckTimeout: 3600 # Set it to one hour so model downloads don't stop halfway through. # 262144 models: GLM47: aliases: - "glm-coder" cmd: > llama-server --port ${PORT} -m /models/GLM-4.7-Flash-MXFP4_MOE.gguf --fit-ctx 230000 --temp 0.7 --top-p 1.0 --min-p 0.01 Qwen3.6-35B-A3B: aliases: - "qwen-omni" cmd: > llama-server --port ${PORT} -m /models/Qwen3.6-35B-A3B-MXFP4_MOE.gguf --mmproj /models/Qwen-mmproj-F16.gguf --fit-ctx 230000 --fit-target 2048 --temp 0.6 --top-p 0.95 --top-k 20 --presence-penalty 0.0 --min-p 0.00 --no-mmap Qwen3.6-Opus: aliases: - "qwen-opus" cmd: > llama-server --port ${PORT} --fit-ctx 262144 -m /models/Qwen3.6-Opus.gguf --fit-target 2048 --temp 0.6 --top-p 0.95 --top-k 20 --presence-penalty 0.0 --min-p 0.00 --spec-type mtp --spec-draft-n-max 3 -np 1 --no-mmap kokoro-tts: proxy: http://${MODEL_ID}:8880 name: "kokoro TTS" useModelName: "tts-1" checkEndpoint: /health cmd: | docker run --rm --name ${MODEL_ID} --network nerd-network --gpus 'device=0' --env 'API_LOG_LEVEL=INFO' ghcr.io/remsky/kokoro-fastapi-gpu:latest cmdStop: docker stop ${MODEL_ID} Qwen-Image: proxy: http://127.0.0.1:${PORT} checkEndpoint: / aliases: - "qwen-image" cmd: > sd-server --listen-port ${PORT} --fa --offload-to-cpu --diffusion-model /models/sd/unet/qwen-image-2512-Q4_K_M.gguf --llm /models/sd/text_encoders/Qwen2.5-VL-7B-Instruct-UD-Q4_K_XL.gguf --llm_vision /models/sd/text_encoders/Qwen2.5-VL-7B-Instruct-mmproj-BF16.gguf --vae /models/sd/vae/qwen_image_vae.safetensors