64 lines
1.9 KiB
YAML
64 lines
1.9 KiB
YAML
# From here is where you define the config for llama-swap.
|
|
healthCheckTimeout: 3600 # Set it to one hour so model downloads don't stop halfway through.
|
|
# 262144
|
|
models:
|
|
GLM47:
|
|
aliases:
|
|
- "glm-coder"
|
|
cmd: >
|
|
llama-server
|
|
--port ${PORT}
|
|
-m /models/GLM-4.7-Flash-MXFP4_MOE.gguf
|
|
--fit-ctx 230000
|
|
--temp 0.7 --top-p 1.0 --min-p 0.01
|
|
|
|
Qwen3.6-35B-A3B:
|
|
aliases:
|
|
- "qwen-omni"
|
|
cmd: >
|
|
llama-server
|
|
--port ${PORT}
|
|
-m /models/Qwen3.6-35B-A3B-MXFP4_MOE.gguf
|
|
--mmproj /models/Qwen-mmproj-F16.gguf
|
|
--fit-ctx 230000
|
|
--fit-target 2048
|
|
--temp 0.6 --top-p 0.95 --top-k 20 --presence-penalty 0.0 --min-p 0.00 --no-mmap
|
|
|
|
|
|
Qwen3.6-Opus:
|
|
aliases:
|
|
- "qwen-opus"
|
|
cmd: >
|
|
llama-server
|
|
--port ${PORT}
|
|
--fit-ctx 262144
|
|
-m /models/Qwen3.6-Opus.gguf
|
|
--fit-target 2048
|
|
--temp 0.6 --top-p 0.95 --top-k 20 --presence-penalty 0.0 --min-p 0.00
|
|
--spec-type mtp --spec-draft-n-max 3 -np 1 --no-mmap
|
|
|
|
kokoro-tts:
|
|
proxy: http://${MODEL_ID}:8880
|
|
name: "kokoro TTS"
|
|
useModelName: "tts-1"
|
|
checkEndpoint: /health
|
|
cmd: |
|
|
docker run --rm --name ${MODEL_ID} --network nerd-network
|
|
--gpus 'device=0'
|
|
--env 'API_LOG_LEVEL=INFO'
|
|
ghcr.io/remsky/kokoro-fastapi-gpu:latest
|
|
cmdStop: docker stop ${MODEL_ID}
|
|
|
|
|
|
Qwen-Image:
|
|
proxy: http://127.0.0.1:${PORT}
|
|
checkEndpoint: /
|
|
aliases:
|
|
- "qwen-image"
|
|
cmd: >
|
|
sd-server
|
|
--listen-port ${PORT} --fa --offload-to-cpu
|
|
--diffusion-model /models/sd/unet/qwen-image-2512-Q4_K_M.gguf
|
|
--llm /models/sd/text_encoders/Qwen2.5-VL-7B-Instruct-UD-Q4_K_XL.gguf
|
|
--llm_vision /models/sd/text_encoders/Qwen2.5-VL-7B-Instruct-mmproj-BF16.gguf
|
|
--vae /models/sd/vae/qwen_image_vae.safetensors |