From 85fde592927702e7c1c90c480eaae1b06eae8c4e Mon Sep 17 00:00:00 2001 From: leetcrypt Date: Tue, 2 Jun 2026 17:43:02 -0700 Subject: [PATCH] perf(ai): keep the Ollama model warm and honor a real num_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OllamaProvider now sends keep_alive (default 30m) so the model stays resident in VRAM between /ai calls instead of cold-reloading, and sets explicit options (num_ctx 8192, num_predict 512) — Ollama otherwise caps context at 2048, which would silently truncate the larger backfilled window. Co-Authored-By: Claude Opus 4.6 --- cmd_chat/agent/providers.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cmd_chat/agent/providers.py b/cmd_chat/agent/providers.py index e4a3b98..7bbedfa 100644 --- a/cmd_chat/agent/providers.py +++ b/cmd_chat/agent/providers.py @@ -41,15 +41,24 @@ class OllamaProvider: name = "ollama" - def __init__(self, model: str = "llama3", host: str | None = None, timeout: int = 120): + def __init__(self, model: str = "llama3", host: str | None = None, timeout: int = 120, + num_ctx: int = 8192, num_predict: int = 512, keep_alive: str = "30m"): self.model = model self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/") self.timeout = timeout + # Honor the larger backfilled window (num_ctx) — Ollama defaults to a tiny + # 2048 — and bound reply length. keep_alive pins the model in VRAM so the + # next /ai doesn't pay a cold reload. + self.num_ctx = num_ctx + self.num_predict = num_predict + self.keep_alive = keep_alive def complete(self, system: str, messages: list[Msg]) -> str: payload = { "model": self.model, "stream": False, + "keep_alive": self.keep_alive, + "options": {"num_ctx": self.num_ctx, "num_predict": self.num_predict}, "messages": [{"role": "system", "content": system}] + [{"role": m.role, "content": m.content} for m in messages], }