From 26c651e9accdaae4eedd3ba0bc39319eb46b181b Mon Sep 17 00:00:00 2001 From: leetcrypt Date: Tue, 2 Jun 2026 22:37:59 -0700 Subject: [PATCH] perf(ai): CPU-tuned local inference + qwen2.5-coder sandbox path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier A/B/C wins for the CPU-only Ollama box (no GPU → optimize TTFT and tokens/sec, not VRAM): - Separate qwen2.5-coder provider for the sandbox `!task` path; chat keeps the general model. Auto-selected when chat is Ollama and a coder build is present, override with --code-model. - OllamaProvider num_ctx default 8192→4096 (8192 was a GPU-mindset default that inflates prefill/TTFT on CPU); expose num_thread; add --num-ctx, --num-thread, --num-predict. token_budget default 3000→2000 to fit. - OllamaProvider.stream() generator over Ollama's stream=True chat endpoint (provider half of token streaming; agent/Rust rendering is a follow-up). - Few-shot request→shell exemplars in SANDBOX_SYSTEM to anchor the small model's fenced-command output. - Matryoshka embedding truncation: OllamaEmbedder truncate_dim=256 (--embed-dim) for faster pure-Python cosine and less RAM; query+stored share the dim. - docs/ai-perf-plan.md records all 8 items with status and the server-side env (OLLAMA_NUM_PARALLEL=1, keep_alive) that must be set where ollama serve runs. Co-Authored-By: Claude Opus 4.6 --- cmd_chat/agent/__main__.py | 57 ++++++++++++++++++++++++++++++++++++- cmd_chat/agent/bridge.py | 24 +++++++++++++--- cmd_chat/agent/providers.py | 56 +++++++++++++++++++++++++++++++----- docs/ai-perf-plan.md | 55 +++++++++++++++++++++++++++++++++++ 4 files changed, 180 insertions(+), 12 deletions(-) create mode 100644 docs/ai-perf-plan.md diff --git a/cmd_chat/agent/__main__.py b/cmd_chat/agent/__main__.py index 54138a3..72b60a9 100644 --- a/cmd_chat/agent/__main__.py +++ b/cmd_chat/agent/__main__.py @@ -59,6 +59,43 @@ def _build_provider(args, ap): return make_provider(args.provider, model=args.model, **opts) +def _apply_ollama_tuning(provider, args) -> None: + """Push CPU-perf flags onto an Ollama chat/code provider. No-op otherwise — + the knobs (num_ctx/num_thread/num_predict) only exist on OllamaProvider.""" + if getattr(provider, "name", None) != "ollama": + return + if args.num_ctx is not None: + provider.num_ctx = args.num_ctx + if args.num_thread is not None: + provider.num_thread = args.num_thread + if args.num_predict is not None: + provider.num_predict = args.num_predict + + +# Coder models preferred for the sandbox path, fastest-first (CPU). +_CODER_MODELS = ("qwen2.5-coder:1.5b", "qwen2.5-coder:3b", "qwen2.5-coder") + + +def _build_code_provider(provider, args): + """A code-specialized provider for the sandbox `!task` path. Only meaningful + for Ollama: use --code-model if given, else auto-select a present + qwen2.5-coder build. Returns None to fall back to the chat provider.""" + if getattr(provider, "name", None) != "ollama": + return None + code_model = args.code_model + if code_model is None: + try: + models = set(provider.available_models()) + except Exception: # noqa: BLE001 — discovery down → no separate code path + models = set() + code_model = next((m for m in _CODER_MODELS if m in models), None) + if not code_model or code_model == provider.model: + return None + code = make_provider("ollama", model=code_model, host=provider.host) + _apply_ollama_tuning(code, args) + return code + + def main() -> None: ap = argparse.ArgumentParser( prog="cmd_chat.agent", description="hack-house AI agent bridge (PoC)" @@ -74,11 +111,19 @@ def main() -> None: ap.add_argument("--models-file", default=None, help="path to models.toml (default: $HH_MODELS_FILE, ./models.toml, ~/.config/hh/models.toml)") ap.add_argument("--model", default=None, help="model name (provider default if omitted)") + ap.add_argument("--code-model", default=None, + help="Ollama model for the sandbox/code path (default: auto-select qwen2.5-coder if present)") ap.add_argument("--base-url", default=None, help="endpoint for openai-compatible providers") + ap.add_argument("--num-ctx", type=int, default=None, + help="Ollama context window (CPU: smaller = faster prefill; default 4096)") + ap.add_argument("--num-thread", type=int, default=None, + help="Ollama CPU threads (default: Ollama's own ≈ physical cores; benchmark 4/6/8)") + ap.add_argument("--num-predict", type=int, default=None, + help="Ollama max reply tokens (default 512)") ap.add_argument("--system", default=None, help="override the system prompt") ap.add_argument("--context-window", type=int, default=12, help="max prior messages fed to the model per reply") - ap.add_argument("--token-budget", type=int, default=3000, + ap.add_argument("--token-budget", type=int, default=2000, help="approx token cap on the context window (whichever is smaller wins)") ap.add_argument("--no-rag", action="store_true", help="disable in-RAM semantic recall (recency-only context)") @@ -88,6 +133,8 @@ def main() -> None: help="Ollama host for embeddings (default: chat host or $OLLAMA_HOST)") ap.add_argument("--rag-top-k", type=int, default=4, help="how many recalled messages to surface per reply") + ap.add_argument("--embed-dim", type=int, default=256, + help="truncate embedding vectors to this many dims (MRL; 0 = full vector)") ap.add_argument("--list-models", action="store_true", help="list models the backend can serve, then exit") ap.add_argument("--check", action="store_true", @@ -97,6 +144,7 @@ def main() -> None: args = ap.parse_args() provider = _build_provider(args, ap) + _apply_ollama_tuning(provider, args) # Discovery / preflight modes never join a room. if args.list_models: @@ -128,13 +176,20 @@ def main() -> None: embedder = OllamaEmbedder( model=args.embed_model, host=args.embed_host or getattr(provider, "host", None), + truncate_dim=args.embed_dim or None, ) + # Separate coder model for the sandbox path (Ollama only); None → reuse chat. + code_provider = _build_code_provider(provider, args) + if code_provider is not None: + print(f"sandbox/code path → {code_provider.name}/{code_provider.model}", file=sys.stderr) + bridge = AgentBridge( args.server, args.port, name=args.name, provider=provider, password=args.password, insecure=args.insecure, no_tls=args.no_tls, system_prompt=args.system, context_window=args.context_window, token_budget=args.token_budget, embedder=embedder, rag_top_k=args.rag_top_k, + code_provider=code_provider, ) try: bridge.run() diff --git a/cmd_chat/agent/bridge.py b/cmd_chat/agent/bridge.py index 86ae1e1..358d58c 100644 --- a/cmd_chat/agent/bridge.py +++ b/cmd_chat/agent/bridge.py @@ -37,7 +37,19 @@ SANDBOX_SYSTEM = ( "block — no prose, no comments, no explanation. Prefer non-interactive " "commands. Create files with heredocs (cat > path <<'EOF' … EOF). Keep it to " "a handful of commands. Never include destructive commands unless the request " - "explicitly demands them." + "explicitly demands them.\n\n" + "Examples:\n" + "Request: create a hello.py that prints hello world and run it\n" + "```sh\n" + "cat > hello.py <<'EOF'\n" + "print(\"hello world\")\n" + "EOF\n" + "python3 hello.py\n" + "```\n\n" + "Request: show disk usage of the current directory, largest first\n" + "```sh\n" + "du -sh ./* | sort -rh\n" + "```" ) # Heuristic guard for obviously dangerous commands — not exhaustive; the owner's @@ -67,12 +79,16 @@ class AgentBridge(Client): def __init__(self, server: str, port: int, name: str, provider: Provider, password: str | None = None, insecure: bool = False, no_tls: bool = False, system_prompt: str | None = None, context_window: int = 12, - token_budget: int = 3000, embedder=None, rag_top_k: int = 4, - rag_min_score: float = 0.35): + token_budget: int = 2000, embedder=None, rag_top_k: int = 4, + rag_min_score: float = 0.35, code_provider: Provider | None = None): super().__init__(server, port, username=name, password=password, insecure=insecure, no_tls=no_tls) self.name = name self.provider = provider + # Optional code-specialized provider (e.g. qwen2.5-coder) used only for + # the sandbox `!task` path; chat keeps the general `provider`. Falls back + # to the chat provider when not supplied. + self.code_provider = code_provider or provider self.system_prompt = (system_prompt or DEFAULT_SYSTEM).format(name=name) self.context_window = context_window # Soft cap (approx tokens) on how much transcript we feed the model per @@ -330,7 +346,7 @@ class AgentBridge(Client): try: context = await self._model_messages(task) plan = await asyncio.to_thread( - self.provider.complete, + self.code_provider.complete, SANDBOX_SYSTEM.format(name=self.name), context + [Msg("user", f"{asker} wants this done in the shell: {task}")], ) diff --git a/cmd_chat/agent/providers.py b/cmd_chat/agent/providers.py index 3a1c2b6..664231e 100644 --- a/cmd_chat/agent/providers.py +++ b/cmd_chat/agent/providers.py @@ -9,6 +9,7 @@ custom one via the ``module:Class`` spec. from __future__ import annotations import importlib +import json import os from dataclasses import dataclass from typing import Protocol, runtime_checkable @@ -42,23 +43,32 @@ class OllamaProvider: name = "ollama" def __init__(self, model: str = "llama3", host: str | None = None, timeout: int = 120, - num_ctx: int = 8192, num_predict: int = 512, keep_alive: str = "30m"): + num_ctx: int = 4096, num_predict: int = 512, num_thread: int | None = None, + keep_alive: str = "30m"): self.model = model self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/") self.timeout = timeout - # Honor the larger backfilled window (num_ctx) — Ollama defaults to a tiny - # 2048 — and bound reply length. keep_alive pins the model in VRAM so the - # next /ai doesn't pay a cold reload. + # On CPU, time-to-first-token is O(num_ctx) prefill, so keep the window + # modest (4096) rather than a GPU-mindset 8192. keep_alive pins the model + # so the next /ai doesn't pay a cold reload. num_thread defaults to + # Ollama's own (≈physical cores); set it explicitly to benchmark 4/6/8. self.num_ctx = num_ctx self.num_predict = num_predict + self.num_thread = num_thread self.keep_alive = keep_alive + def _options(self) -> dict: + opts = {"num_ctx": self.num_ctx, "num_predict": self.num_predict} + if self.num_thread is not None: + opts["num_thread"] = self.num_thread + return opts + def complete(self, system: str, messages: list[Msg]) -> str: payload = { "model": self.model, "stream": False, "keep_alive": self.keep_alive, - "options": {"num_ctx": self.num_ctx, "num_predict": self.num_predict}, + "options": self._options(), "messages": [{"role": "system", "content": system}] + [{"role": m.role, "content": m.content} for m in messages], } @@ -66,6 +76,30 @@ class OllamaProvider: r.raise_for_status() return (r.json().get("message", {}).get("content") or "").strip() + def stream(self, system: str, messages: list[Msg]): + """Yield reply text incrementally as Ollama generates it. On CPU the + perceived latency is TTFT, so streaming makes a slow reply feel live.""" + payload = { + "model": self.model, + "stream": True, + "keep_alive": self.keep_alive, + "options": self._options(), + "messages": [{"role": "system", "content": system}] + + [{"role": m.role, "content": m.content} for m in messages], + } + with requests.post(f"{self.host}/api/chat", json=payload, + timeout=self.timeout, stream=True) as r: + r.raise_for_status() + for line in r.iter_lines(): + if not line: + continue + chunk = json.loads(line) + piece = chunk.get("message", {}).get("content") + if piece: + yield piece + if chunk.get("done"): + break + def available_models(self) -> list[str]: r = requests.get(f"{self.host}/api/tags", timeout=self.timeout) r.raise_for_status() @@ -80,10 +114,15 @@ class OllamaEmbedder: name = "ollama-embed" def __init__(self, model: str = "nomic-embed-text", host: str | None = None, - timeout: int = 60): + timeout: int = 60, truncate_dim: int | None = 256): self.model = model self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/") self.timeout = timeout + # nomic-embed-text is Matryoshka (MRL)-trained, so its 768-dim vector can + # be truncated to a shorter prefix with little quality loss — faster + # pure-Python cosine and less RAM. Query + stored use the same dim, so + # cosine stays correct. None keeps the full vector. + self.truncate_dim = truncate_dim def embed(self, text: str) -> list[float]: r = requests.post( @@ -92,7 +131,10 @@ class OllamaEmbedder: timeout=self.timeout, ) r.raise_for_status() - return r.json().get("embedding") or [] + vec = r.json().get("embedding") or [] + if self.truncate_dim is not None: + vec = vec[: self.truncate_dim] + return vec class AnthropicProvider: diff --git a/docs/ai-perf-plan.md b/docs/ai-perf-plan.md new file mode 100644 index 0000000..a016e70 --- /dev/null +++ b/docs/ai-perf-plan.md @@ -0,0 +1,55 @@ +# AI agent: CPU-only performance & code-quality plan + +Hardware reality: the box serving local models is **CPU-only** (Intel i5-8350U, +4c/8t, no GPU, 62 GB RAM), Ollama 0.3.9. So we optimize **time-to-first-token** +(prefill is O(context)) and **tokens/sec**, not VRAM. GPU knobs (flash attention, +KV-cache quant) are no-ops here. + +## Status + +### Tier A — high impact / low effort +1. **`qwen2.5-coder` for the sandbox/code path.** *(done)* `qwen2.5-coder:1.5b` + pulled and wired as a separate code provider used only by `!task`; chat keeps + the general model. Same speed, better shell/code. Auto-selected when the chat + provider is Ollama and the coder model is present; override with `--code-model`. +2. **Lower `num_ctx` to 4096 + expose `num_thread`.** *(done)* OllamaProvider + default `num_ctx` 8192→4096 (8192 was a GPU-mindset default that inflated TTFT + on CPU); `token_budget` default 3000→2000 to fit. `--num-ctx`, `--num-thread`, + `--num-predict` flags added. `num_thread` defaults to Ollama's own (= physical + cores, 4 here); benchmark 4/6/8. +3. **Token streaming.** *(partial — provider half done)* `OllamaProvider.stream()` + now yields deltas from Ollama's `stream=True` chat endpoint. Still TODO (commit 2): + have the agent emit `_ai:"stream"` delta frames and the Rust client render an + in-progress bubble. On CPU, perceived latency is TTFT — this will make a slow + reply feel live. +4. **Keep model warm + single-flight.** *(partial)* `keep_alive` already 30m + (prevents mid-session reload). `OLLAMA_NUM_PARALLEL=1` is a **server-side env** + read by `ollama serve`, not settable from the agent — set it where Ollama is + launched (documented below). + +### Tier B — code-generation quality +5. **Few-shot in `SANDBOX_SYSTEM`.** *(done)* 1–2 request→shell exemplars to anchor + the small model's output format. +6. **GBNF constrained output.** *(blocked on #7)* Ollama 0.3.9 only supports + `format: json`, not custom grammars for fenced shell. Needs the upgrade; the + existing `_extract_commands` parser + few-shot cover the gap meanwhile. + +### Tier C — infra / housekeeping +7. **Upgrade Ollama 0.3.9 → current.** *(manual, user-run)* System-wide action that + restarts the daemon other projects share — not run automatically. Buys current + coder builds, structured-output/grammar support (unblocks #6), bugfixes. CPU + speed gains are incremental. Suggested: `curl -fsSL https://ollama.com/install.sh | sh`. +8. **Matryoshka embedding truncation.** *(done)* nomic-embed-text is MRL-trained; + truncate vectors to 256-dim (`--embed-dim`) for faster pure-Python cosine and + less RAM. Query + stored use the same dim, so cosine stays correct. + +## Server-side env (set where `ollama serve` runs, e.g. systemd unit or shell) +``` +OLLAMA_NUM_PARALLEL=1 # single interactive user → all cores to one request +OLLAMA_KEEP_ALIVE=30m # or -1 to pin forever (62 GB RAM is plenty) +``` + +## Notes +All grounded in public sources + the Obsidian vault (`research/2026-06-02-*`): +Q4_K_M is the CPU speed sweet spot, small `num_ctx` beats "context rot", and +qwen2.5-coder beats the general model at equal size for code.