perf(ai): CPU-tuned local inference + qwen2.5-coder sandbox path
Tier A/B/C wins for the CPU-only Ollama box (no GPU → optimize TTFT and tokens/sec, not VRAM): - Separate qwen2.5-coder provider for the sandbox `!task` path; chat keeps the general model. Auto-selected when chat is Ollama and a coder build is present, override with --code-model. - OllamaProvider num_ctx default 8192→4096 (8192 was a GPU-mindset default that inflates prefill/TTFT on CPU); expose num_thread; add --num-ctx, --num-thread, --num-predict. token_budget default 3000→2000 to fit. - OllamaProvider.stream() generator over Ollama's stream=True chat endpoint (provider half of token streaming; agent/Rust rendering is a follow-up). - Few-shot request→shell exemplars in SANDBOX_SYSTEM to anchor the small model's fenced-command output. - Matryoshka embedding truncation: OllamaEmbedder truncate_dim=256 (--embed-dim) for faster pure-Python cosine and less RAM; query+stored share the dim. - docs/ai-perf-plan.md records all 8 items with status and the server-side env (OLLAMA_NUM_PARALLEL=1, keep_alive) that must be set where ollama serve runs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e5e1ad8dee
commit
26c651e9ac
|
|
@ -59,6 +59,43 @@ def _build_provider(args, ap):
|
||||||
return make_provider(args.provider, model=args.model, **opts)
|
return make_provider(args.provider, model=args.model, **opts)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_ollama_tuning(provider, args) -> None:
|
||||||
|
"""Push CPU-perf flags onto an Ollama chat/code provider. No-op otherwise —
|
||||||
|
the knobs (num_ctx/num_thread/num_predict) only exist on OllamaProvider."""
|
||||||
|
if getattr(provider, "name", None) != "ollama":
|
||||||
|
return
|
||||||
|
if args.num_ctx is not None:
|
||||||
|
provider.num_ctx = args.num_ctx
|
||||||
|
if args.num_thread is not None:
|
||||||
|
provider.num_thread = args.num_thread
|
||||||
|
if args.num_predict is not None:
|
||||||
|
provider.num_predict = args.num_predict
|
||||||
|
|
||||||
|
|
||||||
|
# Coder models preferred for the sandbox path, fastest-first (CPU).
|
||||||
|
_CODER_MODELS = ("qwen2.5-coder:1.5b", "qwen2.5-coder:3b", "qwen2.5-coder")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_code_provider(provider, args):
|
||||||
|
"""A code-specialized provider for the sandbox `!task` path. Only meaningful
|
||||||
|
for Ollama: use --code-model if given, else auto-select a present
|
||||||
|
qwen2.5-coder build. Returns None to fall back to the chat provider."""
|
||||||
|
if getattr(provider, "name", None) != "ollama":
|
||||||
|
return None
|
||||||
|
code_model = args.code_model
|
||||||
|
if code_model is None:
|
||||||
|
try:
|
||||||
|
models = set(provider.available_models())
|
||||||
|
except Exception: # noqa: BLE001 — discovery down → no separate code path
|
||||||
|
models = set()
|
||||||
|
code_model = next((m for m in _CODER_MODELS if m in models), None)
|
||||||
|
if not code_model or code_model == provider.model:
|
||||||
|
return None
|
||||||
|
code = make_provider("ollama", model=code_model, host=provider.host)
|
||||||
|
_apply_ollama_tuning(code, args)
|
||||||
|
return code
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
ap = argparse.ArgumentParser(
|
ap = argparse.ArgumentParser(
|
||||||
prog="cmd_chat.agent", description="hack-house AI agent bridge (PoC)"
|
prog="cmd_chat.agent", description="hack-house AI agent bridge (PoC)"
|
||||||
|
|
@ -74,11 +111,19 @@ def main() -> None:
|
||||||
ap.add_argument("--models-file", default=None,
|
ap.add_argument("--models-file", default=None,
|
||||||
help="path to models.toml (default: $HH_MODELS_FILE, ./models.toml, ~/.config/hh/models.toml)")
|
help="path to models.toml (default: $HH_MODELS_FILE, ./models.toml, ~/.config/hh/models.toml)")
|
||||||
ap.add_argument("--model", default=None, help="model name (provider default if omitted)")
|
ap.add_argument("--model", default=None, help="model name (provider default if omitted)")
|
||||||
|
ap.add_argument("--code-model", default=None,
|
||||||
|
help="Ollama model for the sandbox/code path (default: auto-select qwen2.5-coder if present)")
|
||||||
ap.add_argument("--base-url", default=None, help="endpoint for openai-compatible providers")
|
ap.add_argument("--base-url", default=None, help="endpoint for openai-compatible providers")
|
||||||
|
ap.add_argument("--num-ctx", type=int, default=None,
|
||||||
|
help="Ollama context window (CPU: smaller = faster prefill; default 4096)")
|
||||||
|
ap.add_argument("--num-thread", type=int, default=None,
|
||||||
|
help="Ollama CPU threads (default: Ollama's own ≈ physical cores; benchmark 4/6/8)")
|
||||||
|
ap.add_argument("--num-predict", type=int, default=None,
|
||||||
|
help="Ollama max reply tokens (default 512)")
|
||||||
ap.add_argument("--system", default=None, help="override the system prompt")
|
ap.add_argument("--system", default=None, help="override the system prompt")
|
||||||
ap.add_argument("--context-window", type=int, default=12,
|
ap.add_argument("--context-window", type=int, default=12,
|
||||||
help="max prior messages fed to the model per reply")
|
help="max prior messages fed to the model per reply")
|
||||||
ap.add_argument("--token-budget", type=int, default=3000,
|
ap.add_argument("--token-budget", type=int, default=2000,
|
||||||
help="approx token cap on the context window (whichever is smaller wins)")
|
help="approx token cap on the context window (whichever is smaller wins)")
|
||||||
ap.add_argument("--no-rag", action="store_true",
|
ap.add_argument("--no-rag", action="store_true",
|
||||||
help="disable in-RAM semantic recall (recency-only context)")
|
help="disable in-RAM semantic recall (recency-only context)")
|
||||||
|
|
@ -88,6 +133,8 @@ def main() -> None:
|
||||||
help="Ollama host for embeddings (default: chat host or $OLLAMA_HOST)")
|
help="Ollama host for embeddings (default: chat host or $OLLAMA_HOST)")
|
||||||
ap.add_argument("--rag-top-k", type=int, default=4,
|
ap.add_argument("--rag-top-k", type=int, default=4,
|
||||||
help="how many recalled messages to surface per reply")
|
help="how many recalled messages to surface per reply")
|
||||||
|
ap.add_argument("--embed-dim", type=int, default=256,
|
||||||
|
help="truncate embedding vectors to this many dims (MRL; 0 = full vector)")
|
||||||
ap.add_argument("--list-models", action="store_true",
|
ap.add_argument("--list-models", action="store_true",
|
||||||
help="list models the backend can serve, then exit")
|
help="list models the backend can serve, then exit")
|
||||||
ap.add_argument("--check", action="store_true",
|
ap.add_argument("--check", action="store_true",
|
||||||
|
|
@ -97,6 +144,7 @@ def main() -> None:
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
provider = _build_provider(args, ap)
|
provider = _build_provider(args, ap)
|
||||||
|
_apply_ollama_tuning(provider, args)
|
||||||
|
|
||||||
# Discovery / preflight modes never join a room.
|
# Discovery / preflight modes never join a room.
|
||||||
if args.list_models:
|
if args.list_models:
|
||||||
|
|
@ -128,13 +176,20 @@ def main() -> None:
|
||||||
embedder = OllamaEmbedder(
|
embedder = OllamaEmbedder(
|
||||||
model=args.embed_model,
|
model=args.embed_model,
|
||||||
host=args.embed_host or getattr(provider, "host", None),
|
host=args.embed_host or getattr(provider, "host", None),
|
||||||
|
truncate_dim=args.embed_dim or None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Separate coder model for the sandbox path (Ollama only); None → reuse chat.
|
||||||
|
code_provider = _build_code_provider(provider, args)
|
||||||
|
if code_provider is not None:
|
||||||
|
print(f"sandbox/code path → {code_provider.name}/{code_provider.model}", file=sys.stderr)
|
||||||
|
|
||||||
bridge = AgentBridge(
|
bridge = AgentBridge(
|
||||||
args.server, args.port, name=args.name, provider=provider,
|
args.server, args.port, name=args.name, provider=provider,
|
||||||
password=args.password, insecure=args.insecure, no_tls=args.no_tls,
|
password=args.password, insecure=args.insecure, no_tls=args.no_tls,
|
||||||
system_prompt=args.system, context_window=args.context_window,
|
system_prompt=args.system, context_window=args.context_window,
|
||||||
token_budget=args.token_budget, embedder=embedder, rag_top_k=args.rag_top_k,
|
token_budget=args.token_budget, embedder=embedder, rag_top_k=args.rag_top_k,
|
||||||
|
code_provider=code_provider,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
bridge.run()
|
bridge.run()
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,19 @@ SANDBOX_SYSTEM = (
|
||||||
"block — no prose, no comments, no explanation. Prefer non-interactive "
|
"block — no prose, no comments, no explanation. Prefer non-interactive "
|
||||||
"commands. Create files with heredocs (cat > path <<'EOF' … EOF). Keep it to "
|
"commands. Create files with heredocs (cat > path <<'EOF' … EOF). Keep it to "
|
||||||
"a handful of commands. Never include destructive commands unless the request "
|
"a handful of commands. Never include destructive commands unless the request "
|
||||||
"explicitly demands them."
|
"explicitly demands them.\n\n"
|
||||||
|
"Examples:\n"
|
||||||
|
"Request: create a hello.py that prints hello world and run it\n"
|
||||||
|
"```sh\n"
|
||||||
|
"cat > hello.py <<'EOF'\n"
|
||||||
|
"print(\"hello world\")\n"
|
||||||
|
"EOF\n"
|
||||||
|
"python3 hello.py\n"
|
||||||
|
"```\n\n"
|
||||||
|
"Request: show disk usage of the current directory, largest first\n"
|
||||||
|
"```sh\n"
|
||||||
|
"du -sh ./* | sort -rh\n"
|
||||||
|
"```"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Heuristic guard for obviously dangerous commands — not exhaustive; the owner's
|
# Heuristic guard for obviously dangerous commands — not exhaustive; the owner's
|
||||||
|
|
@ -67,12 +79,16 @@ class AgentBridge(Client):
|
||||||
def __init__(self, server: str, port: int, name: str, provider: Provider,
|
def __init__(self, server: str, port: int, name: str, provider: Provider,
|
||||||
password: str | None = None, insecure: bool = False, no_tls: bool = False,
|
password: str | None = None, insecure: bool = False, no_tls: bool = False,
|
||||||
system_prompt: str | None = None, context_window: int = 12,
|
system_prompt: str | None = None, context_window: int = 12,
|
||||||
token_budget: int = 3000, embedder=None, rag_top_k: int = 4,
|
token_budget: int = 2000, embedder=None, rag_top_k: int = 4,
|
||||||
rag_min_score: float = 0.35):
|
rag_min_score: float = 0.35, code_provider: Provider | None = None):
|
||||||
super().__init__(server, port, username=name, password=password,
|
super().__init__(server, port, username=name, password=password,
|
||||||
insecure=insecure, no_tls=no_tls)
|
insecure=insecure, no_tls=no_tls)
|
||||||
self.name = name
|
self.name = name
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
|
# Optional code-specialized provider (e.g. qwen2.5-coder) used only for
|
||||||
|
# the sandbox `!task` path; chat keeps the general `provider`. Falls back
|
||||||
|
# to the chat provider when not supplied.
|
||||||
|
self.code_provider = code_provider or provider
|
||||||
self.system_prompt = (system_prompt or DEFAULT_SYSTEM).format(name=name)
|
self.system_prompt = (system_prompt or DEFAULT_SYSTEM).format(name=name)
|
||||||
self.context_window = context_window
|
self.context_window = context_window
|
||||||
# Soft cap (approx tokens) on how much transcript we feed the model per
|
# Soft cap (approx tokens) on how much transcript we feed the model per
|
||||||
|
|
@ -330,7 +346,7 @@ class AgentBridge(Client):
|
||||||
try:
|
try:
|
||||||
context = await self._model_messages(task)
|
context = await self._model_messages(task)
|
||||||
plan = await asyncio.to_thread(
|
plan = await asyncio.to_thread(
|
||||||
self.provider.complete,
|
self.code_provider.complete,
|
||||||
SANDBOX_SYSTEM.format(name=self.name),
|
SANDBOX_SYSTEM.format(name=self.name),
|
||||||
context + [Msg("user", f"{asker} wants this done in the shell: {task}")],
|
context + [Msg("user", f"{asker} wants this done in the shell: {task}")],
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ custom one via the ``module:Class`` spec.
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Protocol, runtime_checkable
|
from typing import Protocol, runtime_checkable
|
||||||
|
|
@ -42,23 +43,32 @@ class OllamaProvider:
|
||||||
name = "ollama"
|
name = "ollama"
|
||||||
|
|
||||||
def __init__(self, model: str = "llama3", host: str | None = None, timeout: int = 120,
|
def __init__(self, model: str = "llama3", host: str | None = None, timeout: int = 120,
|
||||||
num_ctx: int = 8192, num_predict: int = 512, keep_alive: str = "30m"):
|
num_ctx: int = 4096, num_predict: int = 512, num_thread: int | None = None,
|
||||||
|
keep_alive: str = "30m"):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
|
self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
# Honor the larger backfilled window (num_ctx) — Ollama defaults to a tiny
|
# On CPU, time-to-first-token is O(num_ctx) prefill, so keep the window
|
||||||
# 2048 — and bound reply length. keep_alive pins the model in VRAM so the
|
# modest (4096) rather than a GPU-mindset 8192. keep_alive pins the model
|
||||||
# next /ai doesn't pay a cold reload.
|
# so the next /ai doesn't pay a cold reload. num_thread defaults to
|
||||||
|
# Ollama's own (≈physical cores); set it explicitly to benchmark 4/6/8.
|
||||||
self.num_ctx = num_ctx
|
self.num_ctx = num_ctx
|
||||||
self.num_predict = num_predict
|
self.num_predict = num_predict
|
||||||
|
self.num_thread = num_thread
|
||||||
self.keep_alive = keep_alive
|
self.keep_alive = keep_alive
|
||||||
|
|
||||||
|
def _options(self) -> dict:
|
||||||
|
opts = {"num_ctx": self.num_ctx, "num_predict": self.num_predict}
|
||||||
|
if self.num_thread is not None:
|
||||||
|
opts["num_thread"] = self.num_thread
|
||||||
|
return opts
|
||||||
|
|
||||||
def complete(self, system: str, messages: list[Msg]) -> str:
|
def complete(self, system: str, messages: list[Msg]) -> str:
|
||||||
payload = {
|
payload = {
|
||||||
"model": self.model,
|
"model": self.model,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"keep_alive": self.keep_alive,
|
"keep_alive": self.keep_alive,
|
||||||
"options": {"num_ctx": self.num_ctx, "num_predict": self.num_predict},
|
"options": self._options(),
|
||||||
"messages": [{"role": "system", "content": system}]
|
"messages": [{"role": "system", "content": system}]
|
||||||
+ [{"role": m.role, "content": m.content} for m in messages],
|
+ [{"role": m.role, "content": m.content} for m in messages],
|
||||||
}
|
}
|
||||||
|
|
@ -66,6 +76,30 @@ class OllamaProvider:
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return (r.json().get("message", {}).get("content") or "").strip()
|
return (r.json().get("message", {}).get("content") or "").strip()
|
||||||
|
|
||||||
|
def stream(self, system: str, messages: list[Msg]):
|
||||||
|
"""Yield reply text incrementally as Ollama generates it. On CPU the
|
||||||
|
perceived latency is TTFT, so streaming makes a slow reply feel live."""
|
||||||
|
payload = {
|
||||||
|
"model": self.model,
|
||||||
|
"stream": True,
|
||||||
|
"keep_alive": self.keep_alive,
|
||||||
|
"options": self._options(),
|
||||||
|
"messages": [{"role": "system", "content": system}]
|
||||||
|
+ [{"role": m.role, "content": m.content} for m in messages],
|
||||||
|
}
|
||||||
|
with requests.post(f"{self.host}/api/chat", json=payload,
|
||||||
|
timeout=self.timeout, stream=True) as r:
|
||||||
|
r.raise_for_status()
|
||||||
|
for line in r.iter_lines():
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
chunk = json.loads(line)
|
||||||
|
piece = chunk.get("message", {}).get("content")
|
||||||
|
if piece:
|
||||||
|
yield piece
|
||||||
|
if chunk.get("done"):
|
||||||
|
break
|
||||||
|
|
||||||
def available_models(self) -> list[str]:
|
def available_models(self) -> list[str]:
|
||||||
r = requests.get(f"{self.host}/api/tags", timeout=self.timeout)
|
r = requests.get(f"{self.host}/api/tags", timeout=self.timeout)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
@ -80,10 +114,15 @@ class OllamaEmbedder:
|
||||||
name = "ollama-embed"
|
name = "ollama-embed"
|
||||||
|
|
||||||
def __init__(self, model: str = "nomic-embed-text", host: str | None = None,
|
def __init__(self, model: str = "nomic-embed-text", host: str | None = None,
|
||||||
timeout: int = 60):
|
timeout: int = 60, truncate_dim: int | None = 256):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
|
self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
# nomic-embed-text is Matryoshka (MRL)-trained, so its 768-dim vector can
|
||||||
|
# be truncated to a shorter prefix with little quality loss — faster
|
||||||
|
# pure-Python cosine and less RAM. Query + stored use the same dim, so
|
||||||
|
# cosine stays correct. None keeps the full vector.
|
||||||
|
self.truncate_dim = truncate_dim
|
||||||
|
|
||||||
def embed(self, text: str) -> list[float]:
|
def embed(self, text: str) -> list[float]:
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
|
|
@ -92,7 +131,10 @@ class OllamaEmbedder:
|
||||||
timeout=self.timeout,
|
timeout=self.timeout,
|
||||||
)
|
)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.json().get("embedding") or []
|
vec = r.json().get("embedding") or []
|
||||||
|
if self.truncate_dim is not None:
|
||||||
|
vec = vec[: self.truncate_dim]
|
||||||
|
return vec
|
||||||
|
|
||||||
|
|
||||||
class AnthropicProvider:
|
class AnthropicProvider:
|
||||||
|
|
|
||||||
55
docs/ai-perf-plan.md
Normal file
55
docs/ai-perf-plan.md
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
# AI agent: CPU-only performance & code-quality plan
|
||||||
|
|
||||||
|
Hardware reality: the box serving local models is **CPU-only** (Intel i5-8350U,
|
||||||
|
4c/8t, no GPU, 62 GB RAM), Ollama 0.3.9. So we optimize **time-to-first-token**
|
||||||
|
(prefill is O(context)) and **tokens/sec**, not VRAM. GPU knobs (flash attention,
|
||||||
|
KV-cache quant) are no-ops here.
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
### Tier A — high impact / low effort
|
||||||
|
1. **`qwen2.5-coder` for the sandbox/code path.** *(done)* `qwen2.5-coder:1.5b`
|
||||||
|
pulled and wired as a separate code provider used only by `!task`; chat keeps
|
||||||
|
the general model. Same speed, better shell/code. Auto-selected when the chat
|
||||||
|
provider is Ollama and the coder model is present; override with `--code-model`.
|
||||||
|
2. **Lower `num_ctx` to 4096 + expose `num_thread`.** *(done)* OllamaProvider
|
||||||
|
default `num_ctx` 8192→4096 (8192 was a GPU-mindset default that inflated TTFT
|
||||||
|
on CPU); `token_budget` default 3000→2000 to fit. `--num-ctx`, `--num-thread`,
|
||||||
|
`--num-predict` flags added. `num_thread` defaults to Ollama's own (= physical
|
||||||
|
cores, 4 here); benchmark 4/6/8.
|
||||||
|
3. **Token streaming.** *(partial — provider half done)* `OllamaProvider.stream()`
|
||||||
|
now yields deltas from Ollama's `stream=True` chat endpoint. Still TODO (commit 2):
|
||||||
|
have the agent emit `_ai:"stream"` delta frames and the Rust client render an
|
||||||
|
in-progress bubble. On CPU, perceived latency is TTFT — this will make a slow
|
||||||
|
reply feel live.
|
||||||
|
4. **Keep model warm + single-flight.** *(partial)* `keep_alive` already 30m
|
||||||
|
(prevents mid-session reload). `OLLAMA_NUM_PARALLEL=1` is a **server-side env**
|
||||||
|
read by `ollama serve`, not settable from the agent — set it where Ollama is
|
||||||
|
launched (documented below).
|
||||||
|
|
||||||
|
### Tier B — code-generation quality
|
||||||
|
5. **Few-shot in `SANDBOX_SYSTEM`.** *(done)* 1–2 request→shell exemplars to anchor
|
||||||
|
the small model's output format.
|
||||||
|
6. **GBNF constrained output.** *(blocked on #7)* Ollama 0.3.9 only supports
|
||||||
|
`format: json`, not custom grammars for fenced shell. Needs the upgrade; the
|
||||||
|
existing `_extract_commands` parser + few-shot cover the gap meanwhile.
|
||||||
|
|
||||||
|
### Tier C — infra / housekeeping
|
||||||
|
7. **Upgrade Ollama 0.3.9 → current.** *(manual, user-run)* System-wide action that
|
||||||
|
restarts the daemon other projects share — not run automatically. Buys current
|
||||||
|
coder builds, structured-output/grammar support (unblocks #6), bugfixes. CPU
|
||||||
|
speed gains are incremental. Suggested: `curl -fsSL https://ollama.com/install.sh | sh`.
|
||||||
|
8. **Matryoshka embedding truncation.** *(done)* nomic-embed-text is MRL-trained;
|
||||||
|
truncate vectors to 256-dim (`--embed-dim`) for faster pure-Python cosine and
|
||||||
|
less RAM. Query + stored use the same dim, so cosine stays correct.
|
||||||
|
|
||||||
|
## Server-side env (set where `ollama serve` runs, e.g. systemd unit or shell)
|
||||||
|
```
|
||||||
|
OLLAMA_NUM_PARALLEL=1 # single interactive user → all cores to one request
|
||||||
|
OLLAMA_KEEP_ALIVE=30m # or -1 to pin forever (62 GB RAM is plenty)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
All grounded in public sources + the Obsidian vault (`research/2026-06-02-*`):
|
||||||
|
Q4_K_M is the CPU speed sweet spot, small `num_ctx` beats "context rot", and
|
||||||
|
qwen2.5-coder beats the general model at equal size for code.
|
||||||
Loading…
Reference in New Issue
Block a user