hack-house/cmd_chat/agent/__main__.py
leetcrypt 26c651e9ac perf(ai): CPU-tuned local inference + qwen2.5-coder sandbox path
Tier A/B/C wins for the CPU-only Ollama box (no GPU → optimize TTFT and
tokens/sec, not VRAM):

- Separate qwen2.5-coder provider for the sandbox `!task` path; chat keeps
  the general model. Auto-selected when chat is Ollama and a coder build is
  present, override with --code-model.
- OllamaProvider num_ctx default 8192→4096 (8192 was a GPU-mindset default
  that inflates prefill/TTFT on CPU); expose num_thread; add --num-ctx,
  --num-thread, --num-predict. token_budget default 3000→2000 to fit.
- OllamaProvider.stream() generator over Ollama's stream=True chat endpoint
  (provider half of token streaming; agent/Rust rendering is a follow-up).
- Few-shot request→shell exemplars in SANDBOX_SYSTEM to anchor the small
  model's fenced-command output.
- Matryoshka embedding truncation: OllamaEmbedder truncate_dim=256 (--embed-dim)
  for faster pure-Python cosine and less RAM; query+stored share the dim.
- docs/ai-perf-plan.md records all 8 items with status and the server-side
  env (OLLAMA_NUM_PARALLEL=1, keep_alive) that must be set where ollama serve runs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-02 22:37:59 -07:00

202 lines
9.0 KiB
Python

"""CLI: run an AI agent that joins a hack-house room.
Examples
--------
# local Ollama (default, recommended)
python -m cmd_chat.agent 127.0.0.1 3000 --name oracle \
--password hunter2 --model llama3 --no-tls
# cloud, opt-in
python -m cmd_chat.agent 127.0.0.1 3000 --name claude \
--provider anthropic --model claude-opus-4-6 --password hunter2 --no-tls
# any OpenAI-compatible endpoint (Groq, Together, local vLLM…)
python -m cmd_chat.agent 127.0.0.1 3000 --provider openai \
--base-url https://api.groq.com/openai/v1 --model llama-3.1-70b --password hunter2
# a named profile from models.toml (provider + model + endpoint + key env)
python -m cmd_chat.agent 127.0.0.1 3000 --profile groq-llama --password hunter2
# a custom provider you wrote
python -m cmd_chat.agent 127.0.0.1 3000 --provider mypkg.mod:MyProvider
# discovery / preflight (no room join)
python -m cmd_chat.agent --profile groq-llama --list-models
python -m cmd_chat.agent --profile groq-llama --check
"""
from __future__ import annotations
import argparse
import sys
from .bridge import AgentBridge
from .profiles import load_profiles, provider_from_profile
from .providers import OllamaEmbedder, make_provider, preflight
def _build_provider(args, ap):
"""Resolve a Provider from either --profile or the explicit flags."""
if args.profile:
profiles = load_profiles(args.models_file)
if args.profile not in profiles:
known = ", ".join(profiles) or "(none — create models.toml)"
ap.error(f"unknown profile '{args.profile}'. known: {known}")
prof = profiles[args.profile]
provider = provider_from_profile(
prof, name=args.profile, model=args.model, base_url=args.base_url
)
# Profile may also supply non-provider defaults.
if args.system is None and prof.get("system"):
args.system = prof["system"]
if args.context_window == 12 and prof.get("context_window"):
args.context_window = int(prof["context_window"])
return provider
opts: dict = {}
if args.base_url and (args.provider == "openai" or ":" in args.provider):
opts["base_url"] = args.base_url
return make_provider(args.provider, model=args.model, **opts)
def _apply_ollama_tuning(provider, args) -> None:
"""Push CPU-perf flags onto an Ollama chat/code provider. No-op otherwise —
the knobs (num_ctx/num_thread/num_predict) only exist on OllamaProvider."""
if getattr(provider, "name", None) != "ollama":
return
if args.num_ctx is not None:
provider.num_ctx = args.num_ctx
if args.num_thread is not None:
provider.num_thread = args.num_thread
if args.num_predict is not None:
provider.num_predict = args.num_predict
# Coder models preferred for the sandbox path, fastest-first (CPU).
_CODER_MODELS = ("qwen2.5-coder:1.5b", "qwen2.5-coder:3b", "qwen2.5-coder")
def _build_code_provider(provider, args):
"""A code-specialized provider for the sandbox `!task` path. Only meaningful
for Ollama: use --code-model if given, else auto-select a present
qwen2.5-coder build. Returns None to fall back to the chat provider."""
if getattr(provider, "name", None) != "ollama":
return None
code_model = args.code_model
if code_model is None:
try:
models = set(provider.available_models())
except Exception: # noqa: BLE001 — discovery down → no separate code path
models = set()
code_model = next((m for m in _CODER_MODELS if m in models), None)
if not code_model or code_model == provider.model:
return None
code = make_provider("ollama", model=code_model, host=provider.host)
_apply_ollama_tuning(code, args)
return code
def main() -> None:
ap = argparse.ArgumentParser(
prog="cmd_chat.agent", description="hack-house AI agent bridge (PoC)"
)
ap.add_argument("server", nargs="?", help="room host (omit with --list-models/--check)")
ap.add_argument("port", type=int, nargs="?", help="room port")
ap.add_argument("--name", default="oracle", help="agent's room display name")
ap.add_argument("--password", default=None, help="room password")
ap.add_argument("--provider", default="ollama",
help="ollama | anthropic | openai | module:Class")
ap.add_argument("--profile", default=None,
help="named profile from models.toml (overrides --provider/--model)")
ap.add_argument("--models-file", default=None,
help="path to models.toml (default: $HH_MODELS_FILE, ./models.toml, ~/.config/hh/models.toml)")
ap.add_argument("--model", default=None, help="model name (provider default if omitted)")
ap.add_argument("--code-model", default=None,
help="Ollama model for the sandbox/code path (default: auto-select qwen2.5-coder if present)")
ap.add_argument("--base-url", default=None, help="endpoint for openai-compatible providers")
ap.add_argument("--num-ctx", type=int, default=None,
help="Ollama context window (CPU: smaller = faster prefill; default 4096)")
ap.add_argument("--num-thread", type=int, default=None,
help="Ollama CPU threads (default: Ollama's own ≈ physical cores; benchmark 4/6/8)")
ap.add_argument("--num-predict", type=int, default=None,
help="Ollama max reply tokens (default 512)")
ap.add_argument("--system", default=None, help="override the system prompt")
ap.add_argument("--context-window", type=int, default=12,
help="max prior messages fed to the model per reply")
ap.add_argument("--token-budget", type=int, default=2000,
help="approx token cap on the context window (whichever is smaller wins)")
ap.add_argument("--no-rag", action="store_true",
help="disable in-RAM semantic recall (recency-only context)")
ap.add_argument("--embed-model", default="nomic-embed-text",
help="Ollama model used to embed messages for recall")
ap.add_argument("--embed-host", default=None,
help="Ollama host for embeddings (default: chat host or $OLLAMA_HOST)")
ap.add_argument("--rag-top-k", type=int, default=4,
help="how many recalled messages to surface per reply")
ap.add_argument("--embed-dim", type=int, default=256,
help="truncate embedding vectors to this many dims (MRL; 0 = full vector)")
ap.add_argument("--list-models", action="store_true",
help="list models the backend can serve, then exit")
ap.add_argument("--check", action="store_true",
help="run a reachability/model preflight, then exit (0 ok, 1 fail)")
ap.add_argument("--insecure", action="store_true", help="skip TLS cert verification")
ap.add_argument("--no-tls", action="store_true", help="plain ws/http (local/Tailscale)")
args = ap.parse_args()
provider = _build_provider(args, ap)
_apply_ollama_tuning(provider, args)
# Discovery / preflight modes never join a room.
if args.list_models:
discover = getattr(provider, "available_models", None)
if discover is None:
ap.error(f"provider '{provider.name}' has no model discovery")
for m in discover():
print(m)
return
if args.check:
ok, msg = preflight(provider)
print(("ok: " if ok else "FAIL: ") + msg, file=sys.stderr if not ok else sys.stdout)
sys.exit(0 if ok else 1)
if args.server is None or args.port is None:
ap.error("server and port are required to join a room")
# Non-fatal preflight: warn early, but still try (discovery may be blocked
# while completion works).
ok, msg = preflight(provider)
if not ok:
print(f"⚠ preflight: {msg}", file=sys.stderr)
# In-RAM semantic recall is on by default and local (Ollama embeddings),
# independent of which provider answers chat. Reuse the chat host if it's an
# Ollama provider so a single --host/profile covers both.
embedder = None
if not args.no_rag:
embedder = OllamaEmbedder(
model=args.embed_model,
host=args.embed_host or getattr(provider, "host", None),
truncate_dim=args.embed_dim or None,
)
# Separate coder model for the sandbox path (Ollama only); None → reuse chat.
code_provider = _build_code_provider(provider, args)
if code_provider is not None:
print(f"sandbox/code path → {code_provider.name}/{code_provider.model}", file=sys.stderr)
bridge = AgentBridge(
args.server, args.port, name=args.name, provider=provider,
password=args.password, insecure=args.insecure, no_tls=args.no_tls,
system_prompt=args.system, context_window=args.context_window,
token_budget=args.token_budget, embedder=embedder, rag_top_k=args.rag_top_k,
code_provider=code_provider,
)
try:
bridge.run()
except KeyboardInterrupt:
print("\nagent stopped")
if __name__ == "__main__":
main()