Tier A/B/C wins for the CPU-only Ollama box (no GPU → optimize TTFT and tokens/sec, not VRAM): - Separate qwen2.5-coder provider for the sandbox `!task` path; chat keeps the general model. Auto-selected when chat is Ollama and a coder build is present, override with --code-model. - OllamaProvider num_ctx default 8192→4096 (8192 was a GPU-mindset default that inflates prefill/TTFT on CPU); expose num_thread; add --num-ctx, --num-thread, --num-predict. token_budget default 3000→2000 to fit. - OllamaProvider.stream() generator over Ollama's stream=True chat endpoint (provider half of token streaming; agent/Rust rendering is a follow-up). - Few-shot request→shell exemplars in SANDBOX_SYSTEM to anchor the small model's fenced-command output. - Matryoshka embedding truncation: OllamaEmbedder truncate_dim=256 (--embed-dim) for faster pure-Python cosine and less RAM; query+stored share the dim. - docs/ai-perf-plan.md records all 8 items with status and the server-side env (OLLAMA_NUM_PARALLEL=1, keep_alive) that must be set where ollama serve runs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
202 lines
9.0 KiB
Python
202 lines
9.0 KiB
Python
"""CLI: run an AI agent that joins a hack-house room.
|
|
|
|
Examples
|
|
--------
|
|
# local Ollama (default, recommended)
|
|
python -m cmd_chat.agent 127.0.0.1 3000 --name oracle \
|
|
--password hunter2 --model llama3 --no-tls
|
|
|
|
# cloud, opt-in
|
|
python -m cmd_chat.agent 127.0.0.1 3000 --name claude \
|
|
--provider anthropic --model claude-opus-4-6 --password hunter2 --no-tls
|
|
|
|
# any OpenAI-compatible endpoint (Groq, Together, local vLLM…)
|
|
python -m cmd_chat.agent 127.0.0.1 3000 --provider openai \
|
|
--base-url https://api.groq.com/openai/v1 --model llama-3.1-70b --password hunter2
|
|
|
|
# a named profile from models.toml (provider + model + endpoint + key env)
|
|
python -m cmd_chat.agent 127.0.0.1 3000 --profile groq-llama --password hunter2
|
|
|
|
# a custom provider you wrote
|
|
python -m cmd_chat.agent 127.0.0.1 3000 --provider mypkg.mod:MyProvider
|
|
|
|
# discovery / preflight (no room join)
|
|
python -m cmd_chat.agent --profile groq-llama --list-models
|
|
python -m cmd_chat.agent --profile groq-llama --check
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
|
|
from .bridge import AgentBridge
|
|
from .profiles import load_profiles, provider_from_profile
|
|
from .providers import OllamaEmbedder, make_provider, preflight
|
|
|
|
|
|
def _build_provider(args, ap):
|
|
"""Resolve a Provider from either --profile or the explicit flags."""
|
|
if args.profile:
|
|
profiles = load_profiles(args.models_file)
|
|
if args.profile not in profiles:
|
|
known = ", ".join(profiles) or "(none — create models.toml)"
|
|
ap.error(f"unknown profile '{args.profile}'. known: {known}")
|
|
prof = profiles[args.profile]
|
|
provider = provider_from_profile(
|
|
prof, name=args.profile, model=args.model, base_url=args.base_url
|
|
)
|
|
# Profile may also supply non-provider defaults.
|
|
if args.system is None and prof.get("system"):
|
|
args.system = prof["system"]
|
|
if args.context_window == 12 and prof.get("context_window"):
|
|
args.context_window = int(prof["context_window"])
|
|
return provider
|
|
|
|
opts: dict = {}
|
|
if args.base_url and (args.provider == "openai" or ":" in args.provider):
|
|
opts["base_url"] = args.base_url
|
|
return make_provider(args.provider, model=args.model, **opts)
|
|
|
|
|
|
def _apply_ollama_tuning(provider, args) -> None:
|
|
"""Push CPU-perf flags onto an Ollama chat/code provider. No-op otherwise —
|
|
the knobs (num_ctx/num_thread/num_predict) only exist on OllamaProvider."""
|
|
if getattr(provider, "name", None) != "ollama":
|
|
return
|
|
if args.num_ctx is not None:
|
|
provider.num_ctx = args.num_ctx
|
|
if args.num_thread is not None:
|
|
provider.num_thread = args.num_thread
|
|
if args.num_predict is not None:
|
|
provider.num_predict = args.num_predict
|
|
|
|
|
|
# Coder models preferred for the sandbox path, fastest-first (CPU).
|
|
_CODER_MODELS = ("qwen2.5-coder:1.5b", "qwen2.5-coder:3b", "qwen2.5-coder")
|
|
|
|
|
|
def _build_code_provider(provider, args):
|
|
"""A code-specialized provider for the sandbox `!task` path. Only meaningful
|
|
for Ollama: use --code-model if given, else auto-select a present
|
|
qwen2.5-coder build. Returns None to fall back to the chat provider."""
|
|
if getattr(provider, "name", None) != "ollama":
|
|
return None
|
|
code_model = args.code_model
|
|
if code_model is None:
|
|
try:
|
|
models = set(provider.available_models())
|
|
except Exception: # noqa: BLE001 — discovery down → no separate code path
|
|
models = set()
|
|
code_model = next((m for m in _CODER_MODELS if m in models), None)
|
|
if not code_model or code_model == provider.model:
|
|
return None
|
|
code = make_provider("ollama", model=code_model, host=provider.host)
|
|
_apply_ollama_tuning(code, args)
|
|
return code
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(
|
|
prog="cmd_chat.agent", description="hack-house AI agent bridge (PoC)"
|
|
)
|
|
ap.add_argument("server", nargs="?", help="room host (omit with --list-models/--check)")
|
|
ap.add_argument("port", type=int, nargs="?", help="room port")
|
|
ap.add_argument("--name", default="oracle", help="agent's room display name")
|
|
ap.add_argument("--password", default=None, help="room password")
|
|
ap.add_argument("--provider", default="ollama",
|
|
help="ollama | anthropic | openai | module:Class")
|
|
ap.add_argument("--profile", default=None,
|
|
help="named profile from models.toml (overrides --provider/--model)")
|
|
ap.add_argument("--models-file", default=None,
|
|
help="path to models.toml (default: $HH_MODELS_FILE, ./models.toml, ~/.config/hh/models.toml)")
|
|
ap.add_argument("--model", default=None, help="model name (provider default if omitted)")
|
|
ap.add_argument("--code-model", default=None,
|
|
help="Ollama model for the sandbox/code path (default: auto-select qwen2.5-coder if present)")
|
|
ap.add_argument("--base-url", default=None, help="endpoint for openai-compatible providers")
|
|
ap.add_argument("--num-ctx", type=int, default=None,
|
|
help="Ollama context window (CPU: smaller = faster prefill; default 4096)")
|
|
ap.add_argument("--num-thread", type=int, default=None,
|
|
help="Ollama CPU threads (default: Ollama's own ≈ physical cores; benchmark 4/6/8)")
|
|
ap.add_argument("--num-predict", type=int, default=None,
|
|
help="Ollama max reply tokens (default 512)")
|
|
ap.add_argument("--system", default=None, help="override the system prompt")
|
|
ap.add_argument("--context-window", type=int, default=12,
|
|
help="max prior messages fed to the model per reply")
|
|
ap.add_argument("--token-budget", type=int, default=2000,
|
|
help="approx token cap on the context window (whichever is smaller wins)")
|
|
ap.add_argument("--no-rag", action="store_true",
|
|
help="disable in-RAM semantic recall (recency-only context)")
|
|
ap.add_argument("--embed-model", default="nomic-embed-text",
|
|
help="Ollama model used to embed messages for recall")
|
|
ap.add_argument("--embed-host", default=None,
|
|
help="Ollama host for embeddings (default: chat host or $OLLAMA_HOST)")
|
|
ap.add_argument("--rag-top-k", type=int, default=4,
|
|
help="how many recalled messages to surface per reply")
|
|
ap.add_argument("--embed-dim", type=int, default=256,
|
|
help="truncate embedding vectors to this many dims (MRL; 0 = full vector)")
|
|
ap.add_argument("--list-models", action="store_true",
|
|
help="list models the backend can serve, then exit")
|
|
ap.add_argument("--check", action="store_true",
|
|
help="run a reachability/model preflight, then exit (0 ok, 1 fail)")
|
|
ap.add_argument("--insecure", action="store_true", help="skip TLS cert verification")
|
|
ap.add_argument("--no-tls", action="store_true", help="plain ws/http (local/Tailscale)")
|
|
args = ap.parse_args()
|
|
|
|
provider = _build_provider(args, ap)
|
|
_apply_ollama_tuning(provider, args)
|
|
|
|
# Discovery / preflight modes never join a room.
|
|
if args.list_models:
|
|
discover = getattr(provider, "available_models", None)
|
|
if discover is None:
|
|
ap.error(f"provider '{provider.name}' has no model discovery")
|
|
for m in discover():
|
|
print(m)
|
|
return
|
|
if args.check:
|
|
ok, msg = preflight(provider)
|
|
print(("ok: " if ok else "FAIL: ") + msg, file=sys.stderr if not ok else sys.stdout)
|
|
sys.exit(0 if ok else 1)
|
|
|
|
if args.server is None or args.port is None:
|
|
ap.error("server and port are required to join a room")
|
|
|
|
# Non-fatal preflight: warn early, but still try (discovery may be blocked
|
|
# while completion works).
|
|
ok, msg = preflight(provider)
|
|
if not ok:
|
|
print(f"⚠ preflight: {msg}", file=sys.stderr)
|
|
|
|
# In-RAM semantic recall is on by default and local (Ollama embeddings),
|
|
# independent of which provider answers chat. Reuse the chat host if it's an
|
|
# Ollama provider so a single --host/profile covers both.
|
|
embedder = None
|
|
if not args.no_rag:
|
|
embedder = OllamaEmbedder(
|
|
model=args.embed_model,
|
|
host=args.embed_host or getattr(provider, "host", None),
|
|
truncate_dim=args.embed_dim or None,
|
|
)
|
|
|
|
# Separate coder model for the sandbox path (Ollama only); None → reuse chat.
|
|
code_provider = _build_code_provider(provider, args)
|
|
if code_provider is not None:
|
|
print(f"sandbox/code path → {code_provider.name}/{code_provider.model}", file=sys.stderr)
|
|
|
|
bridge = AgentBridge(
|
|
args.server, args.port, name=args.name, provider=provider,
|
|
password=args.password, insecure=args.insecure, no_tls=args.no_tls,
|
|
system_prompt=args.system, context_window=args.context_window,
|
|
token_budget=args.token_budget, embedder=embedder, rag_top_k=args.rag_top_k,
|
|
code_provider=code_provider,
|
|
)
|
|
try:
|
|
bridge.run()
|
|
except KeyboardInterrupt:
|
|
print("\nagent stopped")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|