Bugfixes from test run

This commit is contained in:
K. Hodges 2026-05-17 13:34:19 -07:00
parent 7c1cf29745
commit a32142e9ea
9 changed files with 207 additions and 48 deletions

View File

@ -6,7 +6,9 @@ from dataclasses import dataclass
import json import json
import os import os
from pathlib import Path from pathlib import Path
import re
import subprocess import subprocess
import tempfile
import time import time
from urllib import request from urllib import request
from urllib.error import URLError from urllib.error import URLError
@ -21,6 +23,7 @@ from .tasks import Task
DEFAULT_AGENT_TIMEOUT_SECONDS = 600 DEFAULT_AGENT_TIMEOUT_SECONDS = 600
OLLAMA_HEARTBEAT_SECONDS = 30.0
@dataclass(frozen=True) @dataclass(frozen=True)
@ -224,27 +227,74 @@ class AgentExecutor:
if agent.temperature is not None: if agent.temperature is not None:
prompt_input = f"/set parameter temperature {agent.temperature}\n{prompt}" prompt_input = f"/set parameter temperature {agent.temperature}\n{prompt}"
started = time.monotonic() started = time.monotonic()
self.logger.event(
"ollama.start",
"Starting Ollama model invocation",
agent_id=agent.id,
model=agent.model,
timeout_seconds=self.timeout_seconds,
)
try: try:
completed = subprocess.run( with tempfile.TemporaryFile("w+", encoding="utf-8", errors="replace") as stdout_file:
["ollama", "run", agent.model], with tempfile.TemporaryFile("w+", encoding="utf-8", errors="replace") as stderr_file:
cwd=self.project_root, process = subprocess.Popen(
input=prompt_input, ["ollama", "run", agent.model],
capture_output=True, cwd=self.project_root,
text=True, stdin=subprocess.PIPE,
encoding="utf-8", stdout=stdout_file,
errors="replace", stderr=stderr_file,
timeout=self.timeout_seconds, text=True,
) encoding="utf-8",
duration = time.monotonic() - started errors="replace",
return AgentInvocation( )
agent_id=agent.id, assert process.stdin is not None
command=command, process.stdin.write(prompt_input)
prompt=prompt_input, process.stdin.close()
exit_code=completed.returncode, last_heartbeat = started
stdout=_coerce_output(completed.stdout), timed_out = False
stderr=_coerce_output(completed.stderr), while process.poll() is None:
duration_seconds=duration, now = time.monotonic()
) elapsed = now - started
if elapsed > self.timeout_seconds:
process.kill()
timed_out = True
break
if now - last_heartbeat >= OLLAMA_HEARTBEAT_SECONDS:
self.logger.event(
"ollama.wait",
"Ollama invocation still running",
agent_id=agent.id,
model=agent.model,
elapsed=f"{elapsed:.0f}s",
)
last_heartbeat = now
time.sleep(1.0)
process.wait()
duration = time.monotonic() - started
stdout_file.seek(0)
stderr_file.seek(0)
stdout = stdout_file.read()
stderr = stderr_file.read()
if timed_out:
return AgentInvocation(
agent_id=agent.id,
command=command,
prompt=prompt_input,
exit_code=-1,
stdout=stdout,
stderr=stderr,
duration_seconds=duration,
timed_out=True,
)
return AgentInvocation(
agent_id=agent.id,
command=command,
prompt=prompt_input,
exit_code=process.returncode or 0,
stdout=stdout,
stderr=stderr,
duration_seconds=duration,
)
except FileNotFoundError as exc: except FileNotFoundError as exc:
duration = time.monotonic() - started duration = time.monotonic() - started
return AgentInvocation( return AgentInvocation(
@ -256,17 +306,16 @@ class AgentExecutor:
stderr=str(exc), stderr=str(exc),
duration_seconds=duration, duration_seconds=duration,
) )
except subprocess.TimeoutExpired as exc: except OSError as exc:
duration = time.monotonic() - started duration = time.monotonic() - started
return AgentInvocation( return AgentInvocation(
agent_id=agent.id, agent_id=agent.id,
command=command, command=command,
prompt=prompt_input, prompt=prompt_input,
exit_code=-1, exit_code=1,
stdout=_coerce_output(exc.stdout), stdout="",
stderr=_coerce_output(exc.stderr), stderr=str(exc),
duration_seconds=duration, duration_seconds=duration,
timed_out=True,
) )
def _invoke_openai_compatible(self, agent: AgentConfig, prompt: str) -> AgentInvocation: def _invoke_openai_compatible(self, agent: AgentConfig, prompt: str) -> AgentInvocation:
@ -390,8 +439,12 @@ def _coerce_output(value: str | bytes | None) -> str:
if value is None: if value is None:
return "" return ""
if isinstance(value, bytes): if isinstance(value, bytes):
return value.decode("utf-8", errors="replace") value = value.decode("utf-8", errors="replace")
return value return strip_ansi_escape_sequences(value)
def strip_ansi_escape_sequences(value: str) -> str:
return re.sub(r"\x1b\[[0-?]*[ -/]*[@-~]", "", value)
def _extract_openai_content(raw: str) -> str: def _extract_openai_content(raw: str) -> str:
@ -446,6 +499,11 @@ def output_contract_for(stage: StageConfig) -> str:
" path: <relative path>", " path: <relative path>",
" pattern: <glob for list_files or regex for grep>", " pattern: <glob for list_files or regex for grep>",
"", "",
"Use at most 5 lookup requests.",
"Do not repeat the same lookup request.",
"Prefer read_file for likely-relevant files over many grep variations.",
"Do not search .nightshift, .git, virtualenvs, caches, or artifact directories.",
"",
"NightShift will run these read-only lookup tools, save files-inspected.md, and re-run this planner stage with the retrieved context.", "NightShift will run these read-only lookup tools, save files-inspected.md, and re-run this planner stage with the retrieved context.",
] ]
) )

View File

@ -154,7 +154,7 @@ def default_run_id(now: datetime | None = None) -> str:
"""Return a filesystem-friendly UTC run id.""" """Return a filesystem-friendly UTC run id."""
value = now or datetime.now(timezone.utc) value = now or datetime.now(timezone.utc)
return value.strftime("%Y%m%dT%H%M%SZ") return value.strftime("%Y%m%dT%H%M%S.%fZ")
def _safe_artifact_segment(value: str, context: str) -> str: def _safe_artifact_segment(value: str, context: str) -> str:

View File

@ -16,6 +16,8 @@ from .safety import resolve_inside_root, resolve_project_root, validate_scoped_p
DEFAULT_MAX_BYTES = 20_000 DEFAULT_MAX_BYTES = 20_000
DEFAULT_MAX_MATCHES = 100 DEFAULT_MAX_MATCHES = 100
DEFAULT_MAX_LOOKUP_REQUESTS = 8
SKIPPED_REPO_PARTS = {".git", ".nightshift", "__pycache__", ".venv", "venv"}
@dataclass(frozen=True) @dataclass(frozen=True)
@ -55,7 +57,7 @@ class RepoTools:
relative_files = [ relative_files = [
_relative(item, self.project_root) _relative(item, self.project_root)
for item in sorted(candidates) for item in sorted(candidates)
if fnmatch.fnmatch(item.name, pattern) if fnmatch.fnmatch(item.name, pattern) and not _is_skipped_repo_path(item, self.project_root)
] ]
lines = relative_files[:max_files] lines = relative_files[:max_files]
if len(relative_files) > max_files: if len(relative_files) > max_files:
@ -64,6 +66,8 @@ class RepoTools:
def read_file(self, path: str, max_bytes: int = DEFAULT_MAX_BYTES) -> str: def read_file(self, path: str, max_bytes: int = DEFAULT_MAX_BYTES) -> str:
file_path = self._resolve_scoped(path, "read_file path") file_path = self._resolve_scoped(path, "read_file path")
if _is_skipped_repo_path(file_path, self.project_root):
return f"Path is skipped for repository lookup: {path}"
if not file_path.exists() or not file_path.is_file(): if not file_path.exists() or not file_path.is_file():
return f"File not found: {path}" return f"File not found: {path}"
data = file_path.read_bytes()[:max_bytes + 1] data = file_path.read_bytes()[:max_bytes + 1]
@ -85,6 +89,8 @@ class RepoTools:
files = [root] if root.is_file() else [item for item in root.rglob("*") if item.is_file()] files = [root] if root.is_file() else [item for item in root.rglob("*") if item.is_file()]
matches: list[str] = [] matches: list[str] = []
for file_path in sorted(files): for file_path in sorted(files):
if _is_skipped_repo_path(file_path, self.project_root):
continue
try: try:
text = file_path.read_text(encoding="utf-8", errors="replace") text = file_path.read_text(encoding="utf-8", errors="replace")
except OSError: except OSError:
@ -110,7 +116,8 @@ class RepoTools:
def execute_requests(self, task_id: str, requests: list[ToolCall], filename: str = "repo-tools.md") -> str: def execute_requests(self, task_id: str, requests: list[ToolCall], filename: str = "repo-tools.md") -> str:
completed: list[ToolCall] = [] completed: list[ToolCall] = []
for request in requests: unique_requests = dedupe_tool_calls(requests)[:DEFAULT_MAX_LOOKUP_REQUESTS]
for request in unique_requests:
self.logger.event( self.logger.event(
"tool.call", "tool.call",
"Running repo lookup tool", "Running repo lookup tool",
@ -175,7 +182,7 @@ def format_tool_calls(calls: list[ToolCall]) -> str:
return "\n".join(lines) return "\n".join(lines)
def parse_lookup_requests(text: str) -> list[ToolCall]: def parse_lookup_requests(text: str, max_requests: int = DEFAULT_MAX_LOOKUP_REQUESTS) -> list[ToolCall]:
"""Parse a small YAML-like lookup request list from model output.""" """Parse a small YAML-like lookup request list from model output."""
lines = text.splitlines() lines = text.splitlines()
@ -215,7 +222,19 @@ def parse_lookup_requests(text: str) -> list[ToolCall]:
flush() flush()
current[key] = value current[key] = value
flush() flush()
return requests return dedupe_tool_calls(requests)[:max_requests]
def dedupe_tool_calls(requests: list[ToolCall]) -> list[ToolCall]:
seen: set[tuple[str, tuple[tuple[str, str], ...]]] = set()
unique: list[ToolCall] = []
for request in requests:
key = (request.name, tuple(sorted(request.arguments.items())))
if key in seen:
continue
seen.add(key)
unique.append(request)
return unique
def extract_agent_stdout(artifact_text: str) -> str: def extract_agent_stdout(artifact_text: str) -> str:
@ -249,3 +268,11 @@ def _relative(path: Path, root: Path) -> str:
return path.relative_to(root).as_posix() return path.relative_to(root).as_posix()
except ValueError: except ValueError:
return path.as_posix() return path.as_posix()
def _is_skipped_repo_path(path: Path, root: Path) -> bool:
try:
parts = set(path.relative_to(root).parts)
except ValueError:
parts = set(path.parts)
return bool(parts & SKIPPED_REPO_PARTS)

View File

@ -27,18 +27,23 @@ class RunLogger:
self.console = console self.console = console
self._run_log_path: Path | None = None self._run_log_path: Path | None = None
self._aggregate_log_path: Path | None = None self._aggregate_log_path: Path | None = None
self._initialized_run_logs: set[Path] = set()
def bind(self, artifacts: ArtifactStore) -> None: def bind(self, artifacts: ArtifactStore) -> None:
artifacts.initialize_run() artifacts.initialize_run()
self._run_log_path = artifacts.run_log_path self._run_log_path = artifacts.run_log_path
self._aggregate_log_path = artifacts.aggregate_log_path self._aggregate_log_path = artifacts.aggregate_log_path
if self._run_log_path not in self._initialized_run_logs:
self._run_log_path.parent.mkdir(parents=True, exist_ok=True)
self._run_log_path.write_text("", encoding="utf-8")
self._initialized_run_logs.add(self._run_log_path)
def event(self, event: str, message: str, **fields: object) -> None: def event(self, event: str, message: str, **fields: object) -> None:
safe_fields = _redact_fields(fields) safe_fields = _redact_fields(fields)
line = format_log_line(LogEvent(event=event, message=message, fields=safe_fields)) line = format_log_line(LogEvent(event=event, message=message, fields=safe_fields))
if self.console is not None: if self.console is not None:
self.console(line) self.console(line)
for path in (self._run_log_path, self._aggregate_log_path): for path in (self._run_log_path,):
if path is None: if path is None:
continue continue
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)

View File

@ -50,13 +50,18 @@ def read_artifact(run_path: Path, relative_path: str) -> str:
def render_dashboard(artifact_dir: str | Path) -> str: def render_dashboard(artifact_dir: str | Path) -> str:
runs = list_runs(artifact_dir) runs = list_runs(artifact_dir)
body = ["<h1>NightShift Dashboard</h1>", '<meta http-equiv="refresh" content="5">'] body = [
"<h1>NightShift Dashboard</h1>",
'<meta http-equiv="refresh" content="5">',
"<p>Showing artifact files from the newest run first. This page is read-only and refreshes every 5 seconds.</p>",
]
if not runs: if not runs:
body.append("<p>No runs found.</p>") body.append("<p>No runs found.</p>")
for run in runs: for index, run in enumerate(runs):
title = "Latest Run" if index == 0 else "Older Run"
body.extend( body.extend(
[ [
f"<section><h2>{escape(run.name)}</h2>", f"<section><h2>{title}: {escape(run.name)}</h2>",
"<pre>", "<pre>",
escape(run.summary), escape(run.summary),
"</pre>", "</pre>",
@ -84,11 +89,15 @@ def create_app(project_root: str | Path = ".", artifact_dir: str | Path = ".nigh
@app.get("/") @app.get("/")
def index(): def index():
return Response(render_dashboard(artifacts), mimetype="text/html") response = Response(render_dashboard(artifacts), mimetype="text/html")
response.headers["Cache-Control"] = "no-store, max-age=0"
return response
@app.get("/runs/<run_id>/<path:artifact_path>") @app.get("/runs/<run_id>/<path:artifact_path>")
def artifact(run_id: str, artifact_path: str): def artifact(run_id: str, artifact_path: str):
content = read_artifact(artifacts / "runs" / run_id, artifact_path) content = read_artifact(artifacts / "runs" / run_id, artifact_path)
return Response(f"<pre>{escape(content)}</pre>", mimetype="text/html") response = Response(f"<pre>{escape(content)}</pre>", mimetype="text/html")
response.headers["Cache-Control"] = "no-store, max-age=0"
return response
return app return app

View File

@ -1,9 +1,10 @@
from pathlib import Path from pathlib import Path
import io
import tempfile import tempfile
import unittest import unittest
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from nightshift.agents import AgentExecutor, build_prompt_bundle, parse_review_output from nightshift.agents import AgentExecutor, build_prompt_bundle, parse_review_output, strip_ansi_escape_sequences
from nightshift.agents import AgentInvocation, format_agent_invocation from nightshift.agents import AgentInvocation, format_agent_invocation
from nightshift.artifacts import ArtifactStore from nightshift.artifacts import ArtifactStore
from nightshift.config import AgentConfig, StageConfig from nightshift.config import AgentConfig, StageConfig
@ -118,17 +119,25 @@ class AgentExecutorTests(unittest.TestCase):
task = parse_tasks(TASK_MD)[0] task = parse_tasks(TASK_MD)[0]
stage = StageConfig(id="plan", type="agent", agent="planner", output="plan.md") stage = StageConfig(id="plan", type="agent", agent="planner", output="plan.md")
completed = type( class FakePopen:
"Completed", def __init__(self, args, cwd=None, stdin=None, stdout=None, stderr=None, **kwargs):
(), self.args = args
{"returncode": 0, "stdout": "ollama output", "stderr": ""}, self.stdin = io.StringIO()
)() self.returncode = 0
with patch("nightshift.agents.subprocess.run", return_value=completed) as run: stdout.write("ollama output")
def poll(self):
return self.returncode
def wait(self):
return self.returncode
with patch("nightshift.agents.subprocess.Popen", side_effect=FakePopen) as popen:
result = executor.run_stage(stage, task) result = executor.run_stage(stage, task)
self.assertEqual(result.status, "pass") self.assertEqual(result.status, "pass")
run.assert_called_once() popen.assert_called_once()
self.assertEqual(run.call_args.args[0], ["ollama", "run", "tiny-model"]) self.assertEqual(popen.call_args.args[0], ["ollama", "run", "tiny-model"])
output = (root / result.output_path).read_text(encoding="utf-8") output = (root / result.output_path).read_text(encoding="utf-8")
self.assertIn("ollama run tiny-model", output) self.assertIn("ollama run tiny-model", output)
@ -185,6 +194,9 @@ class AgentExecutorTests(unittest.TestCase):
self.assertIn("Agent: `planner`", output) self.assertIn("Agent: `planner`", output)
self.assertIn("## stderr", output) self.assertIn("## stderr", output)
def test_strip_ansi_escape_sequences(self) -> None:
self.assertEqual(strip_ansi_escape_sequences("\x1b[?25lthinking\x1b[0m"), "thinking")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -2,7 +2,7 @@ from pathlib import Path
import tempfile import tempfile
import unittest import unittest
from nightshift.artifacts import ArtifactStore from nightshift.artifacts import ArtifactStore, default_run_id
from nightshift.errors import ArtifactError from nightshift.errors import ArtifactError
from nightshift.init import init_project from nightshift.init import init_project
from nightshift.tasks import parse_task_file from nightshift.tasks import parse_task_file
@ -62,6 +62,11 @@ class ArtifactStoreTests(unittest.TestCase):
with self.assertRaisesRegex(ArtifactError, "task id contains unsafe"): with self.assertRaisesRegex(ArtifactError, "task id contains unsafe"):
store.create_task_dir("../TASK-001") store.create_task_dir("../TASK-001")
def test_default_run_id_has_subsecond_precision(self) -> None:
run_id = default_run_id()
self.assertRegex(run_id, r"^\d{8}T\d{6}\.\d{6}Z$")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -240,10 +240,13 @@ Acceptance Criteria:
config = make_config(root, stages) config = make_config(root, stages)
runner = PipelineRunner(config, artifacts) runner = PipelineRunner(config, artifacts)
task = parse_tasks(TASK_MD)[0] task = parse_tasks(TASK_MD)[0]
artifacts.initialize_run()
artifacts.run_log_path.write_text("old run log\n", encoding="utf-8")
runner.run_task(task) runner.run_task(task)
log = (root / ".nightshift" / "runs" / "test-run" / "run.log").read_text(encoding="utf-8") log = (root / ".nightshift" / "runs" / "test-run" / "run.log").read_text(encoding="utf-8")
self.assertNotIn("old run log", log)
self.assertIn("task.start", log) self.assertIn("task.start", log)
self.assertIn("stage.start", log) self.assertIn("stage.start", log)
self.assertIn("agent.finish", log) self.assertIn("agent.finish", log)

View File

@ -42,6 +42,46 @@ lookup_requests:
self.assertEqual(requests[0].arguments["path"], "nightshift/pipeline.py") self.assertEqual(requests[0].arguments["path"], "nightshift/pipeline.py")
self.assertEqual(requests[1].arguments["pattern"], "PipelineRunner") self.assertEqual(requests[1].arguments["pattern"], "PipelineRunner")
def test_parse_lookup_requests_dedupes_and_caps(self) -> None:
repeated = "\n".join(
[
"lookup_requests:",
*[
"- tool: grep\n path: .\n pattern: parse.*tokenize"
for _ in range(20)
],
"- tool: read_file\n path: lisp.py",
]
)
requests = parse_lookup_requests(repeated)
self.assertEqual(len(requests), 2)
self.assertEqual([request.name for request in requests], ["grep", "read_file"])
def test_repo_tools_skip_artifact_directories(self) -> None:
with tempfile.TemporaryDirectory() as directory:
root = Path(directory)
(root / ".nightshift" / "runs").mkdir(parents=True)
(root / ".nightshift" / "runs" / "run.log").write_text("needle\n", encoding="utf-8")
(root / "src").mkdir()
(root / "src" / "app.py").write_text("needle\n", encoding="utf-8")
safety = SafetyConfig(
require_clean_worktree=False,
scoped_paths=(".",),
allowed_commands=(),
forbidden_commands=(),
)
tools = RepoTools(root, safety, ArtifactStore(root, ".nightshift", run_id="test-run"))
files = tools.list_files(".")
matches = tools.grep("needle", ".")
self.assertIn("src/app.py", files)
self.assertNotIn(".nightshift", files)
self.assertIn("src/app.py:1", matches)
self.assertNotIn(".nightshift", matches)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()