From d84d580671f51d8468ae47843aba4ee5c0ec7dd4 Mon Sep 17 00:00:00 2001 From: "K. Hodges" Date: Sun, 17 May 2026 00:38:00 -0700 Subject: [PATCH] Implement pipeline execution, context, reports, and MVP docs --- README.md | 711 +++++++++---------------------------- docs/design.md | 189 ++++++++++ docs/devlog/mvp-summary.md | 45 +++ docs/devlog/phase10.md | 21 ++ docs/devlog/phase11.md | 23 ++ docs/devlog/phase7.md | 23 ++ docs/devlog/phase8.md | 25 ++ docs/devlog/phase9.md | 22 ++ nightshift/agents.py | 290 +++++++++++++++ nightshift/cli.py | 22 +- nightshift/context.py | 107 ++++++ nightshift/errors.py | 8 + nightshift/pipeline.py | 206 +++++++++++ nightshift/reports.py | 205 +++++++++++ tests/test_agents.py | 99 ++++++ tests/test_context.py | 57 +++ tests/test_pipeline.py | 138 +++++++ tests/test_reports.py | 58 +++ 18 files changed, 1695 insertions(+), 554 deletions(-) create mode 100644 docs/devlog/mvp-summary.md create mode 100644 docs/devlog/phase10.md create mode 100644 docs/devlog/phase11.md create mode 100644 docs/devlog/phase7.md create mode 100644 docs/devlog/phase8.md create mode 100644 docs/devlog/phase9.md create mode 100644 nightshift/agents.py create mode 100644 nightshift/context.py create mode 100644 nightshift/pipeline.py create mode 100644 nightshift/reports.py create mode 100644 tests/test_agents.py create mode 100644 tests/test_context.py create mode 100644 tests/test_pipeline.py create mode 100644 tests/test_reports.py diff --git a/README.md b/README.md index 78b2f6e..3ad5a37 100644 --- a/README.md +++ b/README.md @@ -1,395 +1,100 @@ # NightShift -> Auditable local-first AI coding pipelines. -> -> Wake up to reviewable work, not chaos. +Auditable local-first AI coding pipelines. -NightShift is a deterministic pipeline runner for long-running AI-assisted coding workflows. +NightShift is a deterministic pipeline runner for long-running AI-assisted coding workflows. It runs one markdown task at a time through a declarative YAML pipeline, records the important artifacts, and leaves the user with a reviewable work package. -It is designed for overnight or unattended execution against a scoped project repository using local or external coding agents. +NightShift is not an autonomous software engineer. It is an orchestration layer that treats AI agents as unreliable workers inside bounded, testable, auditable workflows. -NightShift is not an autonomous coding god. +## MVP Status -It is a safety-aware orchestration system that treats LLMs like unreliable distributed systems. +The core MVP is implemented: -Agents are bounded by: +- `nightshift init` creates starter config, task, and agent prompt files. +- `nightshift validate` checks config structure, prompt paths, task parsing, scoped paths, and command safety. +- `nightshift run` executes the next incomplete task. +- `nightshift run --task TASK-001` executes a specific task. +- Command-backed agents receive compact prompt bundles on stdin. +- Command stages run through allowlist and forbidden-fragment checks. +- Runs create `.nightshift/` artifacts, task context, retry context, command output, agent output, final notes, and run summaries. +- Unit tests cover config, safety, tasks, artifacts, commands, agents, pipeline retries, context, and reports. -* scoped repository access -* structured pipeline stages -* tests and static analysis -* retry limits -* review stages -* context compaction -* durable artifacts +## What NightShift Is -The output is: +NightShift is built for reviewable automation: -* reviewable code -* plans -* logs -* diffs -* test output -* review notes -* overnight summaries +- local-first execution +- declarative pipeline stages +- markdown task files +- command-backed agent wrappers +- explicit retry limits +- command allowlists +- scoped path checks +- durable markdown/text artifacts +- compact context handoff +- final reports for human review -Not blind autonomous shipping. +The goal is to wake up to useful artifacts and a repository state you can inspect. ---- +## What NightShift Is Not -# Why? +NightShift does not try to autonomously ship code. It does not push branches, deploy software, run arbitrary hooks, execute parallel task swarms, or grant agents unlimited repository access. Human review remains the final authority. -Most "AI coding agents" optimize for: - -* autonomy -* demo magic -* speed -* vibes - -NightShift optimizes for: - -1. Cheapness -2. Correctness -3. Auditability -4. Speed - -NightShift is also intended to serve as an experimentation platform for AI-assisted software engineering workflows. - -The system is intentionally designed to facilitate testing and comparison of: - -* different models -* different agent roles -* prompt structures -* system prompts -* retry strategies -* review strategies -* context compaction techniques -* pipeline structures -* reasoning formats -* constraint-driven workflows - -The pipeline architecture should make these experiments reproducible, auditable, and configurable rather than hidden inside opaque agent behavior. - -The assumption is simple: - -> AI systems are useful but unreliable. - -NightShift embraces this reality by building deterministic orchestration around nondeterministic agents. - ---- - -# Features - -## Local-first execution - -Designed primarily for: - -* Ollama -* local models -* Codex CLI -* Claude Code -* command-driven wrappers - -Use cheap local models for most work. -Escalate expensive models only where useful. - ---- - -## Declarative pipelines - -Define workflows in YAML: - -```yaml -pipeline: - stages: - - id: plan - type: agent - agent: planner - - - id: implement - type: agent - agent: implementer - - - id: test - type: command - commands: - - cargo test - - - id: review - type: review - agent: reviewer -``` - -Pipelines are intentionally portable and configurable so users can experiment with: - -* model routing -* review loops -* retry logic -* prompt engineering -* reasoning formats -* planning strategies -* context structures -* cost/performance tradeoffs - -NightShift is designed to make these workflow experiments measurable and repeatable rather than ad-hoc. - -```yaml -pipeline: - stages: - - id: plan - type: agent - agent: planner - - - id: implement - type: agent - agent: implementer - - - id: test - type: command - commands: - - cargo test - - - id: review - type: review - agent: reviewer -``` - ---- - -## Review-first workflows - -NightShift is designed around: - -```text -plan - -> review - -> implement - -> test - -> static analysis - -> review - -> retry or complete -``` - -The goal is: - -> Wake up to a useful review package. - ---- - -## Durable artifacts - -Every run creates a full audit trail. - -Example: - -```text -.nightshift/ - runs/ - 2026-05-16-overnight/ - run-summary.md - - tasks/ - TASK-001/ - plan.md - review.md - implementation-log.md - test-output.txt - diff.patch -``` - -This makes: - -* debugging easier -* prompt experimentation possible -* retries understandable -* failures inspectable -* portfolio demos stronger - ---- - -## Scoped repository safety - -NightShift can: - -* restrict writable directories -* allowlist commands -* block dangerous shell operations -* require clean git worktrees - -The system is intentionally conservative. - ---- - -# Philosophy - -NightShift follows a few core principles. - -## Deterministic orchestration - -Agents are probabilistic. - -The pipeline runner should not be. - ---- - -## Context compaction - -Do not dump infinite history into prompts. - -Use: - -* project context -* task context -* retry summaries - -Keep context compact and intentional. - ---- - -## Reviewability over autonomy - -NightShift is optimized to produce: - -* reviewable work -* reviewable reasoning -* reviewable failure - -Not autonomous deployment. - ---- - -## Boring reliability beats magical demos - -The system should: - -* fail clearly -* retry explicitly -* preserve artifacts -* avoid spooky hidden behavior - ---- - -# Architecture Overview - -```text -Task Parser - ↓ -Pipeline Runner - ↓ -Stage Executor - ┌────┴────┐ - ↓ ↓ -Agents Commands -``` - -Core components: - -* Task parser -* Pipeline runner -* Stage executor -* Agent wrappers -* Command runner -* Artifact store -* Context manager -* Safety layer - ---- - -# Example Workflow - -Input: - -* repository -* tasks.md -* nightshift.yaml -* agent prompt files - -Execution: - -```text -TASK-001 - ↓ -plan - ↓ -review_plan - ↓ -implement - ↓ -test - ↓ -static analysis - ↓ -review - ↓ -complete or retry -``` - -Output: - -* modified repository -* task artifacts -* overnight report -* review notes - ---- - -# Installation - -## Status - -NightShift is currently an early-stage project. - -The MVP focuses on: - -* local-first execution -* declarative pipelines -* task orchestration -* artifact generation -* safe command execution -* reviewable workflows - ---- - -## Planned Installation - -Python version: - -```bash -pip install nightshift -``` +## Install Development install: ```bash -git clone -cd nightshift pip install -e . ``` ---- +You can also run the CLI module directly from a checkout: -# Quickstart +```bash +python -m nightshift.cli --help +``` -## 1. Initialize a project +NightShift currently uses the Python standard library for runtime behavior. PyYAML is used automatically if installed, but the starter config works with the built-in YAML subset parser. + +## Quickstart + +Create starter files: ```bash nightshift init ``` -Creates: +Validate the project: -```text -nightshift.yaml -tasks.md -agents/ +```bash +nightshift validate ``` ---- +Run the next incomplete task: -## 2. Define tasks +```bash +nightshift run +``` -Example: +Run a specific task: + +```bash +nightshift run --task TASK-001 +``` + +Review artifacts: + +```text +.nightshift/runs// +``` + +## Task File Example + +Tasks live in markdown checklist format: ```markdown +# Tasks + - [ ] TASK-001: Add YAML config loading Description: @@ -398,59 +103,13 @@ Implement config loading for NightShift. Acceptance Criteria: - Loads `nightshift.yaml` - Validates required fields +- Returns typed config objects - Includes tests ``` ---- +NightShift parses task id, title, completion state, description, acceptance criteria, optional dependency bullets, and raw task markdown. -## 3. Configure pipeline - -Example: - -```yaml -project: - root: . - task_file: tasks.md - artifact_dir: .nightshift - -pipeline: - max_task_retries: 3 -``` - ---- - -## 4. Run NightShift - -```bash -nightshift run -``` - -Or: - -```bash -nightshift run --task TASK-001 -``` - ---- - -## 5. Review artifacts - -```text -.nightshift/runs// -``` - -Contains: - -* plans -* logs -* diffs -* test output -* review notes -* summaries - ---- - -# Example Config +## Config Example ```yaml project: @@ -460,210 +119,160 @@ project: artifact_dir: .nightshift safety: - require_clean_worktree: true - + require_clean_worktree: false scoped_paths: - - src/ - - tests/ - + - . allowed_commands: - - cargo test - - cargo fmt --check - + - python -m unittest forbidden_commands: - rm -rf - git push + - curl | bash agents: planner: backend: command - command: codex + command: echo system_prompt: agents/planner.md implementer: backend: command - command: codex + command: echo system_prompt: agents/implementer.md reviewer: backend: command - command: codex + command: echo system_prompt: agents/reviewer.md pipeline: max_task_retries: 3 - stages: - id: plan type: agent agent: planner + output: plan.md - id: implement type: agent agent: implementer + output: implementation-log.md - id: test type: command commands: - - cargo test + - python -m unittest + output: test-output.txt - id: review - type: review + type: agent_review agent: reviewer + on_fail: implement + output: review.md + + - id: summarize + type: summarize + output: final-notes.md ``` ---- +## Agent Backends -# Safety Model +The MVP supports `backend: command`. -NightShift intentionally limits agent freedom. +NightShift builds a prompt bundle containing: -## Repository scope restrictions +- system prompt +- stage id and type +- task markdown +- acceptance criteria +- project context +- task context +- previous stage output +- retry notes +- output contract -Agents should only operate within configured project paths. +The prompt is passed to the configured command on stdin. stdout, stderr, exit code, duration, and the prompt are persisted as artifacts. ---- - -## Command allowlists - -Commands must be explicitly permitted. - -Example: +Review agents should emit: ```yaml -allowed_commands: - - cargo test - - cargo fmt --check +status: pass | fail | retry | escalate +reason: +next_stage: +context_update: ``` ---- +## Safety Model -## Dangerous command blocking +NightShift validates paths and commands before execution. -NightShift may block commands such as: +Path safety: + +- project roots are resolved with `pathlib` +- task files and prompt files must stay inside the project root +- artifact paths cannot escape `.nightshift/` +- task artifact writes cannot escape the task directory + +Command safety: + +- command stages must match `allowed_commands` +- forbidden fragments are blocked before allowlist acceptance +- command output and exit codes are recorded +- command stages stop at the first failing or timed-out command + +The MVP does not push, deploy, create branches, or execute arbitrary Python hooks. + +## Artifact Layout + +A run creates human-readable artifacts: ```text -rm -rf -git push -curl | bash +.nightshift/ + project-context.md + runs/ + / + run-summary.md + config.snapshot.yaml + tasks/ + TASK-001/ + task.md + context.md + plan.md + implementation-log.md + test-output.txt + review.md + stage-results.md + context-out.md + final-notes.md ``` ---- +Artifacts are written even when a stage fails where possible. -## Review-first workflow +## Development -The system assumes: +Run tests: -> Humans remain the final authority. +```bash +python -m unittest discover -v +``` ---- +Compile-check modules: -# Roadmap +```bash +python -m compileall nightshift tests +``` -## MVP +## Roadmap -* [ ] YAML config loading -* [ ] Markdown task parsing -* [ ] Pipeline execution -* [ ] Fake command agents -* [ ] Artifact generation -* [ ] Safe command execution -* [ ] Retry handling -* [ ] Overnight reports +Next major work: -## Future +- real local model wrappers +- stronger git safety and diff capture +- task completion updates +- dependency handling +- richer status command +- prompt and model experimentation +- optional branch isolation +- longer-run multi-task reports -* [ ] Ollama integration -* [ ] Claude Code integration -* [ ] Codex integration -* [ ] Parallel execution -* [ ] DAG workflows -* [ ] Prompt A/B testing -* [ ] Cost telemetry -* [ ] Git branch isolation -* [ ] Dashboard UI -* [ ] Constraint-language experimentation - ---- - -# Inspiration - -NightShift is inspired by: - -* CI/CD systems -* build pipelines -* state machines -* agent orchestration research -* distributed systems thinking -* local-first tooling -* practical AI skepticism - ---- - -# Philosophy Statement - -NightShift rejects two extremes: - -## Fully manual engineering - -Too slow. - -## Reckless autonomous agents - -Too unreliable. - -Instead: - -> NightShift treats AI systems as bounded workers inside deterministic workflows. - -The goal is not artificial software gods. - -The goal is trustworthy leverage. - ---- - -# License - -Planned: - -GPLv3 - -Rationale: - -NightShift is licensed under GPLv3 because AI-assisted software engineering is rapidly becoming dependent on opaque, vendor-controlled tooling. As agent systems become part of the actual software production process, users deserve the freedom to inspect, modify, audit, and reproduce the systems operating on their codebases. GPLv3 helps ensure that improvements to NightShift and its orchestration layer remain part of a transparent, inspectable ecosystem rather than disappearing into proprietary black boxes. The goal is not just open source for its own sake, but preserving user autonomy, local-first experimentation, and the ability to understand how automated systems are making decisions inside increasingly critical engineering workflows. - -* encourages community contribution -* protects local-first ecosystem -* aligns with hacker/free software ethos - -[Read more here, GPLv3 saves the world.](https://www.gnu.org/licenses/rms-why-gplv3.html) - ---- - -# Contributing - -NightShift is intentionally early and experimental. - -Good contributions: - -* safety improvements -* pipeline reliability -* better artifact systems -* better context compaction -* local model integrations -* tests -* docs - -Bad contributions: - -* adding magical autonomy before reliability exists -* removing safety boundaries -* overcomplicated abstractions before MVP stability - ---- - -# Final Note - -AI coding tools are currently optimized for demos. - -NightShift is optimized for surviving the night. +NightShift remains oriented around reviewable output, not blind autonomy. diff --git a/docs/design.md b/docs/design.md index b87c875..72e2f52 100644 --- a/docs/design.md +++ b/docs/design.md @@ -870,6 +870,195 @@ This MVP is sufficient to: --- +# 17. MVP Implementation Status + +The first MVP pass is implemented across phases 1 through 11. + +Implemented capabilities: + +* Project initialization +* Config validation +* Markdown task parsing +* Path and command safety checks +* Artifact storage +* Command stage execution +* Command-backed agent execution +* Deterministic pipeline execution +* Retry redirection and retry limits +* Context file creation and prompt injection +* Final task notes and run summaries +* README documentation + +Known MVP limitations: + +* Only the `command` agent backend is implemented +* `nightshift status` is still a placeholder +* Clean worktree enforcement is not fully wired +* Diff patch capture is not implemented +* Task completion mutation is not implemented +* Task dependency enforcement is not implemented +* Multi-task overnight batching is not implemented + +--- + +# 18. Next Major Update Plan + +The next major update should turn the single-task MVP into a more practical local runner while preserving the same safety and auditability model. + +## Phase 12: Status Command + +* [ ] Implement `nightshift status` +* [ ] Print config path and project root +* [ ] Print task counts +* [ ] Print next incomplete task +* [ ] Print latest run directory +* [ ] Print validation warnings where useful +* [ ] Add tests + +Acceptance Criteria: + +* User can inspect project state without running a pipeline +* Missing or malformed inputs produce clear errors +* Latest artifacts are discoverable from the CLI + +--- + +## Phase 13: Git Safety and Diff Artifacts + +* [ ] Implement clean-worktree enforcement when configured +* [ ] Capture pre-run git status +* [ ] Capture post-run git status +* [ ] Write `diff.patch` +* [ ] Include changed files in final reports +* [ ] Handle non-git repositories gracefully +* [ ] Add tests with temporary git repositories where practical + +Acceptance Criteria: + +* `require_clean_worktree: true` blocks dirty repositories +* Diffs are persisted after task execution +* Reports identify modified files without requiring users to inspect every artifact + +--- + +## Phase 14: Task Completion Updates + +* [ ] Mark completed tasks in `tasks.md` +* [ ] Preserve task file formatting where practical +* [ ] Avoid marking failed tasks complete +* [ ] Record task completion decisions in artifacts +* [ ] Add tests + +Acceptance Criteria: + +* Successful runs can mark `[ ]` tasks as `[x]` +* Failed runs leave tasks incomplete +* Task file updates are reviewable and minimal + +--- + +## Phase 15: Multi-Task Run Mode + +* [ ] Add `nightshift run --all` +* [ ] Process incomplete tasks in file order +* [ ] Stop or continue on failure based on config +* [ ] Create per-task artifact directories under one run +* [ ] Generate aggregate run summary +* [ ] Add tests + +Acceptance Criteria: + +* User can run more than one task unattended +* Each task remains independently reviewable +* Aggregate summary shows completed and failed tasks + +--- + +## Phase 16: Dependency Handling + +* [ ] Parse dependency bullets into structured task dependencies +* [ ] Block tasks whose dependencies are incomplete +* [ ] Detect missing dependency references +* [ ] Detect simple dependency cycles +* [ ] Report blocked tasks in status and run summaries +* [ ] Add tests + +Acceptance Criteria: + +* Tasks do not run before declared dependencies are complete +* Dependency errors are clear and actionable +* Task ordering remains deterministic + +--- + +## Phase 17: Local Model Backend + +* [ ] Add an Ollama-compatible agent backend +* [ ] Keep the existing command backend +* [ ] Reuse prompt bundle construction +* [ ] Persist request/response metadata +* [ ] Handle model errors and timeouts +* [ ] Add fake backend tests without requiring Ollama + +Acceptance Criteria: + +* Users can configure a local model backend for agent stages +* Tests do not require real model calls +* Agent artifacts remain comparable across backends + +--- + +## Phase 18: Prompt and Pipeline Experiments + +* [ ] Add prompt variant identifiers +* [ ] Snapshot prompt files per run +* [ ] Record agent backend metadata +* [ ] Add optional experiment labels to config +* [ ] Include experiment metadata in reports +* [ ] Add tests + +Acceptance Criteria: + +* Users can compare prompt/pipeline runs from artifacts +* Reports show which prompts and backend settings produced a result +* Experiment metadata does not change execution semantics + +--- + +## Phase 19: Stronger Command Execution + +* [ ] Replace shell-string execution where possible with parsed argv execution +* [ ] Preserve compatibility with explicit shell command stages when configured +* [ ] Add per-command timeout config +* [ ] Add environment variable allowlists +* [ ] Add working-directory restrictions +* [ ] Add tests + +Acceptance Criteria: + +* Command execution is safer by default +* Shell execution is explicit rather than implicit +* Command behavior remains auditable + +--- + +## Phase 20: Documentation and Examples Refresh + +* [ ] Add complete example project +* [ ] Add example fake-agent pipeline +* [ ] Add example local-model pipeline +* [ ] Document artifact review workflow +* [ ] Document troubleshooting +* [ ] Add config reference + +Acceptance Criteria: + +* New users can run a complete demo from a fresh checkout +* Documentation distinguishes implemented features from planned features +* Examples remain safe to run locally + +--- + # Appendix A: Design Decisions and Rationale ## A.1 Local-first architecture diff --git a/docs/devlog/mvp-summary.md b/docs/devlog/mvp-summary.md new file mode 100644 index 0000000..46267e5 --- /dev/null +++ b/docs/devlog/mvp-summary.md @@ -0,0 +1,45 @@ +# MVP Devlog Summary + +## Scope + +The first MVP pass implemented phases 1 through 11 from `docs/vibe.md`. + +## Completed Stages + +- Phase 1: Python package skeleton, CLI entry point, starter project generation, and init tests. +- Phase 2: typed YAML config loading, structural validation, agent/stage reference checks, and config tests. +- Phase 3: project-root path safety, scoped path checks, artifact path safety, command allowlist checks, forbidden command fragments, and safety tests. +- Phase 4: markdown task parser, task selection helpers, useful task errors, and parser tests. +- Phase 5: artifact store, run/task directories, config and task snapshots, stage output writing, and artifact tests. +- Phase 6: command stage executor, stdout/stderr/exit code capture, output persistence, `StageResult`, and command tests. +- Phase 7: command-backed agent executor, prompt bundle construction, review output parsing, and fake-agent tests. +- Phase 8: deterministic pipeline runner, ordered stage execution, retry redirection, retry limit enforcement, CLI `run`, and pipeline tests. +- Phase 9: project/task/retry context files, agent context injection, `context-out.md`, and context tests. +- Phase 10: final task reports, stage summaries, run summaries, modified-file detection when available, and report tests. +- Phase 11: README updated to document the implemented MVP and current safety model. + +## Major Decisions + +- Runtime code stays dependency-light and uses the standard library where practical. +- YAML support uses PyYAML if installed, with a small fallback parser for starter configs. +- Pipelines are state machines, not DAGs. +- v1 executes one task at a time. +- Agents use the `command` backend first. +- Command stages require exact allowlist matches after whitespace normalization. +- Forbidden command fragments are checked before allowlist acceptance. +- Artifacts are markdown/text-first and are treated as product output, not debug leftovers. +- Context is compact and layered into project, task, and retry context. + +## Current MVP State + +NightShift can initialize a project, validate config and tasks, run a fake command-agent pipeline for one markdown task, enforce retry limits, persist artifacts, and produce reviewable summaries. + +## Remaining Product Gaps + +- Real local model backends are not implemented. +- `nightshift status` remains a placeholder. +- Clean-worktree enforcement is configured but not fully implemented. +- Diff patch capture is not implemented. +- Task completion mutation is not implemented. +- Dependency solving is not implemented. +- Multi-task overnight batching is not implemented. diff --git a/docs/devlog/phase10.md b/docs/devlog/phase10.md new file mode 100644 index 0000000..0043134 --- /dev/null +++ b/docs/devlog/phase10.md @@ -0,0 +1,21 @@ +# Phase 10 Devlog: Reports + +## Implemented + +- Added `nightshift/reports.py`. +- Generated final task notes. +- Generated `stage-results.md`. +- Generated run summaries. +- Included task status, retry count, final reason, acceptance criteria, stage results, artifact paths, and modified files when available. +- Wired report generation into the pipeline runner. +- Added report tests. + +## Decisions Made + +- Report generation is separated from the pipeline runner so formatting can evolve without changing orchestration logic. +- Modified file detection uses `git status --short` when available, but report generation succeeds if Git is unavailable or rejects the repository. +- The summarize stage remains a pipeline stage artifact; Phase 10 final reports are always generated at task completion. + +## Notes + +- Reports are intentionally concise markdown. They are meant to be the morning review entry point, not a full replacement for detailed artifacts. diff --git a/docs/devlog/phase11.md b/docs/devlog/phase11.md new file mode 100644 index 0000000..9501162 --- /dev/null +++ b/docs/devlog/phase11.md @@ -0,0 +1,23 @@ +# Phase 11 Devlog: README + +## Implemented + +- Rewrote `README.md` around the implemented MVP rather than the earlier planned MVP. +- Explained what NightShift is and what it is not. +- Added development install and direct module usage. +- Added quickstart commands for `init`, `validate`, `run`, and `run --task`. +- Added task file and config examples that match the current command-backed MVP. +- Documented command-backed agent behavior and review output contracts. +- Documented the current safety model. +- Documented the artifact layout created by the runner. +- Added testing commands and a concise roadmap. + +## Decisions Made + +- Kept README focused on user-facing operation and reviewability instead of implementation internals. +- Described PyYAML as optional because the MVP has a small standard-library fallback parser for starter configs. +- Left future backend details in the roadmap rather than implying they already exist. + +## Notes + +- The README now reflects the current MVP state through Phase 10. diff --git a/docs/devlog/phase7.md b/docs/devlog/phase7.md new file mode 100644 index 0000000..7296cf6 --- /dev/null +++ b/docs/devlog/phase7.md @@ -0,0 +1,23 @@ +# Phase 7 Devlog: Agent Executor + +## Implemented + +- Added `nightshift/agents.py`. +- Implemented the v1 `command` backend for agents. +- Loaded system prompt files through project-root-safe path resolution. +- Built compact prompt bundles containing system prompt, task markdown, acceptance criteria, project context, previous stage output, retry notes, and output contract. +- Passed prompt bundles to command agents on stdin. +- Captured stdout, stderr, exit code, duration, and timeout state. +- Persisted agent output and prompt artifacts through the artifact store. +- Parsed structured review-agent output into `StageResult`. +- Added fake-agent tests. + +## Decisions Made + +- Agent commands are command strings and run with `shell=True`, matching the Phase 6 command-string model. Unlike validation/test commands, agent commands are configured agent backends rather than allowlisted project commands. +- Agent stages pass when the command exits successfully. Review stages must emit a valid `status:` field or they fail. +- Prompt artifacts include the exact prompt sent to the agent to support auditability and prompt debugging. + +## Notes + +- Only the `command` backend is implemented. Ollama, Codex CLI, Claude Code, and API backends remain future integrations. diff --git a/docs/devlog/phase8.md b/docs/devlog/phase8.md new file mode 100644 index 0000000..34bde05 --- /dev/null +++ b/docs/devlog/phase8.md @@ -0,0 +1,25 @@ +# Phase 8 Devlog: Pipeline Runner + +## Implemented + +- Added `nightshift/pipeline.py`. +- Executed configured stages in order for one task. +- Supported agent, agent-review, command, and summarize stages. +- Stopped on unrecoverable stage failure. +- Supported `on_fail` redirection and review-provided `next_stage` redirection. +- Tracked retry count per task. +- Enforced `pipeline.max_task_retries`. +- Wrote task snapshots, config snapshots, per-stage outputs, stage summaries, final task notes, and run summary. +- Wired `nightshift run --task TASK-001` into the CLI. +- Added tests for happy-path pipeline execution and retry-limit enforcement. + +## Decisions Made + +- `on_fail` takes precedence over review-provided `next_stage` because it is deterministic config controlled by the user. +- Retry count increments when a failing stage redirects to another stage. Once the configured maximum is reached, the task fails. +- The summarize stage writes a simple artifact from known stage outputs and retry notes. Rich report generation remains Phase 10. +- Pipeline execution runs one task at a time, matching the v1 constraint. + +## Notes + +- The runner is now sufficient for fake command-agent pipelines. Context management and fuller reports are still deferred to later phases. diff --git a/docs/devlog/phase9.md b/docs/devlog/phase9.md new file mode 100644 index 0000000..401016a --- /dev/null +++ b/docs/devlog/phase9.md @@ -0,0 +1,22 @@ +# Phase 9 Devlog: Context Manager + +## Implemented + +- Added `nightshift/context.py`. +- Created project context files when absent. +- Created per-task `context.md` files. +- Added compact task context with task id, title, description, and acceptance criteria. +- Passed project context, task context, and retry context into agent prompt bundles. +- Persisted `context-out.md` after task execution. +- Included review `context_update` values in retry/context output notes. +- Added context manager tests and prompt coverage for task/retry context. + +## Decisions Made + +- Context files are plain markdown artifacts so they remain readable and easy to edit. +- Retry context is built from compact retry notes rather than full previous transcripts. +- Durable project-context bubbling is implemented as an explicit helper, but the pipeline does not automatically append every task detail into project context. + +## Notes + +- Later phases can decide which completed-task facts are worth promoting into project context. diff --git a/nightshift/agents.py b/nightshift/agents.py new file mode 100644 index 0000000..8e6d2ee --- /dev/null +++ b/nightshift/agents.py @@ -0,0 +1,290 @@ +"""Command-backed agent execution.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +import subprocess +import time + +from .artifacts import ArtifactStore +from .config import AgentConfig, StageConfig +from .errors import AgentError, SafetyError +from .safety import resolve_inside_root, resolve_project_root +from .stages import StageResult, StageStatus +from .tasks import Task + + +DEFAULT_AGENT_TIMEOUT_SECONDS = 600 + + +@dataclass(frozen=True) +class AgentInvocation: + agent_id: str + command: str + prompt: str + exit_code: int + stdout: str + stderr: str + duration_seconds: float + timed_out: bool = False + + +class AgentExecutor: + """Execute configured agents. + + v1 supports the `command` backend only. The command receives the prompt + bundle on stdin and its stdout/stderr are persisted as the stage artifact. + """ + + def __init__( + self, + project_root: str | Path, + agents: dict[str, AgentConfig], + artifacts: ArtifactStore, + timeout_seconds: int = DEFAULT_AGENT_TIMEOUT_SECONDS, + ) -> None: + self.project_root = resolve_project_root(project_root) + self.agents = agents + self.artifacts = artifacts + self.timeout_seconds = timeout_seconds + + def run_stage( + self, + stage: StageConfig, + task: Task, + previous_outputs: dict[str, str] | None = None, + retry_notes: list[str] | None = None, + project_context: str | None = None, + task_context: str | None = None, + retry_context: str | None = None, + ) -> StageResult: + if stage.agent is None: + raise AgentError(f"Agent error: stage '{stage.id}' does not reference an agent.") + agent = self.agents.get(stage.agent) + if agent is None: + raise AgentError(f"Agent error: unknown agent '{stage.agent}' for stage '{stage.id}'.") + if agent.backend != "command": + raise AgentError( + f"Agent error: agent '{agent.id}' uses unsupported backend '{agent.backend}'." + ) + if not agent.command: + raise AgentError(f"Agent error: command backend agent '{agent.id}' has no command.") + + system_prompt = self._read_system_prompt(agent) + prompt = build_prompt_bundle( + system_prompt=system_prompt, + stage=stage, + task=task, + project_context=project_context if project_context is not None else self._read_project_context(), + task_context=task_context or "", + previous_outputs=previous_outputs or {}, + retry_notes=retry_notes or [], + retry_context=retry_context, + ) + invocation = self._invoke(agent, prompt) + output_filename = stage.output or f"{stage.id}.md" + output = format_agent_invocation(stage.id, invocation) + output_path = self.artifacts.write_stage_output(task.id, output_filename, output) + + if invocation.timed_out: + status: StageStatus = "fail" + reason = f"Agent timed out after {self.timeout_seconds}s." + next_stage = None + context_update = None + elif invocation.exit_code != 0: + status = "fail" + reason = f"Agent exited with code {invocation.exit_code}." + next_stage = None + context_update = None + elif stage.type in {"agent_review", "review"}: + status, reason, next_stage, context_update = parse_review_output(invocation.stdout) + else: + status = "pass" + reason = "Agent completed." + next_stage = None + context_update = None + + return StageResult( + stage_id=stage.id, + status=status, + reason=reason, + output_path=str(output_path.relative_to(self.project_root)), + next_stage=next_stage, + context_update=context_update, + ) + + def _read_system_prompt(self, agent: AgentConfig) -> str: + try: + path = resolve_inside_root( + self.project_root, agent.system_prompt, f"agent '{agent.id}' system prompt" + ) + except SafetyError as exc: + raise AgentError(str(exc)) from exc + if not path.exists(): + raise AgentError(f"Agent error: system prompt does not exist: {agent.system_prompt}") + return path.read_text(encoding="utf-8") + + def _read_project_context(self) -> str: + if not self.artifacts.project_context_path.exists(): + return "" + return self.artifacts.project_context_path.read_text(encoding="utf-8") + + def _invoke(self, agent: AgentConfig, prompt: str) -> AgentInvocation: + started = time.monotonic() + try: + completed = subprocess.run( + agent.command, + cwd=self.project_root, + shell=True, + input=prompt, + capture_output=True, + text=True, + timeout=self.timeout_seconds, + ) + duration = time.monotonic() - started + return AgentInvocation( + agent_id=agent.id, + command=agent.command, + prompt=prompt, + exit_code=completed.returncode, + stdout=completed.stdout, + stderr=completed.stderr, + duration_seconds=duration, + ) + except subprocess.TimeoutExpired as exc: + duration = time.monotonic() - started + return AgentInvocation( + agent_id=agent.id, + command=agent.command, + prompt=prompt, + exit_code=-1, + stdout=exc.stdout or "", + stderr=exc.stderr or "", + duration_seconds=duration, + timed_out=True, + ) + + +def build_prompt_bundle( + system_prompt: str, + stage: StageConfig, + task: Task, + project_context: str, + previous_outputs: dict[str, str], + retry_notes: list[str], + task_context: str = "", + retry_context: str | None = None, +) -> str: + acceptance = "\n".join(f"- {item}" for item in task.acceptance_criteria) + prior = "\n\n".join(f"## {stage_id}\n\n{content}" for stage_id, content in previous_outputs.items()) + retries = "\n".join(f"- {note}" for note in retry_notes) + + return "\n".join( + [ + "# NightShift Agent Input", + "", + "## System Prompt", + "", + system_prompt.strip(), + "", + "## Stage", + "", + f"- id: {stage.id}", + f"- type: {stage.type}", + "", + "## Task", + "", + task.raw_markdown.strip(), + "", + "## Acceptance Criteria", + "", + acceptance, + "", + "## Project Context", + "", + project_context.strip(), + "", + "## Task Context", + "", + task_context.strip(), + "", + "## Previous Stage Output", + "", + prior.strip(), + "", + "## Retry Notes", + "", + (retry_context if retry_context is not None else retries).strip(), + "", + "## Output Contract", + "", + output_contract_for(stage), + "", + ] + ) + + +def output_contract_for(stage: StageConfig) -> str: + if stage.type in {"agent_review", "review"}: + return "\n".join( + [ + "Output exactly:", + "status: pass | fail | retry | escalate", + "reason: ", + "next_stage: ", + "context_update: ", + ] + ) + return "Write the requested stage output in concise markdown." + + +def parse_review_output(output: str) -> tuple[StageStatus, str, str | None, str | None]: + values: dict[str, str] = {} + for line in output.splitlines(): + if ":" not in line: + continue + key, value = line.split(":", 1) + values[key.strip().lower()] = value.strip() + + raw_status = values.get("status", "") + if raw_status not in {"pass", "fail", "retry", "escalate"}: + return "fail", "Review output did not include a valid status.", None, None + + reason = values.get("reason") or "Review returned no reason." + next_stage = values.get("next_stage") or None + context_update = values.get("context_update") or None + return raw_status, reason, next_stage, context_update # type: ignore[return-value] + + +def format_agent_invocation(stage_id: str, invocation: AgentInvocation) -> str: + return "\n".join( + [ + f"# Agent Output: {stage_id}", + "", + f"Agent: `{invocation.agent_id}`", + f"Command: `{invocation.command}`", + f"Exit code: {invocation.exit_code}", + f"Duration seconds: {invocation.duration_seconds:.3f}", + f"Timed out: {str(invocation.timed_out).lower()}", + "", + "## stdout", + "", + "```text", + invocation.stdout.rstrip(), + "```", + "", + "## stderr", + "", + "```text", + invocation.stderr.rstrip(), + "```", + "", + "## Prompt", + "", + "```markdown", + invocation.prompt.rstrip(), + "```", + "", + ] + ) diff --git a/nightshift/cli.py b/nightshift/cli.py index eab6340..1506dbf 100644 --- a/nightshift/cli.py +++ b/nightshift/cli.py @@ -9,7 +9,8 @@ import sys from .config import validate_config from .errors import NightShiftError from .init import init_project -from .tasks import parse_task_file +from .pipeline import PipelineRunner +from .tasks import parse_task_file, select_next_incomplete_task, select_task_by_id def build_parser() -> argparse.ArgumentParser: @@ -25,7 +26,10 @@ def build_parser() -> argparse.ArgumentParser: validate_parser = subparsers.add_parser("validate", help="Validate nightshift.yaml.") validate_parser.add_argument("--config", default="nightshift.yaml", help="Config file to validate.") - subparsers.add_parser("run", help="Pipeline execution is planned for a later phase.") + run_parser = subparsers.add_parser("run", help="Run the configured pipeline for one task.") + run_parser.add_argument("--config", default="nightshift.yaml", help="Config file to use.") + run_parser.add_argument("--task", help="Specific task id to run.") + subparsers.add_parser("status", help="Status reporting is planned for a later phase.") return parser @@ -54,7 +58,19 @@ def main(argv: list[str] | None = None) -> int: print(f"Incomplete tasks: {incomplete}") return 0 - if args.command in {"run", "status"}: + if args.command == "run": + config = validate_config(args.config) + tasks = parse_task_file(config.project.root, config.project.task_file) + task = select_task_by_id(tasks, args.task) if args.task else select_next_incomplete_task(tasks) + result = PipelineRunner(config).run_task(task) + print(f"Task: {result.task_id}") + print(f"Status: {result.status}") + print(f"Retries: {result.retry_count}") + print(f"Artifacts: {result.artifact_dir}") + print(f"Reason: {result.reason}") + return 0 if result.status == "complete" else 1 + + if args.command in {"status"}: parser.error(f"'{args.command}' is not implemented yet.") except NightShiftError as exc: diff --git a/nightshift/context.py b/nightshift/context.py new file mode 100644 index 0000000..4176751 --- /dev/null +++ b/nightshift/context.py @@ -0,0 +1,107 @@ +"""Context file management for pipeline runs.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from .artifacts import ArtifactStore +from .tasks import Task + + +@dataclass(frozen=True) +class TaskContext: + project_context: str + task_context: str + retry_context: str + + +class ContextManager: + """Create and read compact context files for one run.""" + + def __init__(self, artifacts: ArtifactStore) -> None: + self.artifacts = artifacts + + def ensure_project_context(self) -> Path: + self.artifacts.initialize_run() + if not self.artifacts.project_context_path.exists(): + self.artifacts.project_context_path.write_text("# Project Context\n\n", encoding="utf-8") + return self.artifacts.project_context_path + + def create_task_context(self, task: Task) -> Path: + self.ensure_project_context() + content = "\n".join( + [ + "# Task Context", + "", + f"Task: `{task.id}`", + f"Title: {task.title}", + "", + "## Description", + "", + task.description or "_No description provided._", + "", + "## Acceptance Criteria", + "", + "\n".join(f"- {item}" for item in task.acceptance_criteria), + "", + ] + ) + return self.artifacts.write_stage_output(task.id, "context.md", content) + + def read_context(self, task: Task, retry_notes: list[str] | None = None) -> TaskContext: + project_path = self.ensure_project_context() + task_context_path = self.artifacts.create_task_dir(task.id).directory / "context.md" + if not task_context_path.exists(): + task_context_path = self.create_task_context(task) + + retries = retry_notes or [] + return TaskContext( + project_context=project_path.read_text(encoding="utf-8"), + task_context=task_context_path.read_text(encoding="utf-8"), + retry_context="\n".join(f"- {note}" for note in retries) if retries else "- None", + ) + + def write_context_out( + self, + task: Task, + status: str, + reason: str, + retry_notes: list[str], + durable_notes: list[str] | None = None, + ) -> Path: + notes = durable_notes or [] + content = "\n".join( + [ + "# Context Out", + "", + f"Task: `{task.id}`", + f"Status: {status}", + f"Reason: {reason}", + "", + "## Retry Notes", + "", + "\n".join(f"- {note}" for note in retry_notes) if retry_notes else "- None", + "", + "## Durable Notes", + "", + "\n".join(f"- {note}" for note in notes) if notes else "- None", + "", + ] + ) + return self.artifacts.write_stage_output(task.id, "context-out.md", content) + + def append_project_context(self, task: Task, notes: list[str]) -> None: + if not notes: + return + path = self.ensure_project_context() + addition = "\n".join( + [ + f"## {task.id}", + "", + *[f"- {note}" for note in notes], + "", + ] + ) + existing = path.read_text(encoding="utf-8") + path.write_text(existing.rstrip() + "\n\n" + addition, encoding="utf-8") diff --git a/nightshift/errors.py b/nightshift/errors.py index 768a11a..ff1ee50 100644 --- a/nightshift/errors.py +++ b/nightshift/errors.py @@ -27,3 +27,11 @@ class ArtifactError(NightShiftError): class CommandError(NightShiftError): """Raised when command stage execution cannot proceed.""" + + +class AgentError(NightShiftError): + """Raised when agent execution cannot proceed.""" + + +class PipelineError(NightShiftError): + """Raised when pipeline execution cannot proceed.""" diff --git a/nightshift/pipeline.py b/nightshift/pipeline.py new file mode 100644 index 0000000..6b4da5a --- /dev/null +++ b/nightshift/pipeline.py @@ -0,0 +1,206 @@ +"""Deterministic pipeline runner.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from .agents import AgentExecutor +from .artifacts import ArtifactStore +from .commands import CommandExecutor +from .config import COMMAND_STAGE_TYPES, NightShiftConfig, StageConfig +from .context import ContextManager +from .errors import PipelineError +from .reports import ReportGenerator +from .stages import StageResult +from .tasks import Task + + +@dataclass(frozen=True) +class PipelineResult: + task_id: str + status: str + retry_count: int + stage_results: tuple[StageResult, ...] + artifact_dir: str + reason: str + + +class PipelineRunner: + """Execute configured stages for one task.""" + + def __init__( + self, + config: NightShiftConfig, + artifacts: ArtifactStore | None = None, + agent_timeout_seconds: int = 600, + command_timeout_seconds: int = 300, + ) -> None: + self.config = config + self.artifacts = artifacts or ArtifactStore.from_config(config) + self.context = ContextManager(self.artifacts) + self.reports = ReportGenerator(config.project.root, self.artifacts) + self.agent_executor = AgentExecutor( + config.project.root, + config.agents, + self.artifacts, + timeout_seconds=agent_timeout_seconds, + ) + self.command_executor = CommandExecutor( + config.project.root, + config.safety, + self.artifacts, + timeout_seconds=command_timeout_seconds, + ) + + def run_task(self, task: Task) -> PipelineResult: + self.artifacts.initialize_run() + self.artifacts.write_config_snapshot(self.config.path) + self.artifacts.write_task_snapshot(task) + self.context.ensure_project_context() + self.context.create_task_context(task) + + stages = list(self.config.pipeline.stages) + stage_indexes = {stage.id: index for index, stage in enumerate(stages)} + stage_results: list[StageResult] = [] + previous_outputs: dict[str, str] = {} + retry_notes: list[str] = [] + retry_count = 0 + index = 0 + final_status = "complete" + final_reason = "Pipeline completed." + + while index < len(stages): + stage = stages[index] + result = self._run_stage(stage, task, previous_outputs, retry_notes) + stage_results.append(result) + previous_outputs[stage.id] = self._read_output(result.output_path) + if result.context_update: + retry_notes.append(f"Context update from '{stage.id}': {result.context_update}") + + if result.status == "pass": + index += 1 + continue + + target_stage = stage.on_fail or result.next_stage + if target_stage: + if retry_count >= self.config.pipeline.max_task_retries: + final_status = "failed" + final_reason = ( + f"Retry limit reached after stage '{stage.id}': {result.reason}" + ) + break + if target_stage not in stage_indexes: + final_status = "failed" + final_reason = ( + f"Stage '{stage.id}' requested unknown next stage '{target_stage}'." + ) + break + retry_count += 1 + retry_notes.append( + f"Retry {retry_count}: stage '{stage.id}' returned " + f"{result.status} ({result.reason}); redirecting to '{target_stage}'." + ) + index = stage_indexes[target_stage] + continue + + final_status = "failed" + final_reason = f"Stage '{stage.id}' returned {result.status}: {result.reason}" + break + + context_out_path = self.context.write_context_out( + task, + final_status, + final_reason, + retry_notes, + durable_notes=[ + result.context_update + for result in stage_results + if result.context_update + ], + ) + self.reports.write_reports( + task, + final_status, + final_reason, + retry_count, + stage_results, + context_out_path=context_out_path, + ) + + return PipelineResult( + task_id=task.id, + status=final_status, + retry_count=retry_count, + stage_results=tuple(stage_results), + artifact_dir=str(self.artifacts.create_task_dir(task.id).directory.relative_to(self.config.project.root)), + reason=final_reason, + ) + + def _run_stage( + self, + stage: StageConfig, + task: Task, + previous_outputs: dict[str, str], + retry_notes: list[str], + ) -> StageResult: + if stage.type in {"agent", "agent_review", "review"}: + context = self.context.read_context(task, retry_notes) + return self.agent_executor.run_stage( + stage, + task, + previous_outputs, + retry_notes, + project_context=context.project_context, + task_context=context.task_context, + retry_context=context.retry_context, + ) + if stage.type in COMMAND_STAGE_TYPES: + return self.command_executor.run_stage(stage, task.id) + if stage.type == "summarize": + output_path = self.artifacts.write_stage_output( + task.id, + stage.output or "final-notes.md", + format_summary_stage(task, previous_outputs, retry_notes), + ) + return StageResult( + stage_id=stage.id, + status="pass", + reason="Summary written.", + output_path=str(output_path.relative_to(self.config.project.root)), + ) + raise PipelineError(f"Pipeline error: unsupported stage type '{stage.type}'.") + + def _read_output(self, output_path: str | None) -> str: + if output_path is None: + return "" + path = self.config.project.root / Path(output_path) + if not path.exists(): + return "" + return path.read_text(encoding="utf-8") + +def format_summary_stage( + task: Task, + previous_outputs: dict[str, str], + retry_notes: list[str], +) -> str: + outputs = "\n".join(f"- {stage_id}" for stage_id in previous_outputs) + retries = "\n".join(f"- {note}" for note in retry_notes) or "- None" + return "\n".join( + [ + "# Final Notes", + "", + f"Task: `{task.id}`", + f"Title: {task.title}", + "", + "## Stage Outputs", + "", + outputs or "- None", + "", + "## Retry Notes", + "", + retries, + "", + ] + ) + diff --git a/nightshift/reports.py b/nightshift/reports.py new file mode 100644 index 0000000..320d7a7 --- /dev/null +++ b/nightshift/reports.py @@ -0,0 +1,205 @@ +"""Human-readable NightShift reports.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +import subprocess + +from .artifacts import ArtifactStore +from .stages import StageResult +from .tasks import Task + + +@dataclass(frozen=True) +class TaskReport: + final_notes_path: Path + stage_results_path: Path + run_summary_path: Path + + +class ReportGenerator: + """Write task and run summaries from pipeline results.""" + + def __init__(self, project_root: Path, artifacts: ArtifactStore) -> None: + self.project_root = project_root + self.artifacts = artifacts + + def write_reports( + self, + task: Task, + status: str, + reason: str, + retry_count: int, + stage_results: list[StageResult], + context_out_path: Path | None = None, + ) -> TaskReport: + modified_files = collect_modified_files(self.project_root) + stage_results_path = self.artifacts.write_stage_output( + task.id, + "stage-results.md", + format_stage_results(task, status, reason, retry_count, stage_results), + ) + final_notes_path = self.artifacts.write_final_task_notes( + task.id, + format_task_report( + task=task, + status=status, + reason=reason, + retry_count=retry_count, + stage_results=stage_results, + modified_files=modified_files, + stage_results_path=stage_results_path, + context_out_path=context_out_path, + ), + ) + self.artifacts.run_summary_path.write_text( + format_run_summary( + task=task, + status=status, + reason=reason, + retry_count=retry_count, + modified_files=modified_files, + final_notes_path=final_notes_path, + stage_results_path=stage_results_path, + ), + encoding="utf-8", + ) + return TaskReport(final_notes_path, stage_results_path, self.artifacts.run_summary_path) + + +def format_stage_results( + task: Task, + status: str, + reason: str, + retry_count: int, + stage_results: list[StageResult], +) -> str: + lines = [ + "# Stage Results", + "", + f"Task: `{task.id}`", + f"Status: {status}", + f"Retry count: {retry_count}", + f"Reason: {reason}", + "", + ] + for result in stage_results: + lines.extend( + [ + f"## {result.stage_id}", + "", + f"Status: {result.status}", + f"Reason: {result.reason}", + f"Output: {result.output_path or ''}", + f"Next stage: {result.next_stage or ''}", + f"Context update: {result.context_update or ''}", + "", + ] + ) + return "\n".join(lines) + + +def format_task_report( + task: Task, + status: str, + reason: str, + retry_count: int, + stage_results: list[StageResult], + modified_files: list[str], + stage_results_path: Path, + context_out_path: Path | None, +) -> str: + stage_lines = "\n".join( + f"- `{result.stage_id}`: {result.status} ({result.reason})" for result in stage_results + ) + artifact_lines = [ + f"- Stage results: `{stage_results_path.name}`", + ] + if context_out_path is not None: + artifact_lines.append(f"- Context out: `{context_out_path.name}`") + modified = "\n".join(f"- `{path}`" for path in modified_files) if modified_files else "- Unavailable or none detected" + + return "\n".join( + [ + "# Final Task Notes", + "", + f"Task: `{task.id}`", + f"Title: {task.title}", + f"Status: {status}", + f"Retry count: {retry_count}", + f"Reason: {reason}", + "", + "## Acceptance Criteria", + "", + "\n".join(f"- {item}" for item in task.acceptance_criteria), + "", + "## Stage Results", + "", + stage_lines or "- None", + "", + "## Modified Files", + "", + modified, + "", + "## Artifacts", + "", + "\n".join(artifact_lines), + "", + ] + ) + + +def format_run_summary( + task: Task, + status: str, + reason: str, + retry_count: int, + modified_files: list[str], + final_notes_path: Path, + stage_results_path: Path, +) -> str: + modified = "\n".join(f"- `{path}`" for path in modified_files) if modified_files else "- Unavailable or none detected" + return "\n".join( + [ + "# Run Summary", + "", + f"- Task: {task.id}", + f"- Status: {status}", + f"- Retry count: {retry_count}", + f"- Reason: {reason}", + "", + "## Modified Files", + "", + modified, + "", + "## Artifacts", + "", + f"- Final notes: `{final_notes_path.relative_to(final_notes_path.parents[2])}`", + f"- Stage results: `{stage_results_path.relative_to(stage_results_path.parents[2])}`", + "", + ] + ) + + +def collect_modified_files(project_root: Path) -> list[str]: + try: + completed = subprocess.run( + "git status --short", + cwd=project_root, + shell=True, + capture_output=True, + text=True, + timeout=10, + ) + except (OSError, subprocess.TimeoutExpired): + return [] + if completed.returncode != 0: + return [] + + files: list[str] = [] + for line in completed.stdout.splitlines(): + if not line.strip(): + continue + files.append(line[3:].strip()) + return files diff --git a/tests/test_agents.py b/tests/test_agents.py new file mode 100644 index 0000000..bc489f7 --- /dev/null +++ b/tests/test_agents.py @@ -0,0 +1,99 @@ +from pathlib import Path +import tempfile +import unittest + +from nightshift.agents import AgentExecutor, build_prompt_bundle, parse_review_output +from nightshift.artifacts import ArtifactStore +from nightshift.config import AgentConfig, StageConfig +from nightshift.tasks import parse_tasks + + +TASK_MD = """# Tasks + +- [ ] TASK-001: Add fake agent coverage + +Description: +Exercise fake command agents. + +Acceptance Criteria: +- Prompt includes task details +- Agent output is stored +""" + + +class AgentExecutorTests(unittest.TestCase): + def test_build_prompt_bundle_includes_task_and_acceptance_criteria(self) -> None: + task = parse_tasks(TASK_MD)[0] + prompt = build_prompt_bundle( + system_prompt="System rules", + stage=StageConfig(id="plan", type="agent", agent="planner"), + task=task, + project_context="Project context", + previous_outputs={"prior": "Earlier output"}, + retry_notes=["Retry note"], + ) + + self.assertIn("System rules", prompt) + self.assertIn("TASK-001", prompt) + self.assertIn("- Prompt includes task details", prompt) + self.assertIn("Earlier output", prompt) + self.assertIn("Retry note", prompt) + + def test_build_prompt_bundle_includes_task_context(self) -> None: + task = parse_tasks(TASK_MD)[0] + prompt = build_prompt_bundle( + system_prompt="System rules", + stage=StageConfig(id="plan", type="agent", agent="planner"), + task=task, + project_context="Project context", + task_context="Task context body", + previous_outputs={}, + retry_notes=[], + retry_context="- No retries", + ) + + self.assertIn("## Task Context", prompt) + self.assertIn("Task context body", prompt) + self.assertIn("- No retries", prompt) + + def test_command_agent_writes_output_and_returns_pass(self) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + prompt_path = root / "planner.md" + prompt_path.write_text("Plan carefully.", encoding="utf-8") + artifacts = ArtifactStore(root, ".nightshift", run_id="test-run") + executor = AgentExecutor( + root, + { + "planner": AgentConfig( + id="planner", + backend="command", + command='python -c "import sys; print(sys.stdin.read())"', + system_prompt=Path("planner.md"), + ) + }, + artifacts, + ) + task = parse_tasks(TASK_MD)[0] + stage = StageConfig(id="plan", type="agent", agent="planner", output="plan.md") + + result = executor.run_stage(stage, task) + + self.assertEqual(result.status, "pass") + output = (root / result.output_path).read_text(encoding="utf-8") + self.assertIn("TASK-001", output) + self.assertIn("Plan carefully.", output) + + def test_review_output_parser_accepts_structured_status(self) -> None: + status, reason, next_stage, context_update = parse_review_output( + "status: retry\nreason: Needs changes\nnext_stage: implement\ncontext_update: Fix tests\n" + ) + + self.assertEqual(status, "retry") + self.assertEqual(reason, "Needs changes") + self.assertEqual(next_stage, "implement") + self.assertEqual(context_update, "Fix tests") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_context.py b/tests/test_context.py new file mode 100644 index 0000000..6d639f5 --- /dev/null +++ b/tests/test_context.py @@ -0,0 +1,57 @@ +from pathlib import Path +import tempfile +import unittest + +from nightshift.artifacts import ArtifactStore +from nightshift.context import ContextManager +from nightshift.tasks import parse_tasks + + +TASK_MD = """# Tasks + +- [ ] TASK-001: Build context + +Description: +Create compact task context. + +Acceptance Criteria: +- Context files are created +- Retry notes are persisted +""" + + +class ContextManagerTests(unittest.TestCase): + def test_creates_project_task_and_context_out_files(self) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + artifacts = ArtifactStore(root, ".nightshift", run_id="test-run") + manager = ContextManager(artifacts) + task = parse_tasks(TASK_MD)[0] + + project_path = manager.ensure_project_context() + task_path = manager.create_task_context(task) + context = manager.read_context(task, ["retry once"]) + out_path = manager.write_context_out(task, "complete", "done", ["retry once"], ["useful fact"]) + + self.assertTrue(project_path.exists()) + self.assertTrue(task_path.exists()) + self.assertTrue(out_path.exists()) + self.assertIn("TASK-001", context.task_context) + self.assertIn("retry once", context.retry_context) + self.assertIn("useful fact", out_path.read_text(encoding="utf-8")) + + def test_append_project_context_adds_durable_notes(self) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + manager = ContextManager(ArtifactStore(root, ".nightshift", run_id="test-run")) + task = parse_tasks(TASK_MD)[0] + + manager.append_project_context(task, ["Remember this"]) + + content = (root / ".nightshift" / "project-context.md").read_text(encoding="utf-8") + self.assertIn("TASK-001", content) + self.assertIn("Remember this", content) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..835800b --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,138 @@ +from pathlib import Path +import tempfile +import unittest + +from nightshift.artifacts import ArtifactStore +from nightshift.config import ( + AgentConfig, + NightShiftConfig, + PipelineConfig, + ProjectConfig, + SafetyConfig, + StageConfig, +) +from nightshift.pipeline import PipelineRunner +from nightshift.tasks import parse_tasks + + +TASK_MD = """# Tasks + +- [ ] TASK-001: Run fake pipeline + +Description: +Exercise a fake pipeline. + +Acceptance Criteria: +- Happy path completes +- Artifacts are written +""" + + +def make_config(root: Path, stages: tuple[StageConfig, ...], max_retries: int = 2) -> NightShiftConfig: + return NightShiftConfig( + path=root / "nightshift.yaml", + project=ProjectConfig( + name="test", + root=root, + task_file=Path("tasks.md"), + artifact_dir=Path(".nightshift"), + ), + safety=SafetyConfig( + require_clean_worktree=False, + scoped_paths=(".",), + allowed_commands=('python -c "print(\'tests ok\')"',), + forbidden_commands=("rm -rf",), + ), + agents={ + "planner": AgentConfig( + id="planner", + backend="command", + command='python -c "print(\'plan ok\')"', + system_prompt=Path("planner.md"), + ), + "reviewer": AgentConfig( + id="reviewer", + backend="command", + command='python -c "print(\'status: pass\\nreason: ok\')"', + system_prompt=Path("reviewer.md"), + ), + "retry_reviewer": AgentConfig( + id="retry_reviewer", + backend="command", + command='python -c "print(\'status: retry\\nreason: retry it\\nnext_stage: implement\')"', + system_prompt=Path("reviewer.md"), + ), + }, + pipeline=PipelineConfig(max_task_retries=max_retries, stages=stages), + ) + + +class PipelineRunnerTests(unittest.TestCase): + def test_happy_path_pipeline_completes_and_writes_artifacts(self) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + _write_common_files(root) + stages = ( + StageConfig(id="plan", type="agent", agent="planner", output="plan.md"), + StageConfig( + id="test", + type="command", + commands=('python -c "print(\'tests ok\')"',), + output="test-output.txt", + ), + StageConfig(id="review", type="agent_review", agent="reviewer", output="review.md"), + StageConfig(id="summarize", type="summarize", output="final-notes.md"), + ) + config = make_config(root, stages) + runner = PipelineRunner(config, ArtifactStore(root, ".nightshift", run_id="test-run")) + task = parse_tasks(TASK_MD)[0] + + result = runner.run_task(task) + + self.assertEqual(result.status, "complete") + self.assertEqual(result.retry_count, 0) + self.assertTrue((root / ".nightshift" / "runs" / "test-run" / "tasks" / task.id / "plan.md").exists()) + self.assertTrue((root / ".nightshift" / "runs" / "test-run" / "tasks" / task.id / "stage-results.md").exists()) + self.assertTrue((root / ".nightshift" / "runs" / "test-run" / "tasks" / task.id / "context.md").exists()) + self.assertTrue((root / ".nightshift" / "runs" / "test-run" / "tasks" / task.id / "context-out.md").exists()) + self.assertIn( + "## Task Context", + (root / ".nightshift" / "runs" / "test-run" / "tasks" / task.id / "plan.md").read_text(encoding="utf-8"), + ) + self.assertIn("Modified Files", (root / ".nightshift" / "runs" / "test-run" / "run-summary.md").read_text(encoding="utf-8")) + + def test_review_can_retry_implementation_until_limit(self) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + _write_common_files(root) + stages = ( + StageConfig(id="implement", type="agent", agent="planner", output="implementation-log.md"), + StageConfig( + id="review", + type="agent_review", + agent="retry_reviewer", + on_fail="implement", + output="review.md", + ), + ) + config = make_config(root, stages, max_retries=2) + runner = PipelineRunner(config, ArtifactStore(root, ".nightshift", run_id="test-run")) + task = parse_tasks(TASK_MD)[0] + + result = runner.run_task(task) + + self.assertEqual(result.status, "failed") + self.assertEqual(result.retry_count, 2) + self.assertIn("Retry limit reached", result.reason) + self.assertEqual([item.stage_id for item in result.stage_results], ["implement", "review", "implement", "review", "implement", "review"]) + + +def _write_common_files(root: Path) -> None: + (root / "nightshift.yaml").write_text("project:\n name: test\n", encoding="utf-8") + (root / "tasks.md").write_text(TASK_MD, encoding="utf-8") + (root / "planner.md").write_text("Plan.", encoding="utf-8") + (root / "reviewer.md").write_text("Review.", encoding="utf-8") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_reports.py b/tests/test_reports.py new file mode 100644 index 0000000..5d2fa54 --- /dev/null +++ b/tests/test_reports.py @@ -0,0 +1,58 @@ +from pathlib import Path +import tempfile +import unittest + +from nightshift.artifacts import ArtifactStore +from nightshift.reports import ReportGenerator +from nightshift.stages import StageResult +from nightshift.tasks import parse_tasks + + +TASK_MD = """# Tasks + +- [ ] TASK-001: Report results + +Description: +Write summaries. + +Acceptance Criteria: +- Final notes explain status +- Run summary includes artifacts +""" + + +class ReportGeneratorTests(unittest.TestCase): + def test_writes_final_notes_stage_results_and_run_summary(self) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + artifacts = ArtifactStore(root, ".nightshift", run_id="test-run") + reporter = ReportGenerator(root, artifacts) + task = parse_tasks(TASK_MD)[0] + context_out = artifacts.write_stage_output(task.id, "context-out.md", "# Context Out\n") + + report = reporter.write_reports( + task, + "complete", + "done", + 1, + [ + StageResult( + stage_id="test", + status="pass", + reason="ok", + output_path=".nightshift/runs/test-run/tasks/TASK-001/test-output.txt", + ) + ], + context_out_path=context_out, + ) + + self.assertTrue(report.final_notes_path.exists()) + self.assertTrue(report.stage_results_path.exists()) + self.assertTrue(report.run_summary_path.exists()) + self.assertIn("Retry count: 1", report.final_notes_path.read_text(encoding="utf-8")) + self.assertIn("test", report.stage_results_path.read_text(encoding="utf-8")) + self.assertIn("Final notes", report.run_summary_path.read_text(encoding="utf-8")) + + +if __name__ == "__main__": + unittest.main()