Project runner tool and writer tooling fixes

i wanted the animated status bar...
2026-06-14 10:08:37 +00:00 · 2026-05-22 04:17:37 -07:00 · 2026-05-22 04:17:37 -07:00 · a0ad9b2ac0
commit a0ad9b2ac0
parent d928a52fb3
9 changed files with 614 additions and 5 deletions
--- a/docs/bugs.md
+++ b/docs/bugs.md
@ -0,0 +1,3 @@
+# descriptions for logs are slightly off for the status thing. "Starting ollama HTTP model invocation" implies that it's stuck starting when it's not.
+
+# We've stopped updating the version.
--- a/nightshift/cli.py
+++ b/nightshift/cli.py
@ -15,6 +15,7 @@ from .integ_setup import format_setup_result, setup_python_project
 from .integ_test import format_integration_test_result, run_integration_test
 from .pipeline import PipelineRunner
 from .runlog import RunLogger
+from .sandbox_run import format_sandbox_run_result, run_sandbox_project
 from .status import build_status, format_status
 from .task_tests import check_task_test_files, format_task_test_checks, missing_task_test_paths
 from .terminal import HOTDOG_ANIMATIONS, TerminalAnimation, format_banner, style_text
@ -131,6 +132,38 @@ def build_parser() -> argparse.ArgumentParser:
    integ_test_parser.add_argument("--setup-skip-validate", action="store_true", help="Skip validation during setup.")
    integ_test_parser.add_argument("--dry-run", action="store_true", help="Print commands without running setup or tasks.")

+    sandbox_parser = subparsers.add_parser(
+        "sandbox-run",
+        help="Copy an existing NightShift project into a sandbox, set it up, and run it.",
+    )
+    sandbox_parser.add_argument("--project", required=True, help="Existing NightShift project directory to copy.")
+    sandbox_output = sandbox_parser.add_mutually_exclusive_group(required=True)
+    sandbox_output.add_argument("--output", help="Sandbox output directory. The project is copied to OUTPUT/project.")
+    sandbox_output.add_argument(
+        "--timestamped",
+        action="store_true",
+        help="Create a timestamped sandbox under ROOT/integ_runs, like integ-test.",
+    )
+    sandbox_parser.add_argument("--root", default=".", help="Root used with --timestamped. Defaults to current directory.")
+    sandbox_parser.add_argument("--task", help="Specific task id to run.")
+    sandbox_parser.add_argument("--all", action="store_true", help="Run all runnable incomplete tasks.")
+    sandbox_parser.add_argument("--force", action="store_true", help="Overwrite an existing OUTPUT/project copy.")
+    sandbox_parser.add_argument(
+        "--setup-extra",
+        action="append",
+        default=["pytest"],
+        help="Extra package to install during setup. May be repeated. Defaults to pytest.",
+    )
+    sandbox_parser.add_argument("--setup-skip-validate", action="store_true", help="Skip validation during setup.")
+    sandbox_parser.add_argument("--dry-run", action="store_true", help="Create the sandbox copy and print commands without running setup or tasks.")
+    sandbox_parser.add_argument(
+        "--animation",
+        default="status_dots",
+        choices=tuple(sorted(HOTDOG_ANIMATIONS)),
+        help="Terminal animation to show while the sandboxed run is active.",
+    )
+    sandbox_parser.add_argument("--no-animation", action="store_true", help="Disable terminal animation.")
+
    integ_report_parser = subparsers.add_parser("integ-report", help="Summarize the latest integration run.")
    integ_report_parser.add_argument("--root", default=".", help="Repository root where integ_runs/ is located.")
    integ_report_parser.add_argument("--latest", action="store_true", help="Report the latest integration run.")
@ -309,6 +342,24 @@ def main(argv: list[str] | None = None) -> int:
            print(format_integration_test_result(result))
            return result.exit_code

+        if args.command == "sandbox-run":
+            result = run_sandbox_project(
+                args.project,
+                output=args.output,
+                timestamped=args.timestamped,
+                root=args.root,
+                task=args.task,
+                all_tasks=args.all,
+                setup_extras=tuple(args.setup_extra or ()),
+                skip_setup_validate=args.setup_skip_validate,
+                dry_run=args.dry_run,
+                animation=args.animation,
+                no_animation=args.no_animation,
+                force=args.force,
+            )
+            print(format_sandbox_run_result(result))
+            return result.exit_code
+
        if args.command == "integ-report":
            report = build_integration_report(args.root, latest=True)
            print(format_integration_report(report))
--- a/nightshift/pipeline.py
+++ b/nightshift/pipeline.py
@ -229,7 +229,11 @@ class PipelineRunner:
                index += 1
                continue

-            target_stage = stage.on_fail or result.next_stage
+            target_stage = result.next_stage or (
+                stage.on_fail
+                if not (stage.type in {"agent_review", "review"} and _is_malformed_review_result(result))
+                else None
+            )
            analysis_note = self._write_failure_diagnostics(stage, task, result, retry_count)
            if analysis_note:
                retry_notes.append(analysis_note)
@ -481,7 +485,7 @@ class PipelineRunner:
            result = self.agent_executor.run_stage(
                self._stage_for_retry_agent(stage, retry_count),
                task,
-                previous_outputs,
+                _review_previous_outputs(previous_outputs) if stage.type in {"agent_review", "review"} else previous_outputs,
                retry_notes,
                project_context=context.project_context,
                task_context=context.task_context,
@ -501,6 +505,17 @@ class PipelineRunner:
                    context.task_context,
                    context.retry_context,
                )
+            if stage.type in {"agent_review", "review"} and _is_malformed_review_result(result):
+                return self._rerun_malformed_review(
+                    stage,
+                    task,
+                    result,
+                    previous_outputs,
+                    retry_notes,
+                    retry_count,
+                    context.project_context,
+                    context.task_context,
+                )
            return result
        if stage.type in COMMAND_STAGE_TYPES:
            return self.command_executor.run_stage(_stage_with_attempt_output(stage, retry_count), task.id)
@ -1217,6 +1232,59 @@ class PipelineRunner:
        )
        return f"Debugger output: {debug_result.output_path or 'none'}."

+    def _rerun_malformed_review(
+        self,
+        stage: StageConfig,
+        task: Task,
+        malformed_result: StageResult,
+        previous_outputs: dict[str, str],
+        retry_notes: list[str],
+        retry_count: int,
+        project_context: str,
+        task_context: str,
+    ) -> StageResult:
+        output_name = _attempt_filename(stage.output or f"{stage.id}.md", retry_count + 1)
+        strict_stage = replace(
+            self._stage_for_retry_agent(stage, retry_count),
+            output=output_name,
+        )
+        self.logger.event(
+            "agent.rerun",
+            "Re-running review after malformed output",
+            stage_id=stage.id,
+            task_id=task.id,
+        )
+        strict_notes = [
+            *retry_notes,
+            "Previous review output was malformed. Return exactly four lines: status, reason, next_stage, context_update. Do not return prose, headings, or analysis.",
+        ]
+        strict_outputs = _review_previous_outputs(previous_outputs)
+        strict_outputs["malformed_review_output"] = _compact_previous_output(
+            self._read_output(malformed_result.output_path),
+            max_chars=800,
+        )
+        result = self.agent_executor.run_stage(
+            strict_stage,
+            task,
+            strict_outputs,
+            strict_notes,
+            project_context=project_context,
+            task_context=task_context,
+            retry_context="\n".join(f"- {note}" for note in strict_notes),
+        )
+        if _is_malformed_review_result(result):
+            return StageResult(
+                result.stage_id,
+                "fail",
+                (
+                    "Review output remained malformed after a strict formatting retry. "
+                    "Stopping without redrafting; inspect the applied draft and review artifact."
+                ),
+                output_path=result.output_path,
+                context_update=result.context_update,
+            )
+        return result
+
    def _modified_files(self) -> tuple[str, ...]:
        completed = subprocess.run(
            ["git", "status", "--short"],
@ -1608,6 +1676,36 @@ def _invalid_file_writer_output_summary(output: str, reason: str, max_chars: int
    return "\n".join(lines)


+def _is_malformed_review_result(result: StageResult) -> bool:
+    return result.status == "fail" and (
+        "Review output did not include a valid status" in result.reason
+        or "Review output remained malformed" in result.reason
+    )
+
+
+def _review_previous_outputs(previous_outputs: dict[str, str], max_chars: int = 1600) -> dict[str, str]:
+    compacted: dict[str, str] = {}
+    priority_names = {
+        "applied.patch",
+        "normalized-draft.patch",
+        "scene-draft.patch",
+        "draft_scene",
+        "apply_draft",
+        "validate_draft",
+        "test",
+        "review",
+    }
+    for name, output in previous_outputs.items():
+        if name in priority_names or name.endswith(".patch") or "draft" in name or "apply" in name:
+            compacted[name] = _compact_previous_output(output, max_chars=max_chars)
+            continue
+        if name in {"plan", "semantic_context", "context"}:
+            compacted[name] = _compact_previous_output(output, max_chars=500)
+            continue
+        compacted[name] = _compact_previous_output(output, max_chars=800)
+    return compacted
+
+
 def _file_writer_error_reason(stage: StageConfig, reason: str) -> str:
    guidance = _file_writer_stage_guidance(stage)
    if not guidance or "not allowed for this stage" not in reason:
--- a/nightshift/project_templates/tutorial-novel/.nightshift/tasks.md
+++ b/nightshift/project_templates/tutorial-novel/.nightshift/tasks.md
@ -109,11 +109,101 @@ Acceptance Criteria:
 - Updates durable state

 ---
+- [ ] SCENE-031: Rollerblade courier run
+
+Dependencies:
+- SCENE-003
+
+Description:
+Proxy and Cricket rollerblade through late-night Seattle delivering encrypted NightShift inference keys, salvaged hardware, and cached datasets between squatters, artists, and underground operators.
+
+The scene should establish:
+- movement through the city
+- underground mutual aid systems
+- degraded urban infrastructure
+- physical geography of Seattle
+- emotional intimacy through transit
+
+Environmental details should emphasize:
+- wet pavement reflecting neon transit signage
+- abandoned autonomous delivery vehicles
+- late-night teriyaki shops
+- extension cords hanging between apartments
+- cracked sidewalks
+- rooftop antennas
+- stale vape clouds in freight elevators
+
+A subtle anomaly appears when an unrelated ad display briefly shows imagery identical to visuals seen elsewhere in the story.
+
+Nobody reacts strongly.
+
+Acceptance Criteria:
+- Strong Seattle atmosphere
+- Deepens Proxy and Cricket relationship naturally
+- Includes rollerblading materially throughout the scene
+- Introduces subtle recurring anomaly
+- Avoids exposition-heavy dialogue
+- Scene length between 1400-2400 words
+- Writes:
+  - `story/chapters/chapter-001/scene-003a.md`
+- Updates durable state
+
+---
+- [ ] SCENE-032: Kremwerk furry rave
+
+Dependencies:
+- SCENE-031
+
+Description:
+Proxy and DJ BLOODMONEY attend a crowded underground furry rave at Kremwerk following one of BLOODMONEY's pirate jungle sets.
+
+The scene should establish:
+- queer underground culture
+- synthetic identity experimentation
+- emotional sincerity beneath irony
+- anti-corporate creative spaces
+- generated aesthetics used communally rather than commercially
+- shape of the romance between Proxy and BLOODMONEY
+- makeout scene between proxy and bloodmoney
+
+The rave should feel:
+- affectionate
+- overheated
+- crowded
+- emotionally necessary
+
+Environmental details should include:
+- soaked Capitol Hill sidewalks
+- damp faux fur
+- dangling extension cords powering chargers
+- jungle edits mixed with bassline and hyperpop
+- generated visuals projected onto concrete pillars
+- patched jackets with dead startup logos
+- kandi bracelets
+- old server racks repurposed into lighting rigs
+- rollerbladers moving through industrial hallways
+- people discussing models like music genres
+
+Proxy gradually realizes many attendees rely emotionally on systems like NightShift.
+
+Acceptance Criteria:
+- Avoids mocking underground/furry culture
+- Strong sensory environmental detail
+- Reinforces themes of synthetic companionship and community
+- Includes subtle emotional unease beneath warmth
+- Maintains grounded tone
+- Scene length between 1800-3000 words
+- Writes:
+  - `story/chapters/chapter-001/scene-003b.md`
+- Updates durable state
+
+---
+

 - [ ] SCENE-004: Rich district delivery

 Dependencies:
- SCENE-003
+- SCENE-032

 Description:
 Proxy delivers salvaged compute hardware to a wealthy private social club operating in a quiet offline district.
@ -233,19 +323,59 @@ Proxy becomes uncomfortable with:

 Acceptance Criteria:
 - Shows expanding underground compute economy
- Deepens Proxy’s internal conflict
+- Deepens Proxy's internal conflict
 - Introduces operational stress
 - Maintains grounded tone
 - Writes:
  - `story/chapters/chapter-002/scene-002.md`
 - Updates durable state

+---
+- [ ] SCENE-081: Free inference night
+
+Dependencies:
+- SCENE-008
+
+Description:
+Following a successful scavenging run, NightShift temporarily opens free public inference access for one evening.
+
+Artists, musicians, lonely users, and exhausted workers flood the squat looking for compute access.
+
+The scene should establish:
+- NightShift as emotional infrastructure
+- positive social uses of synthetic systems
+- underground mutual aid culture
+- growing operational stress
+
+Examples should include:
+- collaborative generated visuals
+- musicians creating samples
+- users generating outfit concepts before events
+- emotionally vulnerable conversations with companion systems
+- translation of old documents and messages
+- communal experimentation with weird model outputs
+
+Proxy slowly realizes NightShift has become psychologically essential for many people.
+
+This realization unsettles her.
+
+Acceptance Criteria:
+- Avoids simplistic "AI bad" framing
+- Balances warmth with discomfort
+- Strong environmental detail
+- Shows growing scale of NightShift operations
+- Reinforces emotional dependency themes
+- Scene length between 1800-3000 words
+- Writes:
+  - `story/chapters/chapter-002/scene-002a.md`
+- Updates durable state
+
 ---

 - [ ] SCENE-009: Sister Circuit

 Dependencies:
- SCENE-008
+- SCENE-081

 Description:
 Proxy meets Sister Circuit in a server monastery outside Tacoma.
--- a/nightshift/sandbox_run.py
+++ b/nightshift/sandbox_run.py
@ -0,0 +1,143 @@
+"""General-purpose setup-and-run sandbox command."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+import shutil
+import subprocess
+import venv
+
+from .errors import NightShiftError
+from .integ import _initialize_project_git_repo
+from .integ_setup import IntegrationSetupResult, setup_python_project
+
+
+@dataclass(frozen=True)
+class SandboxRunResult:
+    source_project: Path
+    directory: Path
+    project_dir: Path
+    venv_dir: Path
+    setup: IntegrationSetupResult
+    command: tuple[str, ...]
+    exit_code: int
+    dry_run: bool
+
+
+def run_sandbox_project(
+    project: str | Path,
+    *,
+    output: str | Path | None = None,
+    timestamped: bool = False,
+    root: str | Path = ".",
+    task: str | None = None,
+    all_tasks: bool = False,
+    setup_extras: tuple[str, ...] = ("pytest",),
+    skip_setup_validate: bool = False,
+    dry_run: bool = False,
+    animation: str = "status_dots",
+    no_animation: bool = False,
+    force: bool = False,
+) -> SandboxRunResult:
+    """Copy a NightShift project into a sandbox, set it up, and run it."""
+
+    if task and all_tasks:
+        raise NightShiftError("Sandbox run error: use either --task or --all, not both.")
+    if not task and not all_tasks:
+        raise NightShiftError("Sandbox run error: provide --task or --all.")
+    if output and timestamped:
+        raise NightShiftError("Sandbox run error: use either --output or --timestamped, not both.")
+    if not output and not timestamped:
+        raise NightShiftError("Sandbox run error: provide --output or --timestamped.")
+
+    source = Path(project).resolve()
+    if not source.exists() or not source.is_dir():
+        raise NightShiftError(f"Sandbox run error: project directory does not exist: {source}")
+    if not (source / "nightshift.yaml").exists():
+        raise NightShiftError(f"Sandbox run error: project does not contain nightshift.yaml: {source}")
+
+    sandbox_dir = _sandbox_directory(output, root=root, timestamped=timestamped)
+    project_dir = sandbox_dir / "project"
+    venv_dir = sandbox_dir / ".venv"
+    if project_dir.exists() and any(project_dir.iterdir()) and not force:
+        raise NightShiftError(f"Sandbox run error: output project already exists: {project_dir}")
+
+    sandbox_dir.mkdir(parents=True, exist_ok=True)
+    if project_dir.exists():
+        shutil.rmtree(project_dir)
+    shutil.copytree(source, project_dir, ignore=_copy_ignore)
+    if not dry_run:
+        if not venv_dir.exists():
+            venv.EnvBuilder(with_pip=True).create(venv_dir)
+        _initialize_project_git_repo(project_dir)
+
+    setup = setup_python_project(
+        project_dir,
+        extras=setup_extras,
+        validate=not skip_setup_validate,
+        dry_run=dry_run,
+    )
+    command = [str(setup.python), "-m", "nightshift.cli", "run"]
+    if no_animation:
+        command.append("--no-animation")
+    elif animation:
+        command.extend(["--animation", animation])
+    if all_tasks:
+        command.append("--all")
+    else:
+        command.extend(["--task", task or ""])
+
+    exit_code = 0
+    if not dry_run:
+        completed = subprocess.run(command, cwd=project_dir, text=True, encoding="utf-8", errors="replace")
+        exit_code = completed.returncode
+
+    return SandboxRunResult(
+        source_project=source,
+        directory=sandbox_dir,
+        project_dir=project_dir,
+        venv_dir=venv_dir,
+        setup=setup,
+        command=tuple(command),
+        exit_code=exit_code,
+        dry_run=dry_run,
+    )
+
+
+def format_sandbox_run_result(result: SandboxRunResult) -> str:
+    lines = [
+        f"Source project: {result.source_project}",
+        f"Sandbox: {result.directory}",
+        f"Project: {result.project_dir}",
+        f"Venv: {result.venv_dir}",
+        f"Run command: {' '.join(result.command)}",
+        f"Exit code: {result.exit_code}",
+        f"Artifacts: {result.project_dir / '.nightshift'}",
+    ]
+    if result.dry_run:
+        lines.insert(0, "Dry run: true")
+    return "\n".join(lines)
+
+
+def _sandbox_directory(output: str | Path | None, *, root: str | Path, timestamped: bool) -> Path:
+    if output:
+        return Path(output).resolve()
+    base = Path(root).resolve() / "integ_runs"
+    run_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
+    return base / run_id
+
+
+def _copy_ignore(directory: str, names: list[str]) -> set[str]:
+    ignored = {
+        ".git",
+        ".pytest_cache",
+        ".ruff_cache",
+        "__pycache__",
+        ".venv",
+        "venv",
+    }
+    if Path(directory).name == ".nightshift":
+        ignored.update({"runs", "run-summary.md", "run.log", "project-context.md", "project-context-chart.md"})
+    return {name for name in names if name in ignored or name.endswith(".egg-info")}
--- a/nightshift/terminal.py
+++ b/nightshift/terminal.py
@ -168,6 +168,7 @@ class TerminalAnimation:
        self._width = 0
        self._lock = threading.Lock()
        self._last_rendered = ""
+        self._last_status_line = ""

    def __enter__(self) -> "TerminalAnimation":
        self.start()
@ -194,6 +195,7 @@ class TerminalAnimation:
    def update_message(self, message: str) -> None:
        with self._lock:
            self.message = message
+        self._emit_status_line(message)

    def emit(self, line: str) -> None:
        if not self.enabled:
@ -238,6 +240,18 @@ class TerminalAnimation:
        self.stream.write("\r" + (" " * self._width) + "\r")
        self.stream.flush()

+    def _emit_status_line(self, message: str) -> None:
+        line = format_status_bar_message(message, stream=self.stream)
+        if line == self._last_status_line:
+            return
+        self._last_status_line = line
+        if self.enabled:
+            self._clear()
+            print(line)
+            self._render_frame(0)
+            return
+        print(line)
+

 def animation_frames(name: str) -> tuple[str, ...]:
    frames = HOTDOG_ANIMATIONS.get(name)
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@ -153,6 +153,93 @@ class PipelineRunnerTests(unittest.TestCase):
            self.assertIn("Retry limit reached", result.reason)
            self.assertEqual([item.stage_id for item in result.stage_results], ["implement", "review", "implement", "review", "implement", "review"])

+    def test_malformed_review_gets_strict_retry_without_redrafting(self) -> None:
+        with tempfile.TemporaryDirectory() as directory:
+            root = Path(directory)
+            _write_common_files(root)
+            (root / "fake_reviewer.py").write_text(
+                "\n".join(
+                    [
+                        "import sys",
+                        "prompt = sys.stdin.read()",
+                        "if 'Previous review output was malformed' in prompt:",
+                        "    print('status: pass')",
+                        "    print('reason: strict retry ok')",
+                        "    print('next_stage: none')",
+                        "    print('context_update: none')",
+                        "else:",
+                        "    print('files')",
+                    ]
+                ),
+                encoding="utf-8",
+            )
+            stages = (
+                StageConfig(id="implement", type="agent", agent="planner", output="implementation-log.md"),
+                StageConfig(
+                    id="review",
+                    type="agent_review",
+                    agent="reviewer",
+                    on_fail="implement",
+                    output="review.md",
+                ),
+                StageConfig(id="summarize", type="summarize", output="final-notes.md"),
+            )
+            config = make_config(root, stages, max_retries=2)
+            config.agents["reviewer"] = AgentConfig(
+                id="reviewer",
+                backend="command",
+                command="python fake_reviewer.py",
+                system_prompt=Path("reviewer.md"),
+            )
+            runner = PipelineRunner(config, ArtifactStore(root, ".nightshift", run_id="test-run"))
+            task = parse_tasks(TASK_MD)[0]
+
+            result = runner.run_task(task)
+
+            task_dir = root / ".nightshift" / "runs" / "test-run" / "tasks" / task.id
+            self.assertEqual(result.status, "complete")
+            self.assertEqual(result.retry_count, 0)
+            self.assertEqual([item.stage_id for item in result.stage_results], ["implement", "review", "summarize"])
+            self.assertTrue((task_dir / "review.md").exists())
+            self.assertTrue((task_dir / "review-1.md").exists())
+            self.assertIn("files", (task_dir / "review.md").read_text(encoding="utf-8"))
+            self.assertIn("strict retry ok", (task_dir / "review-1.md").read_text(encoding="utf-8"))
+
+    def test_malformed_review_stops_without_on_fail_redraft(self) -> None:
+        with tempfile.TemporaryDirectory() as directory:
+            root = Path(directory)
+            _write_common_files(root)
+            (root / "fake_reviewer.py").write_text("print('files')\n", encoding="utf-8")
+            stages = (
+                StageConfig(id="implement", type="agent", agent="planner", output="implementation-log.md"),
+                StageConfig(
+                    id="review",
+                    type="agent_review",
+                    agent="reviewer",
+                    on_fail="implement",
+                    output="review.md",
+                ),
+            )
+            config = make_config(root, stages, max_retries=2)
+            config.agents["reviewer"] = AgentConfig(
+                id="reviewer",
+                backend="command",
+                command="python fake_reviewer.py",
+                system_prompt=Path("reviewer.md"),
+            )
+            runner = PipelineRunner(config, ArtifactStore(root, ".nightshift", run_id="test-run"))
+            task = parse_tasks(TASK_MD)[0]
+
+            result = runner.run_task(task)
+
+            task_dir = root / ".nightshift" / "runs" / "test-run" / "tasks" / task.id
+            self.assertEqual(result.status, "failed")
+            self.assertEqual(result.retry_count, 0)
+            self.assertIn("remained malformed", result.reason)
+            self.assertEqual([item.stage_id for item in result.stage_results], ["implement", "review"])
+            self.assertTrue((task_dir / "review.md").exists())
+            self.assertTrue((task_dir / "review-1.md").exists())
+
    def test_passing_review_next_stage_is_ignored(self) -> None:
        with tempfile.TemporaryDirectory() as directory:
            root = Path(directory)
--- a/tests/test_sandbox_run.py
+++ b/tests/test_sandbox_run.py
@ -0,0 +1,70 @@
+from pathlib import Path
+import tempfile
+import unittest
+
+from nightshift.errors import NightShiftError
+from nightshift.sandbox_run import format_sandbox_run_result, run_sandbox_project
+
+
+class SandboxRunTests(unittest.TestCase):
+    def test_sandbox_run_dry_run_copies_existing_project_and_keeps_animation(self) -> None:
+        with tempfile.TemporaryDirectory() as directory:
+            root = Path(directory)
+            source = root / "source"
+            source.mkdir()
+            (source / "nightshift.yaml").write_text("project:\n  name: demo\n", encoding="utf-8")
+            (source / "pyproject.toml").write_text("[project]\nname = 'demo'\nversion = '0.1.0'\n", encoding="utf-8")
+            (source / ".nightshift").mkdir()
+            (source / ".nightshift" / "tasks.md").write_text("- [ ] TASK-001: Demo\n\nAcceptance Criteria:\n- done\n", encoding="utf-8")
+            (source / ".nightshift" / "runs").mkdir()
+            (source / ".nightshift" / "runs" / "old.txt").write_text("old artifact", encoding="utf-8")
+            output = root / "sandbox"
+
+            result = run_sandbox_project(
+                source,
+                output=output,
+                task="TASK-001",
+                dry_run=True,
+            )
+
+            rendered = format_sandbox_run_result(result)
+            self.assertIn("Dry run: true", rendered)
+            self.assertEqual(result.project_dir, output / "project")
+            self.assertTrue((output / "project" / "nightshift.yaml").exists())
+            self.assertTrue((output / "project" / ".nightshift" / "tasks.md").exists())
+            self.assertFalse((output / "project" / ".nightshift" / "runs").exists())
+            self.assertIn("--animation", result.command)
+            self.assertNotIn("--no-animation", result.command)
+            self.assertIn("TASK-001", result.command)
+
+    def test_sandbox_run_timestamped_uses_integ_runs_directory(self) -> None:
+        with tempfile.TemporaryDirectory() as directory:
+            root = Path(directory)
+            source = root / "source"
+            source.mkdir()
+            (source / "nightshift.yaml").write_text("project:\n  name: demo\n", encoding="utf-8")
+            (source / "pyproject.toml").write_text("[project]\nname = 'demo'\nversion = '0.1.0'\n", encoding="utf-8")
+
+            result = run_sandbox_project(
+                source,
+                root=root,
+                timestamped=True,
+                all_tasks=True,
+                dry_run=True,
+            )
+
+            self.assertEqual(result.directory.parent, root / "integ_runs")
+            self.assertIn("--all", result.command)
+
+    def test_sandbox_run_requires_output_or_timestamped(self) -> None:
+        with tempfile.TemporaryDirectory() as directory:
+            source = Path(directory) / "source"
+            source.mkdir()
+            (source / "nightshift.yaml").write_text("project:\n  name: demo\n", encoding="utf-8")
+
+            with self.assertRaisesRegex(NightShiftError, "provide --output or --timestamped"):
+                run_sandbox_project(source, task="TASK-001", dry_run=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_terminal.py
+++ b/tests/test_terminal.py
@ -57,6 +57,17 @@ class TerminalStylingTests(unittest.TestCase):

        self.assertEqual(output.getvalue().strip(), "plain log")

+    def test_terminal_animation_status_update_prints_for_non_tty(self) -> None:
+        stream = StringIO()
+        output = StringIO()
+        animation = TerminalAnimation(stream=stream)
+
+        with patch("sys.stdout", output):
+            animation.update_message("Task: TASK-001 | >> Stage: plan")
+
+        self.assertIn("[NightShift]", output.getvalue())
+        self.assertIn("Stage: plan", output.getvalue())
+
    def test_terminal_animation_renders_immediately_when_started(self) -> None:
        stream = FakeTTY()
        animation = TerminalAnimation(
@ -85,10 +96,12 @@ class TerminalStylingTests(unittest.TestCase):
        with patch("sys.stdout", output):
            animation.start()
            animation.emit("log line")
+            animation.update_message("Stage: write")
            stream_output = stream.getvalue()
            animation.stop()

        self.assertIn("log line", output.getvalue())
+        self.assertIn("Stage: write", output.getvalue())
        self.assertGreaterEqual(stream_output.count("Stage: plan"), 2)

    def test_format_status_bar_message_uses_status_color(self) -> None: