fix guard and make stop repeat configurable

2026-06-14 10:08:37 +00:00 · 2026-05-20 05:02:32 -07:00 · 2026-05-20 05:02:32 -07:00 · c12493a248
commit c12493a248
parent 2f2146f47d
8 changed files with 193 additions and 9 deletions
--- a/docs/images/7dea0c14-54fc-459c-a739-835de945fa72.png
+++ b/docs/images/7dea0c14-54fc-459c-a739-835de945fa72.png
--- a/nightshift/config.py
+++ b/nightshift/config.py
@ -78,6 +78,7 @@ class PipelineConfig:
    max_task_retries: int
    stages: tuple[StageConfig, ...]
    continue_on_task_failure: bool = False
    stop_on_repeated_failure_signature_after: int | None = None
@dataclass(frozen=True)
@ -265,6 +266,14 @@ def parse_config(raw: dict[str, Any], config_path: Path) -> NightShiftConfig:
        pipeline_raw.get("continue_on_task_failure", False),
        "pipeline.continue_on_task_failure",
    )
    stop_on_repeated_failure_signature_after = _optional_int_or_none(
        pipeline_raw.get("stop_on_repeated_failure_signature_after"),
        "pipeline.stop_on_repeated_failure_signature_after",
    )
    if stop_on_repeated_failure_signature_after is not None and stop_on_repeated_failure_signature_after < 2:
        raise ConfigError(
            "Config error: pipeline.stop_on_repeated_failure_signature_after must be two or greater."
        )
    stages_raw = pipeline_raw.get("stages")
    if not isinstance(stages_raw, list) or not stages_raw:
@ -396,6 +405,7 @@ def parse_config(raw: dict[str, Any], config_path: Path) -> NightShiftConfig:
            max_task_retries=max_task_retries,
            stages=tuple(stages),
            continue_on_task_failure=continue_on_task_failure,
            stop_on_repeated_failure_signature_after=stop_on_repeated_failure_signature_after,
        ),
        experiment=experiment,
    )
--- a/nightshift/escalation.py
+++ b/nightshift/escalation.py
@ -14,14 +14,32 @@ class EscalationDecision:
    reason: str
-def evaluate_retry_churn(entries: tuple[RetryMemoryEntry, ...], *, retry_budget: int) -> EscalationDecision:
+def evaluate_retry_churn(
    entries: tuple[RetryMemoryEntry, ...],
    *,
    retry_budget: int,
    repeated_signature_after: int | None = None,
 ) -> EscalationDecision:
    if len(entries) < 2:
        return EscalationDecision(False, "continue", "Not enough retry history for churn detection.")
    recent = entries[-3:]
    same_stage = len({entry.stage_id for entry in recent}) == 1
    same_cause = len({entry.cause for entry in recent}) == 1
    recent_signatures = [entry.failure_signature for entry in entries[-2:] if entry.failure_signature]
    same_signature = len(recent_signatures) == 2 and len(set(recent_signatures)) == 1
    if len(entries) >= retry_budget and retry_budget > 0:
        return EscalationDecision(True, "human review", "Configured retry budget is exhausted.")
    if (
        repeated_signature_after is not None
        and repeated_signature_after > 0
        and len(entries) >= repeated_signature_after
        and same_signature
    ):
        return EscalationDecision(
            True,
            "debugger review or larger model",
            "The same failure signature repeated on consecutive retries.",
        )
    if len(recent) == 3 and same_stage and same_cause:
        return EscalationDecision(True, "debugger review or larger model", "The same stage is failing with the same reason repeatedly.")
    return EscalationDecision(False, "continue", "No retry churn detected.")
--- a/nightshift/failures.py
+++ b/nightshift/failures.py
@ -34,6 +34,8 @@ def classify_failure(output: str, exit_code: int | None = None, modified_files:
    text = output or ""
    lowered = text.lower()
    failing_tests = extract_failing_tests(text)
    exception_name = _extract_exception_name(text)
    source_path, _ = _extract_traceback_location(text)
    missing = re.search(r"No module named ['\"]([^'\"]+)['\"]", text, re.IGNORECASE)
    if not missing:
@ -48,6 +50,25 @@ def classify_failure(output: str, exit_code: int | None = None, modified_files:
            "do not retry implementation until dependency is resolved",
            failing_tests,
        )
    if exception_name and source_path and _looks_like_project_source(source_path):
        if exception_name in {"TypeError", "AttributeError"}:
            return FailureClassification(
                "API misuse",
                f"The implementation is calling an API with an incompatible shape near `{source_path}`.",
                0.82,
                "Retry implementation with the exception and relevant call site.",
                "retry implementation",
                failing_tests,
            )
        if exception_name in {"NameError", "OperationalError", "KeyError", "ValueError", "IndexError"}:
            return FailureClassification(
                "logic bug",
                f"The failure originates in project code near `{source_path}`.",
                0.8,
                "Send the traceback and touched files back to the implementer.",
                "retry implementation",
                failing_tests,
            )
    if re.search(r"\b(syntaxerror|indentationerror|importerror)\b", text, re.IGNORECASE):
        return FailureClassification(
            "syntax/import error",
@ -113,6 +134,15 @@ def classify_failure(output: str, exit_code: int | None = None, modified_files:
    )
 def build_failure_signature(output: str, reason: str = "") -> str:
    text = "\n".join(part for part in (reason, output) if part)
    command = _extract_command(text)
    exception_name = _extract_exception_name(text)
    source_path, source_line = _extract_traceback_location(text)
    parts = [part for part in (exception_name, source_path, source_line, command) if part]
    return " | ".join(parts) if parts else "unknown-failure"
 def extract_failing_tests(output: str) -> tuple[str, ...]:
    tests: list[str] = []
    patterns = (
@ -128,6 +158,56 @@ def extract_failing_tests(output: str) -> tuple[str, ...]:
    return tuple(tests)
 def _extract_exception_name(text: str) -> str:
    candidates = []
    for match in re.finditer(r"(?m)^(?:E\s+)?([A-Za-z0-9_.]+(?:Error|Exception|Warning|NameError|TypeError|AttributeError|KeyError|ValueError|IndexError)):\s*(.*)$", text):
        candidates.append(match.group(1))
    return candidates[-1] if candidates else ""
 def _extract_traceback_location(text: str) -> tuple[str, str]:
    candidates: list[tuple[int, str, str]] = []
    for match in re.finditer(r'(?m)^\s*File "([^"]+)", line (\d+), in .+$', text):
        path = match.group(1)
        line = match.group(2)
        candidates.append((_traceback_score(path), path, line))
    for match in re.finditer(r"(?m)^.*?([A-Za-z]:[\\/][^:\n]+?\.py):(\d+):", text):
        path = match.group(1)
        line = match.group(2)
        candidates.append((_traceback_score(path), path, line))
    if not candidates:
        return "", ""
    candidates.sort(key=lambda item: item[0], reverse=True)
    _, path, line = candidates[0]
    return path, line
 def _extract_command(text: str) -> str:
    candidates = []
    for match in re.finditer(r"Command:\s*`([^`]+)`", text):
        candidates.append(match.group(1))
    return candidates[-1] if candidates else ""
 def _looks_like_project_source(path: str) -> bool:
    normalized = path.replace("\\", "/").lower()
    return "/src/" in normalized or "/tests/" in normalized
 def _traceback_score(path: str) -> int:
    normalized = path.replace("\\", "/").lower()
    score = 0
    if normalized.endswith(".py"):
        score += 1
    if "/src/" in normalized:
        score += 10
    if "/tests/" in normalized:
        score += 8
    if "/site-packages/" in normalized or "/_pytest/" in normalized:
        score -= 20
    return score
 def format_failure_classification(result: FailureClassification, *, exit_code: int | None, modified_files: tuple[str, ...]) -> str:
    files = "\n".join(f"- `{path}`" for path in modified_files) or "- None"
    tests = "\n".join(f"- `{name}`" for name in result.failing_tests) or "- None"
--- a/nightshift/pipeline.py
+++ b/nightshift/pipeline.py
@ -16,7 +16,7 @@ from .dependencies import diagnose_python_dependencies, format_dependency_diagno
 from .escalation import evaluate_retry_churn, format_escalation_decision
 from .errors import PipelineError
 from .errors import NightShiftError
-from .failures import classify_failure, format_failure_classification
+from .failures import build_failure_signature, classify_failure, format_failure_classification
 from .git import ensure_clean_worktree, write_diff_artifact, write_git_artifacts
 from .patches import (
    DEFAULT_FORBIDDEN_PATHS,
@ -232,7 +232,16 @@ class PipelineRunner:
                    )
                    break
                retry_count += 1
-                memory_entry = entry_from_stage(retry_count, result, target_stage)
+                output = self._read_output(result.output_path)
                failure_signature = ""
                if stage.type in COMMAND_STAGE_TYPES:
                    failure_signature = build_failure_signature(output, result.reason)
                memory_entry = entry_from_stage(
                    retry_count,
                    result,
                    target_stage,
                    failure_signature=failure_signature,
                )
                retry_memory.append(memory_entry)
                self.artifacts.write_stage_output(
                    task.id,
@ -242,6 +251,8 @@ class PipelineRunner:
                decision = evaluate_retry_churn(
                    tuple(retry_memory),
                    retry_budget=self.config.pipeline.max_task_retries + 1,
                    repeated_signature_after=self.config.pipeline.stop_on_repeated_failure_signature_after
                    or self.config.pipeline.max_task_retries,
                )
                self.artifacts.write_stage_output(
                    task.id,
--- a/nightshift/retry_memory.py
+++ b/nightshift/retry_memory.py
@ -14,6 +14,7 @@ class RetryMemoryEntry:
    status: str
    cause: str
    next_stage: str
    failure_signature: str
 def summarize_retry_memory(entries: tuple[RetryMemoryEntry, ...]) -> str:
@ -23,17 +24,24 @@ def summarize_retry_memory(entries: tuple[RetryMemoryEntry, ...]) -> str:
    for entry in entries[-8:]:
        lines.append(
            f"- Attempt {entry.attempt}: `{entry.stage_id}` returned {entry.status}; "
-            f"cause: {entry.cause}; next: `{entry.next_stage}`"
+            f"cause: {entry.cause}; signature: `{entry.failure_signature}`; next: `{entry.next_stage}`"
        )
    lines.append("")
    return "\n".join(lines)
-def entry_from_stage(attempt: int, result: StageResult, next_stage: str) -> RetryMemoryEntry:
+def entry_from_stage(
    attempt: int,
    result: StageResult,
    next_stage: str,
    *,
    failure_signature: str,
 ) -> RetryMemoryEntry:
    return RetryMemoryEntry(
        attempt=attempt,
        stage_id=result.stage_id,
        status=result.status,
        cause=result.reason,
        next_stage=next_stage,
        failure_signature=failure_signature,
    )
--- a/nightshift/version.py
+++ b/nightshift/version.py
@ -3,10 +3,10 @@
 from __future__ import annotations
-PACKAGE_VERSION = "0.2.4"
+PACKAGE_VERSION = "0.2.5"
 RELEASE_CHANNEL = "alpha"
-hotdog_version = "bratwurst"
+hotdog_version = "chicago"
-topping_version = "relish"
+topping_version = "onions"
 HOTDOG_VERSIONS = (
    "bratwurst",
--- a/tests/test_reliability_features.py
+++ b/tests/test_reliability_features.py
@ -5,10 +5,12 @@ import unittest
 from nightshift.artifacts import ArtifactStore
 from nightshift.config import parse_config, StageConfig
-from nightshift.failures import classify_failure
+from nightshift.escalation import evaluate_retry_churn
 from nightshift.failures import build_failure_signature, classify_failure
 from nightshift.integ import cleanup_integration_runs, create_integration_run
 from nightshift.patches import validate_patch
 from nightshift.pipeline import PipelineRunner
 from nightshift.retry_memory import RetryMemoryEntry
 from nightshift.tasks import parse_tasks
 from tests.test_pipeline import TASK_MD, make_config, _write_common_files
@ -36,6 +38,61 @@ class ReliabilityFeatureTests(unittest.TestCase):
        self.assertEqual(result.category, "missing dependency")
        self.assertIn("pastebin_app", result.probable_root_cause)
    def test_failure_classifier_treats_traceback_into_source_as_logic_bug(self) -> None:
        result = classify_failure(
            "\n".join(
                [
                    '  File "C:\\repo\\project\\src\\pastebin_app\\app.py", line 31, in get_db',
                    "    if 'db' not in g:",
                    "NameError: name 'g' is not defined",
                ]
            ),
            exit_code=1,
        )
        self.assertEqual(result.category, "logic bug")
        self.assertIn("src\\pastebin_app\\app.py", result.probable_root_cause)
    def test_retry_churn_stops_on_repeated_failure_signature(self) -> None:
        entries = (
            RetryMemoryEntry(
                attempt=1,
                stage_id="test",
                status="fail",
                cause="Command exited with code 1: python -m pytest -q",
                next_stage="implement",
                failure_signature="NameError | src/pastebin_app/app.py | 31 | python -m pytest -q",
            ),
            RetryMemoryEntry(
                attempt=2,
                stage_id="test",
                status="fail",
                cause="Command exited with code 1: python -m pytest -q",
                next_stage="implement",
                failure_signature="NameError | src/pastebin_app/app.py | 31 | python -m pytest -q",
            ),
        )
        decision = evaluate_retry_churn(entries, retry_budget=4, repeated_signature_after=2)
        self.assertTrue(decision.should_stop)
        self.assertIn("same failure signature", decision.reason)
    def test_build_failure_signature_prefers_project_traceback_over_pytest_cache(self) -> None:
        signature = build_failure_signature(
            "\n".join(
                [
                    '  File "C:\\repo\\project\\src\\pastebin_app\\app.py", line 31, in get_db',
                    "NameError: name 'g' is not defined",
                    '  File "C:\\Users\\metis\\...\\site-packages\\_pytest\\cacheprovider.py", line 429, in set',
                ]
            ),
            reason="Command exited with code 1: python -m pytest -q",
        )
        self.assertIn("src\\pastebin_app\\app.py", signature)
        self.assertNotIn("_pytest\\cacheprovider.py", signature)
    def test_command_failure_writes_diagnostics_and_retry_memory(self) -> None:
        with tempfile.TemporaryDirectory() as directory:
            root = Path(directory)