diff --git a/docs/images/7dea0c14-54fc-459c-a739-835de945fa72.png b/docs/images/7dea0c14-54fc-459c-a739-835de945fa72.png new file mode 100644 index 0000000..c66e4d2 Binary files /dev/null and b/docs/images/7dea0c14-54fc-459c-a739-835de945fa72.png differ diff --git a/nightshift/config.py b/nightshift/config.py index a722993..84a3b19 100644 --- a/nightshift/config.py +++ b/nightshift/config.py @@ -78,6 +78,7 @@ class PipelineConfig: max_task_retries: int stages: tuple[StageConfig, ...] continue_on_task_failure: bool = False + stop_on_repeated_failure_signature_after: int | None = None @dataclass(frozen=True) @@ -265,6 +266,14 @@ def parse_config(raw: dict[str, Any], config_path: Path) -> NightShiftConfig: pipeline_raw.get("continue_on_task_failure", False), "pipeline.continue_on_task_failure", ) + stop_on_repeated_failure_signature_after = _optional_int_or_none( + pipeline_raw.get("stop_on_repeated_failure_signature_after"), + "pipeline.stop_on_repeated_failure_signature_after", + ) + if stop_on_repeated_failure_signature_after is not None and stop_on_repeated_failure_signature_after < 2: + raise ConfigError( + "Config error: pipeline.stop_on_repeated_failure_signature_after must be two or greater." + ) stages_raw = pipeline_raw.get("stages") if not isinstance(stages_raw, list) or not stages_raw: @@ -396,6 +405,7 @@ def parse_config(raw: dict[str, Any], config_path: Path) -> NightShiftConfig: max_task_retries=max_task_retries, stages=tuple(stages), continue_on_task_failure=continue_on_task_failure, + stop_on_repeated_failure_signature_after=stop_on_repeated_failure_signature_after, ), experiment=experiment, ) diff --git a/nightshift/escalation.py b/nightshift/escalation.py index 872ccc0..23d5157 100644 --- a/nightshift/escalation.py +++ b/nightshift/escalation.py @@ -14,14 +14,32 @@ class EscalationDecision: reason: str -def evaluate_retry_churn(entries: tuple[RetryMemoryEntry, ...], *, retry_budget: int) -> EscalationDecision: +def evaluate_retry_churn( + entries: tuple[RetryMemoryEntry, ...], + *, + retry_budget: int, + repeated_signature_after: int | None = None, +) -> EscalationDecision: if len(entries) < 2: return EscalationDecision(False, "continue", "Not enough retry history for churn detection.") recent = entries[-3:] same_stage = len({entry.stage_id for entry in recent}) == 1 same_cause = len({entry.cause for entry in recent}) == 1 + recent_signatures = [entry.failure_signature for entry in entries[-2:] if entry.failure_signature] + same_signature = len(recent_signatures) == 2 and len(set(recent_signatures)) == 1 if len(entries) >= retry_budget and retry_budget > 0: return EscalationDecision(True, "human review", "Configured retry budget is exhausted.") + if ( + repeated_signature_after is not None + and repeated_signature_after > 0 + and len(entries) >= repeated_signature_after + and same_signature + ): + return EscalationDecision( + True, + "debugger review or larger model", + "The same failure signature repeated on consecutive retries.", + ) if len(recent) == 3 and same_stage and same_cause: return EscalationDecision(True, "debugger review or larger model", "The same stage is failing with the same reason repeatedly.") return EscalationDecision(False, "continue", "No retry churn detected.") diff --git a/nightshift/failures.py b/nightshift/failures.py index a5fa1c5..634bd33 100644 --- a/nightshift/failures.py +++ b/nightshift/failures.py @@ -34,6 +34,8 @@ def classify_failure(output: str, exit_code: int | None = None, modified_files: text = output or "" lowered = text.lower() failing_tests = extract_failing_tests(text) + exception_name = _extract_exception_name(text) + source_path, _ = _extract_traceback_location(text) missing = re.search(r"No module named ['\"]([^'\"]+)['\"]", text, re.IGNORECASE) if not missing: @@ -48,6 +50,25 @@ def classify_failure(output: str, exit_code: int | None = None, modified_files: "do not retry implementation until dependency is resolved", failing_tests, ) + if exception_name and source_path and _looks_like_project_source(source_path): + if exception_name in {"TypeError", "AttributeError"}: + return FailureClassification( + "API misuse", + f"The implementation is calling an API with an incompatible shape near `{source_path}`.", + 0.82, + "Retry implementation with the exception and relevant call site.", + "retry implementation", + failing_tests, + ) + if exception_name in {"NameError", "OperationalError", "KeyError", "ValueError", "IndexError"}: + return FailureClassification( + "logic bug", + f"The failure originates in project code near `{source_path}`.", + 0.8, + "Send the traceback and touched files back to the implementer.", + "retry implementation", + failing_tests, + ) if re.search(r"\b(syntaxerror|indentationerror|importerror)\b", text, re.IGNORECASE): return FailureClassification( "syntax/import error", @@ -113,6 +134,15 @@ def classify_failure(output: str, exit_code: int | None = None, modified_files: ) +def build_failure_signature(output: str, reason: str = "") -> str: + text = "\n".join(part for part in (reason, output) if part) + command = _extract_command(text) + exception_name = _extract_exception_name(text) + source_path, source_line = _extract_traceback_location(text) + parts = [part for part in (exception_name, source_path, source_line, command) if part] + return " | ".join(parts) if parts else "unknown-failure" + + def extract_failing_tests(output: str) -> tuple[str, ...]: tests: list[str] = [] patterns = ( @@ -128,6 +158,56 @@ def extract_failing_tests(output: str) -> tuple[str, ...]: return tuple(tests) +def _extract_exception_name(text: str) -> str: + candidates = [] + for match in re.finditer(r"(?m)^(?:E\s+)?([A-Za-z0-9_.]+(?:Error|Exception|Warning|NameError|TypeError|AttributeError|KeyError|ValueError|IndexError)):\s*(.*)$", text): + candidates.append(match.group(1)) + return candidates[-1] if candidates else "" + + +def _extract_traceback_location(text: str) -> tuple[str, str]: + candidates: list[tuple[int, str, str]] = [] + for match in re.finditer(r'(?m)^\s*File "([^"]+)", line (\d+), in .+$', text): + path = match.group(1) + line = match.group(2) + candidates.append((_traceback_score(path), path, line)) + for match in re.finditer(r"(?m)^.*?([A-Za-z]:[\\/][^:\n]+?\.py):(\d+):", text): + path = match.group(1) + line = match.group(2) + candidates.append((_traceback_score(path), path, line)) + if not candidates: + return "", "" + candidates.sort(key=lambda item: item[0], reverse=True) + _, path, line = candidates[0] + return path, line + + +def _extract_command(text: str) -> str: + candidates = [] + for match in re.finditer(r"Command:\s*`([^`]+)`", text): + candidates.append(match.group(1)) + return candidates[-1] if candidates else "" + + +def _looks_like_project_source(path: str) -> bool: + normalized = path.replace("\\", "/").lower() + return "/src/" in normalized or "/tests/" in normalized + + +def _traceback_score(path: str) -> int: + normalized = path.replace("\\", "/").lower() + score = 0 + if normalized.endswith(".py"): + score += 1 + if "/src/" in normalized: + score += 10 + if "/tests/" in normalized: + score += 8 + if "/site-packages/" in normalized or "/_pytest/" in normalized: + score -= 20 + return score + + def format_failure_classification(result: FailureClassification, *, exit_code: int | None, modified_files: tuple[str, ...]) -> str: files = "\n".join(f"- `{path}`" for path in modified_files) or "- None" tests = "\n".join(f"- `{name}`" for name in result.failing_tests) or "- None" diff --git a/nightshift/pipeline.py b/nightshift/pipeline.py index 7e23da0..863d213 100644 --- a/nightshift/pipeline.py +++ b/nightshift/pipeline.py @@ -16,7 +16,7 @@ from .dependencies import diagnose_python_dependencies, format_dependency_diagno from .escalation import evaluate_retry_churn, format_escalation_decision from .errors import PipelineError from .errors import NightShiftError -from .failures import classify_failure, format_failure_classification +from .failures import build_failure_signature, classify_failure, format_failure_classification from .git import ensure_clean_worktree, write_diff_artifact, write_git_artifacts from .patches import ( DEFAULT_FORBIDDEN_PATHS, @@ -232,7 +232,16 @@ class PipelineRunner: ) break retry_count += 1 - memory_entry = entry_from_stage(retry_count, result, target_stage) + output = self._read_output(result.output_path) + failure_signature = "" + if stage.type in COMMAND_STAGE_TYPES: + failure_signature = build_failure_signature(output, result.reason) + memory_entry = entry_from_stage( + retry_count, + result, + target_stage, + failure_signature=failure_signature, + ) retry_memory.append(memory_entry) self.artifacts.write_stage_output( task.id, @@ -242,6 +251,8 @@ class PipelineRunner: decision = evaluate_retry_churn( tuple(retry_memory), retry_budget=self.config.pipeline.max_task_retries + 1, + repeated_signature_after=self.config.pipeline.stop_on_repeated_failure_signature_after + or self.config.pipeline.max_task_retries, ) self.artifacts.write_stage_output( task.id, diff --git a/nightshift/retry_memory.py b/nightshift/retry_memory.py index a76127e..fbd1856 100644 --- a/nightshift/retry_memory.py +++ b/nightshift/retry_memory.py @@ -14,6 +14,7 @@ class RetryMemoryEntry: status: str cause: str next_stage: str + failure_signature: str def summarize_retry_memory(entries: tuple[RetryMemoryEntry, ...]) -> str: @@ -23,17 +24,24 @@ def summarize_retry_memory(entries: tuple[RetryMemoryEntry, ...]) -> str: for entry in entries[-8:]: lines.append( f"- Attempt {entry.attempt}: `{entry.stage_id}` returned {entry.status}; " - f"cause: {entry.cause}; next: `{entry.next_stage}`" + f"cause: {entry.cause}; signature: `{entry.failure_signature}`; next: `{entry.next_stage}`" ) lines.append("") return "\n".join(lines) -def entry_from_stage(attempt: int, result: StageResult, next_stage: str) -> RetryMemoryEntry: +def entry_from_stage( + attempt: int, + result: StageResult, + next_stage: str, + *, + failure_signature: str, +) -> RetryMemoryEntry: return RetryMemoryEntry( attempt=attempt, stage_id=result.stage_id, status=result.status, cause=result.reason, next_stage=next_stage, + failure_signature=failure_signature, ) diff --git a/nightshift/version.py b/nightshift/version.py index 2b8b621..e8f3857 100644 --- a/nightshift/version.py +++ b/nightshift/version.py @@ -3,10 +3,10 @@ from __future__ import annotations -PACKAGE_VERSION = "0.2.4" +PACKAGE_VERSION = "0.2.5" RELEASE_CHANNEL = "alpha" -hotdog_version = "bratwurst" -topping_version = "relish" +hotdog_version = "chicago" +topping_version = "onions" HOTDOG_VERSIONS = ( "bratwurst", diff --git a/tests/test_reliability_features.py b/tests/test_reliability_features.py index 903dd98..d676391 100644 --- a/tests/test_reliability_features.py +++ b/tests/test_reliability_features.py @@ -5,10 +5,12 @@ import unittest from nightshift.artifacts import ArtifactStore from nightshift.config import parse_config, StageConfig -from nightshift.failures import classify_failure +from nightshift.escalation import evaluate_retry_churn +from nightshift.failures import build_failure_signature, classify_failure from nightshift.integ import cleanup_integration_runs, create_integration_run from nightshift.patches import validate_patch from nightshift.pipeline import PipelineRunner +from nightshift.retry_memory import RetryMemoryEntry from nightshift.tasks import parse_tasks from tests.test_pipeline import TASK_MD, make_config, _write_common_files @@ -36,6 +38,61 @@ class ReliabilityFeatureTests(unittest.TestCase): self.assertEqual(result.category, "missing dependency") self.assertIn("pastebin_app", result.probable_root_cause) + def test_failure_classifier_treats_traceback_into_source_as_logic_bug(self) -> None: + result = classify_failure( + "\n".join( + [ + ' File "C:\\repo\\project\\src\\pastebin_app\\app.py", line 31, in get_db', + " if 'db' not in g:", + "NameError: name 'g' is not defined", + ] + ), + exit_code=1, + ) + + self.assertEqual(result.category, "logic bug") + self.assertIn("src\\pastebin_app\\app.py", result.probable_root_cause) + + def test_retry_churn_stops_on_repeated_failure_signature(self) -> None: + entries = ( + RetryMemoryEntry( + attempt=1, + stage_id="test", + status="fail", + cause="Command exited with code 1: python -m pytest -q", + next_stage="implement", + failure_signature="NameError | src/pastebin_app/app.py | 31 | python -m pytest -q", + ), + RetryMemoryEntry( + attempt=2, + stage_id="test", + status="fail", + cause="Command exited with code 1: python -m pytest -q", + next_stage="implement", + failure_signature="NameError | src/pastebin_app/app.py | 31 | python -m pytest -q", + ), + ) + + decision = evaluate_retry_churn(entries, retry_budget=4, repeated_signature_after=2) + + self.assertTrue(decision.should_stop) + self.assertIn("same failure signature", decision.reason) + + def test_build_failure_signature_prefers_project_traceback_over_pytest_cache(self) -> None: + signature = build_failure_signature( + "\n".join( + [ + ' File "C:\\repo\\project\\src\\pastebin_app\\app.py", line 31, in get_db', + "NameError: name 'g' is not defined", + ' File "C:\\Users\\metis\\...\\site-packages\\_pytest\\cacheprovider.py", line 429, in set', + ] + ), + reason="Command exited with code 1: python -m pytest -q", + ) + + self.assertIn("src\\pastebin_app\\app.py", signature) + self.assertNotIn("_pytest\\cacheprovider.py", signature) + def test_command_failure_writes_diagnostics_and_retry_memory(self) -> None: with tempfile.TemporaryDirectory() as directory: root = Path(directory)