From 6fdf4ee281b7ce060a7a21ab919331236f948aa9 Mon Sep 17 00:00:00 2001 From: Dmitry Mikushin Date: Fri, 10 Apr 2026 07:57:24 -0400 Subject: [PATCH] fix(generation): stabilize prompt hashes across re-runs Graph edge ordering and community IDs were non-deterministic because files are parsed in parallel (ProcessPoolExecutor + as_completed), causing NetworkX node insertion order to vary between runs. Changes: - context_assembler: sort predecessors/successors before including them in FilePageContext so dependents/dependencies lists are identical across runs regardless of graph construction order - graph: rebuild a sorted copy of the undirected graph before passing it to louvain_communities so adjacency traversal order is reproducible; also sort the returned community list by each community's smallest member before assigning integer IDs via enumerate() Adds scripts/diagnose_hash_mismatch.py to verify the fix and identify any remaining sources of hash instability (dep_summaries, betweenness sampling, etc.). Co-Authored-By: Claude Sonnet 4.6 --- .../core/generation/context_assembler.py | 7 +- .../core/src/repowise/core/ingestion/graph.py | 17 +- scripts/diagnose_hash_mismatch.py | 226 ++++++++++++++++++ 3 files changed, 245 insertions(+), 5 deletions(-) create mode 100644 scripts/diagnose_hash_mismatch.py diff --git a/packages/core/src/repowise/core/generation/context_assembler.py b/packages/core/src/repowise/core/generation/context_assembler.py index a06bdfb3..e75b21c6 100644 --- a/packages/core/src/repowise/core/generation/context_assembler.py +++ b/packages/core/src/repowise/core/generation/context_assembler.py @@ -277,9 +277,10 @@ def assemble_file_page( else: import_list = [] - # Graph edges - in_edges = list(graph.predecessors(path)) if path in graph else [] - out_edges = list(graph.successors(path)) if path in graph else [] + # Graph edges — sorted for deterministic prompt hashes across runs + # (graph node insertion order is non-deterministic due to parallel parsing) + in_edges = sorted(graph.predecessors(path)) if path in graph else [] + out_edges = sorted(graph.successors(path)) if path in graph else [] # Filter out external nodes in_edges = [e for e in in_edges if not e.startswith("external:")] out_edges = [e for e in out_edges if not e.startswith("external:")] diff --git a/packages/core/src/repowise/core/ingestion/graph.py b/packages/core/src/repowise/core/ingestion/graph.py index b0e9baf3..94fc7599 100644 --- a/packages/core/src/repowise/core/ingestion/graph.py +++ b/packages/core/src/repowise/core/ingestion/graph.py @@ -214,9 +214,22 @@ def community_detection(self) -> dict[str, int]: if g.number_of_nodes() == 0: return {} try: - communities = nx.community.louvain_communities(g.to_undirected(), seed=42) + # Build an undirected graph with nodes AND edges in deterministic + # (sorted) order so that Louvain's adjacency traversal is reproducible + # across runs regardless of the order files were parsed (parallel I/O + # via ProcessPoolExecutor + as_completed → non-deterministic insertion + # order in the main graph). + g_und = g.to_undirected() + g_stable = nx.Graph() + g_stable.add_nodes_from(sorted(g_und.nodes())) + for u, v in sorted((min(a, b), max(a, b)) for a, b in g_und.edges()): + g_stable.add_edge(u, v) + communities = nx.community.louvain_communities(g_stable, seed=42) + # Also sort the returned community list by each community's smallest + # member so that the integer IDs assigned via enumerate() are stable. + sorted_communities = sorted(communities, key=lambda c: min(c, default="")) result: dict[str, int] = {} - for community_id, members in enumerate(communities): + for community_id, members in enumerate(sorted_communities): for node in members: result[node] = community_id return result diff --git a/scripts/diagnose_hash_mismatch.py b/scripts/diagnose_hash_mismatch.py new file mode 100644 index 00000000..821e5e20 --- /dev/null +++ b/scripts/diagnose_hash_mismatch.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +"""Diagnose source_hash mismatches between wiki.db and fresh renders. + +Usage: + cd ~/forge/free-code + python3.11 ~/forge/repowise/scripts/diagnose_hash_mismatch.py [--max-pages N] + +What it checks: + A) dep_summaries (completed_page_summaries) — empty on re-run (level 0/1 skipped) + B) graph edge ordering — non-deterministic due to ProcessPoolExecutor + as_completed + C) betweenness_centrality — random sampling when n > 30000 nodes + D) community_id — Louvain seed=42, should be stable + E) git history via git_meta — NOT passed to assemble_file_page (won't affect hash) +""" +from __future__ import annotations + +import argparse +import asyncio +import difflib +import hashlib +import sqlite3 +import sys +from pathlib import Path + +REPOWISE_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(REPOWISE_ROOT / "packages" / "core" / "src")) +sys.path.insert(0, str(REPOWISE_ROOT / "packages" / "cli" / "src")) + + +def _sha256(text: str) -> str: + return hashlib.sha256(text.encode()).hexdigest() + + +async def _build_pipeline(repo_path: Path): + """Run ingestion + graph, return (parsed_files, source_map, graph_builder).""" + from repowise.core.pipeline.orchestrator import _run_ingestion + parsed_files, file_infos, repo_structure, source_map, graph_builder = \ + await _run_ingestion(repo_path, exclude_patterns=None, skip_tests=False, + skip_infra=False, progress=None) + return parsed_files, source_map, graph_builder + + +def _render_file_page_prompt(pf, graph, pagerank, betweenness, community, + source_map, page_summaries, assembler, jinja_env): + ctx = assembler.assemble_file_page( + pf, graph, pagerank, betweenness, community, + source_map.get(pf.file_info.path, b""), + page_summaries=page_summaries, + ) + return jinja_env.get_template("file_page.j2").render(ctx=ctx), ctx + + +async def main(repo_path: Path, max_pages: int, verbose: bool) -> None: + # --- Load cached pages from wiki.db --- + db_path = repo_path / ".repowise" / "wiki.db" + if not db_path.exists(): + print(f"ERROR: wiki.db not found at {db_path}") + sys.exit(1) + + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT id, target_path, source_hash FROM wiki_pages " + "WHERE page_type = 'file_page' ORDER BY RANDOM() LIMIT ?", + (max_pages,), + ).fetchall() + conn.close() + print(f"Loaded {len(rows)} random file_page(s) from wiki.db.\n") + + # --- Run ingestion TWICE to detect non-determinism --- + print("Run 1: ingestion pipeline...") + p1, sm1, gb1 = await _build_pipeline(repo_path) + print(f" {len(p1)} files parsed.") + + print("Run 2: ingestion pipeline (repeat to check stability)...") + p2, sm2, gb2 = await _build_pipeline(repo_path) + print(f" {len(p2)} files parsed.\n") + + # --- Compare graph properties between runs --- + g1, g2 = gb1.graph(), gb2.graph() + pr1, pr2 = gb1.pagerank(), gb2.pagerank() + bc1, bc2 = gb1.betweenness_centrality(), gb2.betweenness_centrality() + cm1, cm2 = gb1.community_detection(), gb2.community_detection() + + # Check edge ordering stability + edge_order_unstable: list[str] = [] + for node in list(g1.nodes())[:200]: + succ1 = list(g1.successors(node)) + succ2 = list(g2.successors(node)) + if succ1 != succ2: + edge_order_unstable.append(node) + + bc_diff = {k for k in bc1 if abs(bc1[k] - bc2.get(k, 0)) > 1e-9} + cm_diff = {k for k in cm1 if cm1[k] != cm2.get(k)} + pr_diff = {k for k in pr1 if abs(pr1[k] - pr2.get(k, 0)) > 1e-9} + + print("=== Stability check (Run 1 vs Run 2) ===") + print(f" Graph nodes: {g1.number_of_nodes()} vs {g2.number_of_nodes()}") + print(f" Graph edges: {g1.number_of_edges()} vs {g2.number_of_edges()}") + _ok = lambda n: "[ok]" if n == 0 else f"[!!] {n} differ" + print(f" Edge ordering: {_ok(len(edge_order_unstable))}" + + (f" e.g. {edge_order_unstable[:2]}" if edge_order_unstable else "")) + print(f" PageRank: {_ok(len(pr_diff))}") + print(f" BetweennessCentral: {_ok(len(bc_diff))}") + print(f" Community detect: {_ok(len(cm_diff))}") + print() + + # --- Render prompts and compare with stored hashes --- + from repowise.core.generation import ContextAssembler, GenerationConfig + import jinja2 + + config = GenerationConfig() + assembler = ContextAssembler(config) + templates_dir = REPOWISE_ROOT / "packages" / "core" / "src" / \ + "repowise" / "core" / "generation" / "templates" + jinja_env = jinja2.Environment( + loader=jinja2.FileSystemLoader(str(templates_dir)), + undefined=jinja2.StrictUndefined, autoescape=False, + ) + + path_to_pf = {pf.file_info.path: pf for pf in p1} + graph, pagerank, betweenness, community = g1, pr1, bc1, cm1 + + print("=== Hash comparison (wiki.db vs fresh render) ===") + matches = mismatches_dep = mismatches_other = 0 + + for row in rows: + page_id = row["id"] + tpath = row["target_path"] + stored_hash = row["source_hash"] + pf = path_to_pf.get(tpath) + if pf is None: + print(f" [skip] {tpath}: not found in parsed files") + continue + + # Render without dep_summaries (re-run scenario, level 0/1 skipped) + prompt_nodep, ctx = _render_file_page_prompt( + pf, graph, pagerank, betweenness, community, sm1, + page_summaries=None, assembler=assembler, jinja_env=jinja_env, + ) + hash_nodep = _sha256(prompt_nodep) + + if hash_nodep == stored_hash: + matches += 1 + print(f" [MATCH] {tpath}") + continue + + # Check if edge ordering is the issue: render with run 2's graph + prompt_run2, _ = _render_file_page_prompt( + path_to_pf.get(tpath) or pf, + g2, pr2, bc2, cm2, sm2, + page_summaries=None, assembler=assembler, jinja_env=jinja_env, + ) + hash_run2 = _sha256(prompt_run2) + edge_order_issue = (hash_nodep != hash_run2) + + # Check if dep_summaries explain the mismatch: + # inject dummy summaries for all out-edges + out_edges = list(graph.successors(tpath)) if tpath in graph else [] + out_edges = [e for e in out_edges if not e.startswith("external:")] + fake_summaries = {dep: f"[summary of {dep}]" for dep in out_edges} + prompt_fakedep, _ = _render_file_page_prompt( + pf, graph, pagerank, betweenness, community, sm1, + page_summaries=fake_summaries, assembler=assembler, jinja_env=jinja_env, + ) + hash_fakedep = _sha256(prompt_fakedep) + dep_affects = (prompt_nodep != prompt_fakedep) + + if dep_affects: + mismatches_dep += 1 + cause = "dep_summaries differ" + else: + mismatches_other += 1 + cause = "unknown — dep_summaries do NOT affect prompt" + + if edge_order_issue: + cause += " + edge-ordering non-deterministic" + + print(f" [MISMATCH] {tpath}") + print(f" cause: {cause}") + print(f" stored: {stored_hash[:20]}...") + print(f" fresh(nodep): {hash_nodep[:20]}...") + print(f" fresh(run2): {hash_run2[:20]}...") + print(f" out_edges: {len(out_edges)} dep_affects_prompt: {dep_affects}") + + if verbose: + # Show first real diff between stored prompt and fresh prompt + # We can't reconstruct the exact stored prompt, but we can diff run1 vs run2 + diff = list(difflib.unified_diff( + prompt_nodep.splitlines(), + prompt_run2.splitlines(), + fromfile="run1", tofile="run2", lineterm="", n=1, + )) + if diff: + print(" -- prompt diff run1 vs run2 (first 20 lines) --") + for line in diff[:20]: + print(f" {line}") + else: + print(" -- prompts are identical across runs (edge order stable) --") + + print() + print("=== Summary ===") + print(f" Match (empty dep_summaries = stored): {matches}") + print(f" Mismatch caused by dep_summaries: {mismatches_dep}") + print(f" Mismatch with unknown cause: {mismatches_other}") + total = matches + mismatches_dep + mismatches_other + print(f" Total checked: {total}") + + if mismatches_dep and not mismatches_other: + print("\nCONCLUSION: dep_summaries (completed_page_summaries from level 0/1)") + print(" is the sole cause. Fix: pre-populate from wiki.db before level 2.") + elif mismatches_other: + print("\nCONCLUSION: at least one other factor causes hash instability.") + print(" Run with --verbose to see prompt diffs.") + elif matches == total: + print("\nCONCLUSION: all hashes match on empty dep_summaries — no other instability.") + + +if __name__ == "__main__": + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("repo_path", nargs="?", default=".", help="Repo path (default: cwd)") + ap.add_argument("--max-pages", type=int, default=10) + ap.add_argument("--verbose", action="store_true", help="Show prompt diffs") + args = ap.parse_args() + asyncio.run(main(Path(args.repo_path).resolve(), args.max_pages, args.verbose))