|
| 1 | +import logging |
| 2 | +import subprocess |
| 3 | +from pathlib import Path |
| 4 | +from tempfile import TemporaryDirectory |
| 5 | +from typing import Optional, List |
| 6 | + |
| 7 | +from packages.parser.core.repository_analyzer import RepositoryAnalyzer |
| 8 | +from packages.parser.core.graph_ingestion import graph_ingestion_service |
| 9 | +from packages.ingest.commit_decisions import process_repository |
| 10 | +from packages.ingest.add_embeddings import add_embeddings_to_all_nodes |
| 11 | +from packages.memory.embeddings import EmbeddingProvider |
| 12 | +from packages.config.settings import Settings |
| 13 | +from packages.parser.utils import extract_repo_info |
| 14 | +from packages.database.graph.graph import neo4j_client |
| 15 | +from packages.ingest.full_ingest import full_ingest |
| 16 | + |
| 17 | +settings = Settings() |
| 18 | +logger = logging.getLogger(__name__) |
| 19 | + |
| 20 | +def get_changed_files(repo_path: Path, base_sha: str, head_sha: str) -> Optional[List[str]]: |
| 21 | + """Get list of changed files between two commits. Returns None on error.""" |
| 22 | + try: |
| 23 | + result = subprocess.run( |
| 24 | + ["git", "diff", "--name-only", base_sha, head_sha], |
| 25 | + cwd=repo_path, |
| 26 | + capture_output=True, |
| 27 | + text=True, |
| 28 | + check=True |
| 29 | + ) |
| 30 | + files = [f.strip() for f in result.stdout.splitlines() if f.strip()] |
| 31 | + return files |
| 32 | + except subprocess.CalledProcessError as e: |
| 33 | + logger.error(f"Error getting changed files: {e}") |
| 34 | + return None |
| 35 | + |
| 36 | +def incremental_ingest(repo_url: str, branch: Optional[str] = None): |
| 37 | + """ |
| 38 | + Run incremental ingestion pipeline. |
| 39 | + """ |
| 40 | + with TemporaryDirectory() as temp_dir: |
| 41 | + repo_path = Path(temp_dir) |
| 42 | + |
| 43 | + # Clone the repository |
| 44 | + try: |
| 45 | + subprocess.run(["git", "clone", repo_url, "."], cwd=repo_path, check=True, capture_output=True) |
| 46 | + except subprocess.CalledProcessError as e: |
| 47 | + logger.error(f"Failed to clone repository: {e}") |
| 48 | + return |
| 49 | + |
| 50 | + if branch: |
| 51 | + try: |
| 52 | + subprocess.run(["git", "checkout", branch], cwd=repo_path, check=True, capture_output=True) |
| 53 | + except subprocess.CalledProcessError as e: |
| 54 | + logger.error(f"Failed to checkout branch {branch}: {e}") |
| 55 | + return |
| 56 | + |
| 57 | + # Get current HEAD SHA |
| 58 | + try: |
| 59 | + head_sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=repo_path, text=True).strip() |
| 60 | + except subprocess.CalledProcessError: |
| 61 | + logger.error("Failed to get HEAD SHA") |
| 62 | + return |
| 63 | + |
| 64 | + # Get Repo Info |
| 65 | + repo_info = extract_repo_info(repo_url) |
| 66 | + repo_name = repo_info.get("name", "unknown") |
| 67 | + |
| 68 | + # Get last ingested SHA from Neo4j |
| 69 | + query = "MATCH (r:Repo {name: $name}) RETURN r.repo_sha as sha" |
| 70 | + result = neo4j_client.run_query(query, {"name": repo_name}) |
| 71 | + |
| 72 | + last_sha = result[0]["sha"] if result and result[0]["sha"] else None |
| 73 | + |
| 74 | + if not last_sha: |
| 75 | + full_ingest(repo_url, branch=branch) |
| 76 | + return |
| 77 | + |
| 78 | + if last_sha == head_sha: |
| 79 | + return |
| 80 | + |
| 81 | + print(f"Found changes: {last_sha[:8]} -> {head_sha[:8]}") |
| 82 | + |
| 83 | + # 2. Identify Changed Files |
| 84 | + changed_files = get_changed_files(repo_path, last_sha, head_sha) |
| 85 | + |
| 86 | + if changed_files is None: |
| 87 | + full_ingest(repo_url, branch=branch) |
| 88 | + return |
| 89 | + |
| 90 | + if not changed_files: |
| 91 | + logger.info("No files changed (maybe only merge commits or non-code files).") |
| 92 | + else: |
| 93 | + # 3. Parse Changed Files |
| 94 | + analyzer = RepositoryAnalyzer() |
| 95 | + |
| 96 | + # First, delete old data for these files |
| 97 | + for file_path in changed_files: |
| 98 | + graph_ingestion_service.delete_file_data(repo_name, file_path) |
| 99 | + |
| 100 | + # Then parse and ingest new data |
| 101 | + graph_data = analyzer.analyze_files(repo_path, changed_files) |
| 102 | + graph_ingestion_service.ingest_graph_data(graph_data) |
| 103 | + |
| 104 | + # 4. Ingest Git History (New Commits) |
| 105 | + print("Ingesting new commits...") |
| 106 | + # We limit to 50 commits to avoid re-ingesting too much history if the gap is large. |
| 107 | + # Ideally we'd stop at last_sha, but process_repository doesn't support that yet. |
| 108 | + process_repository(str(repo_path), branch=branch, max_commits=50) |
| 109 | + |
| 110 | + # 5. Update Embeddings |
| 111 | + print("Updating embeddings...") |
| 112 | + add_embeddings_to_all_nodes( |
| 113 | + node_types=["Function", "Class", "File", "Doc", "Module", "Commit"], |
| 114 | + provider=EmbeddingProvider(settings.EMBEDDING_PROVIDER) |
| 115 | + ) |
| 116 | + |
| 117 | + # 6. Update Repo SHA |
| 118 | + print(f"Updating Repo SHA to {head_sha}...") |
| 119 | + update_query = "MATCH (r:Repo {name: $name}) SET r.repo_sha = $sha" |
| 120 | + neo4j_client.run_query(update_query, {"name": repo_name, "sha": head_sha}) |
| 121 | + |
| 122 | + print("\n" + "="*50) |
| 123 | + print("✅ Incremental Ingestion Complete!") |
| 124 | + print("="*50) |
0 commit comments