Refactor GitHub webhook signature verification and enhance incremental ingestion process

jenilsavani9 · jenilsavani9 · commit bcfaa7cd2c2a · 2025-12-02T23:26:34.000+05:30
diff --git a/apps/api/routes/v1/integrations.py b/apps/api/routes/v1/integrations.py
@@ -22,12 +22,18 @@ async def github_webhook(
     if not x_github_event:
         raise BadRequestException(message="Missing X-GitHub-Event header")
 
-    # Always verify signature in production
-    try:
-        await verify_github_signature(request, settings.GITHUB_WEBHOOK_SECRET)
-    except SignatureVerificationError as e:
-        logger.warning(f"GitHub signature verification failed: {str(e)}")
-        raise UnauthorizedException(message=str(e))
+    # Verify signature (skip in development if secret not configured)
+    if settings.GITHUB_WEBHOOK_SECRET:
+        try:
+            await verify_github_signature(request, settings.GITHUB_WEBHOOK_SECRET)
+        except SignatureVerificationError as e:
+            logger.warning(f"GitHub signature verification failed: {str(e)}")
+            raise UnauthorizedException(message=str(e))
+    elif settings.is_production():
+        logger.error("GITHUB_WEBHOOK_SECRET not configured in production!")
+        raise UnauthorizedException(message="Webhook secret not configured")
+    else:
+        logger.warning("Skipping GitHub signature verification (no secret configured in development)")
     
     payload = await request.json()
     
diff --git a/packages/app_store/github/webhook.py b/packages/app_store/github/webhook.py
@@ -1,6 +1,6 @@
 import logging
 from typing import Dict, Any, Optional
-from packages.ingest.full_ingest import full_ingest
+from packages.ingest.incremental_ingest import incremental_ingest
 
 logger = logging.getLogger(__name__)
 
@@ -40,9 +40,7 @@ def handle_pull_request(payload: Dict[str, Any]):
         if repo_url and base_branch:
             logger.info(f"Triggering ingestion for merged PR in {repo_url} on branch {base_branch}")
             
-            # TODO: Trigger ingestion
-            
-            # Let's define a function that CAN be run in background.
+            # Trigger ingestion via background task
             return {
                 "status": "triggered",
                 "task": "ingest",
@@ -76,11 +74,11 @@ def handle_push(payload: Dict[str, Any]):
 
 def run_ingestion(repo_url: str, branch: Optional[str]):
     """
-    Wrapper to run full_ingest, suitable for BackgroundTasks.
+    Wrapper to run incremental_ingest, suitable for BackgroundTasks.
     """
     try:
         logger.info(f"Starting background ingestion for {repo_url} branch {branch}")
-        full_ingest(repo_url, branch=branch)
+        incremental_ingest(repo_url, branch=branch)
         logger.info(f"Finished background ingestion for {repo_url} branch {branch}")
     except Exception as e:
         logger.error(f"Error during background ingestion: {e}", exc_info=True)
diff --git a/packages/ingest/commit_decisions.py b/packages/ingest/commit_decisions.py
@@ -112,7 +112,8 @@ def _upsert_commit(memory: Memory, repo_name: str, repo_url: str, info: CommitIn
 def _upsert_file(memory: Memory, repo_name: str, file_path: str) -> str:
     # ID format: {repo_name}:{file_path}:file
     fid = f"{repo_name}:{file_path}:file"
-    match = {"path": file_path, "repo_name": repo_name}
+    # Match on id directly to avoid constraint conflicts
+    match = {"id": fid}
     props = {
         "id": fid,
         "path": file_path,
diff --git a/packages/ingest/full_ingest.py b/packages/ingest/full_ingest.py
@@ -1,6 +1,7 @@
 import argparse
 from typing import Optional
 from packages.parser.core.repository_analyzer import RepositoryAnalyzer
+from packages.parser.core.graph_ingestion import graph_ingestion_service
 from packages.ingest.commit_decisions import process_repository
 from packages.ingest.add_embeddings import add_embeddings_to_all_nodes
 from packages.memory.embeddings import EmbeddingProvider
diff --git a/packages/ingest/incremental_ingest.py b/packages/ingest/incremental_ingest.py
@@ -0,0 +1,124 @@
+import logging
+import subprocess
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Optional, List
+
+from packages.parser.core.repository_analyzer import RepositoryAnalyzer
+from packages.parser.core.graph_ingestion import graph_ingestion_service
+from packages.ingest.commit_decisions import process_repository
+from packages.ingest.add_embeddings import add_embeddings_to_all_nodes
+from packages.memory.embeddings import EmbeddingProvider
+from packages.config.settings import Settings
+from packages.parser.utils import extract_repo_info
+from packages.database.graph.graph import neo4j_client
+from packages.ingest.full_ingest import full_ingest
+
+settings = Settings()
+logger = logging.getLogger(__name__)
+
+def get_changed_files(repo_path: Path, base_sha: str, head_sha: str) -> Optional[List[str]]:
+    """Get list of changed files between two commits. Returns None on error."""
+    try:
+        result = subprocess.run(
+            ["git", "diff", "--name-only", base_sha, head_sha],
+            cwd=repo_path,
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        files = [f.strip() for f in result.stdout.splitlines() if f.strip()]
+        return files
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error getting changed files: {e}")
+        return None
+
+def incremental_ingest(repo_url: str, branch: Optional[str] = None):
+    """
+    Run incremental ingestion pipeline.
+    """    
+    with TemporaryDirectory() as temp_dir:
+        repo_path = Path(temp_dir)
+        
+        # Clone the repository
+        try:
+            subprocess.run(["git", "clone", repo_url, "."], cwd=repo_path, check=True, capture_output=True)
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to clone repository: {e}")
+            return
+
+        if branch:
+            try:
+                subprocess.run(["git", "checkout", branch], cwd=repo_path, check=True, capture_output=True)
+            except subprocess.CalledProcessError as e:
+                logger.error(f"Failed to checkout branch {branch}: {e}")
+                return
+            
+        # Get current HEAD SHA
+        try:
+            head_sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=repo_path, text=True).strip()
+        except subprocess.CalledProcessError:
+            logger.error("Failed to get HEAD SHA")
+            return
+        
+        # Get Repo Info
+        repo_info = extract_repo_info(repo_url)
+        repo_name = repo_info.get("name", "unknown")
+        
+        # Get last ingested SHA from Neo4j
+        query = "MATCH (r:Repo {name: $name}) RETURN r.repo_sha as sha"
+        result = neo4j_client.run_query(query, {"name": repo_name})
+        
+        last_sha = result[0]["sha"] if result and result[0]["sha"] else None
+        
+        if not last_sha:
+            full_ingest(repo_url, branch=branch)
+            return
+
+        if last_sha == head_sha:
+            return
+
+        print(f"Found changes: {last_sha[:8]} -> {head_sha[:8]}")
+        
+        # 2. Identify Changed Files
+        changed_files = get_changed_files(repo_path, last_sha, head_sha)
+        
+        if changed_files is None:
+            full_ingest(repo_url, branch=branch)
+            return
+                    
+        if not changed_files:
+            logger.info("No files changed (maybe only merge commits or non-code files).")
+        else:
+            # 3. Parse Changed Files
+            analyzer = RepositoryAnalyzer()
+            
+            # First, delete old data for these files
+            for file_path in changed_files:
+                graph_ingestion_service.delete_file_data(repo_name, file_path)
+            
+            # Then parse and ingest new data
+            graph_data = analyzer.analyze_files(repo_path, changed_files)
+            graph_ingestion_service.ingest_graph_data(graph_data)
+            
+        # 4. Ingest Git History (New Commits)
+        print("Ingesting new commits...")
+        # We limit to 50 commits to avoid re-ingesting too much history if the gap is large.
+        # Ideally we'd stop at last_sha, but process_repository doesn't support that yet.
+        process_repository(str(repo_path), branch=branch, max_commits=50)
+        
+        # 5. Update Embeddings
+        print("Updating embeddings...")
+        add_embeddings_to_all_nodes(
+            node_types=["Function", "Class", "File", "Doc", "Module", "Commit"],
+            provider=EmbeddingProvider(settings.EMBEDDING_PROVIDER)
+        )
+        
+        # 6. Update Repo SHA
+        print(f"Updating Repo SHA to {head_sha}...")
+        update_query = "MATCH (r:Repo {name: $name}) SET r.repo_sha = $sha"
+        neo4j_client.run_query(update_query, {"name": repo_name, "sha": head_sha})
+        
+        print("\n" + "="*50)
+        print("✅ Incremental Ingestion Complete!")
+        print("="*50)
diff --git a/packages/parser/core/graph_ingestion.py b/packages/parser/core/graph_ingestion.py
@@ -227,8 +227,29 @@ def clear_repository_data(self, repo_name: str):
         """
         
         self.client.run_query(query, {"repo_name": repo_name})
-        print(f"Cleared data for repository: {repo_name}")
     
+    def delete_file_data(self, repo_name: str, file_path: str):
+        """
+        Delete all data associated with a specific file
+        
+        Args:
+            repo_name: Name of the repository
+            file_path: Relative path of the file
+        """
+        query = """
+        MATCH (f:File {repo_name: $repo_name, path: $file_path})
+        OPTIONAL MATCH (f)-[:CONTAINS_CLASS]->(c:Class)
+        OPTIONAL MATCH (f)-[:CONTAINS_FUNCTION]->(fn:Function)
+        OPTIONAL MATCH (f)-[:HAS_TEST]->(t:Test)
+        OPTIONAL MATCH (f)-[:HAS_DOC]->(d:Doc)
+        DETACH DELETE f, c, fn, t, d
+        """
+        
+        self.client.run_query(query, {
+            "repo_name": repo_name,
+            "file_path": file_path
+        })
+
     def get_repository_stats(self, repo_name: str) -> Dict[str, int]:
         """
         Get comprehensive statistics for a repository in the graph
diff --git a/packages/parser/core/repository_analyzer.py b/packages/parser/core/repository_analyzer.py
@@ -50,6 +50,82 @@ def _initialize_parsers(self):
         # Add more parsers as they become available
         # Java, Go, etc.
     
+    def analyze_files(self, repo_path: Path, file_paths: list[str], repo_context: Optional[dict] = None) -> GraphData:
+        """
+        Analyze specific files in the repository.
+        
+        Args:
+            repo_path: Path to the repository root
+            file_paths: List of relative file paths to analyze
+            repo_context: Optional repository context (if already computed)
+            
+        Returns:
+            GraphData containing nodes and relationships for the specified files
+        """
+        repo_path = Path(repo_path).resolve()
+        
+        if repo_context is None:
+            repo_context = self._get_repo_context(repo_path)
+            
+        graph_data = GraphData()
+        
+        # Create Repo node (needed for linking)
+        repo_node = self._create_repo_node(repo_path, repo_context)
+        graph_data.add_node(repo_node)
+        
+        files_parsed = 0
+        
+        for rel_path in file_paths:
+            file_path = repo_path / rel_path
+            
+            if not file_path.exists() or not is_code_file(file_path):
+                continue
+                
+            try:
+                # Detect language
+                language = detect_language(file_path)
+                
+                if language is None:
+                    continue
+                
+                # Check if we have a parser for this language
+                parser = self.parsers.get(language)
+                
+                if parser is None:
+                    continue
+                
+                # Read file content
+                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                    content = f.read()
+                
+                # Parse the file
+                file_graph_data = parser.parse_file(Path(rel_path), content, repo_context)
+                
+                # Merge into main graph data
+                for node in file_graph_data.nodes:
+                    graph_data.add_node(node)
+                
+                for rel in file_graph_data.relationships:
+                    graph_data.add_relationship(rel)
+                
+                # Add Repo -> File relationship
+                file_nodes = [n for n in file_graph_data.nodes if hasattr(n, 'path')]
+                for file_node in file_nodes:
+                    from packages.parser.models import Relationship, RelationshipType
+                    graph_data.add_relationship(Relationship(
+                        source_id=repo_node.id,
+                        target_id=file_node.id,
+                        type=RelationshipType.HAS_FILE
+                    ))
+                
+                files_parsed += 1
+                
+            except Exception as e:
+                print(f"Error parsing {file_path}: {e}")
+                continue
+                
+        return graph_data
+
     def analyze_repository(self, repo_path: str | Path, cleanup_after: bool = True) -> GraphData:
         """
         Analyze an entire repository and extract graph data