Skip to content

Commit bcfaa7c

Browse files
committed
Refactor GitHub webhook signature verification and enhance incremental ingestion process
1 parent cbfeafa commit bcfaa7c

7 files changed

Lines changed: 241 additions & 14 deletions

File tree

apps/api/routes/v1/integrations.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,18 @@ async def github_webhook(
2222
if not x_github_event:
2323
raise BadRequestException(message="Missing X-GitHub-Event header")
2424

25-
# Always verify signature in production
26-
try:
27-
await verify_github_signature(request, settings.GITHUB_WEBHOOK_SECRET)
28-
except SignatureVerificationError as e:
29-
logger.warning(f"GitHub signature verification failed: {str(e)}")
30-
raise UnauthorizedException(message=str(e))
25+
# Verify signature (skip in development if secret not configured)
26+
if settings.GITHUB_WEBHOOK_SECRET:
27+
try:
28+
await verify_github_signature(request, settings.GITHUB_WEBHOOK_SECRET)
29+
except SignatureVerificationError as e:
30+
logger.warning(f"GitHub signature verification failed: {str(e)}")
31+
raise UnauthorizedException(message=str(e))
32+
elif settings.is_production():
33+
logger.error("GITHUB_WEBHOOK_SECRET not configured in production!")
34+
raise UnauthorizedException(message="Webhook secret not configured")
35+
else:
36+
logger.warning("Skipping GitHub signature verification (no secret configured in development)")
3137

3238
payload = await request.json()
3339

packages/app_store/github/webhook.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import logging
22
from typing import Dict, Any, Optional
3-
from packages.ingest.full_ingest import full_ingest
3+
from packages.ingest.incremental_ingest import incremental_ingest
44

55
logger = logging.getLogger(__name__)
66

@@ -40,9 +40,7 @@ def handle_pull_request(payload: Dict[str, Any]):
4040
if repo_url and base_branch:
4141
logger.info(f"Triggering ingestion for merged PR in {repo_url} on branch {base_branch}")
4242

43-
# TODO: Trigger ingestion
44-
45-
# Let's define a function that CAN be run in background.
43+
# Trigger ingestion via background task
4644
return {
4745
"status": "triggered",
4846
"task": "ingest",
@@ -76,11 +74,11 @@ def handle_push(payload: Dict[str, Any]):
7674

7775
def run_ingestion(repo_url: str, branch: Optional[str]):
7876
"""
79-
Wrapper to run full_ingest, suitable for BackgroundTasks.
77+
Wrapper to run incremental_ingest, suitable for BackgroundTasks.
8078
"""
8179
try:
8280
logger.info(f"Starting background ingestion for {repo_url} branch {branch}")
83-
full_ingest(repo_url, branch=branch)
81+
incremental_ingest(repo_url, branch=branch)
8482
logger.info(f"Finished background ingestion for {repo_url} branch {branch}")
8583
except Exception as e:
8684
logger.error(f"Error during background ingestion: {e}", exc_info=True)

packages/ingest/commit_decisions.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ def _upsert_commit(memory: Memory, repo_name: str, repo_url: str, info: CommitIn
112112
def _upsert_file(memory: Memory, repo_name: str, file_path: str) -> str:
113113
# ID format: {repo_name}:{file_path}:file
114114
fid = f"{repo_name}:{file_path}:file"
115-
match = {"path": file_path, "repo_name": repo_name}
115+
# Match on id directly to avoid constraint conflicts
116+
match = {"id": fid}
116117
props = {
117118
"id": fid,
118119
"path": file_path,

packages/ingest/full_ingest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import argparse
22
from typing import Optional
33
from packages.parser.core.repository_analyzer import RepositoryAnalyzer
4+
from packages.parser.core.graph_ingestion import graph_ingestion_service
45
from packages.ingest.commit_decisions import process_repository
56
from packages.ingest.add_embeddings import add_embeddings_to_all_nodes
67
from packages.memory.embeddings import EmbeddingProvider
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import logging
2+
import subprocess
3+
from pathlib import Path
4+
from tempfile import TemporaryDirectory
5+
from typing import Optional, List
6+
7+
from packages.parser.core.repository_analyzer import RepositoryAnalyzer
8+
from packages.parser.core.graph_ingestion import graph_ingestion_service
9+
from packages.ingest.commit_decisions import process_repository
10+
from packages.ingest.add_embeddings import add_embeddings_to_all_nodes
11+
from packages.memory.embeddings import EmbeddingProvider
12+
from packages.config.settings import Settings
13+
from packages.parser.utils import extract_repo_info
14+
from packages.database.graph.graph import neo4j_client
15+
from packages.ingest.full_ingest import full_ingest
16+
17+
settings = Settings()
18+
logger = logging.getLogger(__name__)
19+
20+
def get_changed_files(repo_path: Path, base_sha: str, head_sha: str) -> Optional[List[str]]:
21+
"""Get list of changed files between two commits. Returns None on error."""
22+
try:
23+
result = subprocess.run(
24+
["git", "diff", "--name-only", base_sha, head_sha],
25+
cwd=repo_path,
26+
capture_output=True,
27+
text=True,
28+
check=True
29+
)
30+
files = [f.strip() for f in result.stdout.splitlines() if f.strip()]
31+
return files
32+
except subprocess.CalledProcessError as e:
33+
logger.error(f"Error getting changed files: {e}")
34+
return None
35+
36+
def incremental_ingest(repo_url: str, branch: Optional[str] = None):
37+
"""
38+
Run incremental ingestion pipeline.
39+
"""
40+
with TemporaryDirectory() as temp_dir:
41+
repo_path = Path(temp_dir)
42+
43+
# Clone the repository
44+
try:
45+
subprocess.run(["git", "clone", repo_url, "."], cwd=repo_path, check=True, capture_output=True)
46+
except subprocess.CalledProcessError as e:
47+
logger.error(f"Failed to clone repository: {e}")
48+
return
49+
50+
if branch:
51+
try:
52+
subprocess.run(["git", "checkout", branch], cwd=repo_path, check=True, capture_output=True)
53+
except subprocess.CalledProcessError as e:
54+
logger.error(f"Failed to checkout branch {branch}: {e}")
55+
return
56+
57+
# Get current HEAD SHA
58+
try:
59+
head_sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=repo_path, text=True).strip()
60+
except subprocess.CalledProcessError:
61+
logger.error("Failed to get HEAD SHA")
62+
return
63+
64+
# Get Repo Info
65+
repo_info = extract_repo_info(repo_url)
66+
repo_name = repo_info.get("name", "unknown")
67+
68+
# Get last ingested SHA from Neo4j
69+
query = "MATCH (r:Repo {name: $name}) RETURN r.repo_sha as sha"
70+
result = neo4j_client.run_query(query, {"name": repo_name})
71+
72+
last_sha = result[0]["sha"] if result and result[0]["sha"] else None
73+
74+
if not last_sha:
75+
full_ingest(repo_url, branch=branch)
76+
return
77+
78+
if last_sha == head_sha:
79+
return
80+
81+
print(f"Found changes: {last_sha[:8]} -> {head_sha[:8]}")
82+
83+
# 2. Identify Changed Files
84+
changed_files = get_changed_files(repo_path, last_sha, head_sha)
85+
86+
if changed_files is None:
87+
full_ingest(repo_url, branch=branch)
88+
return
89+
90+
if not changed_files:
91+
logger.info("No files changed (maybe only merge commits or non-code files).")
92+
else:
93+
# 3. Parse Changed Files
94+
analyzer = RepositoryAnalyzer()
95+
96+
# First, delete old data for these files
97+
for file_path in changed_files:
98+
graph_ingestion_service.delete_file_data(repo_name, file_path)
99+
100+
# Then parse and ingest new data
101+
graph_data = analyzer.analyze_files(repo_path, changed_files)
102+
graph_ingestion_service.ingest_graph_data(graph_data)
103+
104+
# 4. Ingest Git History (New Commits)
105+
print("Ingesting new commits...")
106+
# We limit to 50 commits to avoid re-ingesting too much history if the gap is large.
107+
# Ideally we'd stop at last_sha, but process_repository doesn't support that yet.
108+
process_repository(str(repo_path), branch=branch, max_commits=50)
109+
110+
# 5. Update Embeddings
111+
print("Updating embeddings...")
112+
add_embeddings_to_all_nodes(
113+
node_types=["Function", "Class", "File", "Doc", "Module", "Commit"],
114+
provider=EmbeddingProvider(settings.EMBEDDING_PROVIDER)
115+
)
116+
117+
# 6. Update Repo SHA
118+
print(f"Updating Repo SHA to {head_sha}...")
119+
update_query = "MATCH (r:Repo {name: $name}) SET r.repo_sha = $sha"
120+
neo4j_client.run_query(update_query, {"name": repo_name, "sha": head_sha})
121+
122+
print("\n" + "="*50)
123+
print("✅ Incremental Ingestion Complete!")
124+
print("="*50)

packages/parser/core/graph_ingestion.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,29 @@ def clear_repository_data(self, repo_name: str):
227227
"""
228228

229229
self.client.run_query(query, {"repo_name": repo_name})
230-
print(f"Cleared data for repository: {repo_name}")
231230

231+
def delete_file_data(self, repo_name: str, file_path: str):
232+
"""
233+
Delete all data associated with a specific file
234+
235+
Args:
236+
repo_name: Name of the repository
237+
file_path: Relative path of the file
238+
"""
239+
query = """
240+
MATCH (f:File {repo_name: $repo_name, path: $file_path})
241+
OPTIONAL MATCH (f)-[:CONTAINS_CLASS]->(c:Class)
242+
OPTIONAL MATCH (f)-[:CONTAINS_FUNCTION]->(fn:Function)
243+
OPTIONAL MATCH (f)-[:HAS_TEST]->(t:Test)
244+
OPTIONAL MATCH (f)-[:HAS_DOC]->(d:Doc)
245+
DETACH DELETE f, c, fn, t, d
246+
"""
247+
248+
self.client.run_query(query, {
249+
"repo_name": repo_name,
250+
"file_path": file_path
251+
})
252+
232253
def get_repository_stats(self, repo_name: str) -> Dict[str, int]:
233254
"""
234255
Get comprehensive statistics for a repository in the graph

packages/parser/core/repository_analyzer.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,82 @@ def _initialize_parsers(self):
5050
# Add more parsers as they become available
5151
# Java, Go, etc.
5252

53+
def analyze_files(self, repo_path: Path, file_paths: list[str], repo_context: Optional[dict] = None) -> GraphData:
54+
"""
55+
Analyze specific files in the repository.
56+
57+
Args:
58+
repo_path: Path to the repository root
59+
file_paths: List of relative file paths to analyze
60+
repo_context: Optional repository context (if already computed)
61+
62+
Returns:
63+
GraphData containing nodes and relationships for the specified files
64+
"""
65+
repo_path = Path(repo_path).resolve()
66+
67+
if repo_context is None:
68+
repo_context = self._get_repo_context(repo_path)
69+
70+
graph_data = GraphData()
71+
72+
# Create Repo node (needed for linking)
73+
repo_node = self._create_repo_node(repo_path, repo_context)
74+
graph_data.add_node(repo_node)
75+
76+
files_parsed = 0
77+
78+
for rel_path in file_paths:
79+
file_path = repo_path / rel_path
80+
81+
if not file_path.exists() or not is_code_file(file_path):
82+
continue
83+
84+
try:
85+
# Detect language
86+
language = detect_language(file_path)
87+
88+
if language is None:
89+
continue
90+
91+
# Check if we have a parser for this language
92+
parser = self.parsers.get(language)
93+
94+
if parser is None:
95+
continue
96+
97+
# Read file content
98+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
99+
content = f.read()
100+
101+
# Parse the file
102+
file_graph_data = parser.parse_file(Path(rel_path), content, repo_context)
103+
104+
# Merge into main graph data
105+
for node in file_graph_data.nodes:
106+
graph_data.add_node(node)
107+
108+
for rel in file_graph_data.relationships:
109+
graph_data.add_relationship(rel)
110+
111+
# Add Repo -> File relationship
112+
file_nodes = [n for n in file_graph_data.nodes if hasattr(n, 'path')]
113+
for file_node in file_nodes:
114+
from packages.parser.models import Relationship, RelationshipType
115+
graph_data.add_relationship(Relationship(
116+
source_id=repo_node.id,
117+
target_id=file_node.id,
118+
type=RelationshipType.HAS_FILE
119+
))
120+
121+
files_parsed += 1
122+
123+
except Exception as e:
124+
print(f"Error parsing {file_path}: {e}")
125+
continue
126+
127+
return graph_data
128+
53129
def analyze_repository(self, repo_path: str | Path, cleanup_after: bool = True) -> GraphData:
54130
"""
55131
Analyze an entire repository and extract graph data

0 commit comments

Comments
 (0)