Skip to content

Commit 3117d55

Browse files
authored
Merge pull request #13 from SecrinLabs/feat/store-pr-metadata-as-knowledge-node
Add pull request metadata ingestion and update related models and edges
2 parents ec084d5 + 75f693b commit 3117d55

9 files changed

Lines changed: 175 additions & 6 deletions

File tree

packages/app_store/github/webhook.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
from typing import Dict, Any, Optional
33
from packages.ingest.incremental_ingest import incremental_ingest
4+
from packages.ingest.pr_metadata import ingest_pull_request_metadata
45

56
logger = logging.getLogger(__name__)
67

@@ -29,6 +30,15 @@ def handle_pull_request(payload: Dict[str, Any]):
2930

3031
logger.info(f"Processing PR action: {action}, merged: {merged}")
3132

33+
# First, ingest PR metadata for any PR action
34+
repo_info = payload.get("repository", {})
35+
try:
36+
pr_data = ingest_pull_request_metadata(pull_request, repo_info)
37+
logger.info(f"Ingested PR metadata: {pr_data}")
38+
except Exception as e:
39+
logger.error(f"Failed to ingest PR metadata: {e}", exc_info=True)
40+
41+
# Then handle merged PR actions
3242
if action == "closed" and merged:
3343
# PR was merged
3444
repo_info = payload.get("repository", {})
@@ -48,7 +58,7 @@ def handle_pull_request(payload: Dict[str, Any]):
4858
"branch": base_branch
4959
}
5060

51-
return {"status": "ignored", "reason": "Not a merged PR"}
61+
return {"status": "processed", "reason": "PR metadata ingested"}
5262

5363
def handle_push(payload: Dict[str, Any]):
5464
"""

packages/ingest/add_embeddings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def add_embeddings_to_all_nodes(
102102
provider: Embedding provider to use
103103
"""
104104
if node_types is None:
105-
node_types = ["Function", "Class", "File", "Doc", "Module", "Commit"]
105+
node_types = ["Function", "Class", "File", "Doc", "Module", "Commit", "PullRequest"]
106106

107107
print(f"Using {provider.value} embeddings")
108108
print(f"Embedding dimension: {settings.EMBEDDING_DIMENSION}")
@@ -158,7 +158,7 @@ def main():
158158
parser.add_argument(
159159
"--node-types",
160160
nargs="+",
161-
choices=["Function", "Class", "File", "Doc", "Module", "Commit"],
161+
choices=["Function", "Class", "File", "Doc", "Module", "Commit", "PullRequest"],
162162
help="Node types to process (default: all)"
163163
)
164164
parser.add_argument(

packages/ingest/edges.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,6 @@ class Edge(Enum):
2020
# Issues / PRs / context
2121
REFERENCES = "REFERENCES"
2222
RELATED_TO = "RELATED_TO"
23+
CREATED_BY = "CREATED_BY"
24+
MERGED_TO = "MERGED_TO"
25+
INCLUDES = "INCLUDES"

packages/ingest/full_ingest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def full_ingest(repo_path: str, branch: Optional[str] = None, max_commits: int =
4444
# We need to ensure vector indexes exist first (usually handled by migrations, but let's assume they exist)
4545
# add_embeddings_to_all_nodes handles all node types
4646
add_embeddings_to_all_nodes(
47-
node_types=["Function", "Class", "File", "Doc", "Module", "Commit"],
47+
node_types=["Function", "Class", "File", "Doc", "Module", "Commit", "PullRequest"],
4848
provider=EmbeddingProvider(settings.EMBEDDING_PROVIDER)
4949
)
5050

packages/ingest/incremental_ingest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def incremental_ingest(repo_url: str, branch: Optional[str] = None):
110110
# 5. Update Embeddings
111111
print("Updating embeddings...")
112112
add_embeddings_to_all_nodes(
113-
node_types=["Function", "Class", "File", "Doc", "Module", "Commit"],
113+
node_types=["Function", "Class", "File", "Doc", "Module", "Commit", "PullRequest"],
114114
provider=EmbeddingProvider(settings.EMBEDDING_PROVIDER)
115115
)
116116

packages/ingest/pr_metadata.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from datetime import datetime
2+
from typing import Dict, Any, Optional
3+
from packages.memory.memory import Memory
4+
from packages.ingest.edges import Edge
5+
from packages.parser.utils import extract_repo_info
6+
7+
def ingest_pull_request_metadata(
8+
pr_payload: Dict[str, Any],
9+
repo_payload: Dict[str, Any],
10+
memory: Optional[Memory] = None
11+
) -> Dict[str, Any]:
12+
"""
13+
Ingest pull request metadata into the knowledge graph.
14+
15+
Args:
16+
pr_payload: The pull_request object from GitHub webhook
17+
repo_payload: The repository object from GitHub webhook
18+
memory: Optional Memory instance (creates new if not provided)
19+
20+
Returns:
21+
Dictionary with created node IDs and stats
22+
"""
23+
if memory is None:
24+
memory = Memory()
25+
26+
# Extract repository info
27+
repo_url = repo_payload.get("clone_url") or repo_payload.get("html_url") or ""
28+
if not repo_url:
29+
raise ValueError("Repository URL not found in payload")
30+
31+
repo_info = extract_repo_info(repo_url)
32+
repo_name = repo_info.get("name", "unknown")
33+
34+
# Extract PR metadata
35+
pr_number = pr_payload.get("number")
36+
pr_title = pr_payload.get("title", "")
37+
pr_body = pr_payload.get("body", "")
38+
pr_state = pr_payload.get("state", "open")
39+
pr_merged = pr_payload.get("merged", False)
40+
pr_merged_at = pr_payload.get("merged_at")
41+
42+
# Extract author info
43+
pr_author_data = pr_payload.get("user", {})
44+
pr_author = pr_author_data.get("login", "unknown")
45+
pr_author_email = pr_author_data.get("email", f"{pr_author}@github.com")
46+
47+
# Extract branch info
48+
base_branch = pr_payload.get("base", {}).get("ref")
49+
head_branch = pr_payload.get("head", {}).get("ref")
50+
51+
# Create stable PR ID
52+
pr_id = f"{repo_name}:pr:{pr_number}"
53+
54+
# Build PR content for embedding
55+
pr_content = f"""Pull Request #{pr_number}: {pr_title}
56+
57+
{pr_body}
58+
59+
Author: {pr_author}
60+
State: {pr_state}
61+
Merged: {pr_merged}
62+
Base Branch: {base_branch}
63+
Head Branch: {head_branch}
64+
"""
65+
66+
# Upsert PR node
67+
pr_node_id = memory.upsert_node(
68+
"PullRequest",
69+
match_props={"pr_number": pr_number, "repo_url": repo_url},
70+
set_props={
71+
"id": pr_id,
72+
"pr_number": pr_number,
73+
"title": pr_title,
74+
"body": pr_body,
75+
"content": pr_content,
76+
"author": pr_author,
77+
"repo_url": repo_url,
78+
"state": pr_state,
79+
"merged": pr_merged,
80+
"merged_at": pr_merged_at,
81+
"base_branch": base_branch,
82+
"head_branch": head_branch,
83+
}
84+
)
85+
86+
# Upsert repository node
87+
repo_node_id = f"repo:{repo_name}"
88+
memory.upsert_node(
89+
"Repository",
90+
match_props={"url": repo_url},
91+
set_props={
92+
"id": repo_node_id,
93+
"url": repo_url,
94+
"name": repo_name,
95+
"owner": repo_info.get("owner", "unknown"),
96+
"full_name": repo_info.get("full_name", "unknown"),
97+
"content": repo_info.get("full_name", "unknown"),
98+
}
99+
)
100+
101+
# Upsert author as Person node
102+
author_id = f"person:{pr_author}"
103+
memory.upsert_node(
104+
"Person",
105+
match_props={"email": pr_author_email},
106+
set_props={
107+
"id": author_id,
108+
"name": pr_author,
109+
"email": pr_author_email,
110+
"content": f"{pr_author} ({pr_author_email})",
111+
}
112+
)
113+
114+
# Create relationships
115+
memory.link(pr_node_id, Edge.BELONGS_TO, repo_node_id)
116+
memory.link(pr_node_id, Edge.CREATED_BY, author_id)
117+
118+
# If PR is merged and includes commits, link to commits
119+
# This would require additional commit SHAs from the payload
120+
# For now, we'll just store the PR metadata
121+
122+
return {
123+
"pr_node_id": pr_node_id,
124+
"pr_number": pr_number,
125+
"title": pr_title,
126+
"author": pr_author,
127+
"state": pr_state,
128+
"merged": pr_merged,
129+
}
130+
131+
132+
__all__ = ["ingest_pull_request_metadata"]

packages/parser/core/graph_ingestion.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
ModuleNode,
1717
TestNode,
1818
IssueNode,
19+
PullRequestNode,
1920
Relationship,
2021
)
2122

@@ -64,6 +65,7 @@ def _create_constraints(self):
6465
"CREATE CONSTRAINT IF NOT EXISTS FOR (m:Module) REQUIRE m.id IS UNIQUE",
6566
"CREATE CONSTRAINT IF NOT EXISTS FOR (t:Test) REQUIRE t.id IS UNIQUE",
6667
"CREATE CONSTRAINT IF NOT EXISTS FOR (i:Issue) REQUIRE i.id IS UNIQUE",
68+
"CREATE CONSTRAINT IF NOT EXISTS FOR (pr:PullRequest) REQUIRE pr.id IS UNIQUE",
6769
]
6870

6971
for constraint in constraints:
@@ -209,6 +211,8 @@ def _infer_label_from_id(self, node_id: str) -> str:
209211
return "Test"
210212
elif ":issue:" in node_id:
211213
return "Issue"
214+
elif ":pr:" in node_id:
215+
return "PullRequest"
212216
else:
213217
# Default fallback
214218
return "Node"
@@ -269,14 +273,16 @@ def get_repository_stats(self, repo_name: str) -> Dict[str, int]:
269273
OPTIONAL MATCH (f)-[:HAS_DOC]->(d:Doc)
270274
OPTIONAL MATCH (commit:Commit)-[:TOUCHED]->(f)
271275
OPTIONAL MATCH (f)-[:IMPORTS]->(p:Package)
276+
OPTIONAL MATCH (pr:PullRequest)-[:BELONGS_TO]->(r)
272277
RETURN
273278
count(DISTINCT f) as files,
274279
count(DISTINCT c) as classes,
275280
count(DISTINCT fn) as functions,
276281
count(DISTINCT t) as tests,
277282
count(DISTINCT d) as docs,
278283
count(DISTINCT commit) as commits,
279-
count(DISTINCT p) as packages
284+
count(DISTINCT p) as packages,
285+
count(DISTINCT pr) as pull_requests
280286
"""
281287

282288
result = self.client.run_query(query, {"repo_name": repo_name})
@@ -291,6 +297,7 @@ def get_repository_stats(self, repo_name: str) -> Dict[str, int]:
291297
"docs": record["docs"],
292298
"commits": record["commits"],
293299
"packages": record["packages"],
300+
"pull_requests": record["pull_requests"],
294301
}
295302

296303
return {
@@ -301,6 +308,7 @@ def get_repository_stats(self, repo_name: str) -> Dict[str, int]:
301308
"docs": 0,
302309
"commits": 0,
303310
"packages": 0,
311+
"pull_requests": 0,
304312
}
305313

306314

packages/parser/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
TestNode,
1010
DocNode,
1111
IssueNode,
12+
PullRequestNode,
1213
PackageNode,
1314
)
1415
from .relationships import (
@@ -28,6 +29,7 @@
2829
"TestNode",
2930
"DocNode",
3031
"IssueNode",
32+
"PullRequestNode",
3133
"PackageNode",
3234
"Relationship",
3335
"RelationshipType",

packages/parser/models/nodes.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class NodeType(str, Enum):
1515
TEST = "Test"
1616
DOC = "Doc"
1717
ISSUE = "Issue"
18+
PULL_REQUEST = "PullRequest"
1819
PACKAGE = "Package"
1920

2021

@@ -102,6 +103,19 @@ class IssueNode(BaseNode):
102103
state: str # e.g., 'open', 'closed'
103104

104105

106+
class PullRequestNode(BaseNode):
107+
pr_number: int
108+
title: str
109+
body: str
110+
author: str
111+
repo_url: str
112+
state: str # e.g., 'open', 'closed', 'merged'
113+
merged: bool = False
114+
merged_at: Optional[datetime] = None
115+
base_branch: Optional[str] = None
116+
head_branch: Optional[str] = None
117+
118+
105119
class PackageNode(BaseNode):
106120
name: str
107121
version: str

0 commit comments

Comments
 (0)