Skip to content

Commit 7497a89

Browse files
committed
feat: fallback recursive chunker
1 parent d6181f0 commit 7497a89

10 files changed

Lines changed: 253 additions & 168 deletions

File tree

dev/script.ts

Lines changed: 0 additions & 39 deletions
This file was deleted.

packages/core/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@hpbyte/h-codex-core",
3-
"version": "0.1.6",
3+
"version": "0.1.7",
44
"description": "Core indexing and search functionality for h-codex",
55
"author": "Htoo Pyae Lwin",
66
"license": "MIT",
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import type { SyntaxNode } from 'tree-sitter'
2+
import crypto from 'crypto'
3+
4+
import type { CodeChunkInsert, ChunkParams, SupportedLanguage } from '../../types'
5+
import { maxChunkSize, coalesce, countLengthWithoutWhitespace } from '../../utils'
6+
import { treeSitterParser } from './tree-sitter-parser'
7+
import { ChunkRange } from './chunk-range'
8+
9+
export class CSTChunker {
10+
private filePath = ''
11+
private fileContent = ''
12+
private projectId = ''
13+
private language: SupportedLanguage | null = null
14+
15+
async chunk({ filePath, projectId, language }: ChunkParams) {
16+
const cstRootNode = await treeSitterParser.parseFile(filePath, language)
17+
18+
this.filePath = filePath
19+
this.language = language
20+
this.fileContent = cstRootNode.text
21+
this.projectId = projectId
22+
23+
const chunks = this.chunkNode(cstRootNode)
24+
const processedChunks = this.processChunks(chunks)
25+
26+
return processedChunks
27+
}
28+
29+
/**
30+
* AST-based chunking
31+
* - each nodes are traversed and greedily bundled together
32+
* - if a node is too large, recursively chunk its children
33+
* - else, concatenate the current chunk to the node
34+
*/
35+
private chunkNode(node: SyntaxNode): ChunkRange[] {
36+
const chunks: ChunkRange[] = []
37+
let currentChunk: ChunkRange | null = null
38+
39+
const addCurrentChunk = () => {
40+
if (currentChunk && !currentChunk.isEmpty) {
41+
chunks.push(currentChunk)
42+
currentChunk = null
43+
}
44+
}
45+
46+
for (const child of node.children) {
47+
const childChunkRange = new ChunkRange(
48+
child.startPosition.row,
49+
child.endPosition.row,
50+
child.type
51+
)
52+
const childText = child.text
53+
54+
if (childText.length > maxChunkSize) {
55+
addCurrentChunk()
56+
chunks.push(...this.chunkNode(child))
57+
continue
58+
}
59+
60+
const currentChunkText = currentChunk?.extract(this.fileContent) ?? ''
61+
const wouldExceedSize = currentChunkText.length + childText.length > maxChunkSize
62+
63+
if (wouldExceedSize) {
64+
addCurrentChunk()
65+
currentChunk = childChunkRange
66+
} else {
67+
currentChunk = currentChunk ? currentChunk.add(childChunkRange) : childChunkRange
68+
}
69+
}
70+
71+
addCurrentChunk()
72+
73+
return chunks
74+
}
75+
76+
/**
77+
* try to combine very small chunks (under the coalesce threshold) with adjacent chunks
78+
* but if combing will exceed the max chunk size, keep as is
79+
*/
80+
private processChunks(chunks: ChunkRange[]): CodeChunkInsert[] {
81+
if (chunks.length === 0) return []
82+
83+
const processedChunks: CodeChunkInsert[] = []
84+
85+
for (let i = 0; i < chunks.length; i++) {
86+
const chunk = chunks[i]!
87+
const chunkText = chunk.extract(this.fileContent)
88+
89+
if (countLengthWithoutWhitespace(chunkText) >= coalesce) {
90+
processedChunks.push(this.formatAsCodeChunk(chunk, chunkText))
91+
continue
92+
}
93+
94+
if (i < chunks.length - 1) {
95+
const nextChunk = chunks[i + 1]!
96+
const combinedChunk = chunk.add(nextChunk)
97+
const combinedText = combinedChunk.extract(this.fileContent)
98+
99+
if (combinedText.length <= maxChunkSize) {
100+
processedChunks.push(this.formatAsCodeChunk(combinedChunk, combinedText))
101+
// skip the next chunk as it's combined with the current one
102+
i++
103+
continue
104+
}
105+
}
106+
107+
// can't coalesce, keep as is
108+
processedChunks.push(this.formatAsCodeChunk(chunk, chunkText))
109+
}
110+
111+
return processedChunks
112+
}
113+
114+
private formatAsCodeChunk(chunk: ChunkRange, content: string): CodeChunkInsert {
115+
return {
116+
content,
117+
filePath: this.filePath,
118+
projectId: this.projectId,
119+
startLine: chunk.start + 1, // tree-sitter index starts from 0
120+
endLine: chunk.end + 1,
121+
nodeType: chunk.nodeType,
122+
language: this.language!,
123+
hash: crypto.createHash('sha256').update(content).digest('hex'),
124+
size: content.length,
125+
}
126+
}
127+
}
128+
129+
export const cstChunker = new CSTChunker()
130+
Lines changed: 11 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1,144 +1,26 @@
11
import * as path from 'path'
2-
import type { SyntaxNode } from 'tree-sitter'
3-
import crypto from 'crypto'
42

5-
import type { CodeChunkInsert, SupportedLanguage } from '../../types'
6-
import { treeSitterParser } from './tree-sitter-parser'
7-
import { ChunkRange } from './chunk-range'
3+
import { cstChunker } from './cst-chunker'
4+
import { recursiveChunker } from './recursive-chunker'
85

9-
export class CSTChunker {
10-
private maxChunkSize = 512 * 3
11-
private coalesce = 50
12-
private filePath = ''
13-
private fileContent = ''
14-
private projectId = ''
15-
private language: SupportedLanguage | null = null
6+
import type { CodeChunkInsert, SupportedLanguage } from '../../types'
167

8+
export class Chunker {
179
async processFile(filePath: string, projectId: string): Promise<CodeChunkInsert[]> {
1810
try {
1911
const language = this.detectLanguage(filePath)
20-
const cstRootNode = await treeSitterParser.parseFile(filePath, language)
21-
22-
this.filePath = filePath
23-
this.language = language
24-
this.fileContent = cstRootNode.text
25-
this.projectId = projectId
2612

27-
const chunks = this.chunkNode(cstRootNode)
28-
const processedChunks = this.processChunks(chunks)
29-
30-
return processedChunks
13+
return await cstChunker.chunk({ filePath, projectId, language })
3114
} catch (error) {
3215
if (error instanceof Error && error.message.includes('Unsupported language')) {
33-
return []
16+
return await recursiveChunker.chunk({ filePath, projectId })
3417
}
3518

3619
console.error(`Error processing file ${filePath}:`, error)
3720
return []
3821
}
3922
}
4023

41-
/**
42-
* AST-based chunking
43-
* - each nodes are traversed and greedily bundled together
44-
* - if a node is too large, recursively chunk its children
45-
* - else, concatenate the current chunk to the node
46-
*/
47-
private chunkNode(node: SyntaxNode): ChunkRange[] {
48-
const chunks: ChunkRange[] = []
49-
let currentChunk: ChunkRange | null = null
50-
51-
const addCurrentChunk = () => {
52-
if (currentChunk && !currentChunk.isEmpty) {
53-
chunks.push(currentChunk)
54-
currentChunk = null
55-
}
56-
}
57-
58-
for (const child of node.children) {
59-
const childChunkRange = new ChunkRange(
60-
child.startPosition.row,
61-
child.endPosition.row,
62-
child.type
63-
)
64-
const childText = child.text
65-
66-
if (childText.length > this.maxChunkSize) {
67-
addCurrentChunk()
68-
chunks.push(...this.chunkNode(child))
69-
continue
70-
}
71-
72-
const currentChunkText = currentChunk?.extract(this.fileContent) ?? ''
73-
const wouldExceedSize = currentChunkText.length + childText.length > this.maxChunkSize
74-
75-
if (wouldExceedSize) {
76-
addCurrentChunk()
77-
currentChunk = childChunkRange
78-
} else {
79-
currentChunk = currentChunk ? currentChunk.add(childChunkRange) : childChunkRange
80-
}
81-
}
82-
83-
addCurrentChunk()
84-
85-
return chunks
86-
}
87-
88-
/**
89-
* try to combine very small chunks (under the coalesce threshold) with adjacent chunks
90-
* but if combing will exceed the max chunk size, keep as is
91-
*/
92-
private processChunks(chunks: ChunkRange[]): CodeChunkInsert[] {
93-
if (chunks.length === 0) return []
94-
95-
const processedChunks: CodeChunkInsert[] = []
96-
97-
const countLengthWithoutWhitespace = (s: string) => s.replace(/\s/g, '').length
98-
99-
for (let i = 0; i < chunks.length; i++) {
100-
const chunk = chunks[i]!
101-
const chunkText = chunk.extract(this.fileContent)
102-
103-
if (countLengthWithoutWhitespace(chunkText) >= this.coalesce) {
104-
processedChunks.push(this.formatAsCodeChunk(chunk, chunkText))
105-
continue
106-
}
107-
108-
if (i < chunks.length - 1) {
109-
const nextChunk = chunks[i + 1]!
110-
const combinedChunk = chunk.add(nextChunk)
111-
const combinedText = combinedChunk.extract(this.fileContent)
112-
113-
if (combinedText.length <= this.maxChunkSize) {
114-
processedChunks.push(this.formatAsCodeChunk(combinedChunk, combinedText))
115-
// skip the next chunk as it's combined with the current one
116-
i++
117-
continue
118-
}
119-
}
120-
121-
// can't coalesce, keep as is
122-
processedChunks.push(this.formatAsCodeChunk(chunk, chunkText))
123-
}
124-
125-
return processedChunks
126-
}
127-
128-
private formatAsCodeChunk(chunk: ChunkRange, content: string): CodeChunkInsert {
129-
return {
130-
content,
131-
filePath: this.filePath,
132-
projectId: this.projectId,
133-
startLine: chunk.start + 1, // tree-sitter index starts from 0
134-
endLine: chunk.end + 1,
135-
nodeType: chunk.nodeType,
136-
language: this.language!,
137-
hash: crypto.createHash('sha256').update(content).digest('hex'),
138-
size: content.length,
139-
}
140-
}
141-
14224
private detectLanguage(filePath: string): SupportedLanguage {
14325
const extension = path.extname(filePath).toLowerCase()
14426

@@ -148,15 +30,18 @@ export class CSTChunker {
14830
case '.tsx':
14931
return 'typescript-react'
15032
case '.js':
33+
case '.mjs':
34+
case '.cjs':
15135
return 'javascript'
15236
case '.jsx':
15337
return 'javascript-react'
15438
case '.py':
15539
return 'python'
15640
default:
157-
throw new Error(`Unsupported language for file: ${filePath}`)
41+
throw new Error(`Unsupported language for filepath ${filePath}`)
15842
}
15943
}
16044
}
16145

162-
export const chunker = new CSTChunker()
46+
export const chunker = new Chunker()
47+

0 commit comments

Comments
 (0)