11import * as path from 'path'
2- import type { SyntaxNode } from 'tree-sitter'
3- import crypto from 'crypto'
42
5- import type { CodeChunkInsert , SupportedLanguage } from '../../types'
6- import { treeSitterParser } from './tree-sitter-parser'
7- import { ChunkRange } from './chunk-range'
3+ import { cstChunker } from './cst-chunker'
4+ import { recursiveChunker } from './recursive-chunker'
85
9- export class CSTChunker {
10- private maxChunkSize = 512 * 3
11- private coalesce = 50
12- private filePath = ''
13- private fileContent = ''
14- private projectId = ''
15- private language : SupportedLanguage | null = null
6+ import type { CodeChunkInsert , SupportedLanguage } from '../../types'
167
8+ export class Chunker {
179 async processFile ( filePath : string , projectId : string ) : Promise < CodeChunkInsert [ ] > {
1810 try {
1911 const language = this . detectLanguage ( filePath )
20- const cstRootNode = await treeSitterParser . parseFile ( filePath , language )
21-
22- this . filePath = filePath
23- this . language = language
24- this . fileContent = cstRootNode . text
25- this . projectId = projectId
2612
27- const chunks = this . chunkNode ( cstRootNode )
28- const processedChunks = this . processChunks ( chunks )
29-
30- return processedChunks
13+ return await cstChunker . chunk ( { filePath, projectId, language } )
3114 } catch ( error ) {
3215 if ( error instanceof Error && error . message . includes ( 'Unsupported language' ) ) {
33- return [ ]
16+ return await recursiveChunker . chunk ( { filePath , projectId } )
3417 }
3518
3619 console . error ( `Error processing file ${ filePath } :` , error )
3720 return [ ]
3821 }
3922 }
4023
41- /**
42- * AST-based chunking
43- * - each nodes are traversed and greedily bundled together
44- * - if a node is too large, recursively chunk its children
45- * - else, concatenate the current chunk to the node
46- */
47- private chunkNode ( node : SyntaxNode ) : ChunkRange [ ] {
48- const chunks : ChunkRange [ ] = [ ]
49- let currentChunk : ChunkRange | null = null
50-
51- const addCurrentChunk = ( ) => {
52- if ( currentChunk && ! currentChunk . isEmpty ) {
53- chunks . push ( currentChunk )
54- currentChunk = null
55- }
56- }
57-
58- for ( const child of node . children ) {
59- const childChunkRange = new ChunkRange (
60- child . startPosition . row ,
61- child . endPosition . row ,
62- child . type
63- )
64- const childText = child . text
65-
66- if ( childText . length > this . maxChunkSize ) {
67- addCurrentChunk ( )
68- chunks . push ( ...this . chunkNode ( child ) )
69- continue
70- }
71-
72- const currentChunkText = currentChunk ?. extract ( this . fileContent ) ?? ''
73- const wouldExceedSize = currentChunkText . length + childText . length > this . maxChunkSize
74-
75- if ( wouldExceedSize ) {
76- addCurrentChunk ( )
77- currentChunk = childChunkRange
78- } else {
79- currentChunk = currentChunk ? currentChunk . add ( childChunkRange ) : childChunkRange
80- }
81- }
82-
83- addCurrentChunk ( )
84-
85- return chunks
86- }
87-
88- /**
89- * try to combine very small chunks (under the coalesce threshold) with adjacent chunks
90- * but if combing will exceed the max chunk size, keep as is
91- */
92- private processChunks ( chunks : ChunkRange [ ] ) : CodeChunkInsert [ ] {
93- if ( chunks . length === 0 ) return [ ]
94-
95- const processedChunks : CodeChunkInsert [ ] = [ ]
96-
97- const countLengthWithoutWhitespace = ( s : string ) => s . replace ( / \s / g, '' ) . length
98-
99- for ( let i = 0 ; i < chunks . length ; i ++ ) {
100- const chunk = chunks [ i ] !
101- const chunkText = chunk . extract ( this . fileContent )
102-
103- if ( countLengthWithoutWhitespace ( chunkText ) >= this . coalesce ) {
104- processedChunks . push ( this . formatAsCodeChunk ( chunk , chunkText ) )
105- continue
106- }
107-
108- if ( i < chunks . length - 1 ) {
109- const nextChunk = chunks [ i + 1 ] !
110- const combinedChunk = chunk . add ( nextChunk )
111- const combinedText = combinedChunk . extract ( this . fileContent )
112-
113- if ( combinedText . length <= this . maxChunkSize ) {
114- processedChunks . push ( this . formatAsCodeChunk ( combinedChunk , combinedText ) )
115- // skip the next chunk as it's combined with the current one
116- i ++
117- continue
118- }
119- }
120-
121- // can't coalesce, keep as is
122- processedChunks . push ( this . formatAsCodeChunk ( chunk , chunkText ) )
123- }
124-
125- return processedChunks
126- }
127-
128- private formatAsCodeChunk ( chunk : ChunkRange , content : string ) : CodeChunkInsert {
129- return {
130- content,
131- filePath : this . filePath ,
132- projectId : this . projectId ,
133- startLine : chunk . start + 1 , // tree-sitter index starts from 0
134- endLine : chunk . end + 1 ,
135- nodeType : chunk . nodeType ,
136- language : this . language ! ,
137- hash : crypto . createHash ( 'sha256' ) . update ( content ) . digest ( 'hex' ) ,
138- size : content . length ,
139- }
140- }
141-
14224 private detectLanguage ( filePath : string ) : SupportedLanguage {
14325 const extension = path . extname ( filePath ) . toLowerCase ( )
14426
@@ -148,15 +30,18 @@ export class CSTChunker {
14830 case '.tsx' :
14931 return 'typescript-react'
15032 case '.js' :
33+ case '.mjs' :
34+ case '.cjs' :
15135 return 'javascript'
15236 case '.jsx' :
15337 return 'javascript-react'
15438 case '.py' :
15539 return 'python'
15640 default :
157- throw new Error ( `Unsupported language for file: ${ filePath } ` )
41+ throw new Error ( `Unsupported language for filepath ${ filePath } ` )
15842 }
15943 }
16044}
16145
162- export const chunker = new CSTChunker ( )
46+ export const chunker = new Chunker ( )
47+
0 commit comments