hpbyte
diff --git a/‎dev/script.ts‎
Lines changed: 0 additions & 39 deletions b/‎dev/script.ts‎
Lines changed: 0 additions & 39 deletions
diff --git a/‎packages/core/package.json‎
Lines changed: 1 addition & 1 deletion b/‎packages/core/package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/core/src/ingestion/chunker/cst-chunker.ts‎
Lines changed: 130 additions & 0 deletions b/‎packages/core/src/ingestion/chunker/cst-chunker.ts‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎packages/core/src/ingestion/chunker/index.ts‎
Lines changed: 11 additions & 126 deletions b/‎packages/core/src/ingestion/chunker/index.ts‎
Lines changed: 11 additions & 126 deletions
@@ -1,6 +1,6 @@
 {
   "name": "@hpbyte/h-codex-core",
-  "version": "0.1.6",
+  "version": "0.1.7",
   "description": "Core indexing and search functionality for h-codex",
   "author": "Htoo Pyae Lwin",
   "license": "MIT",
 
@@ -0,0 +1,130 @@
+import type { SyntaxNode } from 'tree-sitter'
+import crypto from 'crypto'
+
+import type { CodeChunkInsert, ChunkParams, SupportedLanguage } from '../../types'
+import { maxChunkSize, coalesce, countLengthWithoutWhitespace } from '../../utils'
+import { treeSitterParser } from './tree-sitter-parser'
+import { ChunkRange } from './chunk-range'
+
+export class CSTChunker {
+  private filePath = ''
+  private fileContent = ''
+  private projectId = ''
+  private language: SupportedLanguage | null = null
+
+  async chunk({ filePath, projectId, language }: ChunkParams) {
+    const cstRootNode = await treeSitterParser.parseFile(filePath, language)
+
+    this.filePath = filePath
+    this.language = language
+    this.fileContent = cstRootNode.text
+    this.projectId = projectId
+
+    const chunks = this.chunkNode(cstRootNode)
+    const processedChunks = this.processChunks(chunks)
+
+    return processedChunks
+  }
+
+  /**
+   * AST-based chunking
+   * - each nodes are traversed and greedily bundled together
+   * - if a node is too large, recursively chunk its children
+   * - else, concatenate the current chunk to the node
+   */
+  private chunkNode(node: SyntaxNode): ChunkRange[] {
+    const chunks: ChunkRange[] = []
+    let currentChunk: ChunkRange | null = null
+
+    const addCurrentChunk = () => {
+      if (currentChunk && !currentChunk.isEmpty) {
+        chunks.push(currentChunk)
+        currentChunk = null
+      }
+    }
+
+    for (const child of node.children) {
+      const childChunkRange = new ChunkRange(
+        child.startPosition.row,
+        child.endPosition.row,
+        child.type
+      )
+      const childText = child.text
+
+      if (childText.length > maxChunkSize) {
+        addCurrentChunk()
+        chunks.push(...this.chunkNode(child))
+        continue
+      }
+
+      const currentChunkText = currentChunk?.extract(this.fileContent) ?? ''
+      const wouldExceedSize = currentChunkText.length + childText.length > maxChunkSize
+
+      if (wouldExceedSize) {
+        addCurrentChunk()
+        currentChunk = childChunkRange
+      } else {
+        currentChunk = currentChunk ? currentChunk.add(childChunkRange) : childChunkRange
+      }
+    }
+
+    addCurrentChunk()
+
+    return chunks
+  }
+
+  /**
+   * try to combine very small chunks (under the coalesce threshold) with adjacent chunks
+   * but if combing will exceed the max chunk size, keep as is
+   */
+  private processChunks(chunks: ChunkRange[]): CodeChunkInsert[] {
+    if (chunks.length === 0) return []
+
+    const processedChunks: CodeChunkInsert[] = []
+
+    for (let i = 0; i < chunks.length; i++) {
+      const chunk = chunks[i]!
+      const chunkText = chunk.extract(this.fileContent)
+
+      if (countLengthWithoutWhitespace(chunkText) >= coalesce) {
+        processedChunks.push(this.formatAsCodeChunk(chunk, chunkText))
+        continue
+      }
+
+      if (i < chunks.length - 1) {
+        const nextChunk = chunks[i + 1]!
+        const combinedChunk = chunk.add(nextChunk)
+        const combinedText = combinedChunk.extract(this.fileContent)
+
+        if (combinedText.length <= maxChunkSize) {
+          processedChunks.push(this.formatAsCodeChunk(combinedChunk, combinedText))
+          // skip the next chunk as it's combined with the current one
+          i++
+          continue
+        }
+      }
+
+      // can't coalesce, keep as is
+      processedChunks.push(this.formatAsCodeChunk(chunk, chunkText))
+    }
+
+    return processedChunks
+  }
+
+  private formatAsCodeChunk(chunk: ChunkRange, content: string): CodeChunkInsert {
+    return {
+      content,
+      filePath: this.filePath,
+      projectId: this.projectId,
+      startLine: chunk.start + 1, // tree-sitter index starts from 0
+      endLine: chunk.end + 1,
+      nodeType: chunk.nodeType,
+      language: this.language!,
+      hash: crypto.createHash('sha256').update(content).digest('hex'),
+      size: content.length,
+    }
+  }
+}
+
+export const cstChunker = new CSTChunker()
+
@@ -1,144 +1,26 @@
 import * as path from 'path'
-import type { SyntaxNode } from 'tree-sitter'
-import crypto from 'crypto'
 
-import type { CodeChunkInsert, SupportedLanguage } from '../../types'
-import { treeSitterParser } from './tree-sitter-parser'
-import { ChunkRange } from './chunk-range'
+import { cstChunker } from './cst-chunker'
+import { recursiveChunker } from './recursive-chunker'
 
-export class CSTChunker {
-  private maxChunkSize = 512 * 3
-  private coalesce = 50
-  private filePath = ''
-  private fileContent = ''
-  private projectId = ''
-  private language: SupportedLanguage | null = null
+import type { CodeChunkInsert, SupportedLanguage } from '../../types'
 
+export class Chunker {
   async processFile(filePath: string, projectId: string): Promise<CodeChunkInsert[]> {
     try {
       const language = this.detectLanguage(filePath)
-      const cstRootNode = await treeSitterParser.parseFile(filePath, language)
-
-      this.filePath = filePath
-      this.language = language
-      this.fileContent = cstRootNode.text
-      this.projectId = projectId
 
-      const chunks = this.chunkNode(cstRootNode)
-      const processedChunks = this.processChunks(chunks)
-
-      return processedChunks
+      return await cstChunker.chunk({ filePath, projectId, language })
     } catch (error) {
       if (error instanceof Error && error.message.includes('Unsupported language')) {
-        return []
+        return await recursiveChunker.chunk({ filePath, projectId })
       }
 
       console.error(`Error processing file ${filePath}:`, error)
       return []
     }
   }
 
-  /**
-   * AST-based chunking
-   * - each nodes are traversed and greedily bundled together
-   * - if a node is too large, recursively chunk its children
-   * - else, concatenate the current chunk to the node
-   */
-  private chunkNode(node: SyntaxNode): ChunkRange[] {
-    const chunks: ChunkRange[] = []
-    let currentChunk: ChunkRange | null = null
-
-    const addCurrentChunk = () => {
-      if (currentChunk && !currentChunk.isEmpty) {
-        chunks.push(currentChunk)
-        currentChunk = null
-      }
-    }
-
-    for (const child of node.children) {
-      const childChunkRange = new ChunkRange(
-        child.startPosition.row,
-        child.endPosition.row,
-        child.type
-      )
-      const childText = child.text
-
-      if (childText.length > this.maxChunkSize) {
-        addCurrentChunk()
-        chunks.push(...this.chunkNode(child))
-        continue
-      }
-
-      const currentChunkText = currentChunk?.extract(this.fileContent) ?? ''
-      const wouldExceedSize = currentChunkText.length + childText.length > this.maxChunkSize
-
-      if (wouldExceedSize) {
-        addCurrentChunk()
-        currentChunk = childChunkRange
-      } else {
-        currentChunk = currentChunk ? currentChunk.add(childChunkRange) : childChunkRange
-      }
-    }
-
-    addCurrentChunk()
-
-    return chunks
-  }
-
-  /**
-   * try to combine very small chunks (under the coalesce threshold) with adjacent chunks
-   * but if combing will exceed the max chunk size, keep as is
-   */
-  private processChunks(chunks: ChunkRange[]): CodeChunkInsert[] {
-    if (chunks.length === 0) return []
-
-    const processedChunks: CodeChunkInsert[] = []
-
-    const countLengthWithoutWhitespace = (s: string) => s.replace(/\s/g, '').length
-
-    for (let i = 0; i < chunks.length; i++) {
-      const chunk = chunks[i]!
-      const chunkText = chunk.extract(this.fileContent)
-
-      if (countLengthWithoutWhitespace(chunkText) >= this.coalesce) {
-        processedChunks.push(this.formatAsCodeChunk(chunk, chunkText))
-        continue
-      }
-
-      if (i < chunks.length - 1) {
-        const nextChunk = chunks[i + 1]!
-        const combinedChunk = chunk.add(nextChunk)
-        const combinedText = combinedChunk.extract(this.fileContent)
-
-        if (combinedText.length <= this.maxChunkSize) {
-          processedChunks.push(this.formatAsCodeChunk(combinedChunk, combinedText))
-          // skip the next chunk as it's combined with the current one
-          i++
-          continue
-        }
-      }
-
-      // can't coalesce, keep as is
-      processedChunks.push(this.formatAsCodeChunk(chunk, chunkText))
-    }
-
-    return processedChunks
-  }
-
-  private formatAsCodeChunk(chunk: ChunkRange, content: string): CodeChunkInsert {
-    return {
-      content,
-      filePath: this.filePath,
-      projectId: this.projectId,
-      startLine: chunk.start + 1, // tree-sitter index starts from 0
-      endLine: chunk.end + 1,
-      nodeType: chunk.nodeType,
-      language: this.language!,
-      hash: crypto.createHash('sha256').update(content).digest('hex'),
-      size: content.length,
-    }
-  }
-
   private detectLanguage(filePath: string): SupportedLanguage {
     const extension = path.extname(filePath).toLowerCase()
 
@@ -148,15 +30,18 @@ export class CSTChunker {
       case '.tsx':
         return 'typescript-react'
       case '.js':
+      case '.mjs':
+      case '.cjs':
         return 'javascript'
       case '.jsx':
         return 'javascript-react'
       case '.py':
         return 'python'
       default:
-        throw new Error(`Unsupported language for file: ${filePath}`)
+        throw new Error(`Unsupported language for filepath ${filePath}`)
     }
   }
 }
 
-export const chunker = new CSTChunker()
+export const chunker = new Chunker()
+
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@hpbyte/h-codex-core",`
`3`		`- "version": "0.1.6",`
	`3`	`+ "version": "0.1.7",`
`4`	`4`	`"description": "Core indexing and search functionality for h-codex",`
`5`	`5`	`"author": "Htoo Pyae Lwin",`
`6`	`6`	`"license": "MIT",`