Skip to content

Commit b5b01c6

Browse files
committed
fix: recursive chunker chunk size
1 parent 7497a89 commit b5b01c6

12 files changed

Lines changed: 329 additions & 21 deletions

File tree

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "h-codex",
3-
"version": "0.1.6",
3+
"version": "0.1.8",
44
"description": "A semantic code search tool for intelligent, cross-repo context retrieval.",
55
"author": "Htoo Pyae Lwin",
66
"license": "MIT",

packages/core/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@hpbyte/h-codex-core",
3-
"version": "0.1.7",
3+
"version": "0.1.8",
44
"description": "Core indexing and search functionality for h-codex",
55
"author": "Htoo Pyae Lwin",
66
"license": "MIT",

packages/core/src/ingestion/chunker/cst-chunker.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,4 +127,3 @@ export class CSTChunker {
127127
}
128128

129129
export const cstChunker = new CSTChunker()
130-

packages/core/src/ingestion/chunker/index.ts

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,7 @@ export class Chunker {
1212

1313
return await cstChunker.chunk({ filePath, projectId, language })
1414
} catch (error) {
15-
if (error instanceof Error && error.message.includes('Unsupported language')) {
16-
return await recursiveChunker.chunk({ filePath, projectId })
17-
}
18-
19-
console.error(`Error processing file ${filePath}:`, error)
20-
return []
15+
return await recursiveChunker.chunk({ filePath, projectId })
2116
}
2217
}
2318

@@ -44,4 +39,3 @@ export class Chunker {
4439
}
4540

4641
export const chunker = new Chunker()
47-

packages/core/src/ingestion/chunker/recursive-chunker.ts

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import * as fs from 'node:fs/promises'
2+
import * as path from 'path'
23
import crypto from 'crypto'
34

45
import type { ChunkParams, CodeChunkInsert } from '../../types'
@@ -40,10 +41,9 @@ export class RecursiveChunker {
4041
}
4142
}
4243

43-
const chunkSize = Math.floor(maxChunkSize / 2)
4444
const chunks: string[] = []
45-
for (let i = 0; i < content.length; i += chunkSize) {
46-
chunks.push(content.slice(i, i + chunkSize))
45+
for (let i = 0; i < content.length; i += maxChunkSize) {
46+
chunks.push(content.slice(i, i + maxChunkSize))
4747
}
4848

4949
return chunks
@@ -58,13 +58,15 @@ export class RecursiveChunker {
5858
for (let i = 0; i < chunks.length; i++) {
5959
let chunk = chunks[i]!
6060

61-
if (countLengthWithoutWhitespace(chunk) < coalesce && i < chunks.length - 1) {
61+
while (countLengthWithoutWhitespace(chunk) < coalesce && i < chunks.length - 1) {
6262
const nextChunk = chunks[i + 1]!
6363
const combinedChunk = chunk + '\n' + nextChunk
6464

6565
if (combinedChunk.length <= maxChunkSize) {
6666
chunk = combinedChunk
6767
i++
68+
} else {
69+
break
6870
}
6971
}
7072

@@ -78,7 +80,7 @@ export class RecursiveChunker {
7880
startLine: currentLineNumber,
7981
endLine,
8082
nodeType: 'text',
81-
language: null,
83+
language: this.getLanguage(filePath),
8284
hash: crypto.createHash('sha256').update(chunk).digest('hex'),
8385
size: chunk.length,
8486
})
@@ -88,7 +90,10 @@ export class RecursiveChunker {
8890

8991
return processedChunks
9092
}
93+
94+
private getLanguage(filePath: string) {
95+
return path.extname(filePath).toLowerCase().replace('.', '')
96+
}
9197
}
9298

9399
export const recursiveChunker = new RecursiveChunker()
94-

packages/core/src/script.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env node
2+
3+
import { indexer } from './ingestion'
4+
import { semanticSearch } from './search'
5+
6+
const helpMesg = `
7+
Usage: h-codexx <command> <path>
8+
9+
Commands:
10+
index <path> Index a directory of code
11+
search <query> Search for a query
12+
`
13+
14+
async function main() {
15+
const args = process.argv.slice(2)
16+
17+
if (args.length === 0) {
18+
console.log('No arguments provided')
19+
return
20+
}
21+
22+
const [command, arg] = args
23+
24+
if (!command || !arg) {
25+
console.log(helpMesg)
26+
return
27+
}
28+
29+
if (!['index', 'search', 'clear'].includes(command)) {
30+
console.log('Invalid command')
31+
return
32+
}
33+
34+
if (command === 'index') {
35+
return indexer.index(arg)
36+
}
37+
38+
if (command === 'search') {
39+
return semanticSearch.search(arg)
40+
}
41+
42+
if (command === 'clear') {
43+
return indexer.clear(arg)
44+
}
45+
}
46+
47+
main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TABLE "code_chunks" ALTER COLUMN "language" DROP NOT NULL;

0 commit comments

Comments
 (0)