Skip to content

Commit ae95b74

Browse files
committed
ack PR commnets
1 parent 94aa52e commit ae95b74

File tree

8 files changed

+23
-21
lines changed

8 files changed

+23
-21
lines changed

apps/sim/app/api/knowledge/route.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ const logger = createLogger('KnowledgeBaseAPI')
1313
* Chunking config units:
1414
* - maxSize: tokens (1 token ≈ 4 characters)
1515
* - minSize: characters
16-
* - overlap: characters
16+
* - overlap: tokens (1 token ≈ 4 characters)
1717
*/
1818
const CreateKnowledgeBaseSchema = z.object({
1919
name: z.string().min(1, 'Name is required'),
@@ -27,8 +27,8 @@ const CreateKnowledgeBaseSchema = z.object({
2727
maxSize: z.number().min(100).max(4000).default(1024),
2828
/** Minimum chunk size in characters */
2929
minSize: z.number().min(1).max(2000).default(100),
30-
/** Overlap between chunks in tokens */
31-
overlap: z.number().min(0).max(1000).default(200),
30+
/** Overlap between chunks in tokens (1 token ≈ 4 characters) */
31+
overlap: z.number().min(0).max(500).default(200),
3232
})
3333
.default({
3434
maxSize: 1024,

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ const FormSchema = z
5454
.number()
5555
.min(100, 'Max chunk size must be at least 100 tokens')
5656
.max(4000, 'Max chunk size must be less than 4000 tokens'),
57-
/** Overlap between chunks in tokens (aligned with Chonkie) */
57+
/** Overlap between chunks in tokens */
5858
overlapSize: z
5959
.number()
6060
.min(0, 'Overlap must be non-negative')
61-
.max(1000, 'Overlap must be less than 1000 tokens'),
61+
.max(500, 'Overlap must be less than 500 tokens'),
6262
})
6363
.refine(
6464
(data) => {

apps/sim/lib/chunkers/json-yaml-chunker.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ function getTokenCount(text: string): number {
1717
}
1818

1919
/**
20-
* Configuration for JSON/YAML chunking (aligned with Chonkie standards)
20+
* Configuration for JSON/YAML chunking
2121
* Reduced limits to ensure we stay well under OpenAI's 8,191 token limit per embedding request
2222
*/
2323
const JSON_YAML_CHUNKING_CONFIG = {
24-
TARGET_CHUNK_SIZE: 1024, // Target tokens per chunk (aligned with Chonkie)
24+
TARGET_CHUNK_SIZE: 1024, // Target tokens per chunk
2525
MIN_CHARACTERS_PER_CHUNK: 100, // Minimum characters per chunk to filter tiny fragments
2626
MAX_CHUNK_SIZE: 1500, // Maximum tokens per chunk
2727
MAX_DEPTH_FOR_SPLITTING: 5, // Maximum depth to traverse for splitting
@@ -100,7 +100,7 @@ export class JsonYamlChunker {
100100
const content = JSON.stringify(data, null, 2)
101101
const tokenCount = getTokenCount(content)
102102

103-
// Filter tiny fragments using character count (Chonkie standard)
103+
// Filter tiny fragments using character count
104104
if (content.length >= this.minCharactersPerChunk) {
105105
chunks.push({
106106
text: content,
@@ -320,7 +320,7 @@ export class JsonYamlChunker {
320320
}
321321
}
322322

323-
// Filter tiny fragments using character count (Chonkie standard)
323+
// Filter tiny fragments using character count
324324
if (currentChunk && currentChunk.length >= this.minCharactersPerChunk) {
325325
chunks.push({
326326
text: currentChunk,

apps/sim/lib/chunkers/structured-data-chunker.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const logger = createLogger('StructuredDataChunker')
88
* These are used when user doesn't provide preferences
99
*/
1010
const DEFAULT_CONFIG = {
11-
// Target chunk size in tokens (aligned with Chonkie)
11+
// Target chunk size in tokens
1212
TARGET_CHUNK_SIZE: 1024,
1313
MIN_CHUNK_SIZE: 100,
1414
MAX_CHUNK_SIZE: 4000,
@@ -25,7 +25,7 @@ const DEFAULT_CONFIG = {
2525

2626
/**
2727
* Smart chunker for structured data (CSV, XLSX) that preserves semantic meaning
28-
* Aligned with Chonkie TableChunker patterns - accepts user chunk size preferences
28+
* Preserves headers in each chunk for better semantic context
2929
*/
3030
export class StructuredDataChunker {
3131
/**

apps/sim/lib/chunkers/text-chunker.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types'
44
* Lightweight text chunker optimized for RAG applications
55
* Uses hierarchical splitting with simple character-based token estimation
66
*
7-
* Aligned with Chonkie standards:
7+
* Parameters:
88
* - chunkSize: Maximum chunk size in TOKENS (default: 1024)
99
* - chunkOverlap: Overlap between chunks in TOKENS (default: 0)
1010
* - minCharactersPerChunk: Minimum characters to keep a chunk (default: 100)
@@ -38,7 +38,9 @@ export class TextChunker {
3838

3939
constructor(options: ChunkerOptions = {}) {
4040
this.chunkSize = options.chunkSize ?? 1024
41-
this.chunkOverlap = options.chunkOverlap ?? 0
41+
// Clamp overlap to prevent exceeding chunk size (max 50% of chunk size)
42+
const maxOverlap = Math.floor(this.chunkSize * 0.5)
43+
this.chunkOverlap = Math.min(options.chunkOverlap ?? 0, maxOverlap)
4244
this.minCharactersPerChunk = options.minCharactersPerChunk ?? 100
4345
}
4446

apps/sim/lib/chunkers/types.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/**
2-
* Options for configuring text chunkers (aligned with Chonkie standards)
2+
* Options for configuring text chunkers
33
*
4-
* Units (all token-based like Chonkie):
4+
* Units:
55
* - chunkSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
66
* - chunkOverlap: Overlap between chunks in TOKENS
77
* - minCharactersPerChunk: Minimum chunk size in CHARACTERS (filters tiny fragments)

apps/sim/lib/knowledge/types.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
* Units:
55
* - maxSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
66
* - minSize: Minimum chunk size in CHARACTERS (floor to avoid tiny fragments)
7-
* - overlap: Overlap between chunks in CHARACTERS
7+
* - overlap: Overlap between chunks in TOKENS (1 token ≈ 4 characters)
88
*/
99
export interface ChunkingConfig {
1010
/** Maximum chunk size in tokens (default: 1024, range: 100-4000) */
1111
maxSize: number
12-
/** Minimum chunk size in characters (default: 1, range: 1-2000) */
12+
/** Minimum chunk size in characters (default: 100, range: 1-2000) */
1313
minSize: number
14-
/** Overlap between chunks in characters (default: 200, range: 0-500) */
14+
/** Overlap between chunks in tokens (default: 200, range: 0-500) */
1515
overlap: number
1616
}
1717

apps/sim/stores/knowledge/store.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ const logger = createLogger('KnowledgeStore')
99
* Units:
1010
* - maxSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
1111
* - minSize: Minimum chunk size in CHARACTERS (floor to avoid tiny fragments)
12-
* - overlap: Overlap between chunks in CHARACTERS
12+
* - overlap: Overlap between chunks in TOKENS (1 token ≈ 4 characters)
1313
*/
1414
export interface ChunkingConfig {
1515
/** Maximum chunk size in tokens (default: 1024, range: 100-4000) */
1616
maxSize: number
17-
/** Minimum chunk size in characters (default: 1, range: 1-2000) */
17+
/** Minimum chunk size in characters (default: 100, range: 1-2000) */
1818
minSize: number
19-
/** Overlap between chunks in characters (default: 200, range: 0-500) */
19+
/** Overlap between chunks in tokens (default: 200, range: 0-500) */
2020
overlap: number
2121
chunkSize?: number // Legacy support
2222
minCharactersPerChunk?: number // Legacy support

0 commit comments

Comments
 (0)