ack PR commnets

waleedlatif1 · waleedlatif1 · commit ae95b74e55e7 · 2025-12-22T20:39:30.000-08:00
diff --git a/apps/sim/app/api/knowledge/route.ts b/apps/sim/app/api/knowledge/route.ts
@@ -13,7 +13,7 @@ const logger = createLogger('KnowledgeBaseAPI')
  * Chunking config units:
  * - maxSize: tokens (1 token ≈ 4 characters)
  * - minSize: characters
- * - overlap: characters
+ * - overlap: tokens (1 token ≈ 4 characters)
  */
 const CreateKnowledgeBaseSchema = z.object({
   name: z.string().min(1, 'Name is required'),
@@ -27,8 +27,8 @@ const CreateKnowledgeBaseSchema = z.object({
       maxSize: z.number().min(100).max(4000).default(1024),
       /** Minimum chunk size in characters */
       minSize: z.number().min(1).max(2000).default(100),
-      /** Overlap between chunks in tokens */
-      overlap: z.number().min(0).max(1000).default(200),
+      /** Overlap between chunks in tokens (1 token ≈ 4 characters) */
+      overlap: z.number().min(0).max(500).default(200),
     })
     .default({
       maxSize: 1024,
diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
@@ -54,11 +54,11 @@ const FormSchema = z
       .number()
       .min(100, 'Max chunk size must be at least 100 tokens')
       .max(4000, 'Max chunk size must be less than 4000 tokens'),
-    /** Overlap between chunks in tokens (aligned with Chonkie) */
+    /** Overlap between chunks in tokens */
     overlapSize: z
       .number()
       .min(0, 'Overlap must be non-negative')
-      .max(1000, 'Overlap must be less than 1000 tokens'),
+      .max(500, 'Overlap must be less than 500 tokens'),
   })
   .refine(
     (data) => {
diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.ts b/apps/sim/lib/chunkers/json-yaml-chunker.ts
@@ -17,11 +17,11 @@ function getTokenCount(text: string): number {
 }
 
 /**
- * Configuration for JSON/YAML chunking (aligned with Chonkie standards)
+ * Configuration for JSON/YAML chunking
  * Reduced limits to ensure we stay well under OpenAI's 8,191 token limit per embedding request
  */
 const JSON_YAML_CHUNKING_CONFIG = {
-  TARGET_CHUNK_SIZE: 1024, // Target tokens per chunk (aligned with Chonkie)
+  TARGET_CHUNK_SIZE: 1024, // Target tokens per chunk
   MIN_CHARACTERS_PER_CHUNK: 100, // Minimum characters per chunk to filter tiny fragments
   MAX_CHUNK_SIZE: 1500, // Maximum tokens per chunk
   MAX_DEPTH_FOR_SPLITTING: 5, // Maximum depth to traverse for splitting
@@ -100,7 +100,7 @@ export class JsonYamlChunker {
     const content = JSON.stringify(data, null, 2)
     const tokenCount = getTokenCount(content)
 
-    // Filter tiny fragments using character count (Chonkie standard)
+    // Filter tiny fragments using character count
     if (content.length >= this.minCharactersPerChunk) {
       chunks.push({
         text: content,
@@ -320,7 +320,7 @@ export class JsonYamlChunker {
       }
     }
 
-    // Filter tiny fragments using character count (Chonkie standard)
+    // Filter tiny fragments using character count
     if (currentChunk && currentChunk.length >= this.minCharactersPerChunk) {
       chunks.push({
         text: currentChunk,
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -8,7 +8,7 @@ const logger = createLogger('StructuredDataChunker')
  * These are used when user doesn't provide preferences
  */
 const DEFAULT_CONFIG = {
-  // Target chunk size in tokens (aligned with Chonkie)
+  // Target chunk size in tokens
   TARGET_CHUNK_SIZE: 1024,
   MIN_CHUNK_SIZE: 100,
   MAX_CHUNK_SIZE: 4000,
@@ -25,7 +25,7 @@ const DEFAULT_CONFIG = {
 
 /**
  * Smart chunker for structured data (CSV, XLSX) that preserves semantic meaning
- * Aligned with Chonkie TableChunker patterns - accepts user chunk size preferences
+ * Preserves headers in each chunk for better semantic context
  */
 export class StructuredDataChunker {
   /**
diff --git a/apps/sim/lib/chunkers/text-chunker.ts b/apps/sim/lib/chunkers/text-chunker.ts
@@ -4,7 +4,7 @@ import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types'
  * Lightweight text chunker optimized for RAG applications
  * Uses hierarchical splitting with simple character-based token estimation
  *
- * Aligned with Chonkie standards:
+ * Parameters:
  * - chunkSize: Maximum chunk size in TOKENS (default: 1024)
  * - chunkOverlap: Overlap between chunks in TOKENS (default: 0)
  * - minCharactersPerChunk: Minimum characters to keep a chunk (default: 100)
@@ -38,7 +38,9 @@ export class TextChunker {
 
   constructor(options: ChunkerOptions = {}) {
     this.chunkSize = options.chunkSize ?? 1024
-    this.chunkOverlap = options.chunkOverlap ?? 0
+    // Clamp overlap to prevent exceeding chunk size (max 50% of chunk size)
+    const maxOverlap = Math.floor(this.chunkSize * 0.5)
+    this.chunkOverlap = Math.min(options.chunkOverlap ?? 0, maxOverlap)
     this.minCharactersPerChunk = options.minCharactersPerChunk ?? 100
   }
 
diff --git a/apps/sim/lib/chunkers/types.ts b/apps/sim/lib/chunkers/types.ts
@@ -1,7 +1,7 @@
 /**
- * Options for configuring text chunkers (aligned with Chonkie standards)
+ * Options for configuring text chunkers
  *
- * Units (all token-based like Chonkie):
+ * Units:
  * - chunkSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
  * - chunkOverlap: Overlap between chunks in TOKENS
  * - minCharactersPerChunk: Minimum chunk size in CHARACTERS (filters tiny fragments)
diff --git a/apps/sim/lib/knowledge/types.ts b/apps/sim/lib/knowledge/types.ts
@@ -4,14 +4,14 @@
  * Units:
  * - maxSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
  * - minSize: Minimum chunk size in CHARACTERS (floor to avoid tiny fragments)
- * - overlap: Overlap between chunks in CHARACTERS
+ * - overlap: Overlap between chunks in TOKENS (1 token ≈ 4 characters)
  */
 export interface ChunkingConfig {
   /** Maximum chunk size in tokens (default: 1024, range: 100-4000) */
   maxSize: number
-  /** Minimum chunk size in characters (default: 1, range: 1-2000) */
+  /** Minimum chunk size in characters (default: 100, range: 1-2000) */
   minSize: number
-  /** Overlap between chunks in characters (default: 200, range: 0-500) */
+  /** Overlap between chunks in tokens (default: 200, range: 0-500) */
   overlap: number
 }
 
diff --git a/apps/sim/stores/knowledge/store.ts b/apps/sim/stores/knowledge/store.ts
@@ -9,14 +9,14 @@ const logger = createLogger('KnowledgeStore')
  * Units:
  * - maxSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
  * - minSize: Minimum chunk size in CHARACTERS (floor to avoid tiny fragments)
- * - overlap: Overlap between chunks in CHARACTERS
+ * - overlap: Overlap between chunks in TOKENS (1 token ≈ 4 characters)
  */
 export interface ChunkingConfig {
   /** Maximum chunk size in tokens (default: 1024, range: 100-4000) */
   maxSize: number
-  /** Minimum chunk size in characters (default: 1, range: 1-2000) */
+  /** Minimum chunk size in characters (default: 100, range: 1-2000) */
   minSize: number
-  /** Overlap between chunks in characters (default: 200, range: 0-500) */
+  /** Overlap between chunks in tokens (default: 200, range: 0-500) */
   overlap: number
   chunkSize?: number // Legacy support
   minCharactersPerChunk?: number // Legacy support