From a7c3a48617ac18710d86aa96c19886bfb95763ef Mon Sep 17 00:00:00 2001 From: Esity Date: Wed, 6 May 2026 12:25:34 -0500 Subject: [PATCH 1/4] Fix Knowledge Apollo model compatibility with schema cleanup --- CHANGELOG.md | 5 + lib/legion/extensions/knowledge.rb | 1 + .../knowledge/helpers/apollo_models.rb | 45 +++++++ .../extensions/knowledge/runners/ingest.rb | 10 +- .../knowledge/runners/maintenance.rb | 124 +++++++++--------- lib/legion/extensions/knowledge/version.rb | 2 +- .../knowledge/runners/maintenance_spec.rb | 22 ++++ 7 files changed, 143 insertions(+), 66 deletions(-) create mode 100644 lib/legion/extensions/knowledge/helpers/apollo_models.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 9154b3f..2c7902b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [0.6.11] - 2026-05-06 + +### Fixed +- Knowledge ingest and maintenance now resolve Apollo data models through the namespaced `Legion::Data::Model::Apollo::*` classes introduced by the legion-data schema cleanup, with fallback support for legacy Apollo model constants. + ## [0.6.10] - 2026-04-28 ### Fixed diff --git a/lib/legion/extensions/knowledge.rb b/lib/legion/extensions/knowledge.rb index 4fd7e79..5e3f93c 100644 --- a/lib/legion/extensions/knowledge.rb +++ b/lib/legion/extensions/knowledge.rb @@ -5,6 +5,7 @@ require_relative 'knowledge/helpers/manifest_store' require_relative 'knowledge/helpers/parser' require_relative 'knowledge/helpers/chunker' +require_relative 'knowledge/helpers/apollo_models' require_relative 'knowledge/runners/ingest' require_relative 'knowledge/runners/query' require_relative 'knowledge/runners/corpus' diff --git a/lib/legion/extensions/knowledge/helpers/apollo_models.rb b/lib/legion/extensions/knowledge/helpers/apollo_models.rb new file mode 100644 index 0000000..15ee972 --- /dev/null +++ b/lib/legion/extensions/knowledge/helpers/apollo_models.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +module Legion + module Extensions + module Knowledge + module Helpers + module ApolloModels + class << self + def entry + namespaced_apollo_model(:Entry) || legacy_model(:ApolloEntry) + end + + def access_log + namespaced_apollo_model(:AccessLog) || legacy_model(:ApolloAccessLog) + end + + def entry_available? + !entry.nil? + end + + def access_log_available? + !access_log.nil? + end + + private + + def namespaced_apollo_model(name) + return nil unless defined?(Legion::Data::Model::Apollo) + return nil unless Legion::Data::Model::Apollo.const_defined?(name, false) + + Legion::Data::Model::Apollo.const_get(name, false) + end + + def legacy_model(name) + return nil unless defined?(Legion::Data::Model) + return nil unless Legion::Data::Model.const_defined?(name, false) + + Legion::Data::Model.const_get(name, false) + end + end + end + end + end + end +end diff --git a/lib/legion/extensions/knowledge/runners/ingest.rb b/lib/legion/extensions/knowledge/runners/ingest.rb index c50f12e..ff0629e 100644 --- a/lib/legion/extensions/knowledge/runners/ingest.rb +++ b/lib/legion/extensions/knowledge/runners/ingest.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require_relative '../helpers/apollo_models' + require 'securerandom' module Legion @@ -234,11 +236,11 @@ def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, private_class_method :upsert_chunk_with_embedding def chunk_exists?(content_hash) - return false unless defined?(Legion::Data::Model::ApolloEntry) + return false unless Helpers::ApolloModels.entry_available? - Legion::Data::Model::ApolloEntry - .where(content_hash: content_hash) - .any? + Helpers::ApolloModels.entry + .where(content_hash: content_hash) + .any? rescue StandardError => e log.warn(e.message) false diff --git a/lib/legion/extensions/knowledge/runners/maintenance.rb b/lib/legion/extensions/knowledge/runners/maintenance.rb index 49fcc50..448084d 100644 --- a/lib/legion/extensions/knowledge/runners/maintenance.rb +++ b/lib/legion/extensions/knowledge/runners/maintenance.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require_relative '../helpers/apollo_models' + module Legion module Extensions module Knowledge @@ -92,11 +94,11 @@ def build_local_stats(path, scan_entries, manifest_file, last_ingest) private_class_method :build_local_stats def build_apollo_stats - return apollo_defaults unless defined?(Legion::Data::Model::ApolloEntry) + return apollo_defaults unless Helpers::ApolloModels.entry_available? - base = Legion::Data::Model::ApolloEntry - .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) - .exclude(status: 'archived') + base = Helpers::ApolloModels.entry + .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) + .exclude(status: 'archived') total = base.count return apollo_defaults if total.zero? @@ -163,94 +165,94 @@ def load_manifest_files(path) private_class_method :load_manifest_files def load_apollo_source_files - return [] unless defined?(Legion::Data::Model::ApolloEntry) - - Legion::Data::Model::ApolloEntry - .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) - .exclude(status: 'archived') - .select_map(Sequel.lit("source_context->>'source_file'")) - .compact - .uniq + return [] unless Helpers::ApolloModels.entry_available? + + Helpers::ApolloModels.entry + .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) + .exclude(status: 'archived') + .select_map(Sequel.lit("source_context->>'source_file'")) + .compact + .uniq rescue StandardError => _e [] end private_class_method :load_apollo_source_files def count_apollo_chunks - return 0 unless defined?(Legion::Data::Model::ApolloEntry) + return 0 unless Helpers::ApolloModels.entry_available? - Legion::Data::Model::ApolloEntry - .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) - .exclude(status: 'archived') - .count + Helpers::ApolloModels.entry + .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) + .exclude(status: 'archived') + .count rescue StandardError => _e 0 end private_class_method :count_apollo_chunks def archive_orphan_entries(orphan_files) - return 0 unless defined?(Legion::Data::Model::ApolloEntry) + return 0 unless Helpers::ApolloModels.entry_available? - Legion::Data::Model::ApolloEntry - .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) - .where(Sequel.lit("source_context->>'source_file' IN ?", orphan_files)) - .exclude(status: 'archived') - .update(status: 'archived', updated_at: Time.now) + Helpers::ApolloModels.entry + .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) + .where(Sequel.lit("source_context->>'source_file' IN ?", orphan_files)) + .exclude(status: 'archived') + .update(status: 'archived', updated_at: Time.now) end private_class_method :archive_orphan_entries def hot_chunks(limit) - return [] unless defined?(Legion::Data::Model::ApolloEntry) - - Legion::Data::Model::ApolloEntry - .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) - .exclude(status: 'archived') - .where { access_count.positive? } - .order(Sequel.desc(:access_count)) - .limit(limit) - .select_map([:id, :access_count, :confidence, - Sequel.lit("source_context->>'source_file' AS source_file")]) - .map { |r| { id: r[0], access_count: r[1], confidence: r[2], source_file: r[3] } } + return [] unless Helpers::ApolloModels.entry_available? + + Helpers::ApolloModels.entry + .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) + .exclude(status: 'archived') + .where { access_count.positive? } + .order(Sequel.desc(:access_count)) + .limit(limit) + .select_map([:id, :access_count, :confidence, + Sequel.lit("source_context->>'source_file' AS source_file")]) + .map { |r| { id: r[0], access_count: r[1], confidence: r[2], source_file: r[3] } } rescue StandardError => _e [] end private_class_method :hot_chunks def cold_chunks(limit) - return [] unless defined?(Legion::Data::Model::ApolloEntry) + return [] unless Helpers::ApolloModels.entry_available? days = settings_cold_chunk_days cutoff = Time.now - (days * 86_400) - Legion::Data::Model::ApolloEntry - .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) - .exclude(status: 'archived') - .where(access_count: 0) - .where { created_at < cutoff } - .order(:created_at) - .limit(limit) - .select_map([:id, :confidence, :created_at, - Sequel.lit("source_context->>'source_file' AS source_file")]) - .map { |r| { id: r[0], confidence: r[1], created_at: r[2]&.iso8601, source_file: r[3] } } + Helpers::ApolloModels.entry + .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) + .exclude(status: 'archived') + .where(access_count: 0) + .where { created_at < cutoff } + .order(:created_at) + .limit(limit) + .select_map([:id, :confidence, :created_at, + Sequel.lit("source_context->>'source_file' AS source_file")]) + .map { |r| { id: r[0], confidence: r[1], created_at: r[2]&.iso8601, source_file: r[3] } } rescue StandardError => _e [] end private_class_method :cold_chunks def low_confidence_chunks(limit) - return [] unless defined?(Legion::Data::Model::ApolloEntry) + return [] unless Helpers::ApolloModels.entry_available? threshold = settings_stale_threshold - Legion::Data::Model::ApolloEntry - .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) - .exclude(status: 'archived') - .where { confidence < threshold } - .order(:confidence) - .limit(limit) - .select_map([:id, :confidence, :access_count, - Sequel.lit("source_context->>'source_file' AS source_file")]) - .map { |r| { id: r[0], confidence: r[1], access_count: r[2], source_file: r[3] } } + Helpers::ApolloModels.entry + .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) + .exclude(status: 'archived') + .where { confidence < threshold } + .order(:confidence) + .limit(limit) + .select_map([:id, :confidence, :access_count, + Sequel.lit("source_context->>'source_file' AS source_file")]) + .map { |r| { id: r[0], confidence: r[1], access_count: r[2], source_file: r[3] } } rescue StandardError => _e [] end @@ -259,11 +261,11 @@ def low_confidence_chunks(limit) def quality_summary defaults = { total_queries: 0, avg_retrieval_score: nil, chunks_never_accessed: 0, chunks_below_threshold: 0 } - return defaults unless defined?(Legion::Data::Model::ApolloEntry) + return defaults unless Helpers::ApolloModels.entry_available? - base = Legion::Data::Model::ApolloEntry - .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) - .exclude(status: 'archived') + base = Helpers::ApolloModels.entry + .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) + .exclude(status: 'archived') { total_queries: query_count, @@ -277,9 +279,9 @@ def quality_summary private_class_method :quality_summary def query_count - return 0 unless defined?(Legion::Data::Model::ApolloAccessLog) + return 0 unless Helpers::ApolloModels.access_log_available? - Legion::Data::Model::ApolloAccessLog.where(action: 'knowledge_query').count + Helpers::ApolloModels.access_log.where(action: 'knowledge_query').count rescue StandardError => _e 0 end diff --git a/lib/legion/extensions/knowledge/version.rb b/lib/legion/extensions/knowledge/version.rb index f6663c1..427bb2d 100644 --- a/lib/legion/extensions/knowledge/version.rb +++ b/lib/legion/extensions/knowledge/version.rb @@ -3,7 +3,7 @@ module Legion module Extensions module Knowledge - VERSION = '0.6.10' + VERSION = '0.6.11' end end end diff --git a/spec/legion/extensions/knowledge/runners/maintenance_spec.rb b/spec/legion/extensions/knowledge/runners/maintenance_spec.rb index 373a6bf..ae6347c 100644 --- a/spec/legion/extensions/knowledge/runners/maintenance_spec.rb +++ b/spec/legion/extensions/knowledge/runners/maintenance_spec.rb @@ -72,6 +72,28 @@ end end + describe '.count_apollo_chunks' do + let(:entry_model) { class_double('Legion::Data::Model::Apollo::Entry') } + let(:dataset) { instance_double('ApolloEntryDataset') } + let(:tags_op) { instance_double('Sequel::Postgres::PGArrayOp') } + + before do + hide_const('Legion::Data::Model::ApolloEntry') if defined?(Legion::Data::Model::ApolloEntry) + stub_const('Legion::Data::Model::Apollo::Entry', entry_model) + stub_const('Sequel', Module.new) unless defined?(Sequel) + allow(Sequel).to receive(:pg_array).with(['document_chunk']).and_return(['document_chunk']) + allow(Sequel).to receive(:pg_array_op).with(:tags).and_return(tags_op) + allow(tags_op).to receive(:contains).with(['document_chunk']).and_return(:document_chunk_filter) + allow(entry_model).to receive(:where).with(:document_chunk_filter).and_return(dataset) + allow(dataset).to receive(:exclude).with(status: 'archived').and_return(dataset) + allow(dataset).to receive(:count).and_return(7) + end + + it 'counts chunks through the namespaced Apollo entry model' do + expect(described_class.send(:count_apollo_chunks)).to eq(7) + end + end + describe '.cleanup_orphans' do before { Legion::Extensions::Knowledge::Runners::Ingest.ingest_corpus(path: tmp_dir) } From 16a92a0992519636f671b607a3a51b14bc8a59d9 Mon Sep 17 00:00:00 2001 From: Esity Date: Wed, 6 May 2026 12:36:53 -0500 Subject: [PATCH 2/4] Add Knowledge neighbor retrieval expansion Refs #10 --- CHANGELOG.md | 8 + .../extensions/knowledge/runners/ingest.rb | 16 +- .../extensions/knowledge/runners/query.rb | 159 +++++++++++++++++- lib/legion/extensions/knowledge/version.rb | 2 +- .../knowledge/runners/ingest_spec.rb | 35 ++++ .../knowledge/runners/query_spec.rb | 73 ++++++++ 6 files changed, 277 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c7902b..da6b791 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [0.6.12] - 2026-05-06 + +### Added +- Query and retrieve runners now support optional neighbor expansion (`expand_neighbors: true`, `neighbor_radius:`) to include adjacent document chunks around Apollo retrieval hits. + +### Fixed +- Knowledge ingest now sends chunk source metadata as Apollo `context` so `source_file` and `chunk_index` are available for neighbor retrieval. + ## [0.6.11] - 2026-05-06 ### Fixed diff --git a/lib/legion/extensions/knowledge/runners/ingest.rb b/lib/legion/extensions/knowledge/runners/ingest.rb index ff0629e..c87e6ad 100644 --- a/lib/legion/extensions/knowledge/runners/ingest.rb +++ b/lib/legion/extensions/knowledge/runners/ingest.rb @@ -250,18 +250,20 @@ def chunk_exists?(content_hash) def ingest_to_apollo(chunk, embedding) return unless defined?(Legion::Extensions::Apollo) + context = { + source_file: chunk[:source_file], + heading: chunk[:heading], + section_path: chunk[:section_path], + chunk_index: chunk[:chunk_index], + token_count: chunk[:token_count] + } payload = { content: chunk[:content], content_type: 'document_chunk', content_hash: chunk[:content_hash], tags: [chunk[:source_file], chunk[:heading], 'document_chunk'].compact.uniq, - metadata: { - source_file: chunk[:source_file], - heading: chunk[:heading], - section_path: chunk[:section_path], - chunk_index: chunk[:chunk_index], - token_count: chunk[:token_count] - } + context: context, + metadata: context } payload[:embedding] = embedding if embedding diff --git a/lib/legion/extensions/knowledge/runners/query.rb b/lib/legion/extensions/knowledge/runners/query.rb index 0c1269f..873dcdc 100644 --- a/lib/legion/extensions/knowledge/runners/query.rb +++ b/lib/legion/extensions/knowledge/runners/query.rb @@ -1,6 +1,9 @@ # frozen_string_literal: true +require_relative '../helpers/apollo_models' + require 'digest' +require 'json' module Legion module Extensions @@ -9,11 +12,17 @@ module Runners module Query # rubocop:disable Legion/Extension/RunnerIncludeHelpers module_function - def query(question:, top_k: nil, synthesize: true) + def query(question:, top_k: nil, synthesize: true, expand_neighbors: false, neighbor_radius: nil) started = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - resolved_k = top_k || settings_top_k || 5 + resolved_k = top_k || settings_top_k || 5 + resolved_radius = resolve_neighbor_radius(neighbor_radius) - chunks = retrieve_chunks(question, resolved_k) + chunks = retrieve_chunks( + question, + resolved_k, + expand_neighbors: expand_neighbors, + neighbor_radius: resolved_radius + ) answer = (synthesize_answer(question, chunks) if synthesize && llm_available?) @@ -39,9 +48,15 @@ def query(question:, top_k: nil, synthesize: true) { success: false, error: e.message } end - def retrieve(question:, top_k: nil) - resolved_k = top_k || settings_top_k || 5 - chunks = retrieve_chunks(question, resolved_k) + def retrieve(question:, top_k: nil, expand_neighbors: false, neighbor_radius: nil) + resolved_k = top_k || settings_top_k || 5 + resolved_radius = resolve_neighbor_radius(neighbor_radius) + chunks = retrieve_chunks( + question, + resolved_k, + expand_neighbors: expand_neighbors, + neighbor_radius: resolved_radius + ) { success: true, @@ -66,7 +81,7 @@ def record_feedback(question:, chunk_ids:, retrieval_score:, synthesized: true, { success: false, error: e.message } end - def retrieve_chunks(question, top_k) + def retrieve_chunks(question, top_k, expand_neighbors: false, neighbor_radius: 1) return [] unless defined?(Legion::Extensions::Apollo) result = Legion::Extensions::Apollo::Runners::Knowledge.retrieve_relevant( @@ -74,12 +89,78 @@ def retrieve_chunks(question, top_k) limit: top_k, tags: ['document_chunk'] ) - result.is_a?(Hash) && result[:success] ? Array(result[:entries]) : [] + chunks = result.is_a?(Hash) && result[:success] ? Array(result[:entries]) : [] + expand_neighbors ? expand_neighbor_chunks(chunks, neighbor_radius) : chunks rescue StandardError => _e [] end private_class_method :retrieve_chunks + def expand_neighbor_chunks(chunks, neighbor_radius) + return chunks if chunks.empty? + + radius = neighbor_radius.to_i + return chunks unless radius.positive? && Helpers::ApolloModels.entry_available? + + merge_neighbor_chunks(chunks.flat_map { |chunk| neighbor_window_for(chunk, radius) }) + rescue StandardError => _e + chunks + end + private_class_method :expand_neighbor_chunks + + def neighbor_window_for(chunk, radius) + context = chunk_context(chunk) + return [chunk] unless context[:source_file] && !context[:chunk_index].nil? + + source_file = context[:source_file] + chunk_index = context[:chunk_index].to_i + lower = chunk_index - radius + upper = chunk_index + radius + + rows = neighbor_dataset(source_file, lower, upper).all.map { |entry| chunk_from_entry(entry) } + rows << chunk unless rows.any? { |row| chunk_dedupe_key(row) == chunk_dedupe_key(chunk) } + rows.sort_by { |row| chunk_context(row)[:chunk_index].to_i } + rescue StandardError => _e + [chunk] + end + private_class_method :neighbor_window_for + + def neighbor_dataset(source_file, lower, upper) + Helpers::ApolloModels.entry + .where(content_type: 'document_chunk') + .where(Sequel.lit("source_context->>'source_file' = ?", source_file)) + .where(Sequel.lit("(source_context->>'chunk_index')::integer BETWEEN ? AND ?", lower, upper)) + .order(Sequel.lit("(source_context->>'chunk_index')::integer ASC")) + end + private_class_method :neighbor_dataset + + def chunk_from_entry(entry) + values = entry.respond_to?(:values) ? entry.values : entry + context = normalize_context(values[:source_context] || values[:metadata] || values[:context]) + + { + id: values[:id], + content: values[:content], + content_type: values[:content_type], + confidence: values[:confidence], + tags: values[:tags], + source_agent: values[:source_agent], + knowledge_domain: values[:knowledge_domain], + status: values[:status], + content_hash: values[:content_hash], + metadata: context + }.compact + end + private_class_method :chunk_from_entry + + def merge_neighbor_chunks(chunks) + chunks.each_with_object({}) do |chunk, merged| + key = chunk_dedupe_key(chunk) + merged[key] ||= chunk + end.values + end + private_class_method :merge_neighbor_chunks + def synthesize_answer(question, chunks) return nil unless llm_available? @@ -103,11 +184,59 @@ def format_source(chunk) content: chunk[:content], source_file: chunk.dig(:metadata, :source_file) || chunk[:source_file], heading: chunk.dig(:metadata, :heading) || chunk[:heading], + chunk_index: chunk.dig(:metadata, :chunk_index) || chunk[:chunk_index], distance: chunk[:distance] || chunk[:score] } end private_class_method :format_source + def chunk_context(chunk) + context = normalize_context(chunk[:metadata] || chunk[:source_context] || chunk[:context]) + if (context[:source_file].nil? || context[:chunk_index].nil?) && chunk[:id] && Helpers::ApolloModels.entry_available? + row = Helpers::ApolloModels.entry.where(id: chunk[:id]).first + context = context.merge(normalize_context(row_context(row))) if row + end + + context[:source_file] ||= chunk[:source_file] + context[:chunk_index] ||= chunk[:chunk_index] + context[:heading] ||= chunk[:heading] + context + rescue StandardError => _e + {} + end + private_class_method :chunk_context + + def row_context(row) + values = row.respond_to?(:values) ? row.values : row + values[:source_context] || values[:metadata] || values[:context] + end + private_class_method :row_context + + def normalize_context(context) + normalized = case context + when String + context.strip.empty? ? {} : ::JSON.parse(context, symbolize_names: true) + when Hash + context + else + {} + end + + normalized.transform_keys { |key| key.respond_to?(:to_sym) ? key.to_sym : key } + rescue StandardError => _e + {} + end + private_class_method :normalize_context + + def chunk_dedupe_key(chunk) + chunk[:id] || chunk[:content_hash] || [ + chunk_context(chunk)[:source_file], + chunk_context(chunk)[:chunk_index], + chunk[:content] + ] + end + private_class_method :chunk_dedupe_key + def average_score(chunks) return nil if chunks.empty? @@ -178,6 +307,20 @@ def settings_top_k nil end private_class_method :settings_top_k + + def resolve_neighbor_radius(neighbor_radius) + (neighbor_radius || settings_neighbor_radius || 1).to_i + end + private_class_method :resolve_neighbor_radius + + def settings_neighbor_radius + return nil unless defined?(Legion::Settings) + + Legion::Settings.dig(:knowledge, :query, :neighbor_radius) + rescue StandardError => _e + nil + end + private_class_method :settings_neighbor_radius end end end diff --git a/lib/legion/extensions/knowledge/version.rb b/lib/legion/extensions/knowledge/version.rb index 427bb2d..83bf756 100644 --- a/lib/legion/extensions/knowledge/version.rb +++ b/lib/legion/extensions/knowledge/version.rb @@ -3,7 +3,7 @@ module Legion module Extensions module Knowledge - VERSION = '0.6.11' + VERSION = '0.6.12' end end end diff --git a/spec/legion/extensions/knowledge/runners/ingest_spec.rb b/spec/legion/extensions/knowledge/runners/ingest_spec.rb index b55f165..b188998 100644 --- a/spec/legion/extensions/knowledge/runners/ingest_spec.rb +++ b/spec/legion/extensions/knowledge/runners/ingest_spec.rb @@ -246,6 +246,41 @@ end end + describe '.ingest_to_apollo' do + let(:chunk) do + { + content: 'Chunk contents for Apollo context.', + content_hash: 'abcdef0123456789abcdef0123456789', + source_file: '/tmp/context.md', + heading: 'Context', + section_path: ['Context'], + chunk_index: 4, + token_count: 7 + } + end + + before do + stub_const('Legion::Extensions::Apollo', Module.new) + runners_mod = Module.new + knowledge_mod = Module.new + knowledge_mod.define_singleton_method(:handle_ingest) { |**| { success: true } } + runners_mod.const_set(:Knowledge, knowledge_mod) + stub_const('Legion::Extensions::Apollo::Runners', runners_mod) + end + + it 'passes chunk metadata as Apollo context for neighbor retrieval' do + expect(Legion::Extensions::Apollo::Runners::Knowledge).to receive(:handle_ingest).with( + hash_including( + content: chunk[:content], + context: hash_including(source_file: '/tmp/context.md', chunk_index: 4), + metadata: hash_including(source_file: '/tmp/context.md', chunk_index: 4) + ) + ).and_return({ success: true }) + + described_class.send(:ingest_to_apollo, chunk, nil) + end + end + describe '.ingest_corpus — delta behavior' do let(:tmp_dir) { Dir.mktmpdir } let(:store_dir) { Dir.mktmpdir } diff --git a/spec/legion/extensions/knowledge/runners/query_spec.rb b/spec/legion/extensions/knowledge/runners/query_spec.rb index a06d2e1..9bc60cc 100644 --- a/spec/legion/extensions/knowledge/runners/query_spec.rb +++ b/spec/legion/extensions/knowledge/runners/query_spec.rb @@ -80,6 +80,79 @@ end end + describe 'neighbor expansion' do + before do + stub_const('Legion::Extensions::Apollo', Module.new) + runners_mod = Module.new + knowledge_mod = Module.new + knowledge_mod.define_singleton_method(:retrieve_relevant) do |**| + { success: true, entries: [ + { id: 2, content: 'middle chunk', + metadata: { source_file: '/docs/a.md', chunk_index: 1 }, + distance: 0.1 } + ] } + end + runners_mod.const_set(:Knowledge, knowledge_mod) + stub_const('Legion::Extensions::Apollo::Runners', runners_mod) + allow(Legion::Extensions::Knowledge::Helpers::ApolloModels).to receive(:entry_available?).and_return(true) + end + + it 'leaves retrieval results unchanged by default' do + expect(described_class).not_to receive(:neighbor_window_for) + + result = described_class.retrieve(question: 'legion') + + expect(result[:success]).to be true + expect(result[:sources].map { |s| s[:content] }).to eq(['middle chunk']) + end + + it 'expands each hit with adjacent chunks when requested' do + neighbor_window = [ + { + id: 1, content: 'previous chunk', + metadata: { source_file: '/docs/a.md', chunk_index: 0 } + }, + { + id: 2, content: 'middle chunk', + metadata: { source_file: '/docs/a.md', chunk_index: 1 }, distance: 0.1 + }, + { + id: 3, content: 'next chunk', + metadata: { source_file: '/docs/a.md', chunk_index: 2 } + } + ] + allow(described_class).to receive(:neighbor_window_for).and_return(neighbor_window) + + result = described_class.retrieve(question: 'legion', expand_neighbors: true, neighbor_radius: 1) + + expect(result[:success]).to be true + expect(result[:sources].map { |s| s[:content] }).to eq(['previous chunk', 'middle chunk', 'next chunk']) + expect(result[:metadata][:chunk_count]).to eq(3) + end + + it 'deduplicates overlapping neighbor windows' do + allow(described_class).to receive(:neighbor_window_for).and_return( + [ + { id: 1, content: 'previous chunk', metadata: { source_file: '/docs/a.md', chunk_index: 0 } }, + { id: 2, content: 'middle chunk', metadata: { source_file: '/docs/a.md', chunk_index: 1 } } + ], + [ + { id: 2, content: 'middle chunk', metadata: { source_file: '/docs/a.md', chunk_index: 1 } }, + { id: 3, content: 'next chunk', metadata: { source_file: '/docs/a.md', chunk_index: 2 } } + ] + ) + + expanded = described_class.send(:merge_neighbor_chunks, [ + { id: 1, content: 'previous chunk' }, + { id: 2, content: 'middle chunk' }, + { id: 2, content: 'middle chunk duplicate' }, + { id: 3, content: 'next chunk' } + ]) + + expect(expanded.map { |chunk| chunk[:id] }).to eq([1, 2, 3]) + end + end + describe '.record_feedback' do it 'returns success with question_hash' do result = described_class.record_feedback( From 6c8bbc2db8d26d8e8ab8a3ab84052dc9656f836b Mon Sep 17 00:00:00 2001 From: Esity Date: Wed, 6 May 2026 12:41:07 -0500 Subject: [PATCH 3/4] Add Knowledge ingest filter prompt Refs #7 --- CHANGELOG.md | 5 + .../extensions/knowledge/runners/ingest.rb | 102 +++++++++++++++--- lib/legion/extensions/knowledge/version.rb | 2 +- .../knowledge/runners/ingest_spec.rb | 75 +++++++++++++ 4 files changed, 168 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da6b791..a0af5bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [0.6.13] - 2026-05-06 + +### Added +- Knowledge ingest now supports optional LLM-based chunk filtering through `knowledge.ingest.filter_prompt`, with confidence thresholding, content-hash caching, and a runner-level `filter: false` bypass for no-filter ingest flows. + ## [0.6.12] - 2026-05-06 ### Added diff --git a/lib/legion/extensions/knowledge/runners/ingest.rb b/lib/legion/extensions/knowledge/runners/ingest.rb index c87e6ad..c185834 100644 --- a/lib/legion/extensions/knowledge/runners/ingest.rb +++ b/lib/legion/extensions/knowledge/runners/ingest.rb @@ -31,17 +31,27 @@ def scan_corpus(path:, extensions: nil) } end - def ingest_corpus(path: nil, monitors: nil, dry_run: false, force: false) - return ingest_monitors(monitors: monitors, dry_run: dry_run, force: force) if monitors&.any? + FILTER_SCHEMA = { + type: 'object', + properties: { + relevant: { type: 'boolean' }, + confidence: { type: 'number' }, + reason: { type: 'string' } + }, + required: %w[relevant confidence] + }.freeze + + def ingest_corpus(path: nil, monitors: nil, dry_run: false, force: false, filter: true) + return ingest_monitors(monitors: monitors, dry_run: dry_run, force: force, filter: filter) if monitors&.any? raise ArgumentError, 'path is required when monitors is not provided' if path.nil? - ingest_corpus_path(path: path, dry_run: dry_run, force: force) + ingest_corpus_path(path: path, dry_run: dry_run, force: force, filter: filter) rescue ArgumentError => e log.warn(e.message) { success: false, error: e.message } end - def ingest_corpus_path(path:, dry_run: false, force: false) + def ingest_corpus_path(path:, dry_run: false, force: false, filter: true) current = Helpers::Manifest.scan(path: path) previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path) delta = Helpers::Manifest.diff(current: current, previous: previous) @@ -52,7 +62,7 @@ def ingest_corpus_path(path:, dry_run: false, force: false) chunks_updated = 0 to_process.each do |file_path| - result = process_file(file_path, dry_run: dry_run, force: force) + result = process_file(file_path, dry_run: dry_run, force: force, filter: filter) chunks_created += result[:created] chunks_skipped += result[:skipped] chunks_updated += result[:updated] @@ -78,9 +88,9 @@ def ingest_corpus_path(path:, dry_run: false, force: false) end private_class_method :ingest_corpus_path - def ingest_monitors(monitors:, dry_run: false, force: false) + def ingest_monitors(monitors:, dry_run: false, force: false, filter: true) results = monitors.map do |monitor| - ingest_corpus(path: monitor[:path], dry_run: dry_run, force: force) + ingest_corpus(path: monitor[:path], dry_run: dry_run, force: force, filter: filter) rescue StandardError => e log.warn(e.message) { success: false, path: monitor[:path], error: e.message } @@ -116,7 +126,7 @@ def ingest_content(content:, source_type: :text, metadata: {}) section_path: [source_type.to_s], source_file: source_path } - chunks = Helpers::Chunker.chunk(sections: [section]) + chunks = filter_chunks(Helpers::Chunker.chunk(sections: [section]), filter: true) paired = batch_embed_chunks(chunks, force: false) paired.each { |p| upsert_chunk_with_embedding(p[:chunk], p[:embedding], force: false, exists: p[:exists] || false) } { status: :ingested, chunks: chunks.size, source_type: source_type, metadata: metadata } @@ -125,8 +135,8 @@ def ingest_content(content:, source_type: :text, metadata: {}) { status: :failed, error: e.message, source_type: source_type, metadata: metadata } end - def ingest_file(file_path:, force: false) - result = process_file(file_path, dry_run: false, force: force) + def ingest_file(file_path:, force: false, filter: true) + result = process_file(file_path, dry_run: false, force: force, filter: filter) { success: true, @@ -140,19 +150,20 @@ def ingest_file(file_path:, force: false) { success: false, error: e.message } end - def process_file(file_path, dry_run: false, force: false) + def process_file(file_path, dry_run: false, force: false, filter: true) sections = Helpers::Parser.parse(file_path: file_path) return { created: 0, skipped: 0, updated: 0 } if sections.first&.key?(:error) - chunks = Helpers::Chunker.chunk(sections: sections) + chunks = Helpers::Chunker.chunk(sections: sections) + filtered_chunks = filter_chunks(chunks, filter: filter) paired = if dry_run - chunks.map { |c| { chunk: c, embedding: nil } } + filtered_chunks.map { |c| { chunk: c, embedding: nil } } else - batch_embed_chunks(chunks, force: force) + batch_embed_chunks(filtered_chunks, force: force) end created = 0 - skipped = 0 + skipped = chunks.size - filtered_chunks.size updated = 0 paired.each do |p| @@ -168,6 +179,67 @@ def process_file(file_path, dry_run: false, force: false) end private_class_method :process_file + def filter_chunks(chunks, filter:) + return chunks unless filter + + prompt = settings_filter_prompt + return chunks if prompt.to_s.strip.empty? || !llm_structured_available? + + chunks.select { |chunk| chunk_allowed_by_filter?(chunk, prompt: prompt, threshold: settings_filter_threshold) } + rescue StandardError => e + log.warn(e.message) + chunks + end + private_class_method :filter_chunks + + def chunk_allowed_by_filter?(chunk, prompt:, threshold:) + hash = chunk[:content_hash] || Helpers::Chunker.send(:apollo_compatible_content_hash, chunk[:content].to_s) + return filter_cache[hash] if filter_cache.key?(hash) + + result = Legion::LLM.structured( # rubocop:disable Legion/HelperMigration/DirectLlm + messages: [ + { role: 'system', content: prompt }, + { role: 'user', content: chunk[:content].to_s } + ], + schema: FILTER_SCHEMA, + caller: { extension: 'lex-knowledge', runner: 'ingest', operation: 'filter_chunk' } + ) + data = result.is_a?(Hash) ? (result[:data] || result) : {} + filter_cache[hash] = data[:relevant] == true && data[:confidence].to_f >= threshold.to_f + rescue StandardError => e + log.warn(e.message) + filter_cache[hash] = true + end + private_class_method :chunk_allowed_by_filter? + + def filter_cache + Thread.current[:lex_knowledge_filter_cache] ||= {} + end + private_class_method :filter_cache + + def llm_structured_available? + defined?(Legion::LLM) && Legion::LLM.respond_to?(:structured) + end + private_class_method :llm_structured_available? + + def settings_filter_prompt + return nil unless defined?(Legion::Settings) + + Legion::Settings.dig(:knowledge, :ingest, :filter_prompt) || Legion::Settings.dig(:knowledge, :filter_prompt) + rescue StandardError => _e + nil + end + private_class_method :settings_filter_prompt + + def settings_filter_threshold + return 0.5 unless defined?(Legion::Settings) + + Legion::Settings.dig(:knowledge, :ingest, :filter_threshold) || Legion::Settings.dig(:knowledge, :filter_threshold) || 0.5 + rescue StandardError => _e + 0.5 + end + private_class_method :settings_filter_threshold + def batch_embed_chunks(chunks, force:) exists_map = force ? {} : build_exists_map(chunks) return paired_without_embed(chunks, exists_map) unless llm_embed_available? diff --git a/lib/legion/extensions/knowledge/version.rb b/lib/legion/extensions/knowledge/version.rb index 83bf756..594c58f 100644 --- a/lib/legion/extensions/knowledge/version.rb +++ b/lib/legion/extensions/knowledge/version.rb @@ -3,7 +3,7 @@ module Legion module Extensions module Knowledge - VERSION = '0.6.12' + VERSION = '0.6.13' end end end diff --git a/spec/legion/extensions/knowledge/runners/ingest_spec.rb b/spec/legion/extensions/knowledge/runners/ingest_spec.rb index b188998..cb7f8e1 100644 --- a/spec/legion/extensions/knowledge/runners/ingest_spec.rb +++ b/spec/legion/extensions/knowledge/runners/ingest_spec.rb @@ -145,6 +145,81 @@ end end + describe 'LLM ingest filtering' do + let(:chunk_a) do + { content: 'Useful architecture notes', content_hash: 'hash_a', source_file: '/tmp/a.md', chunk_index: 0 } + end + let(:chunk_b) do + { content: 'Boilerplate navigation text', content_hash: 'hash_b', source_file: '/tmp/a.md', chunk_index: 1 } + end + let(:llm_double) { double('Legion::LLM') } + + before do + described_class.send(:filter_cache).clear + allow(described_class).to receive(:settings_filter_prompt).and_return('Keep useful architecture notes only.') + allow(described_class).to receive(:settings_filter_threshold).and_return(0.7) + stub_const('Legion::LLM', llm_double) + allow(llm_double).to receive(:respond_to?).with(:structured).and_return(true) + end + + it 'drops chunks whose structured filter result is not relevant enough' do + allow(llm_double).to receive(:structured).and_return( + { data: { relevant: true, confidence: 0.9, reason: 'useful' } }, + { data: { relevant: false, confidence: 0.2, reason: 'noise' } } + ) + + filtered = described_class.send(:filter_chunks, [chunk_a, chunk_b], filter: true) + + expect(filtered).to eq([chunk_a]) + end + + it 'caches filter results by content hash' do + duplicate = chunk_a.merge(content: 'Useful architecture notes repeated') + + allow(llm_double).to receive(:structured).and_return( + { data: { relevant: true, confidence: 0.9, reason: 'useful' } }, + { data: { relevant: false, confidence: 0.2, reason: 'noise' } } + ) + + filtered = described_class.send(:filter_chunks, [chunk_a, duplicate, chunk_b], filter: true) + + expect(filtered).to eq([chunk_a, duplicate]) + expect(llm_double).to have_received(:structured).twice + end + + it 'bypasses the LLM when filter is disabled' do + expect(llm_double).not_to receive(:structured) + + filtered = described_class.send(:filter_chunks, [chunk_a, chunk_b], filter: false) + + expect(filtered).to eq([chunk_a, chunk_b]) + end + + it 'counts filtered chunks as skipped before embedding' do + allow(Legion::Extensions::Knowledge::Helpers::Parser).to receive(:parse).and_return([{ content: 'parsed' }]) + allow(Legion::Extensions::Knowledge::Helpers::Chunker).to receive(:chunk).and_return([chunk_a, chunk_b]) + allow(described_class).to receive(:filter_chunks).and_return([chunk_a]) + allow(described_class).to receive(:batch_embed_chunks).with([chunk_a], force: false) + .and_return([{ chunk: chunk_a, embedding: nil, exists: false }]) + allow(described_class).to receive(:upsert_chunk_with_embedding).and_return(:created) + + result = described_class.send(:process_file, '/tmp/a.md', dry_run: false, force: false, filter: true) + + expect(result).to eq({ created: 1, skipped: 1, updated: 0 }) + end + + it 'accepts filter false on the public single-file ingest runner' do + allow(described_class).to receive(:process_file) + .with('/tmp/a.md', dry_run: false, force: false, filter: false) + .and_return({ created: 1, skipped: 0, updated: 0 }) + + result = described_class.ingest_file(file_path: '/tmp/a.md', filter: false) + + expect(result[:success]).to be true + expect(result[:chunks_created]).to eq(1) + end + end + describe '#ingest_content' do it 'accepts string content directly' do result = described_class.ingest_content( From 9aec6f910d659d9b7846f4c75f420cc7e4fadc2a Mon Sep 17 00:00:00 2001 From: Esity Date: Wed, 6 May 2026 13:41:44 -0500 Subject: [PATCH 4/4] Uplift Knowledge helpers and defaults Refs #7 Refs #10 --- CHANGELOG.md | 5 ++ lib/legion/extensions/knowledge.rb | 33 +++++++++ .../knowledge/actors/corpus_ingest.rb | 6 +- .../knowledge/actors/corpus_watcher.rb | 19 ++--- .../knowledge/actors/maintenance_runner.rb | 23 +++--- .../extensions/knowledge/helpers/chunker.rb | 25 ++----- .../extensions/knowledge/helpers/manifest.rb | 9 +-- .../knowledge/helpers/manifest_store.rb | 15 ++-- .../extensions/knowledge/helpers/parser.rb | 3 + .../extensions/knowledge/runners/corpus.rb | 3 + .../extensions/knowledge/runners/ingest.rb | 56 +++++---------- .../knowledge/runners/maintenance.rb | 71 ++++++++----------- .../extensions/knowledge/runners/monitor.rb | 37 +++++----- .../extensions/knowledge/runners/query.rb | 53 +++++++------- lib/legion/extensions/knowledge/version.rb | 2 +- .../knowledge/helpers/manifest_store_spec.rb | 4 +- .../knowledge/runners/ingest_spec.rb | 16 +++-- .../knowledge/runners/maintenance_spec.rb | 14 ++-- spec/spec_helper.rb | 4 ++ 19 files changed, 195 insertions(+), 203 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0af5bb..9ca6a79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [0.6.14] - 2026-05-06 + +### Changed +- Knowledge defaults are now declared directly in `Knowledge.default_settings`, and helpers, runners, actors, and JSON sidecar persistence use Legion logging, settings, and JSON helpers end to end. + ## [0.6.13] - 2026-05-06 ### Added diff --git a/lib/legion/extensions/knowledge.rb b/lib/legion/extensions/knowledge.rb index 5e3f93c..e90a0d8 100644 --- a/lib/legion/extensions/knowledge.rb +++ b/lib/legion/extensions/knowledge.rb @@ -1,5 +1,8 @@ # frozen_string_literal: true +require 'legion/logging' +require 'legion/settings' +require 'legion/json' require_relative 'knowledge/version' require_relative 'knowledge/helpers/manifest' require_relative 'knowledge/helpers/manifest_store' @@ -28,11 +31,41 @@ module Legion module Extensions module Knowledge + extend Legion::Logging::Helper + extend Legion::Settings::Helper extend Legion::Extensions::Core if defined?(Legion::Extensions::Core) def self.remote_invocable? false end + + def self.default_settings + { + corpus_path: nil, + monitors: [], + chunker: { + max_tokens: 512, + overlap_tokens: 128 + }, + query: { + top_k: 5, + neighbor_radius: 1 + }, + ingest: { + filter_prompt: nil, + filter_threshold: 0.5 + }, + maintenance: { + stale_threshold: 0.3, + cold_chunk_days: 7, + quality_report_limit: 10 + }, + actors: { + watcher_interval: 300, + maintenance_interval: 21_600 + } + } + end end end end diff --git a/lib/legion/extensions/knowledge/actors/corpus_ingest.rb b/lib/legion/extensions/knowledge/actors/corpus_ingest.rb index 53d4ed9..a3918e5 100644 --- a/lib/legion/extensions/knowledge/actors/corpus_ingest.rb +++ b/lib/legion/extensions/knowledge/actors/corpus_ingest.rb @@ -5,6 +5,9 @@ module Extensions module Knowledge module Actor class CorpusIngest < Legion::Extensions::Actors::Subscription + include Legion::Logging::Helper + include Legion::Settings::Helper + def runner_class = 'Legion::Extensions::Knowledge::Runners::Ingest' def runner_function = 'ingest_file' def check_subtask? = false @@ -13,7 +16,8 @@ def generate_task? = false def enabled? # rubocop:disable Legion/Extension/ActorEnabledSideEffects Legion.const_defined?(:Transport, false) && defined?(Legion::Extensions::Knowledge::Runners::Ingest) - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.corpus_ingest.enabled') false end end diff --git a/lib/legion/extensions/knowledge/actors/corpus_watcher.rb b/lib/legion/extensions/knowledge/actors/corpus_watcher.rb index 546bdb3..547d50b 100644 --- a/lib/legion/extensions/knowledge/actors/corpus_watcher.rb +++ b/lib/legion/extensions/knowledge/actors/corpus_watcher.rb @@ -5,26 +5,25 @@ module Extensions module Knowledge module Actor class CorpusWatcher < Legion::Extensions::Actors::Every # rubocop:disable Legion/Extension/EveryActorRequiresTime + include Legion::Logging::Helper + include Legion::Settings::Helper + def runner_class = 'Legion::Extensions::Knowledge::Runners::Ingest' def runner_function = 'ingest_corpus' def check_subtask? = false def generate_task? = false def time - if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil? - Legion::Settings.dig(:knowledge, :actors, :watcher_interval) || 300 - else - 300 - end + settings[:actors][:watcher_interval] rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.corpus_watcher.time') 300 end def enabled? # rubocop:disable Legion/Extension/ActorEnabledSideEffects resolve_monitors.any? rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.corpus_watcher.enabled') false end @@ -34,14 +33,10 @@ def args private - def log - Legion::Logging - end - def resolve_monitors Runners::Monitor.resolve_monitors rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.corpus_watcher.resolve_monitors') [] end end diff --git a/lib/legion/extensions/knowledge/actors/maintenance_runner.rb b/lib/legion/extensions/knowledge/actors/maintenance_runner.rb index 8646b71..4c598e2 100644 --- a/lib/legion/extensions/knowledge/actors/maintenance_runner.rb +++ b/lib/legion/extensions/knowledge/actors/maintenance_runner.rb @@ -5,19 +5,18 @@ module Extensions module Knowledge module Actor class MaintenanceRunner < Legion::Extensions::Actors::Every # rubocop:disable Legion/Extension/EveryActorRequiresTime + include Legion::Logging::Helper + include Legion::Settings::Helper + def runner_class = 'Legion::Extensions::Knowledge::Runners::Maintenance' def runner_function = 'health' def check_subtask? = false def generate_task? = false def time - if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil? - Legion::Settings.dig(:knowledge, :actors, :maintenance_interval) || 21_600 - else - 21_600 - end + settings[:actors][:maintenance_interval] rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.maintenance_runner.time') 21_600 end @@ -26,7 +25,7 @@ def enabled? # rubocop:disable Legion/Extension/ActorEnabledSideEffects true rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.maintenance_runner.enabled') false end @@ -36,16 +35,10 @@ def args private - def log - Legion::Logging - end - def corpus_path - return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil? - - Legion::Settings.dig(:knowledge, :corpus_path) + settings[:corpus_path] rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.maintenance_runner.corpus_path') nil end end diff --git a/lib/legion/extensions/knowledge/helpers/chunker.rb b/lib/legion/extensions/knowledge/helpers/chunker.rb index 435e669..099d7da 100644 --- a/lib/legion/extensions/knowledge/helpers/chunker.rb +++ b/lib/legion/extensions/knowledge/helpers/chunker.rb @@ -7,13 +7,16 @@ module Extensions module Knowledge module Helpers module Chunker + extend Legion::Logging::Helper + extend Legion::Settings::Helper + CHARS_PER_TOKEN = 4 module_function def chunk(sections:, max_tokens: nil, overlap_tokens: nil) - resolved_max = max_tokens || settings_max_tokens || 512 - resolved_overlap = overlap_tokens || settings_overlap_tokens || 128 + resolved_max = max_tokens || settings[:chunker][:max_tokens] + resolved_overlap = overlap_tokens || settings[:chunker][:overlap_tokens] max_chars = resolved_max * CHARS_PER_TOKEN overlap_chars = resolved_overlap * CHARS_PER_TOKEN @@ -89,24 +92,6 @@ def apollo_compatible_content_hash(content) end end private_class_method :apollo_compatible_content_hash - - def settings_max_tokens - return nil unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :chunker, :max_tokens) - rescue StandardError => _e - nil - end - private_class_method :settings_max_tokens - - def settings_overlap_tokens - return nil unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :chunker, :overlap_tokens) - rescue StandardError => _e - nil - end - private_class_method :settings_overlap_tokens end end end diff --git a/lib/legion/extensions/knowledge/helpers/manifest.rb b/lib/legion/extensions/knowledge/helpers/manifest.rb index 859d935..30eb686 100644 --- a/lib/legion/extensions/knowledge/helpers/manifest.rb +++ b/lib/legion/extensions/knowledge/helpers/manifest.rb @@ -7,6 +7,8 @@ module Extensions module Knowledge module Helpers module Manifest + extend Legion::Logging::Helper + module_function def scan(path:, extensions: %w[.md .txt .docx .pdf]) @@ -25,15 +27,10 @@ def walk(entry, extensions, results) results << build_entry(entry) end rescue Errno::EPERM, Errno::EACCES, Errno::ELOOP, Errno::ENOENT => e - log.debug("[manifest] skipping unreadable #{entry}: #{e.class}: #{e.message}") + handle_exception(e, level: :warn, operation: 'knowledge.manifest.walk', entry: entry) end private_class_method :walk - def log - Legion::Logging - end - private_class_method :log - def diff(current:, previous:) current_map = current.to_h { |e| [e[:path], e[:sha256]] } previous_map = previous.to_h { |e| [e[:path], e[:sha256]] } diff --git a/lib/legion/extensions/knowledge/helpers/manifest_store.rb b/lib/legion/extensions/knowledge/helpers/manifest_store.rb index 1029fb9..b5c52a1 100644 --- a/lib/legion/extensions/knowledge/helpers/manifest_store.rb +++ b/lib/legion/extensions/knowledge/helpers/manifest_store.rb @@ -2,7 +2,7 @@ require 'digest' require 'fileutils' -require 'json' +require 'legion/json' require 'tempfile' module Legion @@ -10,6 +10,9 @@ module Extensions module Knowledge module Helpers module ManifestStore + extend Legion::Logging::Helper + extend Legion::JSON::Helper + module_function STORE_DIR = ::File.expand_path('~/.legionio/knowledge').freeze @@ -19,8 +22,9 @@ def load(corpus_path:) return [] unless ::File.exist?(path) raw = ::File.read(path, encoding: 'utf-8') - ::JSON.parse(raw, symbolize_names: true) - rescue StandardError => _e + json_parse(raw) + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.manifest_store.load', corpus_path: corpus_path) [] end @@ -28,10 +32,11 @@ def save(corpus_path:, manifest:) ::FileUtils.mkdir_p(STORE_DIR) path = store_path(corpus_path: corpus_path) tmp = "#{path}.tmp" - ::File.write(tmp, ::JSON.generate(manifest.map { |e| serialize_entry(e) })) + ::File.write(tmp, json_generate(manifest.map { |e| serialize_entry(e) })) ::File.rename(tmp, path) true - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.manifest_store.save', corpus_path: corpus_path) false end diff --git a/lib/legion/extensions/knowledge/helpers/parser.rb b/lib/legion/extensions/knowledge/helpers/parser.rb index 0083529..b43a2b2 100644 --- a/lib/legion/extensions/knowledge/helpers/parser.rb +++ b/lib/legion/extensions/knowledge/helpers/parser.rb @@ -5,6 +5,8 @@ module Extensions module Knowledge module Helpers module Parser + extend Legion::Logging::Helper + module_function def parse(file_path:) @@ -57,6 +59,7 @@ def extract_via_data(file_path:) heading = ::File.basename(file_path, '.*') [{ heading: heading, section_path: [], content: result[:text].strip, source_file: file_path }] rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.parser.extract_via_data', file_path: file_path) [{ error: 'extraction_failed', source_file: file_path, detail: e.message }] end diff --git a/lib/legion/extensions/knowledge/runners/corpus.rb b/lib/legion/extensions/knowledge/runners/corpus.rb index 1817f1b..d5ff007 100644 --- a/lib/legion/extensions/knowledge/runners/corpus.rb +++ b/lib/legion/extensions/knowledge/runners/corpus.rb @@ -5,6 +5,8 @@ module Extensions module Knowledge module Runners module Corpus # rubocop:disable Legion/Extension/RunnerIncludeHelpers + extend Legion::Logging::Helper + module_function def manifest_path(path:) @@ -32,6 +34,7 @@ def corpus_stats(path:, extensions: nil) total_bytes: entries.sum { |e| e[:size] } } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.corpus.corpus_stats', path: path) { success: false, error: e.message } end end diff --git a/lib/legion/extensions/knowledge/runners/ingest.rb b/lib/legion/extensions/knowledge/runners/ingest.rb index c185834..c4777d8 100644 --- a/lib/legion/extensions/knowledge/runners/ingest.rb +++ b/lib/legion/extensions/knowledge/runners/ingest.rb @@ -9,12 +9,10 @@ module Extensions module Knowledge module Runners module Ingest # rubocop:disable Legion/Extension/RunnerIncludeHelpers - module_function + extend Legion::Logging::Helper + extend Legion::Settings::Helper - def log - Legion::Logging - end - private_class_method :log + module_function def scan_corpus(path:, extensions: nil) opts = { path: path } @@ -47,7 +45,7 @@ def ingest_corpus(path: nil, monitors: nil, dry_run: false, force: false, filter ingest_corpus_path(path: path, dry_run: dry_run, force: force, filter: filter) rescue ArgumentError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_corpus') { success: false, error: e.message } end @@ -83,7 +81,7 @@ def ingest_corpus_path(path:, dry_run: false, force: false, filter: true) chunks_updated: chunks_updated } rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_corpus_path', path: path) { success: false, error: e.message } end private_class_method :ingest_corpus_path @@ -92,7 +90,7 @@ def ingest_monitors(monitors:, dry_run: false, force: false, filter: true) results = monitors.map do |monitor| ingest_corpus(path: monitor[:path], dry_run: dry_run, force: force, filter: filter) rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_monitor', path: monitor[:path]) { success: false, path: monitor[:path], error: e.message } end @@ -113,7 +111,7 @@ def ingest_monitors(monitors:, dry_run: false, force: false, filter: true) { success: true, monitors_processed: results.size, **total } rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_monitors') { success: false, error: e.message } end private_class_method :ingest_monitors @@ -131,7 +129,7 @@ def ingest_content(content:, source_type: :text, metadata: {}) paired.each { |p| upsert_chunk_with_embedding(p[:chunk], p[:embedding], force: false, exists: p[:exists] || false) } { status: :ingested, chunks: chunks.size, source_type: source_type, metadata: metadata } rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_content', source_type: source_type) { status: :failed, error: e.message, source_type: source_type, metadata: metadata } end @@ -146,7 +144,7 @@ def ingest_file(file_path:, force: false, filter: true) chunks_updated: result[:updated] } rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_file', file_path: file_path) { success: false, error: e.message } end @@ -182,12 +180,12 @@ def process_file(file_path, dry_run: false, force: false, filter: true) def filter_chunks(chunks, filter:) return chunks unless filter - prompt = settings_filter_prompt + prompt = settings[:ingest][:filter_prompt] return chunks if prompt.to_s.strip.empty? || !llm_structured_available? - chunks.select { |chunk| chunk_allowed_by_filter?(chunk, prompt: prompt, threshold: settings_filter_threshold) } + chunks.select { |chunk| chunk_allowed_by_filter?(chunk, prompt: prompt, threshold: settings[:ingest][:filter_threshold]) } rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.filter_chunks') chunks end private_class_method :filter_chunks @@ -207,7 +205,7 @@ def chunk_allowed_by_filter?(chunk, prompt:, threshold:) data = result.is_a?(Hash) ? (result[:data] || result) : {} filter_cache[hash] = data[:relevant] == true && data[:confidence].to_f >= threshold.to_f rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.filter_chunk', content_hash: hash) filter_cache[hash] = true end private_class_method :chunk_allowed_by_filter? @@ -222,24 +220,6 @@ def llm_structured_available? end private_class_method :llm_structured_available? - def settings_filter_prompt - return nil unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :ingest, :filter_prompt) || Legion::Settings.dig(:knowledge, :filter_prompt) - rescue StandardError => _e - nil - end - private_class_method :settings_filter_prompt - - def settings_filter_threshold - return 0.5 unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :ingest, :filter_threshold) || Legion::Settings.dig(:knowledge, :filter_threshold) || 0.5 - rescue StandardError => _e - 0.5 - end - private_class_method :settings_filter_threshold - def batch_embed_chunks(chunks, force:) exists_map = force ? {} : build_exists_map(chunks) return paired_without_embed(chunks, exists_map) unless llm_embed_available? @@ -249,7 +229,7 @@ def batch_embed_chunks(chunks, force:) chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } } rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.batch_embed_chunks') paired_without_embed(chunks, {}) end private_class_method :batch_embed_chunks @@ -275,7 +255,7 @@ def build_embed_map(needs_embed) h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error] end rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.build_embed_map') {} end private_class_method :build_embed_map @@ -302,7 +282,7 @@ def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, end force ? :updated : :created rescue StandardError => e - log.warn("[knowledge][upsert_chunk] unexpected error class=#{e.class} message=#{e.message} chunk_hash=#{chunk[:content_hash]&.slice(0, 12)}") + handle_exception(e, level: :warn, operation: 'knowledge.ingest.upsert_chunk', content_hash: chunk[:content_hash]&.slice(0, 12)) :skipped end private_class_method :upsert_chunk_with_embedding @@ -314,7 +294,7 @@ def chunk_exists?(content_hash) .where(content_hash: content_hash) .any? rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.chunk_exists', content_hash: content_hash) false end private_class_method :chunk_exists? @@ -354,7 +334,7 @@ def retire_file(file_path:) metadata: { source_file: file_path, retired: true } ) rescue StandardError => e - log.warn(e.message) + handle_exception(e, level: :warn, operation: 'knowledge.ingest.retire_file', file_path: file_path) nil end private_class_method :retire_file diff --git a/lib/legion/extensions/knowledge/runners/maintenance.rb b/lib/legion/extensions/knowledge/runners/maintenance.rb index 448084d..4c8a154 100644 --- a/lib/legion/extensions/knowledge/runners/maintenance.rb +++ b/lib/legion/extensions/knowledge/runners/maintenance.rb @@ -7,6 +7,9 @@ module Extensions module Knowledge module Runners module Maintenance # rubocop:disable Legion/Extension/RunnerIncludeHelpers + extend Legion::Logging::Helper + extend Legion::Settings::Helper + module_function def detect_orphans(path:) @@ -23,6 +26,7 @@ def detect_orphans(path:) total_manifest_files: manifest_files.size } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.detect_orphans', path: path) { success: false, error: e.message } end @@ -36,6 +40,7 @@ def cleanup_orphans(path:, dry_run: true) { success: true, archived: archived, files_cleaned: detection[:orphan_files].size, dry_run: false } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.cleanup_orphans', path: path) { success: false, error: e.message } end @@ -45,11 +50,12 @@ def reindex(path:) Runners::Ingest.ingest_corpus(path: path, force: true) rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.reindex', path: path) { success: false, error: e.message } end def health(path:) - resolved = path || (Legion::Settings.dig(:knowledge, :corpus_path) if defined?(Legion::Settings)) + resolved = path || settings[:corpus_path] return { success: false, error: 'corpus_path is required' } if resolved.nil? || resolved.to_s.empty? scan_entries = Helpers::Manifest.scan(path: resolved) @@ -64,11 +70,12 @@ def health(path:) sync: build_sync_stats(resolved, scan_entries) } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.health', path: path) { success: false, error: e.message } end def quality_report(limit: nil) - resolved_limit = limit || settings_quality_limit + resolved_limit = limit || settings[:maintenance][:quality_report_limit] { success: true, @@ -79,6 +86,7 @@ def quality_report(limit: nil) summary: quality_summary } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.quality_report') { success: false, error: e.message } end @@ -104,7 +112,8 @@ def build_apollo_stats rows = base.select(:confidence, :status, :access_count, :embedding, :created_at).all apollo_stats_from_rows(base, rows, total) - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.build_apollo_stats') apollo_defaults end private_class_method :build_apollo_stats @@ -112,7 +121,7 @@ def build_apollo_stats def apollo_stats_from_rows(base, rows, total) confidences = rows.map { |r| r[:confidence].to_f } with_embeddings = rows.count { |r| !r[:embedding].nil? } - stale_threshold = settings_stale_threshold + stale_threshold = settings[:maintenance][:stale_threshold] timestamps = rows.map { |r| r[:created_at] } { @@ -173,7 +182,8 @@ def load_apollo_source_files .select_map(Sequel.lit("source_context->>'source_file'")) .compact .uniq - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.load_apollo_source_files') [] end private_class_method :load_apollo_source_files @@ -185,7 +195,8 @@ def count_apollo_chunks .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) .exclude(status: 'archived') .count - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.count_apollo_chunks') 0 end private_class_method :count_apollo_chunks @@ -213,7 +224,8 @@ def hot_chunks(limit) .select_map([:id, :access_count, :confidence, Sequel.lit("source_context->>'source_file' AS source_file")]) .map { |r| { id: r[0], access_count: r[1], confidence: r[2], source_file: r[3] } } - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.hot_chunks') [] end private_class_method :hot_chunks @@ -221,7 +233,7 @@ def hot_chunks(limit) def cold_chunks(limit) return [] unless Helpers::ApolloModels.entry_available? - days = settings_cold_chunk_days + days = settings[:maintenance][:cold_chunk_days] cutoff = Time.now - (days * 86_400) Helpers::ApolloModels.entry @@ -234,7 +246,8 @@ def cold_chunks(limit) .select_map([:id, :confidence, :created_at, Sequel.lit("source_context->>'source_file' AS source_file")]) .map { |r| { id: r[0], confidence: r[1], created_at: r[2]&.iso8601, source_file: r[3] } } - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.cold_chunks') [] end private_class_method :cold_chunks @@ -242,7 +255,7 @@ def cold_chunks(limit) def low_confidence_chunks(limit) return [] unless Helpers::ApolloModels.entry_available? - threshold = settings_stale_threshold + threshold = settings[:maintenance][:stale_threshold] Helpers::ApolloModels.entry .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk']))) @@ -253,7 +266,8 @@ def low_confidence_chunks(limit) .select_map([:id, :confidence, :access_count, Sequel.lit("source_context->>'source_file' AS source_file")]) .map { |r| { id: r[0], confidence: r[1], access_count: r[2], source_file: r[3] } } - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.low_confidence_chunks') [] end private_class_method :low_confidence_chunks @@ -271,9 +285,10 @@ def quality_summary total_queries: query_count, avg_retrieval_score: nil, chunks_never_accessed: base.where(access_count: 0).count, - chunks_below_threshold: base.where { confidence < settings_stale_threshold }.count + chunks_below_threshold: base.where { confidence < settings[:maintenance][:stale_threshold] }.count } - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.quality_summary') defaults end private_class_method :quality_summary @@ -282,37 +297,11 @@ def query_count return 0 unless Helpers::ApolloModels.access_log_available? Helpers::ApolloModels.access_log.where(action: 'knowledge_query').count - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.maintenance.query_count') 0 end private_class_method :query_count - - def settings_stale_threshold - return 0.3 unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :maintenance, :stale_threshold) || 0.3 - rescue StandardError => _e - 0.3 - end - private_class_method :settings_stale_threshold - - def settings_cold_chunk_days - return 7 unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :maintenance, :cold_chunk_days) || 7 - rescue StandardError => _e - 7 - end - private_class_method :settings_cold_chunk_days - - def settings_quality_limit - return 10 unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :maintenance, :quality_report_limit) || 10 - rescue StandardError => _e - 10 - end - private_class_method :settings_quality_limit end end end diff --git a/lib/legion/extensions/knowledge/runners/monitor.rb b/lib/legion/extensions/knowledge/runners/monitor.rb index 7f940a2..d0b55e6 100644 --- a/lib/legion/extensions/knowledge/runners/monitor.rb +++ b/lib/legion/extensions/knowledge/runners/monitor.rb @@ -5,6 +5,9 @@ module Extensions module Knowledge module Runners module Monitor # rubocop:disable Legion/Extension/RunnerIncludeHelpers + extend Legion::Logging::Helper + extend Legion::Settings::Helper + module_function DEFAULT_EXTENSIONS = %w[.md .txt].freeze @@ -18,7 +21,8 @@ def resolve_monitors end monitors - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.resolve_monitors') [] end @@ -41,6 +45,7 @@ def add_monitor(path:, extensions: nil, label: nil) { success: true, monitor: entry } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.add_monitor', path: path) { success: false, error: e.message } end @@ -54,12 +59,14 @@ def remove_monitor(identifier:) { success: true, removed: found } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.remove_monitor', identifier: identifier) { success: false, error: e.message } end def list_monitors { success: true, monitors: resolve_monitors } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.list_monitors') { success: false, error: e.message } end @@ -70,44 +77,40 @@ def monitor_status monitors.each do |m| scan = Helpers::Manifest.scan(path: m[:path], extensions: m[:extensions]) total_files += scan.size - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.scan_monitor', path: m[:path]) next end { success: true, total_monitors: monitors.size, total_files: total_files } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.monitor_status') { success: false, error: e.message } end # --- private helpers --- def read_monitors_setting - return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil? - - Legion::Settings.dig(:knowledge, :monitors) - rescue StandardError => _e + settings[:monitors] + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.read_monitors_setting') nil end private_class_method :read_monitors_setting def read_legacy_corpus_path - return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil? - - Legion::Settings.dig(:knowledge, :corpus_path) - rescue StandardError => _e + settings[:corpus_path] + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.read_legacy_corpus_path') nil end private_class_method :read_legacy_corpus_path def persist_monitors(monitors) - return false unless defined?(Legion::Settings) - - loader = Legion::Settings.loader - knowledge = loader.settings[:knowledge] || {} - knowledge[:monitors] = monitors - loader.settings[:knowledge] = knowledge + settings[:monitors] = monitors true - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.monitor.persist_monitors') false end private_class_method :persist_monitors diff --git a/lib/legion/extensions/knowledge/runners/query.rb b/lib/legion/extensions/knowledge/runners/query.rb index 873dcdc..3df5626 100644 --- a/lib/legion/extensions/knowledge/runners/query.rb +++ b/lib/legion/extensions/knowledge/runners/query.rb @@ -3,18 +3,21 @@ require_relative '../helpers/apollo_models' require 'digest' -require 'json' module Legion module Extensions module Knowledge module Runners module Query # rubocop:disable Legion/Extension/RunnerIncludeHelpers + extend Legion::Logging::Helper + extend Legion::JSON::Helper + extend Legion::Settings::Helper + module_function def query(question:, top_k: nil, synthesize: true, expand_neighbors: false, neighbor_radius: nil) started = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - resolved_k = top_k || settings_top_k || 5 + resolved_k = top_k || settings[:query][:top_k] resolved_radius = resolve_neighbor_radius(neighbor_radius) chunks = retrieve_chunks( @@ -45,11 +48,12 @@ def query(question:, top_k: nil, synthesize: true, expand_neighbors: false, neig metadata: build_metadata(chunks, score, latency_ms) } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.query') { success: false, error: e.message } end def retrieve(question:, top_k: nil, expand_neighbors: false, neighbor_radius: nil) - resolved_k = top_k || settings_top_k || 5 + resolved_k = top_k || settings[:query][:top_k] resolved_radius = resolve_neighbor_radius(neighbor_radius) chunks = retrieve_chunks( question, @@ -64,6 +68,7 @@ def retrieve(question:, top_k: nil, expand_neighbors: false, neighbor_radius: ni metadata: build_metadata(chunks, average_score(chunks)) } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.retrieve') { success: false, error: e.message } end @@ -78,6 +83,7 @@ def record_feedback(question:, chunk_ids:, retrieval_score:, synthesized: true, ) { success: true, question_hash: question_hash, rating: rating } rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.record_feedback') { success: false, error: e.message } end @@ -91,7 +97,8 @@ def retrieve_chunks(question, top_k, expand_neighbors: false, neighbor_radius: 1 ) chunks = result.is_a?(Hash) && result[:success] ? Array(result[:entries]) : [] expand_neighbors ? expand_neighbor_chunks(chunks, neighbor_radius) : chunks - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.retrieve_chunks') [] end private_class_method :retrieve_chunks @@ -103,7 +110,8 @@ def expand_neighbor_chunks(chunks, neighbor_radius) return chunks unless radius.positive? && Helpers::ApolloModels.entry_available? merge_neighbor_chunks(chunks.flat_map { |chunk| neighbor_window_for(chunk, radius) }) - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.expand_neighbor_chunks') chunks end private_class_method :expand_neighbor_chunks @@ -120,7 +128,8 @@ def neighbor_window_for(chunk, radius) rows = neighbor_dataset(source_file, lower, upper).all.map { |entry| chunk_from_entry(entry) } rows << chunk unless rows.any? { |row| chunk_dedupe_key(row) == chunk_dedupe_key(chunk) } rows.sort_by { |row| chunk_context(row)[:chunk_index].to_i } - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.neighbor_window') [chunk] end private_class_method :neighbor_window_for @@ -175,6 +184,7 @@ def synthesize_answer(question, chunks) result = llm_chat(message: prompt, caller: { extension: 'lex-knowledge' }) result.is_a?(Hash) ? result[:content] : result rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.synthesize_answer') "Error generating answer: #{e.message}" end private_class_method :synthesize_answer @@ -201,7 +211,8 @@ def chunk_context(chunk) context[:chunk_index] ||= chunk[:chunk_index] context[:heading] ||= chunk[:heading] context - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.chunk_context') {} end private_class_method :chunk_context @@ -215,7 +226,7 @@ def row_context(row) def normalize_context(context) normalized = case context when String - context.strip.empty? ? {} : ::JSON.parse(context, symbolize_names: true) + context.strip.empty? ? {} : json_parse(context) when Hash context else @@ -223,7 +234,8 @@ def normalize_context(context) end normalized.transform_keys { |key| key.respond_to?(:to_sym) ? key.to_sym : key } - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.normalize_context') {} end private_class_method :normalize_context @@ -289,7 +301,8 @@ def emit_feedback_event(question_hash:, chunk_ids:, retrieval_score:, synthesize synthesized: synthesized, rating: rating }) - rescue StandardError => _e + rescue StandardError => e + handle_exception(e, level: :warn, operation: 'knowledge.query.emit_feedback_event') nil end private_class_method :emit_feedback_event @@ -299,28 +312,10 @@ def llm_available? end private_class_method :llm_available? - def settings_top_k - return nil unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :query, :top_k) - rescue StandardError => _e - nil - end - private_class_method :settings_top_k - def resolve_neighbor_radius(neighbor_radius) - (neighbor_radius || settings_neighbor_radius || 1).to_i + (neighbor_radius || settings[:query][:neighbor_radius]).to_i end private_class_method :resolve_neighbor_radius - - def settings_neighbor_radius - return nil unless defined?(Legion::Settings) - - Legion::Settings.dig(:knowledge, :query, :neighbor_radius) - rescue StandardError => _e - nil - end - private_class_method :settings_neighbor_radius end end end diff --git a/lib/legion/extensions/knowledge/version.rb b/lib/legion/extensions/knowledge/version.rb index 594c58f..6c55127 100644 --- a/lib/legion/extensions/knowledge/version.rb +++ b/lib/legion/extensions/knowledge/version.rb @@ -3,7 +3,7 @@ module Legion module Extensions module Knowledge - VERSION = '0.6.13' + VERSION = '0.6.14' end end end diff --git a/spec/legion/extensions/knowledge/helpers/manifest_store_spec.rb b/spec/legion/extensions/knowledge/helpers/manifest_store_spec.rb index 26610f1..ae334b8 100644 --- a/spec/legion/extensions/knowledge/helpers/manifest_store_spec.rb +++ b/spec/legion/extensions/knowledge/helpers/manifest_store_spec.rb @@ -60,8 +60,8 @@ t = Time.now manifest = [{ path: '/b.md', size: 20, mtime: t, sha256: 'zzz' }] store.save(corpus_path: corpus_path, manifest: manifest) - raw = JSON.parse(File.read(store.store_path(corpus_path: corpus_path))) - expect(raw.first['mtime']).to be_a(String) + raw = Legion::JSON.load(File.read(store.store_path(corpus_path: corpus_path))) + expect(raw.first[:mtime]).to be_a(String) end it 'survives roundtrip with Time mtime (mtime comes back as string)' do diff --git a/spec/legion/extensions/knowledge/runners/ingest_spec.rb b/spec/legion/extensions/knowledge/runners/ingest_spec.rb index cb7f8e1..fe9995d 100644 --- a/spec/legion/extensions/knowledge/runners/ingest_spec.rb +++ b/spec/legion/extensions/knowledge/runners/ingest_spec.rb @@ -156,8 +156,9 @@ before do described_class.send(:filter_cache).clear - allow(described_class).to receive(:settings_filter_prompt).and_return('Keep useful architecture notes only.') - allow(described_class).to receive(:settings_filter_threshold).and_return(0.7) + allow(described_class).to receive(:settings).and_return( + ingest: { filter_prompt: 'Keep useful architecture notes only.', filter_threshold: 0.7 } + ) stub_const('Legion::LLM', llm_double) allow(llm_double).to receive(:respond_to?).with(:structured).and_return(true) end @@ -293,28 +294,31 @@ it 'returns :skipped and emits a warn log when handle_ingest returns success: false' do allow(described_class).to receive(:ingest_to_apollo) .and_return({ success: false, error: 'PG::StringDataRightTruncation' }) - expect(Legion::Logging).to receive(:warn).with(/apollo persistence not confirmed/) + expect(described_class.log).to receive(:warn).with(/apollo persistence not confirmed/) outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) expect(outcome).to eq(:skipped) end it 'returns :skipped when handle_ingest returns a non-Hash result' do allow(described_class).to receive(:ingest_to_apollo).and_return(nil) - expect(Legion::Logging).to receive(:warn).with(/non-hash result class=NilClass/) + expect(described_class.log).to receive(:warn).with(/non-hash result class=NilClass/) outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) expect(outcome).to eq(:skipped) end it 'returns :skipped when handle_ingest returns a hash without success' do allow(described_class).to receive(:ingest_to_apollo).and_return({ entry_id: 42 }) - expect(Legion::Logging).to receive(:warn).with(/apollo persistence not confirmed/) + expect(described_class.log).to receive(:warn).with(/apollo persistence not confirmed/) outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) expect(outcome).to eq(:skipped) end it 'returns :skipped and logs when ingest_to_apollo raises' do allow(described_class).to receive(:ingest_to_apollo).and_raise(StandardError, 'boom') - expect(Legion::Logging).to receive(:warn).with(/unexpected error/) + expect(described_class).to receive(:handle_exception).with( + instance_of(StandardError), + hash_including(level: :warn, operation: 'knowledge.ingest.upsert_chunk') + ) outcome = described_class.send(:upsert_chunk_with_embedding, chunk, embedding) expect(outcome).to eq(:skipped) end diff --git a/spec/legion/extensions/knowledge/runners/maintenance_spec.rb b/spec/legion/extensions/knowledge/runners/maintenance_spec.rb index ae6347c..7043da0 100644 --- a/spec/legion/extensions/knowledge/runners/maintenance_spec.rb +++ b/spec/legion/extensions/knowledge/runners/maintenance_spec.rb @@ -295,22 +295,16 @@ end it 'uses settings corpus_path when path is nil and settings are present' do - stub_const('Legion::Settings', Module.new do - def self.dig(*keys) - { knowledge: { corpus_path: nil } }.dig(*keys) - end - end) + allow(described_class).to receive(:settings).and_return(Legion::Extensions::Knowledge.default_settings) result = described_class.health(path: nil) expect(result[:success]).to be false expect(result[:error]).to eq('corpus_path is required') end it 'falls back to settings corpus_path when set' do - stub_const('Legion::Settings', Module.new do - def self.dig(*_keys) - Dir.pwd - end - end) + allow(described_class).to receive(:settings).and_return( + Legion::Extensions::Knowledge.default_settings.merge(corpus_path: Dir.pwd) + ) result = described_class.health(path: nil) expect(result[:success]).to be true expect(result).to include(:local, :apollo, :sync) diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 94b8f34..00d251d 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,6 +1,8 @@ # frozen_string_literal: true require 'bundler/setup' +require 'legion/json' +require 'legion/settings' require 'tmpdir' require 'fileutils' @@ -37,6 +39,8 @@ class Message; end require 'legion/extensions/knowledge' +Legion::Settings[:extensions][:knowledge] = Legion::Extensions::Knowledge.default_settings + RSpec.configure do |config| config.example_status_persistence_file_path = '.rspec_status' config.disable_monkey_patching!