From 0e705d88a5387701f054d11fd7e86e8e1a1e746c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 11 May 2026 04:02:32 +0000 Subject: [PATCH 1/2] Add sheet filtering support to encoder API and CLI Agent-Logs-Url: https://github.com/kingkillery/Spreadsheet_LLM_Encoder/sessions/d65e2e40-c663-464e-ab77-abc9bb5da70f Co-authored-by: kingkillery <200727508+kingkillery@users.noreply.github.com> --- README.md | 15 ++- Spreadsheet_LLM_Encoder.py | 244 +++++++++++++++++++++++++++++++++++- run_qa_evaluation.py | 5 + test_spreadsheet_encoder.py | 63 ++++++++++ 4 files changed, 320 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index a345625..d28cfa1 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,12 @@ Parameters: - `--max-cols-per-sheet`: Bounded mode column cap for very large sheets (optional) - `--max-cells-per-sheet`: Bounded mode cell cap per sheet after row/column caps are applied (optional) - `--sheet-limit-action`: Behavior for sheets over configured caps: `truncate`, `skip`, or `error` (default=`truncate`) +- `--include-sheet`: Include only this exact sheet name (repeatable) +- `--exclude-sheet`: Exclude this exact sheet name (repeatable) +- `--include-sheet-glob`: Include sheets matching this glob pattern (repeatable) +- `--exclude-sheet-glob`: Exclude sheets matching this glob pattern (repeatable) +- `--include-sheet-regex`: Include sheets matching this regex pattern (repeatable) +- `--exclude-sheet-regex`: Exclude sheets matching this regex pattern (repeatable) The CLI prints compression ratios for each sheet and overall to stdout. These metrics are also stored in the output JSON under `compression_metrics` and emitted via the logger at INFO level. @@ -95,6 +101,13 @@ encoding = spreadsheet_llm_encode( max_cells_per_sheet=50000, sheet_limit_action="truncate", ) + +# Encode only specific sheets (exact name, glob, and regex filters supported) +encoding = spreadsheet_llm_encode( + excel_path="path/to/workbook.xlsx", + include_sheets=["Summary"], + exclude_sheet_globs=["Archive*"], +) ``` @@ -226,7 +239,7 @@ formula errors, and repeated-formula summaries. The `sheet_processing` field records whether the encoder ran in full or bounded mode. When row, column, or cell caps are configured, each sheet records its original dimensions, encoded dimensions, encoded range, truncation status, -and skip reason when `--sheet-limit-action skip` is used. +and skip reason when a sheet is omitted or skipped. ### Compression Metrics diff --git a/Spreadsheet_LLM_Encoder.py b/Spreadsheet_LLM_Encoder.py index 4236af1..c857855 100644 --- a/Spreadsheet_LLM_Encoder.py +++ b/Spreadsheet_LLM_Encoder.py @@ -3,6 +3,7 @@ import json import logging import re +from fnmatch import fnmatch from copy import copy from temp_helpers import ( infer_cell_data_type, @@ -207,6 +208,62 @@ def _bounded_dimensions( return max(1, effective_rows), max(1, effective_cols) +def _normalize_sheet_filter_values(values, label): + if values is None: + return [] + if isinstance(values, str): + values = [values] + normalized = [] + for value in values: + text = str(value).strip() + if not text: + raise ValueError(f"{label} entries must be non-empty") + normalized.append(text) + return normalized + + +def _compile_sheet_regexes(patterns, label): + compiled = [] + for pattern in patterns: + try: + compiled.append((pattern, re.compile(pattern))) + except re.error as exc: + raise ValueError(f"Invalid {label} pattern '{pattern}': {exc}") from exc + return compiled + + +def _sheet_selection_decision( + sheet_name, + include_names, + include_globs, + include_regexes, + exclude_names, + exclude_globs, + exclude_regexes, +): + include_filters_active = bool(include_names or include_globs or include_regexes) + include_matches = ( + sheet_name in include_names + or any(fnmatch(sheet_name, pattern) for pattern in include_globs) + or any(regex.search(sheet_name) for _, regex in include_regexes) + ) + if include_filters_active and not include_matches: + return False, "sheet not matched by include filters" + + if sheet_name in exclude_names: + return False, "sheet excluded by name filter" + + for pattern in exclude_globs: + if fnmatch(sheet_name, pattern): + return False, f"sheet excluded by glob filter '{pattern}'" + + for pattern, regex in exclude_regexes: + if regex.search(sheet_name): + return False, f"sheet excluded by regex filter '{pattern}'" + + return True, None + + def _copy_bounded_sheet(source_sheet, max_row, max_col): """Copy a bounded top-left worksheet region into a normal worksheet.""" wb = openpyxl.Workbook() @@ -290,6 +347,12 @@ def spreadsheet_llm_encode( max_cols_per_sheet=None, max_cells_per_sheet=None, sheet_limit_action="truncate", + include_sheets=None, + exclude_sheets=None, + include_sheet_globs=None, + exclude_sheet_globs=None, + include_sheet_regexes=None, + exclude_sheet_regexes=None, ): """ Convert an Excel file to SpreadsheetLLM format or a vanilla markdown-like format. @@ -321,17 +384,52 @@ def spreadsheet_llm_encode( sheet_limit_action (str, optional): What to do when a sheet exceeds the configured caps: ``"truncate"`` (default), ``"skip"``, or ``"error"``. + include_sheets (Iterable[str] | str, optional): Exact sheet names to + include. When provided, only matching sheets are encoded. + exclude_sheets (Iterable[str] | str, optional): Exact sheet names to + exclude from encoding. + include_sheet_globs (Iterable[str] | str, optional): Glob patterns + for sheets to include. + exclude_sheet_globs (Iterable[str] | str, optional): Glob patterns + for sheets to exclude. + include_sheet_regexes (Iterable[str] | str, optional): Regex patterns + for sheets to include. + exclude_sheet_regexes (Iterable[str] | str, optional): Regex patterns + for sheets to exclude. Returns: dict: The SpreadsheetLLM encoding of the Excel file. """ if vanilla: - return vanilla_encode(excel_path, output_path) + return vanilla_encode( + excel_path, + output_path, + include_sheets=include_sheets, + exclude_sheets=exclude_sheets, + include_sheet_globs=include_sheet_globs, + exclude_sheet_globs=exclude_sheet_globs, + include_sheet_regexes=include_sheet_regexes, + exclude_sheet_regexes=exclude_sheet_regexes, + ) if paper_strict: compress_homogeneous = False max_rows_per_sheet = _limit_to_positive_int(max_rows_per_sheet, "max_rows_per_sheet") max_cols_per_sheet = _limit_to_positive_int(max_cols_per_sheet, "max_cols_per_sheet") max_cells_per_sheet = _limit_to_positive_int(max_cells_per_sheet, "max_cells_per_sheet") + include_sheets = _normalize_sheet_filter_values(include_sheets, "include_sheets") + exclude_sheets = _normalize_sheet_filter_values(exclude_sheets, "exclude_sheets") + include_sheet_globs = _normalize_sheet_filter_values(include_sheet_globs, "include_sheet_globs") + exclude_sheet_globs = _normalize_sheet_filter_values(exclude_sheet_globs, "exclude_sheet_globs") + include_sheet_regexes = _normalize_sheet_filter_values(include_sheet_regexes, "include_sheet_regexes") + exclude_sheet_regexes = _normalize_sheet_filter_values(exclude_sheet_regexes, "exclude_sheet_regexes") + include_sheet_regexes_compiled = _compile_sheet_regexes( + include_sheet_regexes, + "include_sheet_regexes", + ) + exclude_sheet_regexes_compiled = _compile_sheet_regexes( + exclude_sheet_regexes, + "exclude_sheet_regexes", + ) if sheet_limit_action not in {"truncate", "skip", "error"}: raise ValueError("sheet_limit_action must be 'truncate', 'skip', or 'error'") logger.info(f"Processing Excel file: {excel_path}") @@ -369,6 +467,16 @@ def spreadsheet_llm_encode( "max_cells_per_sheet": max_cells_per_sheet, "sheet_limit_action": sheet_limit_action, }, + "selection": { + "include_sheets": include_sheets, + "exclude_sheets": exclude_sheets, + "include_sheet_globs": include_sheet_globs, + "exclude_sheet_globs": exclude_sheet_globs, + "include_sheet_regexes": include_sheet_regexes, + "exclude_sheet_regexes": exclude_sheet_regexes, + "included_sheets": [], + "skipped_sheets": [], + }, "sheets": {}, } overall_orig = overall_anchor = overall_index = overall_format = overall_final = 0 @@ -376,9 +484,53 @@ def spreadsheet_llm_encode( for sheet_name in workbook.sheetnames: logger.info(f"\\nProcessing sheet: {sheet_name}") original_sheet = workbook[sheet_name] + include_sheet, selection_reason = _sheet_selection_decision( + sheet_name, + include_sheets, + include_sheet_globs, + include_sheet_regexes_compiled, + exclude_sheets, + exclude_sheet_globs, + exclude_sheet_regexes_compiled, + ) + if not include_sheet: + sheet_processing["sheets"][sheet_name] = { + "status": "skipped", + "reason": selection_reason, + "limit_action": sheet_limit_action, + "truncated": False, + "original_rows": original_sheet.max_row or 1, + "original_cols": original_sheet.max_column or 1, + "original_cells": (original_sheet.max_row or 1) * (original_sheet.max_column or 1), + "effective_rows": 0, + "effective_cols": 0, + "effective_cells": 0, + "encoded_range": None, + } + sheet_processing["selection"]["skipped_sheets"].append( + {"sheet_name": sheet_name, "reason": selection_reason} + ) + logger.info("Skipping sheet '%s': %s", sheet_name, selection_reason) + continue if original_sheet.max_row <= 1 and original_sheet.max_column <= 1: logger.info(f"Sheet '{sheet_name}' appears to be empty. Skipping.") + sheet_processing["sheets"][sheet_name] = { + "status": "skipped", + "reason": "sheet appears empty", + "limit_action": sheet_limit_action, + "truncated": False, + "original_rows": original_sheet.max_row or 1, + "original_cols": original_sheet.max_column or 1, + "original_cells": (original_sheet.max_row or 1) * (original_sheet.max_column or 1), + "effective_rows": 0, + "effective_cols": 0, + "effective_cells": 0, + "encoded_range": None, + } + sheet_processing["selection"]["skipped_sheets"].append( + {"sheet_name": sheet_name, "reason": "sheet appears empty"} + ) continue effective_rows, effective_cols, processing_meta = _sheet_processing_plan( @@ -390,6 +542,9 @@ def spreadsheet_llm_encode( ) sheet_processing["sheets"][sheet_name] = processing_meta if processing_meta["status"] == "skipped": + sheet_processing["selection"]["skipped_sheets"].append( + {"sheet_name": sheet_name, "reason": processing_meta.get("reason", "sheet skipped")} + ) logger.info( "Skipping sheet '%s' because it exceeds configured limits: %s rows x %s cols", sheet_name, @@ -567,6 +722,7 @@ def spreadsheet_llm_encode( ) sheets_encoding[sheet_name] = sheet_encoding + sheet_processing["selection"]["included_sheets"].append(sheet_name) overall_orig += original_tokens overall_anchor += anchor_tokens @@ -1757,6 +1913,42 @@ def main(): "truncate, skip, or error (default: truncate)." ), ) + parser.add_argument( + "--include-sheet", + action="append", + default=[], + help="Include only this exact sheet name. Repeat flag for multiple sheets.", + ) + parser.add_argument( + "--exclude-sheet", + action="append", + default=[], + help="Exclude this exact sheet name. Repeat flag for multiple sheets.", + ) + parser.add_argument( + "--include-sheet-glob", + action="append", + default=[], + help="Include sheets matching this glob pattern. Repeatable.", + ) + parser.add_argument( + "--exclude-sheet-glob", + action="append", + default=[], + help="Exclude sheets matching this glob pattern. Repeatable.", + ) + parser.add_argument( + "--include-sheet-regex", + action="append", + default=[], + help="Include sheets whose names match this regex. Repeatable.", + ) + parser.add_argument( + "--exclude-sheet-regex", + action="append", + default=[], + help="Exclude sheets whose names match this regex. Repeatable.", + ) args = parser.parse_args() @@ -1778,6 +1970,12 @@ def main(): max_cols_per_sheet=args.max_cols_per_sheet, max_cells_per_sheet=args.max_cells_per_sheet, sheet_limit_action=args.sheet_limit_action, + include_sheets=args.include_sheet, + exclude_sheets=args.exclude_sheet, + include_sheet_globs=args.include_sheet_glob, + exclude_sheet_globs=args.exclude_sheet_glob, + include_sheet_regexes=args.include_sheet_regex, + exclude_sheet_regexes=args.exclude_sheet_regex, ) if result is not None and not args.vanilla: @@ -1794,7 +1992,16 @@ def main(): print(f"Overall: {overall.get('overall_ratio', 0.0):.2f}x compression") -def vanilla_encode(excel_path, output_path=None): +def vanilla_encode( + excel_path, + output_path=None, + include_sheets=None, + exclude_sheets=None, + include_sheet_globs=None, + exclude_sheet_globs=None, + include_sheet_regexes=None, + exclude_sheet_regexes=None, +): """Vanilla markdown-like encoding (paper Section 3.1). Produces a ``{sheet_name: pair_string}`` dict where each sheet is the @@ -1803,16 +2010,41 @@ def vanilla_encode(excel_path, output_path=None): multi-sheet workbooks aren't silently truncated. """ logger.info(f"Producing vanilla encoding for {excel_path}") + include_sheets = _normalize_sheet_filter_values(include_sheets, "include_sheets") + exclude_sheets = _normalize_sheet_filter_values(exclude_sheets, "exclude_sheets") + include_sheet_globs = _normalize_sheet_filter_values(include_sheet_globs, "include_sheet_globs") + exclude_sheet_globs = _normalize_sheet_filter_values(exclude_sheet_globs, "exclude_sheet_globs") + include_sheet_regexes = _normalize_sheet_filter_values(include_sheet_regexes, "include_sheet_regexes") + exclude_sheet_regexes = _normalize_sheet_filter_values(exclude_sheet_regexes, "exclude_sheet_regexes") + include_sheet_regexes_compiled = _compile_sheet_regexes( + include_sheet_regexes, + "include_sheet_regexes", + ) + exclude_sheet_regexes_compiled = _compile_sheet_regexes( + exclude_sheet_regexes, + "exclude_sheet_regexes", + ) try: workbook = openpyxl.load_workbook(excel_path, data_only=True) except Exception as e: logger.error(f"Error loading Excel file for vanilla encoding: {e}") return None - vanilla_content = { - sheet_name: paper_serializers.to_paper_vanilla_prompt(workbook[sheet_name]) - for sheet_name in workbook.sheetnames - } + vanilla_content = {} + for sheet_name in workbook.sheetnames: + include_sheet, _ = _sheet_selection_decision( + sheet_name, + include_sheets, + include_sheet_globs, + include_sheet_regexes_compiled, + exclude_sheets, + exclude_sheet_globs, + exclude_sheet_regexes_compiled, + ) + if include_sheet: + vanilla_content[sheet_name] = paper_serializers.to_paper_vanilla_prompt( + workbook[sheet_name] + ) if output_path: with open(output_path, 'w', encoding='utf-8') as f: diff --git a/run_qa_evaluation.py b/run_qa_evaluation.py index cd581f8..c0b0859 100644 --- a/run_qa_evaluation.py +++ b/run_qa_evaluation.py @@ -134,6 +134,11 @@ def main( encoding, query ) if sheet_name is None: + if not encoding.get("sheets"): + logger.warning( + " - SpreadsheetLLM identified a table range but no sheets are encoded." + ) + continue sheet_name = next(iter(encoding["sheets"])) sheet_data = encoding["sheets"][sheet_name] diff --git a/test_spreadsheet_encoder.py b/test_spreadsheet_encoder.py index 5a60998..80da989 100644 --- a/test_spreadsheet_encoder.py +++ b/test_spreadsheet_encoder.py @@ -1,8 +1,10 @@ import unittest import os import json +import sys import openpyxl from openpyxl.styles import Font, PatternFill +from unittest.mock import patch from Spreadsheet_LLM_Encoder import ( spreadsheet_llm_encode, create_inverted_index, @@ -11,6 +13,7 @@ find_boundary_candidates, aggregate_regions_dfs, vanilla_encode, + main, is_header_row, filter_unreasonable_candidates, filter_overlapping_candidates, @@ -298,6 +301,66 @@ def test_spreadsheet_llm_encode_runs(self): self.assertIsNotNone(result) self.assertIn("Sheet1", result["sheets"]) + def test_sheet_filters_include_single_sheet_and_record_skips(self): + result = spreadsheet_llm_encode( + self.test_file, + include_sheets=["Sheet2"], + ) + + self.assertEqual(list(result["sheets"].keys()), ["Sheet2"]) + selection = result["sheet_processing"]["selection"] + self.assertEqual(selection["included_sheets"], ["Sheet2"]) + self.assertIn( + {"sheet_name": "Sheet1", "reason": "sheet not matched by include filters"}, + selection["skipped_sheets"], + ) + self.assertEqual( + result["sheet_processing"]["sheets"]["Sheet1"]["reason"], + "sheet not matched by include filters", + ) + + def test_sheet_filters_can_exclude_sheet_and_keep_rest(self): + result = spreadsheet_llm_encode( + self.test_file, + exclude_sheets=["Sheet2"], + ) + + self.assertIn("Sheet1", result["sheets"]) + self.assertNotIn("Sheet2", result["sheets"]) + self.assertEqual( + result["sheet_processing"]["sheets"]["Sheet2"]["reason"], + "sheet excluded by name filter", + ) + + def test_cli_sheet_filters_write_metadata(self): + out_path = "cli_sheet_filter.json" + argv = [ + "Spreadsheet_LLM_Encoder.py", + self.test_file, + "--output", + out_path, + "--include-sheet", + "Sheet1", + "--exclude-sheet", + "Sheet2", + ] + + try: + with patch.object(sys, "argv", argv): + main() + with open(out_path, encoding="utf-8") as fh: + encoded = json.load(fh) + finally: + if os.path.exists(out_path): + os.remove(out_path) + + self.assertIn("Sheet1", encoded["sheets"]) + self.assertNotIn("Sheet2", encoded["sheets"]) + self.assertEqual( + encoded["sheet_processing"]["sheets"]["Sheet2"]["reason"], + "sheet not matched by include filters", + ) + def test_extract_formula_references_normalizes_local_and_cross_sheet_refs(self): refs = extract_formula_references("=SUM(B2:C3)+'Data Sheet'!D4+Aux!E5", "Sheet1") From 5a11ff99024c0a5635e2cd316dfbfc46918f15f1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 11 May 2026 04:06:31 +0000 Subject: [PATCH 2/2] Implement sheet include/exclude filters with metadata and tests Agent-Logs-Url: https://github.com/kingkillery/Spreadsheet_LLM_Encoder/sessions/d65e2e40-c663-464e-ab77-abc9bb5da70f Co-authored-by: kingkillery <200727508+kingkillery@users.noreply.github.com> --- Spreadsheet_LLM_Encoder.py | 51 +++++++++++-------- test_spreadsheet_encoder.py | 18 +++++-- .../date_currency_percent.encoding.json | 12 +++++ tests/golden/formula_cells.encoding.json | 30 +++++++++++ tests/golden/hidden_rows_cols.encoding.json | 12 +++++ tests/golden/merged_headers.encoding.json | 12 +++++ tests/golden/multi_table_sheet.encoding.json | 12 +++++ tests/golden/simple_table.encoding.json | 12 +++++ tests/golden/sparse_sheet.encoding.json | 12 +++++ tests/golden/wide_sparse_sheet.encoding.json | 12 +++++ 10 files changed, 156 insertions(+), 27 deletions(-) diff --git a/Spreadsheet_LLM_Encoder.py b/Spreadsheet_LLM_Encoder.py index c857855..eb86d7e 100644 --- a/Spreadsheet_LLM_Encoder.py +++ b/Spreadsheet_LLM_Encoder.py @@ -208,7 +208,7 @@ def _bounded_dimensions( return max(1, effective_rows), max(1, effective_cols) -def _normalize_sheet_filter_values(values, label): +def _validate_and_normalize_filter_list(values, parameter_name): if values is None: return [] if isinstance(values, str): @@ -217,18 +217,19 @@ def _normalize_sheet_filter_values(values, label): for value in values: text = str(value).strip() if not text: - raise ValueError(f"{label} entries must be non-empty") + raise ValueError(f"{parameter_name} entries must be non-empty") normalized.append(text) return normalized -def _compile_sheet_regexes(patterns, label): +def _compile_sheet_regexes(patterns, parameter_name): + """Compile sheet-name regex filters as ``(pattern, compiled_regex)`` tuples.""" compiled = [] for pattern in patterns: try: compiled.append((pattern, re.compile(pattern))) except re.error as exc: - raise ValueError(f"Invalid {label} pattern '{pattern}': {exc}") from exc + raise ValueError(f"Invalid {parameter_name} pattern '{pattern}': {exc}") from exc return compiled @@ -400,6 +401,17 @@ def spreadsheet_llm_encode( Returns: dict: The SpreadsheetLLM encoding of the Excel file. """ + if paper_strict: + compress_homogeneous = False + max_rows_per_sheet = _limit_to_positive_int(max_rows_per_sheet, "max_rows_per_sheet") + max_cols_per_sheet = _limit_to_positive_int(max_cols_per_sheet, "max_cols_per_sheet") + max_cells_per_sheet = _limit_to_positive_int(max_cells_per_sheet, "max_cells_per_sheet") + include_sheets = _validate_and_normalize_filter_list(include_sheets, "include_sheets") + exclude_sheets = _validate_and_normalize_filter_list(exclude_sheets, "exclude_sheets") + include_sheet_globs = _validate_and_normalize_filter_list(include_sheet_globs, "include_sheet_globs") + exclude_sheet_globs = _validate_and_normalize_filter_list(exclude_sheet_globs, "exclude_sheet_globs") + include_sheet_regexes = _validate_and_normalize_filter_list(include_sheet_regexes, "include_sheet_regexes") + exclude_sheet_regexes = _validate_and_normalize_filter_list(exclude_sheet_regexes, "exclude_sheet_regexes") if vanilla: return vanilla_encode( excel_path, @@ -411,17 +423,6 @@ def spreadsheet_llm_encode( include_sheet_regexes=include_sheet_regexes, exclude_sheet_regexes=exclude_sheet_regexes, ) - if paper_strict: - compress_homogeneous = False - max_rows_per_sheet = _limit_to_positive_int(max_rows_per_sheet, "max_rows_per_sheet") - max_cols_per_sheet = _limit_to_positive_int(max_cols_per_sheet, "max_cols_per_sheet") - max_cells_per_sheet = _limit_to_positive_int(max_cells_per_sheet, "max_cells_per_sheet") - include_sheets = _normalize_sheet_filter_values(include_sheets, "include_sheets") - exclude_sheets = _normalize_sheet_filter_values(exclude_sheets, "exclude_sheets") - include_sheet_globs = _normalize_sheet_filter_values(include_sheet_globs, "include_sheet_globs") - exclude_sheet_globs = _normalize_sheet_filter_values(exclude_sheet_globs, "exclude_sheet_globs") - include_sheet_regexes = _normalize_sheet_filter_values(include_sheet_regexes, "include_sheet_regexes") - exclude_sheet_regexes = _normalize_sheet_filter_values(exclude_sheet_regexes, "exclude_sheet_regexes") include_sheet_regexes_compiled = _compile_sheet_regexes( include_sheet_regexes, "include_sheet_regexes", @@ -543,7 +544,13 @@ def spreadsheet_llm_encode( sheet_processing["sheets"][sheet_name] = processing_meta if processing_meta["status"] == "skipped": sheet_processing["selection"]["skipped_sheets"].append( - {"sheet_name": sheet_name, "reason": processing_meta.get("reason", "sheet skipped")} + { + "sheet_name": sheet_name, + "reason": processing_meta.get( + "reason", + "sheet skipped (reason not recorded)", + ), + } ) logger.info( "Skipping sheet '%s' because it exceeds configured limits: %s rows x %s cols", @@ -2010,12 +2017,12 @@ def vanilla_encode( multi-sheet workbooks aren't silently truncated. """ logger.info(f"Producing vanilla encoding for {excel_path}") - include_sheets = _normalize_sheet_filter_values(include_sheets, "include_sheets") - exclude_sheets = _normalize_sheet_filter_values(exclude_sheets, "exclude_sheets") - include_sheet_globs = _normalize_sheet_filter_values(include_sheet_globs, "include_sheet_globs") - exclude_sheet_globs = _normalize_sheet_filter_values(exclude_sheet_globs, "exclude_sheet_globs") - include_sheet_regexes = _normalize_sheet_filter_values(include_sheet_regexes, "include_sheet_regexes") - exclude_sheet_regexes = _normalize_sheet_filter_values(exclude_sheet_regexes, "exclude_sheet_regexes") + include_sheets = _validate_and_normalize_filter_list(include_sheets, "include_sheets") + exclude_sheets = _validate_and_normalize_filter_list(exclude_sheets, "exclude_sheets") + include_sheet_globs = _validate_and_normalize_filter_list(include_sheet_globs, "include_sheet_globs") + exclude_sheet_globs = _validate_and_normalize_filter_list(exclude_sheet_globs, "exclude_sheet_globs") + include_sheet_regexes = _validate_and_normalize_filter_list(include_sheet_regexes, "include_sheet_regexes") + exclude_sheet_regexes = _validate_and_normalize_filter_list(exclude_sheet_regexes, "exclude_sheet_regexes") include_sheet_regexes_compiled = _compile_sheet_regexes( include_sheet_regexes, "include_sheet_regexes", diff --git a/test_spreadsheet_encoder.py b/test_spreadsheet_encoder.py index 80da989..12eb06e 100644 --- a/test_spreadsheet_encoder.py +++ b/test_spreadsheet_encoder.py @@ -2,6 +2,7 @@ import os import json import sys +import tempfile import openpyxl from openpyxl.styles import Font, PatternFill from unittest.mock import patch @@ -332,21 +333,28 @@ def test_sheet_filters_can_exclude_sheet_and_keep_rest(self): "sheet excluded by name filter", ) - def test_cli_sheet_filters_write_metadata(self): - out_path = "cli_sheet_filter.json" + def test_cli_include_and_exclude_filters_record_skip_reason(self): argv = [ "Spreadsheet_LLM_Encoder.py", self.test_file, - "--output", - out_path, "--include-sheet", "Sheet1", "--exclude-sheet", "Sheet2", ] + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp: + out_path = tmp.name + argv_with_output = [ + argv[0], + argv[1], + "--output", + out_path, + *argv[2:], + ] + try: - with patch.object(sys, "argv", argv): + with patch.object(sys, "argv", argv_with_output): main() with open(out_path, encoding="utf-8") as fh: encoded = json.load(fh) diff --git a/tests/golden/date_currency_percent.encoding.json b/tests/golden/date_currency_percent.encoding.json index 9848c85..8f69e55 100644 --- a/tests/golden/date_currency_percent.encoding.json +++ b/tests/golden/date_currency_percent.encoding.json @@ -8,6 +8,18 @@ "sheet_limit_action": "truncate" }, "mode": "full", + "selection": { + "exclude_sheet_globs": [], + "exclude_sheet_regexes": [], + "exclude_sheets": [], + "include_sheet_globs": [], + "include_sheet_regexes": [], + "include_sheets": [], + "included_sheets": [ + "Sheet1" + ], + "skipped_sheets": [] + }, "sheets": { "Sheet1": { "effective_cells": 15, diff --git a/tests/golden/formula_cells.encoding.json b/tests/golden/formula_cells.encoding.json index 50dbd64..a45969a 100644 --- a/tests/golden/formula_cells.encoding.json +++ b/tests/golden/formula_cells.encoding.json @@ -8,7 +8,37 @@ "sheet_limit_action": "truncate" }, "mode": "full", + "selection": { + "exclude_sheet_globs": [], + "exclude_sheet_regexes": [], + "exclude_sheets": [], + "include_sheet_globs": [], + "include_sheet_regexes": [], + "include_sheets": [], + "included_sheets": [ + "Sheet1" + ], + "skipped_sheets": [ + { + "reason": "sheet appears empty", + "sheet_name": "Aux" + } + ] + }, "sheets": { + "Aux": { + "effective_cells": 0, + "effective_cols": 0, + "effective_rows": 0, + "encoded_range": null, + "limit_action": "truncate", + "original_cells": 1, + "original_cols": 1, + "original_rows": 1, + "reason": "sheet appears empty", + "status": "skipped", + "truncated": false + }, "Sheet1": { "effective_cells": 20, "effective_cols": 5, diff --git a/tests/golden/hidden_rows_cols.encoding.json b/tests/golden/hidden_rows_cols.encoding.json index 281562a..8575778 100644 --- a/tests/golden/hidden_rows_cols.encoding.json +++ b/tests/golden/hidden_rows_cols.encoding.json @@ -8,6 +8,18 @@ "sheet_limit_action": "truncate" }, "mode": "full", + "selection": { + "exclude_sheet_globs": [], + "exclude_sheet_regexes": [], + "exclude_sheets": [], + "include_sheet_globs": [], + "include_sheet_regexes": [], + "include_sheets": [], + "included_sheets": [ + "Sheet1" + ], + "skipped_sheets": [] + }, "sheets": { "Sheet1": { "effective_cells": 9, diff --git a/tests/golden/merged_headers.encoding.json b/tests/golden/merged_headers.encoding.json index 2fb76ea..9cd78ab 100644 --- a/tests/golden/merged_headers.encoding.json +++ b/tests/golden/merged_headers.encoding.json @@ -8,6 +8,18 @@ "sheet_limit_action": "truncate" }, "mode": "full", + "selection": { + "exclude_sheet_globs": [], + "exclude_sheet_regexes": [], + "exclude_sheets": [], + "include_sheet_globs": [], + "include_sheet_regexes": [], + "include_sheets": [], + "included_sheets": [ + "Sheet1" + ], + "skipped_sheets": [] + }, "sheets": { "Sheet1": { "effective_cells": 16, diff --git a/tests/golden/multi_table_sheet.encoding.json b/tests/golden/multi_table_sheet.encoding.json index add7072..ba586a5 100644 --- a/tests/golden/multi_table_sheet.encoding.json +++ b/tests/golden/multi_table_sheet.encoding.json @@ -8,6 +8,18 @@ "sheet_limit_action": "truncate" }, "mode": "full", + "selection": { + "exclude_sheet_globs": [], + "exclude_sheet_regexes": [], + "exclude_sheets": [], + "include_sheet_globs": [], + "include_sheet_regexes": [], + "include_sheets": [], + "included_sheets": [ + "Sheet1" + ], + "skipped_sheets": [] + }, "sheets": { "Sheet1": { "effective_cells": 40, diff --git a/tests/golden/simple_table.encoding.json b/tests/golden/simple_table.encoding.json index 7996475..57bd114 100644 --- a/tests/golden/simple_table.encoding.json +++ b/tests/golden/simple_table.encoding.json @@ -8,6 +8,18 @@ "sheet_limit_action": "truncate" }, "mode": "full", + "selection": { + "exclude_sheet_globs": [], + "exclude_sheet_regexes": [], + "exclude_sheets": [], + "include_sheet_globs": [], + "include_sheet_regexes": [], + "include_sheets": [], + "included_sheets": [ + "Sheet1" + ], + "skipped_sheets": [] + }, "sheets": { "Sheet1": { "effective_cells": 9, diff --git a/tests/golden/sparse_sheet.encoding.json b/tests/golden/sparse_sheet.encoding.json index e295c08..10e0943 100644 --- a/tests/golden/sparse_sheet.encoding.json +++ b/tests/golden/sparse_sheet.encoding.json @@ -8,6 +8,18 @@ "sheet_limit_action": "truncate" }, "mode": "full", + "selection": { + "exclude_sheet_globs": [], + "exclude_sheet_regexes": [], + "exclude_sheets": [], + "include_sheet_globs": [], + "include_sheet_regexes": [], + "include_sheets": [], + "included_sheets": [ + "Sheet1" + ], + "skipped_sheets": [] + }, "sheets": { "Sheet1": { "effective_cells": 40, diff --git a/tests/golden/wide_sparse_sheet.encoding.json b/tests/golden/wide_sparse_sheet.encoding.json index 7ec5d13..59fc11f 100644 --- a/tests/golden/wide_sparse_sheet.encoding.json +++ b/tests/golden/wide_sparse_sheet.encoding.json @@ -8,6 +8,18 @@ "sheet_limit_action": "truncate" }, "mode": "full", + "selection": { + "exclude_sheet_globs": [], + "exclude_sheet_regexes": [], + "exclude_sheets": [], + "include_sheet_globs": [], + "include_sheet_regexes": [], + "include_sheets": [], + "included_sheets": [ + "Sheet1" + ], + "skipped_sheets": [] + }, "sheets": { "Sheet1": { "effective_cells": 520,