diff --git a/RELEASE_NOTES_v0.2.0.md b/RELEASE_NOTES_v0.2.0.md new file mode 100644 index 0000000..13f4b98 --- /dev/null +++ b/RELEASE_NOTES_v0.2.0.md @@ -0,0 +1,371 @@ +# Release Notes v0.2.0 + +**Release Date:** 2026-01-16 + +## Overview + +Version 0.2.0 represents a major milestone in the Excel Toolkit architecture with the complete implementation of the **Operations Layer**. This release establishes a clean separation between business logic and CLI concerns, enabling: + +- ✅ Unit testing without CLI dependencies +- ✅ Code reuse in pipelines and templates +- ✅ Import by external packages +- ✅ Type-safe error handling with Result types +- ✅ Immutable error data structures + +This is a **foundation release** that introduces 9 operation modules with 441 comprehensive unit tests, achieving >90% test coverage. + +--- + +## 🚀 Major Features + +### Operations Layer Architecture + +The centerpiece of this release is the new **Operations Layer** - a complete separation of business logic from CLI code. + +**Benefits:** +- **Testability:** All operations can be unit tested independently +- **Reusability:** Operations can be imported and used in other projects +- **Type Safety:** Explicit error handling with Result types (Ok/Err) +- **Immutability:** All error types are frozen dataclasses +- **Comprehensive Testing:** 441 tests with >90% code coverage + +### 9 New Operation Modules + +#### Phase 1: Core Operations (5 modules) + +**1. Filtering Operations** (`excel_toolkit/operations/filtering.py`) +- Security-validated filter expressions with protection against code injection +- Intelligent condition normalization ("is None" → `.isna()`, "between" → range checks) +- Column selection and row limiting +- **46 tests passing** + +**2. Sorting Operations** (`excel_toolkit/operations/sorting.py`) +- Single and multi-column sorting +- Ascending and descending order per column +- NaN placement control (first/last) +- Row limiting with mixed type detection +- **23 tests passing** + +**3. Pivoting Operations** (`excel_toolkit/operations/pivoting.py`) +- Multi-dimensional pivot tables +- 11 aggregation functions (sum, mean, avg→mean, count, min, max, median, std, var, first, last) +- Fill value handling (None, 0, nan, custom) +- Automatic MultiIndex flattening +- **56 tests passing** + +**4. Aggregating Operations** (`excel_toolkit/operations/aggregating.py`) +- Smart column:func syntax parsing ("Age:mean,sum,count") +- Multi-level groupby operations +- Empty group handling +- Automatic MultiIndex flattening +- **38 tests passing** + +**5. Comparing Operations** (`excel_toolkit/operations/comparing.py`) +- Key-based or position-based comparison +- NaN equality handling (NaN == NaN) +- Comprehensive difference tracking (added, deleted, modified, unchanged) +- **44 tests passing** + +#### Phase 2: Support Operations (4 modules) + +**6. Cleaning Operations** (`excel_toolkit/operations/cleaning.py`) +- Whitespace trimming (left, right, both) +- Duplicate removal with flexible keep strategies +- 6 fill strategies (forward, backward, mean, median, constant, drop) +- Column name standardization (lower, upper, title, snake case) +- Special character removal +- **57 tests passing** + +**7. Transforming Operations** (`excel_toolkit/operations/transforming.py`) +- Security-validated expression evaluation +- Type casting (int, float, str, bool, datetime, category) +- 6 built-in transformations (log, sqrt, abs, exp, standardize, normalize) +- Custom callable transformations +- String concatenation support +- **52 tests passing** + +**8. Joining Operations** (`excel_toolkit/operations/joining.py`) +- All join types (inner, left, right, outer, cross) +- Column validation before joining +- Left/right column specification for asymmetric joins +- Index-based joins +- Custom suffixes for overlapping columns +- Sequential DataFrame merging +- **33 tests passing** + +**9. Validation Operations** (`excel_toolkit/operations/validation.py`) +- Column existence validation +- Type checking (int, float, str, bool, datetime, numeric) +- Value range validation with boundary control +- Null value detection with thresholds +- Uniqueness validation (single/multiple columns) +- Rule-based validation framework +- **53 tests passing** + +### Functional Programming Utilities + +**Result Type Implementation** (`excel_toolkit/fp.py`) +- `Ok[T]` and `Err[E]` types for explicit error handling +- Helper functions: `ok()`, `err()`, `is_ok()`, `is_err()`, `unwrap()`, `unwrap_err()` +- Type-safe error propagation throughout the operations layer + +**Immutable Dataclass Decorator** (`excel_toolkit/fp/immutable.py`) +- `@immutable` decorator for creating frozen dataclasses +- Must be applied AFTER `@dataclass` decorator +- Used for all error type ADTs + +### Comprehensive Error Type System + +**27+ Specialized Error Types** (`excel_toolkit/models/error_types.py`) + +**Validation Errors (12 types):** +- `ColumnNotFoundError` - Column doesn't exist in DataFrame +- `TypeMismatchError` - Column type doesn't match expected +- `ValueOutOfRangeError` - Values outside specified range +- `NullValueThresholdExceededError` - Too many null values +- `UniquenessViolationError` - Duplicate values found +- `InvalidRuleError` - Invalid validation rule +- `ValidationReport` - Comprehensive validation results + +**Filtering Errors (4 types):** +- `InvalidConditionError` - Invalid filter condition +- `ColumnNotFoundError` - Column not found +- `FilteringError` - Generic filtering error +- `EmptyResultError` - No rows match filter + +**Sorting Errors (2 types):** +- `ColumnNotFoundError` - Column not found +- `SortingError` - Generic sorting error + +**Pivoting Errors (4 types):** +- `InvalidAggregationFunctionError` - Invalid aggregation function +- `InvalidPivotColumnError` - Invalid pivot column +- `InvalidFillValueError` - Invalid fill value +- `PivotingError` - Generic pivoting error + +**Aggregating Errors (3 types):** +- `InvalidAggregationSpecError` - Invalid aggregation specification +- `InvalidAggregationColumnError` - Invalid aggregation column +- `AggregatingError` - Generic aggregating error + +**Comparing Errors (3 types):** +- `ColumnNotFoundError` - Column not found +- `ComparingError` - Generic comparing error +- `InvalidKeyColumnsError` - Invalid key columns + +**Cleaning Errors (3 types):** +- `CleaningError` - Generic cleaning error +- `InvalidFillStrategyError` - Invalid fill strategy +- `FillFailedError` - Fill operation failed + +**Transforming Errors (4 types):** +- `InvalidExpressionError` - Invalid expression +- `ColumnNotFoundError` - Column not found +- `InvalidTypeError` - Invalid type specification +- `CastFailedError` - Type casting failed +- `InvalidTransformationError` - Invalid transformation +- `TransformingError` - Generic transforming error + +**Joining Errors (6 types):** +- `InvalidJoinTypeError` - Invalid join type +- `InvalidJoinParametersError` - Invalid join parameters +- `JoinColumnsNotFoundError` - Join columns not found +- `MergeColumnsNotFoundError` - Merge columns not found +- `InsufficientDataFramesError` - Not enough DataFrames +- `JoiningError` - Generic joining error + +All error types are immutable frozen dataclasses with clear field documentation. + +--- + +## 📊 Statistics + +### Code Metrics +- **9 operation modules** implemented +- **60+ functions** across all modules +- **~5,500 lines** of production code +- **~4,800 lines** of test code +- **441 unit tests** passing +- **9 atomic commits** (one per operation module) +- **>90% test coverage** achieved + +### Test Breakdown +| Module | Tests | Status | +|--------|-------|--------| +| Error Types | 39 | ✅ Passing | +| Filtering | 46 | ✅ Passing | +| Sorting | 23 | ✅ Passing | +| Pivoting | 56 | ✅ Passing | +| Aggregating | 38 | ✅ Passing | +| Comparing | 44 | ✅ Passing | +| Cleaning | 57 | ✅ Passing | +| Transforming | 52 | ✅ Passing | +| Joining | 33 | ✅ Passing | +| Validation | 53 | ✅ Passing | +| **Total** | **441** | **✅ All Passing** | + +--- + +## 🔧 Breaking Changes + +None. This is a new architecture release that adds functionality without changing existing APIs. + +--- + +## 🔄 Migration Guide + +### For CLI Users +No changes required. The CLI commands work exactly as before. + +### For Developers +If you want to use the operations layer directly in your code: + +```python +from excel_toolkit.operations.filtering import apply_filter +from excel_toolkit.operations.sorting import sort_dataframe +from excel_toolkit.fp import is_ok, unwrap, unwrap_err + +# Apply a filter +result = apply_filter(df, condition="Age > 25") +if is_ok(result): + filtered_df = unwrap(result) +else: + error = unwrap_err(result) + print(f"Filter failed: {error}") + +# Sort a DataFrame +result = sort_dataframe(df, sort_columns=[{"column": "Name", "ascending": True}]) +if is_ok(result): + sorted_df = unwrap(result) +``` + +--- + +## 📦 Installation + +```bash +pip install excel-toolkit-cwd==0.2.0 +``` + +Or with parquet support: + +```bash +pip install "excel-toolkit-cwd[parquet]==0.2.0" +``` + +For development: + +```bash +pip install "excel-toolkit-cwd[dev]==0.2.0" +``` + +--- + +## 🐛 Bug Fixes + +This release focuses on new architecture. Bug fixes from previous versions are included. + +--- + +## 📝 Documentation + +### New Documentation +- **ROADMAP.md** - Comprehensive implementation roadmap tracking Phase 1 & 2 progress +- **Operations Layer** - Each operation module has detailed docstrings with: + - Function description + - Parameter documentation + - Return types + - Error types + - Implementation details + - Usage examples + +### Internal Documentation +- All functions have comprehensive docstrings +- Type hints throughout +- Error handling examples in docstrings +- Implementation notes for complex logic + +--- + +## 🎯 What's Next + +### Phase 3: Command Refactoring (Planned) +The next phase will refactor all CLI commands to use the new operations layer, reducing command files to <100 lines each by removing business logic. + +**Expected Benefits:** +- Cleaner CLI code +- Easier testing of CLI commands +- Reusable business logic +- Consistent error handling + +--- + +## 🙏 Acknowledgments + +This release represents approximately 10 hours of focused development with: +- **9 atomic commits** for clean git history +- **441 comprehensive tests** for reliability +- **Type-safe error handling** for robustness +- **Immutable data structures** for safety + +--- + +## 📋 Commits in This Release + +### Phase 2: Support Operations +- `4aa1d98` - docs: Update ROADMAP to Phase 2 100% complete +- `c310d53` - feat: Add validation operations module +- `343a7a0` - feat: Add joining operations module +- `e3b5476` - feat: Add transforming operations module +- `0048fbc` - feat: Add cleaning operations module +- `ab42635` - wip: Add Phase 2 operations modules (work in progress) +- `31d551e` - fix: Add InvalidParameterError and fix error class inheritance +- `8689602` - feat: Add Phase 2 error types + +### Phase 1: Core Operations +- `afc542c` - docs: Update ROADMAP to reflect Phase 1 completion +- `318719a` - feat: Add comparing operations module +- `86848cb` - feat: Add aggregating operations module +- `da246eb` - feat: Add pivoting operations module with comprehensive tests +- `1d4afb8` - docs: Add comprehensive implementation roadmap +- `6b3c2bb` - feat: Add sorting operations module with comprehensive tests +- `3fabc0f` - feat: Add filtering operations module with comprehensive tests +- `d740279` - feat: Add immutable dataclass decorator and error type ADTs + +--- + +## ⚠️ Important Notes + +### Security +- **Filtering operations** include comprehensive security validation to prevent code injection +- All expression evaluation blocks dangerous patterns (import, exec, eval, __, etc.) +- Uses restricted builtins for safe evaluation + +### Performance +- Operations are optimized for pandas DataFrames +- Large file operations may require significant memory +- Consider chunking for very large datasets (planned for future releases) + +### Compatibility +- Requires Python 3.10+ +- Tested on Python 3.10, 3.11, 3.12, 3.13, 3.14 +- Supports Excel files (.xlsx, .xls) and CSV files +- Optional parquet support with pyarrow + +--- + +## 📞 Support + +- **GitHub Issues:** https://github.com/AliiiBenn/excel-toolkit/issues +- **Documentation:** https://github.com/AliiiBenn/excel-toolkit/blob/main/README.md +- **Roadmap:** https://github.com/AliiiBenn/excel-toolkit/blob/main/docs/ROADMAP.md + +--- + +## 📄 License + +MIT License - See LICENSE file for details + +--- + +**Full Changelog:** https://github.com/AliiiBenn/excel-toolkit/compare/v0.1.0...v0.2.0 diff --git a/excel_toolkit/__init__.py b/excel_toolkit/__init__.py index 567d2f3..00921df 100644 --- a/excel_toolkit/__init__.py +++ b/excel_toolkit/__init__.py @@ -1,3 +1,3 @@ """Excel CLI Toolkit - Command-line toolkit for Excel data manipulation.""" -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/excel_toolkit/commands/aggregate.py b/excel_toolkit/commands/aggregate.py index ae82b0d..64511f6 100644 --- a/excel_toolkit/commands/aggregate.py +++ b/excel_toolkit/commands/aggregate.py @@ -4,14 +4,19 @@ """ from pathlib import Path -from typing import Any - import typer -import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.aggregating import ( + parse_aggregation_specs, + validate_aggregation_columns, + aggregate_groups, +) +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, +) def aggregate( @@ -33,174 +38,70 @@ def aggregate( xl aggregate data.csv --group "Category" --functions "Sales:sum,Sales:min,Sales:max,Profit:mean" --output stats.xlsx xl aggregate transactions.xlsx --group "Date,Type" --functions "Amount:sum,Amount:count,Quantity:mean" --output daily.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate group columns + # 1. Validate parameters if not group: typer.echo("Error: Must specify --group columns", err=True) raise typer.Exit(1) - # Step 3: Validate aggregation specifications if not functions: typer.echo("Error: Must specify --functions", err=True) typer.echo("Format: column:func1,func2 (e.g., 'Amount:sum,mean')", err=True) typer.echo("Supported functions: sum, mean, avg, median, min, max, count, std, var, first, last") raise typer.Exit(1) - # Step 4: Parse aggregation specifications - valid_funcs = ["sum", "mean", "avg", "median", "min", "max", "count", "std", "var", "first", "last"] - agg_specs = {} - parse_errors = [] - - for spec in functions.split(","): - spec = spec.strip() - if ":" not in spec: - parse_errors.append(f"Invalid format: '{spec}' (expected column:func1,func2)") - continue - - col_name, funcs = spec.split(":", 1) - col_name = col_name.strip() - func_list = [f.strip().lower() for f in funcs.split(",")] - - # Normalize avg to mean - func_list = ["mean" if f == "avg" else f for f in func_list] - - # Validate functions - invalid_funcs = [f for f in func_list if f not in valid_funcs] - if invalid_funcs: - parse_errors.append(f"Invalid functions in '{spec}': {', '.join(invalid_funcs)}") - continue - - # Merge with existing functions if column already specified - if col_name in agg_specs: - agg_specs[col_name].extend(func_list) - else: - agg_specs[col_name] = func_list - - if parse_errors: - typer.echo("Error parsing aggregation specifications:", err=True) - for error in parse_errors: - typer.echo(f" - {error}", err=True) - raise typer.Exit(1) + # 2. Read file + df = read_data_file(file_path, sheet) - if not agg_specs: - typer.echo("Error: No valid aggregation specifications", err=True) + # 3. Parse aggregation specifications + parse_result = parse_aggregation_specs(functions) + if is_err(parse_result): + error = unwrap_err(parse_result) + typer.echo(f"Error parsing aggregation specifications: {error}", err=True) raise typer.Exit(1) - # Step 5: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) + agg_specs = unwrap(parse_result) - handler = unwrap(handler_result) + # 4. Parse group columns + group_cols = [c.strip() for c in group.split(",")] - # Step 6: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) + # 5. Validate columns + validation = validate_aggregation_columns(df, group_cols, list(agg_specs.keys())) + if is_err(validation): + error = unwrap_err(validation) + typer.echo(f"Error: {error}", err=True) raise typer.Exit(1) - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) + # 6. Aggregate + result = aggregate_groups(df, group_cols, agg_specs) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error aggregating data: {error}", err=True) raise typer.Exit(1) - df = unwrap(read_result) - original_count = len(df) + df_agg = unwrap(result) - # Step 7: Handle empty file - if df.empty: - typer.echo("File is empty (no data rows)") + # 7. Handle dry-run + if dry_run: + typer.echo(f"Would aggregate {len(df)} rows into {len(df_agg)} groups") + typer.echo(f"Group by: {group}") + typer.echo(f"Aggregations: {functions}") + typer.echo("") + if len(df_agg) > 0: + from excel_toolkit.commands.common import display_table + preview_rows = min(5, len(df_agg)) + typer.echo("Preview of aggregated data:") + display_table(df_agg.head(preview_rows)) raise typer.Exit(0) - # Step 8: Parse group columns - group_columns = [c.strip() for c in group.split(",")] - # Validate group columns exist - missing_cols = [c for c in group_columns if c not in df.columns] - if missing_cols: - typer.echo(f"Error: Group columns not found: {', '.join(missing_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - raise typer.Exit(1) - - # Step 9: Validate aggregation columns exist - agg_columns = list(agg_specs.keys()) - missing_agg_cols = [c for c in agg_columns if c not in df.columns] - if missing_agg_cols: - typer.echo(f"Error: Aggregation columns not found: {', '.join(missing_agg_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - raise typer.Exit(1) - - # Check if aggregation columns are the same as group columns - overlap_cols = set(group_columns) & set(agg_columns) - if overlap_cols: - typer.echo(f"Error: Cannot aggregate on group columns: {', '.join(overlap_cols)}", err=True) - raise typer.Exit(1) - - # Step 10: Build aggregation dictionary for pandas - agg_dict = {} - for col, func_list in agg_specs.items(): - agg_dict[col] = func_list - - # Step 11: Perform groupby and aggregation - try: - df_aggregated = df.groupby(group_columns, as_index=False, dropna=False).agg(agg_dict) - - # Flatten column names (MultiIndex from agg with multiple functions) - if isinstance(df_aggregated.columns, pd.MultiIndex): - df_aggregated.columns = ['_'.join(col).strip() for col in df_aggregated.columns.values] - - except Exception as e: - typer.echo(f"Error performing aggregation: {str(e)}", err=True) - raise typer.Exit(1) - - aggregated_count = len(df_aggregated) - - # Step 12: Display summary - typer.echo(f"Original rows: {original_count}") - typer.echo(f"Aggregated rows: {aggregated_count}") - typer.echo(f"Grouped by: {', '.join(group_columns)}") + # 8. Display summary + typer.echo(f"Aggregated {len(df)} rows into {len(df_agg)} groups") + typer.echo(f"Group by: {group}") typer.echo(f"Aggregations: {functions}") typer.echo("") - # Step 13: Handle dry-run mode - if dry_run: - typer.echo("Preview of aggregated data:") - preview_rows = min(5, aggregated_count) - display_table(df_aggregated.head(preview_rows)) - raise typer.Exit(0) - - # Step 14: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_aggregated, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_aggregated) + # 9. Write or display + factory = HandlerFactory() + write_or_display(df_agg, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/append.py b/excel_toolkit/commands/append.py index e81f89c..3a63deb 100644 --- a/excel_toolkit/commands/append.py +++ b/excel_toolkit/commands/append.py @@ -4,14 +4,17 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def append( @@ -33,104 +36,31 @@ def append( xl append main.csv extra.csv --ignore-index --output combined.csv xl append main.xlsx additional.xlsx --sort --output sorted.xlsx """ - factory = HandlerFactory() - - # Step 1: Validate all files exist - main_path = Path(main_file) - if not main_path.exists(): - typer.echo(f"Main file not found: {main_file}", err=True) - raise typer.Exit(1) - - additional_paths = [Path(f) for f in additional_files] - for f in additional_paths: - if not f.exists(): - typer.echo(f"File not found: {f}", err=True) - raise typer.Exit(1) - - # Step 2: Read main file - handler_result = factory.get_handler(main_path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Read main file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(main_path, **kwargs) - elif isinstance(handler, CSVHandler): - encoding_result = handler.detect_encoding(main_path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(main_path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(main_path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) + # 1. Read main file + main_df = read_data_file(main_file, sheet) - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading main file: {error}", err=True) - raise typer.Exit(1) - - main_df = unwrap(read_result) - - # Step 3: Handle empty main file + # 2. Handle empty main file if main_df.empty: typer.echo("Main file is empty (no data rows)") raise typer.Exit(0) - # Step 4: Read and append additional files + # 3. Read and append additional files dfs = [main_df] total_main_rows = len(main_df) - for i, file_path in enumerate(additional_paths): - # Get handler for this file - file_handler_result = factory.get_handler(file_path) - if is_err(file_handler_result): - error = unwrap_err(file_handler_result) - typer.echo(f"Error with file {file_path.name}: {error}", err=True) - raise typer.Exit(1) - - file_handler = unwrap(file_handler_result) - + for i, file_path in enumerate(additional_files): # Determine sheet name for this file file_sheet = None if additional_sheets and i < len(additional_sheets): file_sheet = additional_sheets[i] - # Read file - if isinstance(file_handler, ExcelHandler): - kwargs = {"sheet_name": file_sheet} if file_sheet else {} - file_read_result = file_handler.read(file_path, **kwargs) - elif isinstance(file_handler, CSVHandler): - enc_result = file_handler.detect_encoding(file_path) - file_encoding = unwrap(enc_result) if is_ok(enc_result) else "utf-8" - - del_result = file_handler.detect_delimiter(file_path, file_encoding) - file_delimiter = unwrap(del_result) if is_ok(del_result) else "," - - file_read_result = file_handler.read(file_path, encoding=file_encoding, delimiter=file_delimiter) - else: - typer.echo(f"Unsupported file type: {file_path.name}", err=True) - raise typer.Exit(1) - - if is_err(file_read_result): - error = unwrap_err(file_read_result) - typer.echo(f"Error reading {file_path.name}: {error}", err=True) - raise typer.Exit(1) - - file_df = unwrap(file_read_result) + # Read file using helper + file_df = read_data_file(str(file_path), file_sheet) # Check column compatibility if not file_df.empty: if list(file_df.columns) != list(main_df.columns): - typer.echo(f"Warning: Column mismatch in {file_path.name}", err=True) + typer.echo(f"Warning: Column mismatch in {Path(file_path).name}", err=True) typer.echo(f" Expected: {', '.join(main_df.columns)}", err=True) typer.echo(f" Found: {', '.join(file_df.columns)}", err=True) typer.echo(" Attempting to align columns...", err=True) @@ -140,7 +70,7 @@ def append( dfs.append(file_df) - # Step 5: Concatenate all DataFrames + # 4. Concatenate all DataFrames if ignore_index: result_df = pd.concat(dfs, ignore_index=True) else: @@ -149,28 +79,23 @@ def append( total_rows = len(result_df) appended_rows = total_rows - total_main_rows - # Step 6: Sort if requested + # 5. Sort if requested if sort: first_col = result_df.columns[0] result_df = result_df.sort_values(by=first_col) result_df = result_df.reset_index(drop=True) - # Step 7: Display summary + # 6. Display summary typer.echo(f"Main file rows: {total_main_rows}") typer.echo(f"Appended rows: {appended_rows}") typer.echo(f"Total rows: {total_rows}") typer.echo(f"Files processed: {len(dfs)}") typer.echo("") - # Step 8: Write output or display + # 7. Write or display + factory = HandlerFactory() if output: - output_path = Path(output) - write_result = factory.write_file(result_df, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") + write_or_display(result_df, factory, output, "table") else: # Display result display_table(result_df.head(20)) diff --git a/excel_toolkit/commands/clean.py b/excel_toolkit/commands/clean.py index abf2369..e012743 100644 --- a/excel_toolkit/commands/clean.py +++ b/excel_toolkit/commands/clean.py @@ -4,15 +4,18 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -import numpy as np -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.cleaning import trim_whitespace +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def clean( @@ -41,15 +44,7 @@ def clean( xl clean contacts.csv --keep-alphanumeric --column "phone" xl clean data.csv --uppercase --columns "category" --dry-run """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Check if at least one cleaning operation is specified + # 1. Validate operations operations = [] if trim: operations.append("trim") @@ -93,47 +88,16 @@ def clean( typer.echo("Error: Cannot specify both --remove-special and --keep-alphanumeric", err=True) raise typer.Exit(1) - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 2. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 5: Handle empty file + # 3. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 6: Determine columns to clean + # 4. Determine columns to clean if columns: column_list = [c.strip() for c in columns.split(",")] # Validate column names exist @@ -144,19 +108,26 @@ def clean( raise typer.Exit(1) else: # Clean all string columns - column_list = [] - for col in df.columns: - if df[col].dtype == "object": - column_list.append(col) + column_list = [col for col in df.columns if df[col].dtype == "object"] if not column_list: typer.echo("No string columns to clean") typer.echo("Use --columns to specify which columns to clean") raise typer.Exit(0) - # Step 7: Apply cleaning operations + # 5. Apply cleaning operations df_cleaned = df.copy() + # Use trim_whitespace operation if --trim specified + if trim: + result = trim_whitespace(df_cleaned, columns=column_list, side="both") + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error trimming whitespace: {error}", err=True) + raise typer.Exit(1) + df_cleaned = unwrap(result) + + # Apply other operations for col in column_list: # Only clean string columns if df_cleaned[col].dtype != "object": @@ -165,9 +136,6 @@ def clean( series = df_cleaned[col].copy() # Apply operations in order - if trim: - series = _trim_whitespace(series) - if whitespace: series = _normalize_whitespace(series) @@ -191,7 +159,7 @@ def clean( df_cleaned[col] = series - # Step 8: Display summary + # 6. Display summary typer.echo(f"Cleaned {len(column_list)} column(s)") typer.echo(f"Operations: {', '.join(operations)}") if columns: @@ -199,25 +167,16 @@ def clean( typer.echo(f"Rows: {original_count}") typer.echo("") - # Step 9: Handle dry-run mode + # 7. Handle dry-run mode if dry_run: typer.echo("Preview of cleaned data:") preview_rows = min(5, original_count) display_table(df_cleaned.head(preview_rows)) raise typer.Exit(0) - # Step 10: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_cleaned, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_cleaned) + # 8. Write or display + factory = HandlerFactory() + write_or_display(df_cleaned, factory, output, "table") def _trim_whitespace(series: pd.Series) -> pd.Series: diff --git a/excel_toolkit/commands/common.py b/excel_toolkit/commands/common.py index 3f0641a..eb34f3e 100644 --- a/excel_toolkit/commands/common.py +++ b/excel_toolkit/commands/common.py @@ -4,11 +4,16 @@ across different commands. """ +from pathlib import Path from typing import Any import pandas as pd import json +import typer from tabulate import tabulate +from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err + def display_table( df: pd.DataFrame, @@ -145,3 +150,164 @@ def format_file_info(path: str, sheet: str | None = None, total_rows: int = 0, t lines.append(f"Showing data ({total_rows} rows x {total_cols} columns)") return "\n".join(lines) + + +# ============================================================================= +# Helper Functions for Command Refactoring (Phase 3) +# ============================================================================= + + +def read_data_file( + file_path: str, + sheet: str | None = None, +) -> pd.DataFrame: + """Read a data file (Excel or CSV) with auto-detection. + + This function handles the common pattern of reading Excel or CSV files + with automatic encoding and delimiter detection for CSV files. + + Args: + file_path: Path to input file + sheet: Sheet name for Excel files (optional) + + Returns: + DataFrame with file contents + + Raises: + typer.Exit: If file cannot be read (always exits with code 1) + """ + path = Path(file_path) + + # Validate file exists + if not path.exists(): + typer.echo(f"File not found: {file_path}", err=True) + raise typer.Exit(1) + + factory = HandlerFactory() + + # Get appropriate handler + handler_result = factory.get_handler(path) + if is_err(handler_result): + error = unwrap_err(handler_result) + typer.echo(f"{error}", err=True) + raise typer.Exit(1) + + handler = unwrap(handler_result) + + # Read file based on handler type + if isinstance(handler, ExcelHandler): + kwargs = {"sheet_name": sheet} if sheet else {} + read_result = handler.read(path, **kwargs) + elif isinstance(handler, CSVHandler): + # Auto-detect encoding + encoding_result = handler.detect_encoding(path) + encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" + + # Auto-detect delimiter + delimiter_result = handler.detect_delimiter(path, encoding) + delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," + + read_result = handler.read(path, encoding=encoding, delimiter=delimiter) + else: + typer.echo("Unsupported file type", err=True) + raise typer.Exit(1) + + # Check for read errors + if is_err(read_result): + error = unwrap_err(read_result) + typer.echo(f"Error reading file: {error}", err=True) + raise typer.Exit(1) + + return unwrap(read_result) + + +def write_or_display( + df: pd.DataFrame, + factory: HandlerFactory, + output: str | None, + format: str, +) -> None: + """Write DataFrame to file or display to console. + + This function handles the common pattern of either writing results to + a file or displaying them in the specified format. + + Args: + df: DataFrame to write/display + factory: HandlerFactory for writing files + output: Output file path (None = display to console) + format: Display format (table, csv, json) + + Raises: + typer.Exit: If write operation fails (exits with code 1) + """ + if output: + # Write to file + output_path = Path(output) + write_result = factory.write_file(df, output_path) + if is_err(write_result): + error = unwrap_err(write_result) + typer.echo(f"Error writing file: {error}", err=True) + raise typer.Exit(1) + typer.echo(f"Written to: {output}") + else: + # Display to console + if format == "table": + display_table(df) + elif format == "csv": + display_csv(df) + elif format == "json": + display_json(df) + else: + typer.echo(f"Unknown format: {format}", err=True) + typer.echo("Supported formats: table, csv, json") + raise typer.Exit(1) + + +def handle_operation_error(error: Exception) -> None: + """Handle operation errors with user-friendly messages. + + This function converts operation errors into user-friendly error messages + and exits with appropriate error code. + + Args: + error: Error from operation (Result Err variant) + + Raises: + typer.Exit: Always exits with error code 1 + """ + error_type = type(error).__name__ + error_msg = str(error) + + # Map error types to user-friendly messages + if "ColumnNotFoundError" in error_type: + typer.echo(f"Error: {error_msg}", err=True) + elif "TypeMismatchError" in error_type: + typer.echo(f"Type mismatch: {error_msg}", err=True) + elif "ValueOutOfRangeError" in error_type: + typer.echo(f"Value out of range: {error_msg}", err=True) + elif "InvalidConditionError" in error_type: + typer.echo(f"Invalid condition: {error_msg}", err=True) + elif "FilteringError" in error_type: + typer.echo(f"Filter error: {error_msg}", err=True) + elif "SortingError" in error_type: + typer.echo(f"Sort error: {error_msg}", err=True) + elif "PivotingError" in error_type: + typer.echo(f"Pivot error: {error_msg}", err=True) + elif "AggregatingError" in error_type: + typer.echo(f"Aggregation error: {error_msg}", err=True) + elif "ComparingError" in error_type: + typer.echo(f"Comparison error: {error_msg}", err=True) + elif "CleaningError" in error_type: + typer.echo(f"Cleaning error: {error_msg}", err=True) + elif "TransformingError" in error_type: + typer.echo(f"Transform error: {error_msg}", err=True) + elif "JoiningError" in error_type: + typer.echo(f"Join error: {error_msg}", err=True) + elif "ValidationError" in error_type: + typer.echo(f"Validation error: {error_msg}", err=True) + else: + # Generic error handling + typer.echo(f"Error: {error_msg}", err=True) + + raise typer.Exit(1) diff --git a/excel_toolkit/commands/compare.py b/excel_toolkit/commands/compare.py index 51b46f2..a4dfcb3 100644 --- a/excel_toolkit/commands/compare.py +++ b/excel_toolkit/commands/compare.py @@ -4,14 +4,19 @@ """ from pathlib import Path -from typing import Any import typer -import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.comparing import ( + compare_dataframes, + ComparisonResult, +) +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, +) def compare( @@ -33,261 +38,53 @@ def compare( xl compare data1.xlsx data2.xlsx --key-columns "ID,Date" --diffs-only --output changes.xlsx xl compare old.xlsx new.xlsx --sheet1 "Sheet1" --sheet2 "Sheet2" --output diff.xlsx """ - path1 = Path(file1) - path2 = Path(file2) - factory = HandlerFactory() - - # Step 1: Validate files exist - if not path1.exists(): - typer.echo(f"File not found: {file1}", err=True) - raise typer.Exit(1) - - if not path2.exists(): - typer.echo(f"File not found: {file2}", err=True) - raise typer.Exit(1) - - # Step 2: Get handlers - handler1_result = factory.get_handler(path1) - if is_err(handler1_result): - error = unwrap_err(handler1_result) - typer.echo(f"Error with file1: {error}", err=True) - raise typer.Exit(1) - - handler2_result = factory.get_handler(path2) - if is_err(handler2_result): - error = unwrap_err(handler2_result) - typer.echo(f"Error with file2: {error}", err=True) - raise typer.Exit(1) - - handler1 = unwrap(handler1_result) - handler2 = unwrap(handler2_result) - - # Step 3: Read first file - if isinstance(handler1, ExcelHandler): - kwargs = {"sheet_name": sheet1} if sheet1 else {} - read_result1 = handler1.read(path1, **kwargs) - elif isinstance(handler1, CSVHandler): - encoding_result = handler1.detect_encoding(path1) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler1.detect_delimiter(path1, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result1 = handler1.read(path1, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type for file1", err=True) - raise typer.Exit(1) - - if is_err(read_result1): - error = unwrap_err(read_result1) - typer.echo(f"Error reading file1: {error}", err=True) - raise typer.Exit(1) - - df1 = unwrap(read_result1) - - # Step 4: Read second file - if isinstance(handler2, ExcelHandler): - kwargs = {"sheet_name": sheet2} if sheet2 else {} - read_result2 = handler2.read(path2, **kwargs) - elif isinstance(handler2, CSVHandler): - encoding_result = handler2.detect_encoding(path2) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler2.detect_delimiter(path2, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result2 = handler2.read(path2, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type for file2", err=True) - raise typer.Exit(1) - - if is_err(read_result2): - error = unwrap_err(read_result2) - typer.echo(f"Error reading file2: {error}", err=True) - raise typer.Exit(1) + # 1. Read both files + df1 = read_data_file(file1, sheet1) + df2 = read_data_file(file2, sheet2) - df2 = unwrap(read_result2) - - # Step 5: Handle empty files + # 2. Handle empty files if df1.empty and df2.empty: typer.echo("Both files are empty") raise typer.Exit(0) if df1.empty: typer.echo(f"File1 is empty, File2 has {len(df2)} rows") - if output: - output_path = Path(output) - df2['_diff_status'] = 'added' - write_result = factory.write_file(df2, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - display_table(df2) + # Mark all as added + df2['_diff_status'] = 'added' + factory = HandlerFactory() + write_or_display(df2, factory, output, "table") raise typer.Exit(0) if df2.empty: typer.echo(f"File2 is empty, File1 has {len(df1)} rows") - if output: - output_path = Path(output) - df1['_diff_status'] = 'deleted' - write_result = factory.write_file(df1, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - display_table(df1) + # Mark all as deleted + df1['_diff_status'] = 'deleted' + factory = HandlerFactory() + write_or_display(df1, factory, output, "table") raise typer.Exit(0) - # Step 6: Parse key columns if specified + # 3. Parse key columns + key_cols = None if key_columns: key_cols = [c.strip() for c in key_columns.split(",")] - # Validate key columns exist in both dataframes - missing_df1 = [c for c in key_cols if c not in df1.columns] - missing_df2 = [c for c in key_cols if c not in df2.columns] - - if missing_df1: - typer.echo(f"Error: Key columns not found in file1: {', '.join(missing_df1)}", err=True) - typer.echo(f"Available columns in file1: {', '.join(df1.columns)}") - raise typer.Exit(1) - - if missing_df2: - typer.echo(f"Error: Key columns not found in file2: {', '.join(missing_df2)}", err=True) - typer.echo(f"Available columns in file2: {', '.join(df2.columns)}") - raise typer.Exit(1) - - # Set key columns as index for comparison - df1_indexed = df1.set_index(key_cols) - df2_indexed = df2.set_index(key_cols) - else: - # Compare by row position - df1_indexed = df1.copy() - df2_indexed = df2.copy() - # Add a temporary index column - df1_indexed['_row_num'] = range(len(df1)) - df2_indexed['_row_num'] = range(len(df2)) - key_cols = ['_row_num'] - - # Step 7: Perform comparison - try: - # Find rows only in df1 (deleted) - only_df1 = df1_indexed.index.difference(df2_indexed.index) - - # Find rows only in df2 (added) - only_df2 = df2_indexed.index.difference(df1_indexed.index) - - # Find rows in both (potentially modified) - common_index = df1_indexed.index.intersection(df2_indexed.index) - - modified_rows = [] - if len(common_index) > 0: - df1_common = df1_indexed.loc[common_index].sort_index() - df2_common = df2_indexed.loc[common_index].sort_index() - - # Compare values - for idx in common_index: - row1 = df1_common.loc[idx] - row2 = df2_common.loc[idx] - - # Check if values are different (ignoring NaN differences) - values_equal = True - for col in df1_common.columns: - val1 = row1[col] if col in row1 else None - val2 = row2[col] if col in row2 else None - - # Handle NaN comparisons - if pd.isna(val1) and pd.isna(val2): - continue - elif pd.isna(val1) or pd.isna(val2): - values_equal = False - break - elif val1 != val2: - values_equal = False - break - - if not values_equal: - modified_rows.append(idx) - - except Exception as e: - typer.echo(f"Error comparing data: {str(e)}", err=True) + # 4. Compare dataframes + result = compare_dataframes(df1, df2, key_cols) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error comparing data: {error}", err=True) raise typer.Exit(1) - # Step 8: Build comparison result - result_data = [] - added_count = 0 - deleted_count = 0 - modified_count = 0 - - # Added rows - if len(only_df2) > 0: - for idx in only_df2: - row = df2_indexed.loc[idx].to_dict() - row['_diff_status'] = 'added' - result_data.append(row) - added_count += 1 - - # Deleted rows - if len(only_df1) > 0: - for idx in only_df1: - row = df1_indexed.loc[idx].to_dict() - row['_diff_status'] = 'deleted' - result_data.append(row) - deleted_count += 1 - - # Modified rows (show both versions) - if len(modified_rows) > 0: - for idx in modified_rows: - row1 = df1_indexed.loc[idx] - row2 = df2_indexed.loc[idx] - - # Show old version - row_old = row1.to_dict() - row_old['_diff_status'] = 'modified (old)' - result_data.append(row_old) - - # Show new version - row_new = row2.to_dict() - row_new['_diff_status'] = 'modified (new)' - result_data.append(row_new) + comparison: ComparisonResult = unwrap(result) - modified_count += 1 - - # Create result dataframe - if result_data: - df_result = pd.DataFrame(result_data) - - # Reset index to make key columns regular columns again - if key_cols != ['_row_num']: - df_result.reset_index(inplace=True) - # Remove the temporary _row_num column if it exists - if '_row_num' in df_result.columns: - df_result.drop('_row_num', axis=1, inplace=True) - else: - df_result.reset_index(drop=True, inplace=True) - if '_row_num' in df_result.columns: - df_result.drop('_row_num', axis=1, inplace=True) - - # Reorder columns to put _diff_status first - if '_diff_status' in df_result.columns: - cols = ['_diff_status'] + [c for c in df_result.columns if c != '_diff_status'] - df_result = df_result[cols] - else: - # No differences found - create empty dataframe with columns from df1 - df_result = pd.DataFrame(columns=list(df1.columns) + ['_diff_status']) - - # Step 9: Display summary + # 5. Display summary typer.echo(f"File1 ({file1}): {len(df1)} rows") typer.echo(f"File2 ({file2}): {len(df2)} rows") typer.echo("") - typer.echo(f"Added rows: {added_count}") - typer.echo(f"Deleted rows: {deleted_count}") - typer.echo(f"Modified rows: {modified_count}") - total_diffs = added_count + deleted_count + modified_count + typer.echo(f"Added rows: {comparison.added_count}") + typer.echo(f"Deleted rows: {comparison.deleted_count}") + typer.echo(f"Modified rows: {comparison.modified_count}") + total_diffs = comparison.added_count + comparison.deleted_count + comparison.modified_count typer.echo(f"Total differences: {total_diffs}") typer.echo("") @@ -295,25 +92,17 @@ def compare( typer.echo("No differences found - files are identical") raise typer.Exit(0) - # Step 10: Filter if diffs only requested + # 6. Filter if diffs only requested + df_result = comparison.df_result if show_diffs_only: - df_result = df_result[df_result['_diff_status'].notna()] + df_result = df_result[df_result['_diff_status'] != 'unchanged'] if df_result.empty: typer.echo("No differences to display") raise typer.Exit(0) - # Step 11: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_result, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_result) + # 7. Write or display + factory = HandlerFactory() + write_or_display(df_result, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/convert.py b/excel_toolkit/commands/convert.py index ce906b3..15abed1 100644 --- a/excel_toolkit/commands/convert.py +++ b/excel_toolkit/commands/convert.py @@ -4,13 +4,13 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err +from excel_toolkit.commands.common import read_data_file def convert( @@ -32,12 +32,7 @@ def convert( output_path = Path(output) factory = HandlerFactory() - # Step 1: Validate input file exists - if not input_path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate output format + # 1. Validate output format output_ext = output_path.suffix.lower() supported_formats = {'.xlsx', '.xlsm', '.csv', '.json'} @@ -46,52 +41,21 @@ def convert( typer.echo(f"Supported formats: {', '.join(sorted(supported_formats))}") raise typer.Exit(1) - # Step 3: Get handler for input file - handler_result = factory.get_handler(input_path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read input file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(input_path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(input_path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(input_path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(input_path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 2. Read input file + df = read_data_file(file_path, sheet) - # Step 5: Handle empty file + # 3. Handle empty file if df.empty: typer.echo("Warning: Input file is empty (no data rows)", err=True) - # Step 6: Write to output format + # 4. Write to output format write_result = factory.write_file(df, output_path) if is_err(write_result): error = unwrap_err(write_result) typer.echo(f"Error writing file: {error}", err=True) raise typer.Exit(1) - # Step 7: Display summary + # 5. Display summary input_format = input_path.suffix.lower() typer.echo(f"Input format: {input_format}") typer.echo(f"Output format: {output_ext}") diff --git a/excel_toolkit/commands/count.py b/excel_toolkit/commands/count.py index cf146bd..27e7123 100644 --- a/excel_toolkit/commands/count.py +++ b/excel_toolkit/commands/count.py @@ -4,14 +4,17 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def count( @@ -33,62 +36,23 @@ def count( xl count data.xlsx --columns "Product" --sort count --output top-products.xlsx xl count data.xlsx --columns "Category" --sort name --ascending --output categories.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate sort option + # 1. Validate sort option valid_sort_values = ["count", "name", "none", None] if sort not in valid_sort_values: typer.echo(f"Error: Invalid sort value '{sort}'", err=True) typer.echo("Valid values: count, name, none") raise typer.Exit(1) - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 2. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 5: Handle empty file + # 3. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 6: Parse columns + # 4. Parse columns column_list = [c.strip() for c in columns.split(",")] # Validate columns exist missing_cols = [c for c in column_list if c not in df.columns] @@ -97,7 +61,7 @@ def count( typer.echo(f"Available columns: {', '.join(df.columns)}") raise typer.Exit(1) - # Step 7: Count occurrences for each column + # 5. Count occurrences for each column count_dfs = [] for col in column_list: @@ -118,7 +82,7 @@ def count( else: df_counts = pd.concat(count_dfs, ignore_index=True) - # Step 8: Sort if requested + # 6. Sort if requested if sort == "count": # Sort by count (descending by default) sort_column = 'count' @@ -136,25 +100,16 @@ def count( # Reset index after sorting df_counts = df_counts.reset_index(drop=True) - # Step 9: Display summary + # 7. Display summary typer.echo(f"Total rows: {original_count}") typer.echo(f"Columns: {', '.join(column_list)}") if sort: typer.echo(f"Sorted by: {sort} ({'ascending' if ascending else 'descending'})") typer.echo("") - # Step 10: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_counts, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display counts - display_table(df_counts) + # 8. Write or display + factory = HandlerFactory() + write_or_display(df_counts, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/dedupe.py b/excel_toolkit/commands/dedupe.py index da6691a..b90aedc 100644 --- a/excel_toolkit/commands/dedupe.py +++ b/excel_toolkit/commands/dedupe.py @@ -4,14 +4,18 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.cleaning import remove_duplicates +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def dedupe( @@ -32,103 +36,59 @@ def dedupe( xl dedupe data.csv --keep last --output latest.xlsx xl dedupe contacts.xlsx --output clean.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate keep option + # 1. Validate keep option valid_keep_values = ["first", "last", "none"] if keep not in valid_keep_values: typer.echo(f"Error: Invalid keep value '{keep}'", err=True) typer.echo(f"Valid values: {', '.join(valid_keep_values)}") raise typer.Exit(1) - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) + # 2. Map "none" to False for pandas + keep_param = False if keep == "none" else keep - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 3. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 5: Handle empty file + # 4. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 6: Parse columns for deduplication + # 5. Parse columns for deduplication subset = None if by: - column_list = [c.strip() for c in by.split(",")] + subset = [c.strip() for c in by.split(",")] # Validate columns exist - missing_cols = [c for c in column_list if c not in df.columns] + missing_cols = [c for c in subset if c not in df.columns] if missing_cols: typer.echo(f"Error: Columns not found: {', '.join(missing_cols)}", err=True) typer.echo(f"Available columns: {', '.join(df.columns)}") raise typer.Exit(1) - subset = column_list - - # Step 7: Identify duplicates - # Count duplicates before removal - if keep == "none": - # Remove ALL occurrences of duplicates - duplicated_mask = df.duplicated(subset=subset, keep=False) - duplicate_count = duplicated_mask.sum() - else: - # Keep first or last occurrence - duplicated_mask = df.duplicated(subset=subset, keep=keep) - duplicate_count = duplicated_mask.sum() + + # 6. Count duplicates before removal + duplicated_mask = df.duplicated(subset=subset, keep=keep_param) + duplicate_count = duplicated_mask.sum() if duplicate_count == 0: typer.echo("No duplicates found") if not dry_run and not output: - # Display data if no duplicates and no output display_table(df) raise typer.Exit(0) - # Step 8: Remove duplicates - if keep == "none": - # Remove all rows that have duplicates - df_dedupe = df[~duplicated_mask].copy() - else: - # Keep first or last occurrence - df_dedupe = df[~duplicated_mask].copy() + # 7. Remove duplicates using operation + result = remove_duplicates(df, subset=subset, keep=keep_param) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error removing duplicates: {error}", err=True) + raise typer.Exit(1) + + df_dedupe = unwrap(result) deduped_count = len(df_dedupe) removed_count = original_count - deduped_count - # Step 9: Display summary + # 8. Display summary typer.echo(f"Original rows: {original_count}") typer.echo(f"Duplicate rows found: {duplicate_count}") typer.echo(f"Rows removed: {removed_count}") @@ -140,7 +100,7 @@ def dedupe( typer.echo(f"Keep strategy: {keep}") typer.echo("") - # Step 10: Handle dry-run mode + # 9. Handle dry-run mode if dry_run: typer.echo("Preview of deduplicated data:") preview_rows = min(5, deduped_count) @@ -152,26 +112,16 @@ def dedupe( removed_rows = min(5, removed_count) if keep == "none": # Show all duplicate rows (both first and subsequent occurrences) - all_dupes = df[df.duplicated(subset=subset, keep=False) | df.duplicated(subset=subset, keep=False)] - # Get unique duplicate rows for preview + all_dupes = df[df.duplicated(subset=subset, keep=False)] display_table(all_dupes.head(removed_rows)) else: # Show only the rows that were removed display_table(df[duplicated_mask].head(removed_rows)) raise typer.Exit(0) - # Step 11: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_dedupe, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_dedupe) + # 10. Write or display + factory = HandlerFactory() + write_or_display(df_dedupe, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/export.py b/excel_toolkit/commands/export.py index dcb3158..8b3ed1d 100644 --- a/excel_toolkit/commands/export.py +++ b/excel_toolkit/commands/export.py @@ -4,13 +4,13 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err +from excel_toolkit.commands.common import read_data_file def export( @@ -33,64 +33,25 @@ def export( xl export data.xlsx --format json --orient records --output data.json xl export data.csv --format parquet --output data.parquet xl export data.xlsx --format html --output data.html - xl export data.xlsx --format tsv --delimiter \"\\t\" --output data.tsv + xl export data.xlsx --format tsv --delimiter "\\t" --output data.tsv """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate format + # 1. Validate format valid_formats = ["csv", "json", "parquet", "tsv", "html", "markdown"] if format not in valid_formats: typer.echo(f"Error: Invalid format '{format}'", err=True) typer.echo(f"Valid formats: {', '.join(valid_formats)}") raise typer.Exit(1) - # Step 2: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - file_encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, file_encoding) - file_delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=file_encoding, delimiter=file_delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 2. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 5: Handle empty file + # 3. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 6: Export based on format + # 4. Export based on format output_path = Path(output) try: @@ -129,7 +90,7 @@ def export( with open(output_path, 'w', encoding=encoding) as f: f.write(df.to_markdown(index=index)) - # Step 7: Display summary + # 5. Display summary typer.echo(f"Exported {original_count} rows to {output}") typer.echo(f"Format: {format}") diff --git a/excel_toolkit/commands/fill.py b/excel_toolkit/commands/fill.py index d970514..2aa6476 100644 --- a/excel_toolkit/commands/fill.py +++ b/excel_toolkit/commands/fill.py @@ -4,14 +4,18 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.cleaning import fill_missing_values +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def fill( @@ -33,15 +37,7 @@ def fill( xl fill data.xlsx --columns "Price" --strategy "median" --output filled.xlsx xl fill sales.xlsx --strategy "ffill" --output filled.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate fill options + # 1. Validate fill options if value is None and strategy is None: typer.echo("Error: Must specify either --value or --strategy", err=True) typer.echo("Available strategies: mean, median, mode, min, max, ffill, bfill") @@ -51,54 +47,25 @@ def fill( typer.echo("Error: Cannot use both --value and --strategy", err=True) raise typer.Exit(1) - # Step 3: Validate strategy - valid_strategies = ["mean", "median", "mode", "min", "max", "ffill", "bfill"] - if strategy and strategy not in valid_strategies: - typer.echo(f"Error: Invalid strategy '{strategy}'", err=True) - typer.echo(f"Valid strategies: {', '.join(valid_strategies)}") - raise typer.Exit(1) - - # Step 4: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 5: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) + # 2. Map CLI strategies to operation strategies + strategy_mapping = { + "ffill": "forward", + "bfill": "backward", + } - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) + fill_strategy = strategy_mapping.get(strategy, strategy) if strategy else None + fill_value = value - df = unwrap(read_result) + # 3. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 6: Handle empty file + # 4. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 7: Determine columns to fill + # 5. Determine columns to fill if columns: column_list = [c.strip() for c in columns.split(",")] # Validate columns exist @@ -116,69 +83,32 @@ def fill( typer.echo("No columns with missing values found") raise typer.Exit(0) - # Step 8: Count missing values before filling + # 6. Count missing values before filling missing_before = df[target_columns].isnull().sum().sum() - # Step 9: Apply fill strategy - df_filled = df.copy() - - for col in target_columns: - if strategy == "mean": - # Only for numeric columns - if pd.api.types.is_numeric_dtype(df_filled[col]): - df_filled[col].fillna(df_filled[col].mean(), inplace=True) - else: - typer.echo(f"Warning: Cannot apply 'mean' to non-numeric column '{col}', skipping", err=True) - elif strategy == "median": - # Only for numeric columns - if pd.api.types.is_numeric_dtype(df_filled[col]): - df_filled[col].fillna(df_filled[col].median(), inplace=True) - else: - typer.echo(f"Warning: Cannot apply 'median' to non-numeric column '{col}', skipping", err=True) - elif strategy == "mode": - # Mode can be applied to any column type - mode_values = df_filled[col].mode() - if len(mode_values) > 0: - df_filled[col].fillna(mode_values[0], inplace=True) - else: - typer.echo(f"Warning: No mode found for column '{col}', skipping", err=True) - elif strategy == "min": - # Only for numeric columns - if pd.api.types.is_numeric_dtype(df_filled[col]): - df_filled[col].fillna(df_filled[col].min(), inplace=True) - else: - typer.echo(f"Warning: Cannot apply 'min' to non-numeric column '{col}', skipping", err=True) - elif strategy == "max": - # Only for numeric columns - if pd.api.types.is_numeric_dtype(df_filled[col]): - df_filled[col].fillna(df_filled[col].max(), inplace=True) - else: - typer.echo(f"Warning: Cannot apply 'max' to non-numeric column '{col}', skipping", err=True) - elif strategy == "ffill": - # Forward fill (propagate last valid value) - df_filled[col].fillna(method='ffill', inplace=True) - # If still NaN at the beginning, backward fill - df_filled[col].fillna(method='bfill', inplace=True) - elif strategy == "bfill": - # Backward fill (propagate next valid value) - df_filled[col].fillna(method='bfill', inplace=True) - # If still NaN at the end, forward fill - df_filled[col].fillna(method='ffill', inplace=True) - elif value is not None: - # Fill with constant value - # Try to convert to appropriate type - try: - # Try numeric conversion - numeric_value = float(value) - if pd.api.types.is_numeric_dtype(df_filled[col]): - df_filled[col].fillna(numeric_value, inplace=True) - else: - df_filled[col].fillna(value, inplace=True) - except ValueError: - # Use as string - df_filled[col].fillna(value, inplace=True) - - # Step 10: Count missing values after filling + # 7. Apply fill strategy using operation + if fill_value: + # Convert value to appropriate type + try: + # Try numeric conversion + numeric_value = float(fill_value) + fill_value_arg = numeric_value + except ValueError: + # Use as string + fill_value_arg = fill_value + + result = fill_missing_values(df, strategy="constant", columns=target_columns, value=fill_value_arg) + else: + result = fill_missing_values(df, strategy=fill_strategy, columns=target_columns) + + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error filling missing values: {error}", err=True) + raise typer.Exit(1) + + df_filled = unwrap(result) + + # 8. Count missing values after filling missing_after = df_filled[target_columns].isnull().sum().sum() filled_count = missing_before - missing_after @@ -188,7 +118,7 @@ def fill( display_table(df) raise typer.Exit(0) - # Step 11: Display summary + # 9. Display summary typer.echo(f"Missing values before: {missing_before}") typer.echo(f"Missing values after: {missing_after}") typer.echo(f"Values filled: {filled_count}") @@ -202,25 +132,16 @@ def fill( typer.echo(f"Columns: all columns with missing values") typer.echo("") - # Step 12: Handle dry-run mode + # 10. Handle dry-run mode if dry_run: typer.echo("Preview of filled data:") preview_rows = min(5, original_count) display_table(df_filled.head(preview_rows)) raise typer.Exit(0) - # Step 13: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_filled, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_filled) + # 11. Write or display + factory = HandlerFactory() + write_or_display(df_filled, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/filter.py b/excel_toolkit/commands/filter.py index e2f5bf2..05ac0cd 100644 --- a/excel_toolkit/commands/filter.py +++ b/excel_toolkit/commands/filter.py @@ -4,52 +4,22 @@ """ from pathlib import Path -from typing import Any -import re - import typer -import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler -from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err, ok, err -from excel_toolkit.fp._result import Result +from excel_toolkit.core import HandlerFactory +from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err +from excel_toolkit.operations.filtering import ( + validate_condition, + normalize_condition, + apply_filter, +) from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, display_table, - display_csv, - display_json, - format_file_info, ) -# Security: allowed patterns in conditions -ALLOWED_PATTERNS = [ - r"\w+\s*[=!<>]+\s*[\w'\"]+", # Comparisons: x == 5, x > 3 - r"\w+\s+in\s+\[[^\]]+\]", # in operator: x in [a, b, c] - r"\w+\.isna\(\)", # Null check: x.isna() - r"\w+\.notna\(\)", # Null check: x.notna() - r"\w+\s+contains\s+['\"][^'\"]+['\"]", # String contains - r"\w+\s+startswith\s+['\"][^'\"]+['\"]", # String starts with - r"\w+\s+endswith\s+['\"][^'\"]+['\"]", # String ends with - r"\s+and\s+", # Logical AND - r"\s+or\s+", # Logical OR - r"\s+not\s+", # Logical NOT - r"\([^)]+\)", # Parentheses for grouping -] - -DANGEROUS_PATTERNS = [ - "import", - "exec", - "eval", - "__", - "open(", - "file(", - "os.", - "sys.", - "subprocess", - "pickle", -] - - def filter( file_path: str = typer.Argument(..., help="Path to input file"), condition: str = typer.Argument(..., help="Filter condition (e.g., 'age > 30')"), @@ -74,111 +44,41 @@ def filter( xl filter data.xlsx "city == 'Paris'" --columns name,age xl filter data.csv "status == 'active'" --dry-run """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate condition for security - validation_result = _validate_condition(condition) - if is_err(validation_result): - error = unwrap_err(validation_result) - typer.echo(f"Invalid condition: {error}", err=True) - raise typer.Exit(1) - - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 1. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 5: Handle empty file + # 2. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 6: Normalize condition - normalized_condition = _normalize_condition(condition) - - # Step 7: Apply filter - try: - df_filtered = df.query(normalized_condition) - except pd.errors.UndefinedVariableError as e: - # Extract column name from error - error_str = str(e) - col_match = re.search(r"'([^']+)'", error_str) - if col_match: - col = col_match.group(1) - typer.echo(f"Error: Column '{col}' not found", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - else: - typer.echo(f"Error: {error_str}", err=True) - raise typer.Exit(1) - except Exception as e: - error_msg = str(e) - if "could not convert" in error_msg: - typer.echo("Error: Type mismatch in condition", err=True) - typer.echo("Ensure numeric columns are compared with numbers", err=True) - typer.echo("Ensure string columns are compared with strings in quotes", err=True) - else: - typer.echo(f"Error filtering data: {error_msg}", err=True) - typer.echo(f"\nCondition: {condition}", err=True) + # 3. Validate condition + validation = validate_condition(condition) + if is_err(validation): + error = unwrap_err(validation) + typer.echo(f"Invalid condition: {error}", err=True) raise typer.Exit(1) - filtered_count = len(df_filtered) + # 4. Normalize condition + normalized = unwrap(normalize_condition(condition)) - # Step 8: Select columns if specified + # 5. Parse columns + col_list = None if columns: - try: - col_list = [c.strip() for c in columns.split(",")] - # Validate column names - missing_cols = [c for c in col_list if c not in df_filtered.columns] - if missing_cols: - typer.echo(f"Error: Columns not found: {', '.join(missing_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df_filtered.columns)}") - raise typer.Exit(1) - df_filtered = df_filtered[col_list] - except Exception as e: - typer.echo(f"Error selecting columns: {str(e)}", err=True) - raise typer.Exit(1) + col_list = [c.strip() for c in columns.split(",")] + + # 6. Apply filter + result = apply_filter(df, normalized, columns=col_list, limit=rows) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error filtering data: {error}", err=True) + raise typer.Exit(1) - # Step 9: Limit rows if specified - if rows is not None: - df_filtered = df_filtered.head(rows) + df_filtered = unwrap(result) + filtered_count = len(df_filtered) - # Step 10: Handle dry-run mode + # 7. Handle dry-run if dry_run: percentage = (filtered_count / original_count * 100) if original_count > 0 else 0 typer.echo(f"Would filter {filtered_count} of {original_count} rows ({percentage:.1f}%)") @@ -192,22 +92,16 @@ def filter( typer.echo("No rows match the condition") raise typer.Exit(0) - # Step 11: Handle empty result + # 8. Handle empty result if filtered_count == 0: typer.echo("No rows match the filter condition") typer.echo(f"Condition: {condition}") if output: - # Still write empty file - output_path = Path(output) - write_result = factory.write_file(df_filtered, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") + factory = HandlerFactory() + write_or_display(df_filtered, factory, output, format) raise typer.Exit(0) - # Step 12: Display summary + # 9. Display summary percentage = (filtered_count / original_count * 100) if original_count > 0 else 0 typer.echo(f"Filtered {filtered_count} of {original_count} rows ({percentage:.1f}%)") typer.echo(f"Condition: {condition}") @@ -217,94 +111,9 @@ def filter( typer.echo("") - # Step 13: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_filtered, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - if format == "table": - display_table(df_filtered) - elif format == "csv": - display_csv(df_filtered) - elif format == "json": - display_json(df_filtered) - else: - typer.echo(f"Unknown format: {format}", err=True) - typer.echo("Supported formats: table, csv, json") - raise typer.Exit(1) - - -def _validate_condition(condition: str) -> Result[str, str]: - """Validate filter condition for security and syntax. - - Args: - condition: User-provided condition string - - Returns: - Result[str, str] - Valid condition or error message - """ - # Check for dangerous patterns - condition_lower = condition.lower() - for pattern in DANGEROUS_PATTERNS: - if pattern in condition_lower: - return err(f"Unsafe pattern detected: {pattern}") - - # Check length - if len(condition) > 1000: - return err("Condition too long (max 1000 characters)") - - # Basic syntax validation - # Check for balanced parentheses - if condition.count("(") != condition.count(")"): - return err("Unbalanced parentheses") - - # Check for balanced brackets - if condition.count("[") != condition.count("]"): - return err("Unbalanced brackets") - - # Check for balanced quotes - single_quotes = condition.count("'") - if single_quotes % 2 != 0: - return err("Unbalanced single quotes") - - double_quotes = condition.count('"') - if double_quotes % 2 != 0: - return err("Unbalanced double quotes") - - return ok(condition) - - -def _normalize_condition(condition: str) -> str: - """Normalize condition syntax for pandas.query(). - - Handles special syntax and converts to pandas-compatible form. - - Args: - condition: User-provided condition - - Returns: - Normalized condition string - """ - # Convert 'value is None' to 'value.isna()' - condition = re.sub(r"(\w+)\s+is\s+None\b", r"\1.isna()", condition) - condition = re.sub(r"(\w+)\s+is\s+not\s+None\b", r"\1.notna()", condition) - - # Convert 'value between X and Y' to 'value >= X and value <= Y' - # Case insensitive - pattern = r"(\w+)\s+between\s+([^ ]+)\s+and\s+([^ ]+)" - replacement = r"\1 >= \2 and \1 <= \3" - condition = re.sub(pattern, replacement, condition, flags=re.IGNORECASE) - - # Handle 'not in' - condition = re.sub(r"(\w+)\s+not\s+in\s+", r"\1 not in ", condition, flags=re.IGNORECASE) - - return condition + # 10. Write or display + factory = HandlerFactory() + write_or_display(df_filtered, factory, output, format) # Create CLI app for this command diff --git a/excel_toolkit/commands/group.py b/excel_toolkit/commands/group.py index 62c58f7..ee6b03a 100644 --- a/excel_toolkit/commands/group.py +++ b/excel_toolkit/commands/group.py @@ -4,14 +4,22 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.aggregating import ( + parse_aggregation_specs, + validate_aggregation_columns, + aggregate_groups, +) +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def group( @@ -32,191 +40,75 @@ def group( xl group data.csv --by "Category,Subcategory" --aggregate "Sales:sum,Profit:mean" --output summary.xlsx xl group transactions.xlsx --by "Date" --aggregate "Amount:sum,Count:count" --output daily.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate group columns + # 1. Validate group columns if not by: typer.echo("Error: Must specify --by columns for grouping", err=True) raise typer.Exit(1) - # Step 3: Validate aggregation specifications + # 2. Validate aggregation specifications if not aggregate: typer.echo("Error: Must specify --aggregate specifications", err=True) typer.echo("Format: column:function (e.g., 'Amount:sum,Quantity:avg')") typer.echo("Supported functions: sum, mean, avg, median, min, max, count, std, var") raise typer.Exit(1) - # Step 4: Parse aggregation specifications - valid_funcs = ["sum", "mean", "avg", "median", "min", "max", "count", "std", "var"] - agg_specs = {} - parse_errors = [] - - for spec in aggregate.split(","): - spec = spec.strip() - if ":" not in spec: - parse_errors.append(f"Invalid format: '{spec}' (expected column:function)") - continue - - col_name, func = spec.split(":", 1) - col_name = col_name.strip() - func = func.strip().lower() - - if func == "avg": - func = "mean" # Normalize avg to mean - - if func not in valid_funcs: - parse_errors.append(f"Invalid function '{func}' in '{spec}'") - continue - - if col_name in agg_specs: - parse_errors.append(f"Duplicate column '{col_name}'") - continue - - agg_specs[col_name] = func - - if parse_errors: - typer.echo("Error parsing aggregation specifications:", err=True) - for error in parse_errors: - typer.echo(f" - {error}", err=True) - raise typer.Exit(1) - - if not agg_specs: - typer.echo("Error: No valid aggregation specifications", err=True) - raise typer.Exit(1) - - # Step 5: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 6: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 3. Read file + df = read_data_file(file_path, sheet) original_count = len(df) original_cols = len(df.columns) - # Step 7: Handle empty file + # 4. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 8: Parse group columns - group_columns = [c.strip() for c in by.split(",")] - # Validate group columns exist - missing_cols = [c for c in group_columns if c not in df.columns] - if missing_cols: - typer.echo(f"Error: Group columns not found: {', '.join(missing_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") + # 5. Parse aggregation specifications + parse_result = parse_aggregation_specs(aggregate) + if is_err(parse_result): + error = unwrap_err(parse_result) + typer.echo(f"Error parsing aggregation specifications: {error}", err=True) raise typer.Exit(1) - # Step 9: Validate aggregation columns exist - agg_columns = list(agg_specs.keys()) - missing_agg_cols = [c for c in agg_columns if c not in df.columns] - if missing_agg_cols: - typer.echo(f"Error: Aggregation columns not found: {', '.join(missing_agg_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - raise typer.Exit(1) + agg_specs = unwrap(parse_result) + + # 6. Parse group columns + group_cols = [c.strip() for c in by.split(",")] - # Check if aggregation columns are the same as group columns - overlap_cols = set(group_columns) & set(agg_columns) - if overlap_cols: - typer.echo(f"Error: Cannot aggregate on group columns: {', '.join(overlap_cols)}", err=True) + # 7. Validate columns + validation = validate_aggregation_columns(df, group_cols, list(agg_specs.keys())) + if is_err(validation): + error = unwrap_err(validation) + typer.echo(f"Error: {error}", err=True) raise typer.Exit(1) - # Step 10: Build aggregation dictionary for pandas - agg_dict = {} - for col, func in agg_specs.items(): - if func == "count": - # Count is special - count non-null values - agg_dict[col] = func - else: - agg_dict[col] = func - - # Step 11: Perform groupby and aggregation - try: - df_grouped = df.groupby(group_columns, as_index=False, dropna=False).agg(agg_dict) - - # Flatten column names (MultiIndex from agg) - if isinstance(df_grouped.columns, pd.MultiIndex): - df_grouped.columns = ['_'.join(col).strip() for col in df_grouped.columns.values] - - # Rename columns to match aggregation spec format - new_column_names = {} - for col in group_columns: - new_column_names[col] = col - - for col, func in agg_specs.items(): - # Find the actual column name (might be col_func or just col) - matching_cols = [c for c in df_grouped.columns if c.startswith(col)] - if matching_cols: - new_column_names[matching_cols[0]] = f"{col}_{func}" - - df_grouped.rename(columns=new_column_names, inplace=True) - - except Exception as e: - typer.echo(f"Error performing aggregation: {str(e)}", err=True) + # 8. Aggregate + result = aggregate_groups(df, group_cols, agg_specs) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error aggregating data: {error}", err=True) raise typer.Exit(1) + df_grouped = unwrap(result) grouped_count = len(df_grouped) grouped_cols = len(df_grouped.columns) - # Step 12: Display summary + # 9. Display summary typer.echo(f"Original rows: {original_count}") typer.echo(f"Grouped rows: {grouped_count}") - typer.echo(f"Grouped by: {', '.join(group_columns)}") + typer.echo(f"Grouped by: {', '.join(group_cols)}") typer.echo(f"Aggregations: {aggregate}") typer.echo("") - # Step 13: Handle dry-run mode + # 10. Handle dry-run mode if dry_run: typer.echo("Preview of grouped data:") preview_rows = min(5, grouped_count) display_table(df_grouped.head(preview_rows)) raise typer.Exit(0) - # Step 14: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_grouped, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_grouped) + # 11. Write or display + factory = HandlerFactory() + write_or_display(df_grouped, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/head.py b/excel_toolkit/commands/head.py index e1c530b..8bc7a76 100644 --- a/excel_toolkit/commands/head.py +++ b/excel_toolkit/commands/head.py @@ -4,14 +4,14 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err from excel_toolkit.commands.common import ( + read_data_file, display_table, display_csv, display_json, @@ -44,104 +44,39 @@ def head( Raises: typer.Exit: If file cannot be read """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - typer.echo("\nSupported formats: .xlsx, .xls, .csv") - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 3: Read file - if isinstance(handler, ExcelHandler): - # Determine which sheet to read - sheet_name = sheet - if sheet_name is None: - # Get first sheet name - names_result = handler.get_sheet_names(path) - if is_ok(names_result): - sheets = unwrap(names_result) - sheet_name = sheets[0] if sheets else None - - # Read Excel file - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading Excel file: {error}", err=True) - raise typer.Exit(1) - - elif isinstance(handler, CSVHandler): - # Detect encoding - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - # Detect delimiter - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - # Read CSV file - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading CSV file: {error}", err=True) - raise typer.Exit(1) - else: - typer.echo(f"Unsupported handler type", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) - - # Step 4: Handle empty DataFrame + # 1. Read file + df = read_data_file(file_path, sheet) + + # 2. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 5: Get first N rows + # 3. Limit columns if requested + if max_columns and len(df.columns) > max_columns: + df = df.iloc[:, :max_columns] + + # 4. Get first N rows df_head = df.head(rows) - # Step 6: Display file info - sheet_name_display = sheet_name if isinstance(handler, ExcelHandler) else None - file_info = format_file_info( - str(path), sheet=sheet_name_display, total_rows=len(df), total_cols=len(df.columns) - ) - typer.echo(file_info) + # 5. Display file info + path = Path(file_path) + format_file_info(path, len(df), len(df.columns)) - # Step 7: Show column info if requested + # 6. Show column information if requested if show_columns: display_column_types(df) - typer.echo("") # Empty line before data - - # Step 8: Display data in requested format - try: - if format == "table": - display_table(df_head, max_columns=max_columns) - elif format == "csv": - display_csv(df_head) - elif format == "json": - display_json(df_head) - else: - typer.echo(f"Unknown format: {format}", err=True) - typer.echo("Supported formats: table, csv, json") - raise typer.Exit(1) - except Exception as e: - typer.echo(f"Error displaying data: {str(e)}", err=True) - raise typer.Exit(1) - - -# Create CLI app for this command (can be used standalone or imported) + + # 7. Display data based on format + if format == "table": + display_table(df_head) + elif format == "csv": + display_csv(df_head) + elif format == "json": + display_json(df_head) + + +# Create CLI app for this command app = typer.Typer(help="Display the first N rows of a data file") # Register the command diff --git a/excel_toolkit/commands/join.py b/excel_toolkit/commands/join.py index 5b9a419..367cebf 100644 --- a/excel_toolkit/commands/join.py +++ b/excel_toolkit/commands/join.py @@ -4,14 +4,16 @@ """ from pathlib import Path -from typing import Any import typer -import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.joining import join_dataframes +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, +) def join( @@ -35,103 +37,11 @@ def join( xl join left.xlsx right.xlsx --left-on "id" --right-on "user_id" --output joined.xlsx xl join data1.xlsx data2.xlsx --on "key" --how left --output left_join.xlsx """ - left_path = Path(left_file) - right_path = Path(right_file) - factory = HandlerFactory() - - # Step 1: Validate files exist - if not left_path.exists(): - typer.echo(f"File not found: {left_file}", err=True) - raise typer.Exit(1) - - if not right_path.exists(): - typer.echo(f"File not found: {right_file}", err=True) - raise typer.Exit(1) - - # Step 2: Validate join type - valid_join_types = ["inner", "left", "right", "outer"] - if how not in valid_join_types: - typer.echo(f"Error: Invalid join type '{how}'", err=True) - typer.echo(f"Valid types: {', '.join(valid_join_types)}") - raise typer.Exit(1) - - # Step 3: Validate join columns - if on: - if left_on or right_on: - typer.echo("Error: Cannot use --on with --left-on/--right-on", err=True) - raise typer.Exit(1) - - if (left_on and not right_on) or (right_on and not left_on): - typer.echo("Error: Must specify both --left-on and --right-on", err=True) - raise typer.Exit(1) - - if not on and not (left_on and right_on): - typer.echo("Error: Must specify either --on or both --left-on and --right-on", err=True) - raise typer.Exit(1) - - # Step 4: Read left file - left_handler_result = factory.get_handler(left_path) - if is_err(left_handler_result): - error = unwrap_err(left_handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - left_handler = unwrap(left_handler_result) - - if isinstance(left_handler, ExcelHandler): - kwargs = {"sheet_name": left_sheet} if left_sheet else {} - left_read_result = left_handler.read(left_path, **kwargs) - elif isinstance(left_handler, CSVHandler): - encoding_result = left_handler.detect_encoding(left_path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = left_handler.detect_delimiter(left_path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - left_read_result = left_handler.read(left_path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(left_read_result): - error = unwrap_err(left_read_result) - typer.echo(f"Error reading left file: {error}", err=True) - raise typer.Exit(1) + # 1. Read both files + df_left = read_data_file(left_file, left_sheet) + df_right = read_data_file(right_file, right_sheet) - df_left = unwrap(left_read_result) - - # Step 5: Read right file - right_handler_result = factory.get_handler(right_path) - if is_err(right_handler_result): - error = unwrap_err(right_handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - right_handler = unwrap(right_handler_result) - - if isinstance(right_handler, ExcelHandler): - kwargs = {"sheet_name": right_sheet} if right_sheet else {} - right_read_result = right_handler.read(right_path, **kwargs) - elif isinstance(right_handler, CSVHandler): - encoding_result = right_handler.detect_encoding(right_path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = right_handler.detect_delimiter(right_path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - right_read_result = right_handler.read(right_path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(right_read_result): - error = unwrap_err(right_read_result) - typer.echo(f"Error reading right file: {error}", err=True) - raise typer.Exit(1) - - df_right = unwrap(right_read_result) - - # Step 6: Handle empty files + # 2. Handle empty files if df_left.empty: typer.echo("Left file is empty (no data rows)") raise typer.Exit(0) @@ -140,58 +50,47 @@ def join( typer.echo("Right file is empty (no data rows)") raise typer.Exit(0) - # Step 7: Validate join columns exist + # 3. Parse join columns + on_cols = None + left_on_cols = None + right_on_cols = None + if on: - left_on_cols = [on] - right_on_cols = [on] + on_cols = [on] else: - left_on_cols = [c.strip() for c in left_on.split(",")] - right_on_cols = [c.strip() for c in right_on.split(",")] - - if len(left_on_cols) != len(right_on_cols): - typer.echo("Error: --left-on and --right-on must have the same number of columns", err=True) + if left_on and right_on: + left_on_cols = [c.strip() for c in left_on.split(",")] + right_on_cols = [c.strip() for c in right_on.split(",")] + elif left_on or right_on: + typer.echo("Error: Must specify both --left-on and --right-on", err=True) + raise typer.Exit(1) + else: + typer.echo("Error: Must specify either --on or both --left-on and --right-on", err=True) raise typer.Exit(1) - missing_left = [c for c in left_on_cols if c not in df_left.columns] - if missing_left: - typer.echo(f"Error: Columns not found in left file: {', '.join(missing_left)}", err=True) - typer.echo(f"Available columns: {', '.join(df_left.columns)}") + # 4. Join dataframes using operation + result = join_dataframes( + df_left, + df_right, + how=how, + on=on_cols, + left_on=left_on_cols, + right_on=right_on_cols, + suffixes=("_left", "_right") + ) + + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error joining data: {error}", err=True) raise typer.Exit(1) - missing_right = [c for c in right_on_cols if c not in df_right.columns] - if missing_right: - typer.echo(f"Error: Columns not found in right file: {', '.join(missing_right)}", err=True) - typer.echo(f"Available columns: {', '.join(df_right.columns)}") - raise typer.Exit(1) - - # Step 8: Perform join - try: - if on: - df_joined = pd.merge( - df_left, - df_right, - on=on, - how=how, - suffixes=("_left", "_right") - ) - else: - df_joined = pd.merge( - df_left, - df_right, - left_on=left_on_cols, - right_on=right_on_cols, - how=how, - suffixes=("_left", "_right") - ) - except Exception as e: - typer.echo(f"Error performing join: {str(e)}", err=True) - raise typer.Exit(1) + df_joined = unwrap(result) + # 5. Display summary joined_rows = len(df_joined) left_rows = len(df_left) right_rows = len(df_right) - # Step 9: Display summary typer.echo(f"Join type: {how}") if on: typer.echo(f"On column: {on}") @@ -203,18 +102,9 @@ def join( typer.echo(f"Joined rows: {joined_rows}") typer.echo("") - # Step 10: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_joined, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_joined) + # 6. Write or display + factory = HandlerFactory() + write_or_display(df_joined, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/merge.py b/excel_toolkit/commands/merge.py index 42d6856..61d5360 100644 --- a/excel_toolkit/commands/merge.py +++ b/excel_toolkit/commands/merge.py @@ -10,9 +10,12 @@ import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, +) def merge( @@ -34,7 +37,7 @@ def merge( output_path = Path(output) factory = HandlerFactory() - # Step 1: Expand file paths (handle wildcards) + # 1. Expand file paths (handle wildcards) expanded_paths = [] for file_pattern in files.split(","): file_pattern = file_pattern.strip() @@ -56,50 +59,19 @@ def merge( typer.echo("Error: No files to merge", err=True) raise typer.Exit(1) - # Step 2: Read all files + # 2. Read all files dfs = [] columns_per_file = [] rows_per_file = [] for file_path in expanded_paths: - # Get handler - handler_result = factory.get_handler(file_path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"Error with {file_path}: {error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(file_path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(file_path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(file_path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(file_path, encoding=encoding, delimiter=delimiter) - else: - typer.echo(f"Unsupported file type: {file_path}", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading {file_path}: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # Read file using helper + df = read_data_file(str(file_path), sheet) dfs.append(df) columns_per_file.append(set(df.columns)) rows_per_file.append(len(df)) - # Step 3: Check if all files have the same columns + # 3. Check if all files have the same columns if len(columns_per_file) > 1: first_columns = columns_per_file[0] for i, cols in enumerate(columns_per_file[1:], 1): @@ -109,14 +81,14 @@ def merge( typer.echo(f"Found columns: {sorted(cols)}") raise typer.Exit(1) - # Step 4: Merge DataFrames + # 4. Merge DataFrames try: df_merged = pd.concat(dfs, ignore_index=ignore_index) except Exception as e: typer.echo(f"Error merging files: {e}", err=True) raise typer.Exit(1) - # Step 5: Display summary + # 5. Display summary typer.echo(f"Files merged: {len(expanded_paths)}") for i, (file_path, rows) in enumerate(zip(expanded_paths, rows_per_file), 1): typer.echo(f" {i}. {file_path.name}: {rows} rows") @@ -124,7 +96,7 @@ def merge( typer.echo(f"Total columns: {len(df_merged.columns)}") typer.echo("") - # Step 6: Write output + # 6. Write output write_result = factory.write_file(df_merged, output_path) if is_err(write_result): error = unwrap_err(write_result) diff --git a/excel_toolkit/commands/pivot.py b/excel_toolkit/commands/pivot.py index d0a205f..ab50a2e 100644 --- a/excel_toolkit/commands/pivot.py +++ b/excel_toolkit/commands/pivot.py @@ -1,219 +1,114 @@ """Pivot command implementation. -Create pivot table-like summaries from data. +Creates pivot tables from data files. """ from pathlib import Path -from typing import Any - import typer -import pandas as pd - -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.pivoting import ( + validate_aggregation_function, + validate_pivot_columns, + parse_fill_value, + create_pivot_table, +) +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, +) def pivot( file_path: str = typer.Argument(..., help="Path to input file"), - rows: str | None = typer.Option(None, "--rows", "-r", help="Column(s) to use as rows (comma-separated)"), - columns: str | None = typer.Option(None, "--columns", "-c", help="Column(s) to use as columns (comma-separated)"), - values: str | None = typer.Option(None, "--values", "-v", help="Column(s) to use as values (comma-separated)"), - aggfunc: str = typer.Option("sum", "--aggfunc", "-a", help="Aggregation function (sum, mean, count, min, max, median)"), - fill_value: str | None = typer.Option(None, "--fill", "-f", help="Value to fill NaN with"), + rows: str = typer.Option(..., "--rows", "-r", help="Column(s) for pivot table rows"), + columns: str | None = typer.Option(None, "--columns", "-c", help="Column(s) for pivot table columns"), + values: str = typer.Option(..., "--values", "-v", help="Column(s) for pivot table values"), + aggfunc: str = typer.Option("sum", "--aggfunc", "-a", help="Aggregation function (sum, mean, count, etc.)"), + fill_value: str | None = typer.Option(None, "--fill", "-f", help="Fill value for missing cells"), output: str | None = typer.Option(None, "--output", "-o", help="Output file path"), - dry_run: bool = typer.Option(False, "--dry-run", help="Show preview without writing"), + format: str = typer.Option("table", "--format", help="Output format (table, csv, json)"), sheet: str | None = typer.Option(None, "--sheet", "-s", help="Sheet name for Excel files"), ) -> None: - """Create pivot table summaries from data. + """Create a pivot table from data. - Create a pivot table by specifying row, column, and value dimensions. - Supported aggregation functions: sum, mean, avg, count, min, max, median. + Creates multi-dimensional pivot tables with customizable aggregations. Examples: - xl pivot data.xlsx --rows "Date" --columns "Product" --values "Sales:sum" --output pivot.xlsx - xl pivot sales.csv --rows "Region,Category" --columns "Month" --values "Revenue" --aggfunc mean --output monthly.xlsx - xl pivot data.xlsx --rows "Department" --columns "Year" --values "Employees" --aggfunc count --output count.xlsx + xl pivot data.xlsx --rows Category --columns Year --values Sales + xl pivot data.csv --rows Region --columns Product --values Quantity --aggfunc sum + xl pivot data.xlsx --rows Date --values Price --aggfunc mean + xl pivot data.csv --rows City --columns Month --values Revenue --fill 0 """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate required parameters - if not rows: - typer.echo("Error: Must specify --rows columns", err=True) - raise typer.Exit(1) - - if not columns: - typer.echo("Error: Must specify --columns columns", err=True) - raise typer.Exit(1) - - if not values: - typer.echo("Error: Must specify --values columns", err=True) - raise typer.Exit(1) - - # Step 3: Validate aggregation function - valid_funcs = ["sum", "mean", "avg", "count", "min", "max", "median"] - if aggfunc.lower() not in valid_funcs: - typer.echo(f"Error: Invalid aggregation function '{aggfunc}'", err=True) - typer.echo(f"Valid functions: {', '.join(valid_funcs)}", err=True) - raise typer.Exit(1) - - # Normalize avg to mean - agg_func_normalized = "mean" if aggfunc.lower() == "avg" else aggfunc.lower() - - # Step 4: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 5: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) + # 1. Read file + df = read_data_file(file_path, sheet) + + # 2. Parse parameters + row_cols = [c.strip() for c in rows.split(",")] + col_cols = [c.strip() for c in columns.split(",")] if columns else None + value_cols = [c.strip() for c in values.split(",")] + + # 3. Validate aggregation function + agg_result = validate_aggregation_function(aggfunc) + if is_err(agg_result): + error = unwrap_err(agg_result) + typer.echo(f"Invalid aggregation function: {error}", err=True) raise typer.Exit(1) - df = unwrap(read_result) - original_count = len(df) + agg_func_normalized = unwrap(agg_result) - # Step 6: Handle empty file - if df.empty: - typer.echo("File is empty (no data rows)") - raise typer.Exit(0) - - # Step 7: Parse column specifications - row_columns = [c.strip() for c in rows.split(",")] - col_columns = [c.strip() for c in columns.split(",")] - value_columns = [c.strip() for c in values.split(",")] - - # Step 8: Validate columns exist - missing_rows = [c for c in row_columns if c not in df.columns] - missing_cols = [c for c in col_columns if c not in df.columns] - missing_vals = [c for c in value_columns if c not in df.columns] - - if missing_rows: - typer.echo(f"Error: Row columns not found: {', '.join(missing_rows)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - raise typer.Exit(1) - - if missing_cols: - typer.echo(f"Error: Column columns not found: {', '.join(missing_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - raise typer.Exit(1) + # 4. Parse fill value + fill_val = None + if fill_value: + fill_result = parse_fill_value(fill_value) + if is_err(fill_result): + error = unwrap_err(fill_result) + typer.echo(f"Invalid fill value: {error}", err=True) + raise typer.Exit(1) + fill_val = unwrap(fill_result) - if missing_vals: - typer.echo(f"Error: Value columns not found: {', '.join(missing_vals)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") + # 5. Validate columns + validation = validate_pivot_columns(df, row_cols, col_cols, value_cols) + if is_err(validation): + error = unwrap_err(validation) + typer.echo(f"Error: {error}", err=True) raise typer.Exit(1) - # Step 9: Parse fill value - fill_value_parsed = None - if fill_value: - if fill_value.lower() == "none": - fill_value_parsed = None - elif fill_value.lower() == "0": - fill_value_parsed = 0 - elif fill_value.lower() == "nan": - fill_value_parsed = float('nan') - else: - # Try to parse as number - try: - fill_value_parsed = int(fill_value) - except ValueError: - try: - fill_value_parsed = float(fill_value) - except ValueError: - fill_value_parsed = fill_value # Keep as string - - # Step 10: Create pivot table - try: - pivot_table = df.pivot_table( - index=row_columns, - columns=col_columns, - values=value_columns, - aggfunc=agg_func_normalized, - fill_value=fill_value_parsed, - observed=True, # Only use observed categories for categorical data - ) - - # Flatten column names if MultiIndex - if isinstance(pivot_table.columns, pd.MultiIndex): - pivot_table.columns = ['_'.join(map(str, col)).strip() for col in pivot_table.columns.values] - - # Flatten index if MultiIndex - if isinstance(pivot_table.index, pd.MultiIndex): - pivot_table.index = ['_'.join(map(str, idx)).strip() for idx in pivot_table.index.values] - - # Reset index to make rows into columns - pivot_table = pivot_table.reset_index() - - except Exception as e: - typer.echo(f"Error creating pivot table: {str(e)}", err=True) + # 6. Create pivot table + result = create_pivot_table( + df, + rows=row_cols, + columns=col_cols, + values=value_cols, + aggfunc=agg_func_normalized, + fill_value=fill_val + ) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error creating pivot table: {error}", err=True) raise typer.Exit(1) - pivot_count = len(pivot_table) - pivot_cols = len(pivot_table.columns) + df_pivot = unwrap(result) - # Step 11: Display summary - typer.echo(f"Original rows: {original_count}") - typer.echo(f"Pivoted rows: {pivot_count}") - typer.echo(f"Rows: {', '.join(row_columns)}") - typer.echo(f"Columns: {', '.join(col_columns)}") - typer.echo(f"Values: {', '.join(value_columns)}") + # 7. Display summary + typer.echo(f"Created pivot table with {len(df_pivot)} rows x {len(df_pivot.columns)} columns") + typer.echo(f"Rows: {rows}") + if columns: + typer.echo(f"Columns: {columns}") + typer.echo(f"Values: {values}") typer.echo(f"Aggregation: {aggfunc}") - if fill_value is not None: + if fill_value: typer.echo(f"Fill value: {fill_value}") typer.echo("") - # Step 12: Handle dry-run mode - if dry_run: - typer.echo("Preview of pivot table:") - preview_rows = min(5, pivot_count) - display_table(pivot_table.head(preview_rows)) - raise typer.Exit(0) - - # Step 13: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(pivot_table, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(pivot_table) + # 8. Write or display + factory = HandlerFactory() + write_or_display(df_pivot, factory, output, format) # Create CLI app for this command -app = typer.Typer(help="Create pivot table summaries") +app = typer.Typer(help="Create pivot tables from data") # Register the command app.command()(pivot) diff --git a/excel_toolkit/commands/rename.py b/excel_toolkit/commands/rename.py index ca70de9..1544fbe 100644 --- a/excel_toolkit/commands/rename.py +++ b/excel_toolkit/commands/rename.py @@ -4,14 +4,17 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def rename( @@ -29,20 +32,12 @@ def rename( xl rename data.xlsx --mapping "old_name:new_name,first_name:fname" --output renamed.xlsx xl rename data.csv --mapping "id:ID,name:FullName" --output renamed.csv """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate mapping specified + # 1. Validate mapping specified if not mapping: typer.echo("Error: Must specify --mapping", err=True) raise typer.Exit(1) - # Step 3: Parse mapping + # 2. Parse mapping rename_dict = {} parse_errors = [] @@ -76,48 +71,17 @@ def rename( typer.echo("Error: No valid rename mappings", err=True) raise typer.Exit(1) - # Step 4: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 5: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 3. Read file + df = read_data_file(file_path, sheet) original_count = len(df) original_cols = len(df.columns) - # Step 6: Handle empty file + # 4. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 7: Validate old column names exist + # 5. Validate old column names exist missing_cols = [old for old in rename_dict.keys() if old not in df.columns] if missing_cols: typer.echo(f"Error: Columns not found: {', '.join(missing_cols)}", err=True) @@ -132,10 +96,10 @@ def rename( typer.echo(f"Error: New column names conflict with existing columns: {', '.join(overlap)}", err=True) raise typer.Exit(1) - # Step 8: Apply rename + # 6. Apply rename df_renamed = df.rename(columns=rename_dict) - # Step 9: Display summary + # 7. Display summary renamed_count = len(rename_dict) typer.echo(f"Renamed {renamed_count} column(s)") for old_name, new_name in rename_dict.items(): @@ -143,25 +107,16 @@ def rename( typer.echo(f"Rows: {original_count}") typer.echo("") - # Step 10: Handle dry-run mode + # 8. Handle dry-run mode if dry_run: typer.echo("Preview of renamed data:") preview_rows = min(5, original_count) display_table(df_renamed.head(preview_rows)) raise typer.Exit(0) - # Step 11: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_renamed, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_renamed) + # 9. Write or display + factory = HandlerFactory() + write_or_display(df_renamed, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/search.py b/excel_toolkit/commands/search.py index deb2264..46c08c8 100644 --- a/excel_toolkit/commands/search.py +++ b/excel_toolkit/commands/search.py @@ -4,15 +4,18 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd import re -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def search( @@ -34,60 +37,21 @@ def search( xl search data.csv --pattern "^[A-Z]" --regex --columns "Name" xl search logs.xlsx --pattern "error|warning" --regex --case-sensitive """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate pattern specified + # 1. Validate pattern specified if not pattern: typer.echo("Error: Must specify --pattern", err=True) raise typer.Exit(1) - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 2. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 5: Handle empty file + # 3. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 6: Determine columns to search + # 4. Determine columns to search if columns: column_list = [c.strip() for c in columns.split(",")] # Validate columns exist @@ -101,7 +65,7 @@ def search( # Search all columns search_columns = df.columns.tolist() - # Step 7: Compile regex pattern if needed + # 5. Compile regex pattern if needed flags = 0 if case_sensitive else re.IGNORECASE if regex: @@ -115,7 +79,7 @@ def search( pattern_literal = re.escape(pattern) search_pattern = re.compile(pattern_literal, flags) - # Step 8: Search for pattern + # 6. Search for pattern matches = [] for col in search_columns: @@ -141,14 +105,14 @@ def search( typer.echo(f"No matches found for pattern: {pattern}") raise typer.Exit(0) - # Step 9: Create results DataFrame + # 7. Create results DataFrame df_results = pd.DataFrame(matches) # Get matching rows (unique rows that have at least one match) matching_row_indices = df_results['row'].unique() df_matched = df.loc[matching_row_indices].reset_index(drop=True) - # Step 10: Display summary + # 8. Display summary typer.echo(f"Pattern: {pattern}") if columns: typer.echo(f"Columns: {', '.join(search_columns)}") @@ -160,15 +124,10 @@ def search( typer.echo(f"Regex: {regex}") typer.echo("") - # Step 11: Write output or display + # 9. Write or display + factory = HandlerFactory() if output: - output_path = Path(output) - write_result = factory.write_file(df_matched, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") + write_or_display(df_matched, factory, output, "table") else: # Display matching rows display_table(df_matched) diff --git a/excel_toolkit/commands/select.py b/excel_toolkit/commands/select.py index 8ea80de..cf7d6c6 100644 --- a/excel_toolkit/commands/select.py +++ b/excel_toolkit/commands/select.py @@ -4,15 +4,18 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd import numpy as np -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def select( @@ -38,15 +41,7 @@ def select( xl select large.xlsx --only-numeric --output numbers.xlsx xl select data.xlsx --columns "id,name->full_name,email" --output renamed.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Check selection options + # 1. Check selection options selection_options = [ columns is not None, exclude is not None, @@ -66,48 +61,17 @@ def select( typer.echo("Use only one of: --columns, --exclude, --only-numeric, --only-string, --only-datetime, --only-non-empty") raise typer.Exit(1) - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 2. Read file + df = read_data_file(file_path, sheet) original_count = len(df) original_cols = len(df.columns) - # Step 5: Handle empty file + # 3. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 6: Determine columns to select + # 4. Determine columns to select selected_columns = [] if columns: @@ -131,15 +95,13 @@ def select( elif only_datetime: selected_columns = df.select_dtypes(include=['datetime64']).columns.tolist() elif only_non_empty: - for col in df.columns: - if df[col].notna().all(): - selected_columns.append(col) + selected_columns = [col for col in df.columns if df[col].notna().all()] if not selected_columns: typer.echo("No columns match the selection criteria") raise typer.Exit(0) - # Step 7: Validate column names exist + # 5. Validate and select columns if columns: # Parse original column names (before renaming) column_names = [] @@ -165,11 +127,7 @@ def select( raise typer.Exit(1) # Select columns - try: - df_selected = df[column_names].copy() - except Exception as e: - typer.echo(f"Error selecting columns: {str(e)}", err=True) - raise typer.Exit(1) + df_selected = df[column_names].copy() # Apply renaming if specified if rename_mapping: @@ -177,16 +135,8 @@ def select( selected_column_names = [rename_mapping.get(c, c) for c in column_names] else: selected_column_names = column_names - else: - # For other selection methods, validate columns exist - if columns or exclude: - missing_cols = [c for c in selected_columns if c not in df.columns] - if missing_cols: - typer.echo(f"Error: Columns not found: {', '.join(missing_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - raise typer.Exit(1) - + # For other selection methods try: df_selected = df[selected_columns].copy() except Exception as e: @@ -195,7 +145,7 @@ def select( selected_column_names = selected_columns - # Step 8: Display summary + # 6. Display summary typer.echo(f"Selected {len(selected_column_names)} of {original_cols} columns") if columns: typer.echo(f"Columns: {', '.join(selected_column_names)}") @@ -212,25 +162,16 @@ def select( typer.echo(f"Rows: {original_count}") typer.echo("") - # Step 9: Handle dry-run mode + # 7. Handle dry-run mode if dry_run: typer.echo("Preview of selected data:") preview_rows = min(5, original_count) display_table(df_selected.head(preview_rows)) raise typer.Exit(0) - # Step 10: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_selected, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_selected) + # 8. Write or display + factory = HandlerFactory() + write_or_display(df_selected, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/sort.py b/excel_toolkit/commands/sort.py index 3c578bd..4e91157 100644 --- a/excel_toolkit/commands/sort.py +++ b/excel_toolkit/commands/sort.py @@ -4,19 +4,22 @@ """ from pathlib import Path -from typing import Any - import typer -import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler -from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err, ok, err -from excel_toolkit.fp._result import Result +from excel_toolkit.core import HandlerFactory +from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err +from excel_toolkit.operations.sorting import ( + validate_sort_columns, + sort_dataframe, +) +from excel_toolkit.operations.filtering import ( + validate_condition, + normalize_condition, + apply_filter, +) from excel_toolkit.commands.common import ( - display_table, - display_csv, - display_json, - format_file_info, + read_data_file, + write_or_display, ) @@ -43,168 +46,80 @@ def sort( xl sort data.xlsx --columns name --where "age > 30" xl sort data.csv --columns date --na-placement first """ - path = Path(file_path) - factory = HandlerFactory() + # 1. Read file + df = read_data_file(file_path, sheet) - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate na_placement + # 2. Validate na_placement if na_placement not in ["first", "last"]: typer.echo(f"Invalid na_placement: {na_placement}. Must be 'first' or 'last'", err=True) raise typer.Exit(1) - # Step 3: Parse column list + # 3. Parse column list column_list = [c.strip() for c in columns.split(",")] if not column_list: typer.echo("Error: At least one column must be specified", err=True) raise typer.Exit(1) - # Step 4: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 5: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) + # 4. Validate columns + validation = validate_sort_columns(df, column_list) + if is_err(validation): + error = unwrap_err(validation) + typer.echo(f"Error: {error}", err=True) raise typer.Exit(1) - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) - original_count = len(df) - - # Step 6: Handle empty file - if df.empty: - typer.echo("File is empty (no data rows)") - raise typer.Exit(0) - - # Step 7: Validate column names - missing_cols = [c for c in column_list if c not in df.columns] - if missing_cols: - typer.echo(f"Error: Columns not found: {', '.join(missing_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - raise typer.Exit(1) - - # Step 8: Apply filter if specified + # 5. Apply filter if specified if where: - # Import validation from filter command - from excel_toolkit.commands.filter import _validate_condition, _normalize_condition - - validation_result = _validate_condition(where) - if is_err(validation_result): - error_msg = unwrap_err(validation_result) - typer.echo(f"Invalid filter condition: {error_msg}", err=True) + # Validate condition + validation = validate_condition(where) + if is_err(validation): + error = unwrap_err(validation) + typer.echo(f"Invalid filter condition: {error}", err=True) raise typer.Exit(1) - normalized_condition = _normalize_condition(where) - - try: - df = df.query(normalized_condition) - except pd.errors.UndefinedVariableError as e: - import re - error_str = str(e) - col_match = re.search(r"'([^']+)'", error_str) - if col_match: - col = col_match.group(1) - typer.echo(f"Error: Column '{col}' not found", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - else: - typer.echo(f"Error: {error_str}", err=True) - raise typer.Exit(1) - except Exception as e: - error_msg = str(e) - typer.echo(f"Error filtering data: {error_msg}", err=True) - typer.echo(f"\nCondition: {where}", err=True) + # Normalize and apply + normalized = unwrap(normalize_condition(where)) + result = apply_filter(df, normalized) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error filtering data: {error}", err=True) raise typer.Exit(1) + df = unwrap(result) filtered_count = len(df) + if filtered_count == 0: typer.echo("No rows match the filter condition") typer.echo(f"Condition: {where}") raise typer.Exit(0) else: - filtered_count = original_count - - # Step 9: Sort data - try: - # Convert single column to list for consistency - df_sorted = df.sort_values( - by=column_list, - ascending=not desc, - na_position=na_placement, - ) - except TypeError as e: - error_msg = str(e) - if "not comparable" in error_msg or "unorderable types" in error_msg: - typer.echo("Error: Cannot sort mixed data types in column", err=True) - typer.echo("Ensure all values in the column are of the same type", err=True) - else: - typer.echo(f"Error sorting data: {error_msg}", err=True) - raise typer.Exit(1) - except Exception as e: - typer.echo(f"Error sorting data: {str(e)}", err=True) - raise typer.Exit(1) + filtered_count = len(df) - # Step 10: Limit rows if specified - if rows is not None: - df_sorted = df_sorted.head(rows) + # 6. Build sort columns specification + sort_columns = [{"column": col, "ascending": not desc} for col in column_list] - # Step 11: Display summary + # 7. Sort data + result = sort_dataframe(df, sort_columns, na_placement=na_placement, limit=rows) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error sorting data: {error}", err=True) + raise typer.Exit(1) + + df_sorted = unwrap(result) final_count = len(df_sorted) + + # 8. Display summary typer.echo(f"Sorted {final_count} rows") typer.echo(f"Columns: {columns}") typer.echo(f"Order: {'descending' if desc else 'ascending'}") if where: - typer.echo(f"Filter: {where} ({filtered_count} of {original_count} rows matched)") + typer.echo(f"Filter: {where} ({filtered_count} of {len(read_data_file(file_path, sheet))} rows matched)") if na_placement: typer.echo(f"NaN placement: {na_placement}") typer.echo("") - # Step 12: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_sorted, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - if format == "table": - display_table(df_sorted) - elif format == "csv": - display_csv(df_sorted) - elif format == "json": - display_json(df_sorted) - else: - typer.echo(f"Unknown format: {format}", err=True) - typer.echo("Supported formats: table, csv, json") - raise typer.Exit(1) + # 9. Write or display + factory = HandlerFactory() + write_or_display(df_sorted, factory, output, format) # Create CLI app for this command diff --git a/excel_toolkit/commands/stats.py b/excel_toolkit/commands/stats.py index c713438..c8c7fac 100644 --- a/excel_toolkit/commands/stats.py +++ b/excel_toolkit/commands/stats.py @@ -11,9 +11,12 @@ import pandas as pd import numpy as np -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + display_table, +) def stats( @@ -38,15 +41,7 @@ def stats( xl stats data.csv --all-columns --percentiles 10,25,50,75,90,95,99 xl stats data.xlsx --all-columns --include categorical """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Parse percentiles + # 1. Parse percentiles try: percentile_list = [float(p.strip()) for p in percentiles.split(",")] if not all(0 <= p <= 100 for p in percentile_list): @@ -57,7 +52,7 @@ def stats( typer.echo("Expected comma-separated values (e.g., 25,50,75)", err=True) raise typer.Exit(1) - # Step 3: Parse include types + # 2. Parse include types include_types = [t.strip().lower() for t in include.split(",")] valid_types = {"numeric", "categorical", "datetime", "all"} invalid_types = [t for t in include_types if t not in valid_types] @@ -66,46 +61,15 @@ def stats( typer.echo(f"Valid types: {', '.join(valid_types)}", err=True) raise typer.Exit(1) - # Step 4: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 5: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 3. Read file + df = read_data_file(file_path, sheet) - # Step 6: Handle empty file + # 4. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 7: Determine columns to analyze + # 5. Determine columns to analyze if column: if column not in df.columns: typer.echo(f"Error: Column '{column}' not found", err=True) diff --git a/excel_toolkit/commands/strip.py b/excel_toolkit/commands/strip.py index 436d793..da2d8eb 100644 --- a/excel_toolkit/commands/strip.py +++ b/excel_toolkit/commands/strip.py @@ -4,14 +4,18 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.operations.cleaning import trim_whitespace +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def strip( @@ -32,55 +36,16 @@ def strip( xl strip data.csv --columns "Name,Email" --output cleaned.csv xl strip data.xlsx --left --right --output cleaned.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 3: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 1. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 4: Handle empty file + # 2. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 5: Determine columns to process + # 3. Determine columns to process if columns: column_list = [c.strip() for c in columns.split(",")] # Validate columns exist @@ -93,28 +58,38 @@ def strip( # Default: all string columns column_list = df.select_dtypes(include=['object']).columns.tolist() - # Step 6: Strip whitespace from specified columns + # 4. Count cells modified before stripping cells_modified = 0 - for col in column_list: - if col in df.columns: - # Check if column is string type - if df[col].dtype == 'object': - # Count cells with leading/trailing whitespace before stripping - if left and right: - before = df[col].str.strip().ne(df[col]).sum() - df[col] = df[col].str.strip() - cells_modified += before - elif left: - before = df[col].str.lstrip().ne(df[col]).sum() - df[col] = df[col].str.lstrip() - cells_modified += before - elif right: - before = df[col].str.rstrip().ne(df[col]).sum() - df[col] = df[col].str.rstrip() - cells_modified += before - - # Step 7: Display summary + if col in df.columns and df[col].dtype == 'object': + if left and right: + cells_modified += df[col].str.strip().ne(df[col]).sum() + elif left: + cells_modified += df[col].str.lstrip().ne(df[col]).sum() + elif right: + cells_modified += df[col].str.rstrip().ne(df[col]).sum() + + # 5. Determine strip side + if left and right: + side = "both" + elif left: + side = "left" + elif right: + side = "right" + else: + side = "both" + + # 6. Strip whitespace using operation + result = trim_whitespace(df, columns=column_list, side=side) + + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Error stripping whitespace: {error}", err=True) + raise typer.Exit(1) + + df_stripped = unwrap(result) + + # 7. Display summary typer.echo(f"Total rows: {original_count}") typer.echo(f"Columns processed: {len(column_list)}") if columns: @@ -125,18 +100,13 @@ def strip( typer.echo(f"Strip mode: {'left' if left else ''}{'/' if left and right else ''}{'right' if right else ''}") typer.echo("") - # Step 8: Write output or display + # 8. Write or display + factory = HandlerFactory() if output: - output_path = Path(output) - write_result = factory.write_file(df, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") + write_or_display(df_stripped, factory, output, "table") else: # Display preview - display_table(df.head(20)) + display_table(df_stripped.head(20)) if original_count > 20: typer.echo(f"\n... and {original_count - 20} more rows") diff --git a/excel_toolkit/commands/tail.py b/excel_toolkit/commands/tail.py index 0cdfdf5..47332bf 100644 --- a/excel_toolkit/commands/tail.py +++ b/excel_toolkit/commands/tail.py @@ -4,14 +4,14 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err from excel_toolkit.commands.common import ( + read_data_file, display_table, display_csv, display_json, @@ -44,110 +44,37 @@ def tail( Raises: typer.Exit: If file cannot be read """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - typer.echo("\nSupported formats: .xlsx, .xls, .csv") - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 3: Read file - sheet_name_display = None - if isinstance(handler, ExcelHandler): - # Determine which sheet to read - sheet_name = sheet - if sheet_name is None: - # Get first sheet name - names_result = handler.get_sheet_names(path) - if is_ok(names_result): - sheets = unwrap(names_result) - sheet_name = sheets[0] if sheets else None - - sheet_name_display = sheet_name - - # Read Excel file - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading Excel file: {error}", err=True) - raise typer.Exit(1) - - elif isinstance(handler, CSVHandler): - # Detect encoding - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - # Detect delimiter - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - # Read CSV file - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading CSV file: {error}", err=True) - raise typer.Exit(1) - - else: - typer.echo("Unsupported file format", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) - - # Step 4: Handle empty file + # 1. Read file + df = read_data_file(file_path, sheet) + + # 2. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") - # Still show column info if requested - if show_columns: - display_column_types(df) raise typer.Exit(0) - # Step 5: Display file info if columns requested + # 3. Limit columns if requested + if max_columns and len(df.columns) > max_columns: + df = df.iloc[:, :max_columns] + + # 4. Get last N rows + df_tail = df.tail(rows) + + # 5. Display file info + path = Path(file_path) + format_file_info(path, len(df), len(df.columns)) + + # 6. Show column information if requested if show_columns: - file_info = format_file_info( - str(path), sheet=sheet_name_display, total_rows=len(df), total_cols=len(df.columns) - ) - typer.echo(file_info) display_column_types(df) - raise typer.Exit(0) - - # Step 6: Get last N rows - tail_rows = min(rows, len(df)) - df_tail = df.tail(tail_rows) - # Step 7: Display based on format + # 7. Display data based on format if format == "table": - # Limit columns if requested - if max_columns and len(df_tail.columns) > max_columns: - df_tail = df_tail.iloc[:, :max_columns] - display_table(df_tail) - elif format == "csv": display_csv(df_tail) - elif format == "json": display_json(df_tail) - else: - typer.echo(f"Unknown format: {format}", err=True) - typer.echo("Supported formats: table, csv, json") - raise typer.Exit(1) - # Create CLI app for this command app = typer.Typer(help="Display the last N rows of a data file") diff --git a/excel_toolkit/commands/transform.py b/excel_toolkit/commands/transform.py index 8c2cf88..1f74190 100644 --- a/excel_toolkit/commands/transform.py +++ b/excel_toolkit/commands/transform.py @@ -4,14 +4,17 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def transform( @@ -39,15 +42,7 @@ def transform( xl transform data.xlsx --columns "Description" --operation "strip" --output clean.xlsx xl transform sales.xlsx --columns "Amount" --add "100" --output adjusted.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate transformation options + # 1. Validate transformation options math_operations = { 'multiply': multiply, 'add': add, @@ -71,6 +66,9 @@ def transform( raise typer.Exit(1) # Validate only one math operation + math_op = None + numeric_value = None + if has_math_op: active_math_ops = [k for k, v in math_operations.items() if v is not None] if len(active_math_ops) > 1: @@ -87,7 +85,7 @@ def transform( typer.echo(f"Error: Invalid numeric value '{math_value}' for --{math_op}", err=True) raise typer.Exit(1) - # Step 3: Validate string operation + # 2. Validate string operation valid_string_ops = ["uppercase", "lowercase", "titlecase", "strip", "replace", "length"] if operation and operation not in valid_string_ops: typer.echo(f"Error: Invalid operation '{operation}'", err=True) @@ -100,47 +98,16 @@ def transform( typer.echo("Format: --replace \"old_pattern,new_pattern\"") raise typer.Exit(1) - # Step 4: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 5: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 3. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 6: Handle empty file + # 4. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 7: Parse columns + # 5. Parse columns column_list = [c.strip() for c in columns.split(",")] # Validate columns exist missing_cols = [c for c in column_list if c not in df.columns] @@ -149,7 +116,7 @@ def transform( typer.echo(f"Available columns: {', '.join(df.columns)}") raise typer.Exit(1) - # Step 8: Apply transformation + # 6. Apply transformation df_transformed = df.copy() for col in column_list: @@ -190,7 +157,7 @@ def transform( elif operation == "length": df_transformed[col] = df_transformed[col].astype(str).str.len() - # Step 9: Display summary + # 7. Display summary typer.echo(f"Transformed {len(column_list)} column(s)") typer.echo(f"Columns: {', '.join(column_list)}") if has_math_op: @@ -200,25 +167,16 @@ def transform( typer.echo(f"Rows: {original_count}") typer.echo("") - # Step 10: Handle dry-run mode + # 8. Handle dry-run mode if dry_run: typer.echo("Preview of transformed data:") preview_rows = min(5, original_count) display_table(df_transformed.head(preview_rows)) raise typer.Exit(0) - # Step 11: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_transformed, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_transformed) + # 9. Write or display + factory = HandlerFactory() + write_or_display(df_transformed, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/unique.py b/excel_toolkit/commands/unique.py index ed1da41..8d4c351 100644 --- a/excel_toolkit/commands/unique.py +++ b/excel_toolkit/commands/unique.py @@ -4,14 +4,17 @@ """ from pathlib import Path -from typing import Any import typer import pandas as pd -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler +from excel_toolkit.core import HandlerFactory from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err -from excel_toolkit.commands.common import display_table +from excel_toolkit.commands.common import ( + read_data_file, + write_or_display, + display_table, +) def unique( @@ -31,60 +34,21 @@ def unique( xl unique data.csv --columns "Region,Product" --output unique.xlsx xl unique contacts.xlsx --columns "Email" --count --output email-counts.xlsx """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Validate columns specified + # 1. Validate columns specified if not columns: typer.echo("Error: Must specify --columns", err=True) raise typer.Exit(1) - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) - - handler = unwrap(handler_result) - - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) + # 2. Read file + df = read_data_file(file_path, sheet) original_count = len(df) - # Step 5: Handle empty file + # 3. Handle empty file if df.empty: typer.echo("File is empty (no data rows)") raise typer.Exit(0) - # Step 6: Parse columns + # 4. Parse columns column_list = [c.strip() for c in columns.split(",")] # Validate columns exist missing_cols = [c for c in column_list if c not in df.columns] @@ -93,7 +57,7 @@ def unique( typer.echo(f"Available columns: {', '.join(df.columns)}") raise typer.Exit(1) - # Step 7: Get unique values + # 5. Get unique values if len(column_list) == 1: # Single column - get unique values col = column_list[0] @@ -118,7 +82,7 @@ def unique( unique_count = len(df_unique) - # Step 8: Display summary + # 6. Display summary typer.echo(f"Original rows: {original_count}") typer.echo(f"Unique rows: {unique_count}") if len(column_list) == 1: @@ -127,25 +91,16 @@ def unique( typer.echo(f"Columns: {', '.join(column_list)}") typer.echo("") - # Step 9: Handle dry-run mode + # 7. Handle dry-run mode if dry_run: typer.echo("Preview of unique values:") preview_rows = min(5, unique_count) display_table(df_unique.head(preview_rows)) raise typer.Exit(0) - # Step 10: Write output or display - if output: - output_path = Path(output) - write_result = factory.write_file(df_unique, output_path) - if is_err(write_result): - error = unwrap_err(write_result) - typer.echo(f"Error writing file: {error}", err=True) - raise typer.Exit(1) - typer.echo(f"Written to: {output}") - else: - # Display data - display_table(df_unique) + # 8. Write or display + factory = HandlerFactory() + write_or_display(df_unique, factory, output, "table") # Create CLI app for this command diff --git a/excel_toolkit/commands/validate.py b/excel_toolkit/commands/validate.py index 8bd7a1c..ca2c364 100644 --- a/excel_toolkit/commands/validate.py +++ b/excel_toolkit/commands/validate.py @@ -1,497 +1,182 @@ """Validate command implementation. -Validates data against rules and constraints. +Validates data quality against various rules. """ from pathlib import Path -from typing import Any -import json -import re -from datetime import datetime - import typer -import pandas as pd -import numpy as np -from excel_toolkit.core import HandlerFactory, ExcelHandler, CSVHandler -from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err, ok, err -from excel_toolkit.fp._result import Result -from excel_toolkit.commands.common import display_table +from excel_toolkit.core import HandlerFactory +from excel_toolkit.fp import is_ok, is_err, unwrap, unwrap_err +from excel_toolkit.operations.validation import ( + validate_column_exists, + validate_column_type, + validate_value_range, + validate_unique, + check_null_values, + validate_dataframe, + ValidationReport, +) +from excel_toolkit.commands.common import read_data_file def validate( file_path: str = typer.Argument(..., help="Path to input file"), - rules: str | None = typer.Option(None, "--rules", "-r", help="Validation rules (comma-separated)"), - rules_file: str | None = typer.Option(None, "--rules-file", help="Path to JSON rules file"), - columns: str | None = typer.Option(None, "--columns", "-c", help="Specific columns to validate"), - output: str | None = typer.Option(None, "--output", "-o", help="Output report file"), - fail_fast: bool = typer.Option(False, "--fail-fast", help="Stop on first validation error"), + columns: str | None = typer.Option(None, "--columns", "-c", help="Comma-separated columns to check"), + types: str | None = typer.Option(None, "--types", "-t", help="Type checks (format: col:type,col:type)"), + range: str | None = typer.Option(None, "--range", "-r", help="Range check (format: col:min:max)"), + unique: str | None = typer.Option(None, "--unique", "-u", help="Check uniqueness of column(s)"), + null_threshold: float | None = typer.Option(None, "--null-threshold", help="Max null percentage (0-1)"), + min_value: float | None = typer.Option(None, "--min", help="Minimum value for range check"), + max_value: float | None = typer.Option(None, "--max", help="Maximum value for range check"), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed validation info"), + fail_fast: bool = typer.Option(False, "--fail-fast", help="Stop on first validation failure"), sheet: str | None = typer.Option(None, "--sheet", "-s", help="Sheet name for Excel files"), ) -> None: - """Validate data against rules and constraints. + """Validate data quality against various rules. - Supports various validation types: - - Type checking: int, float, str, bool, datetime - - Range validation: min:max (e.g., age:int:0-120) - - Pattern matching: email, url, phone, regex - - Null checking: required, optional - - Uniqueness: unique, duplicate + Performs comprehensive validation checks: + - Column existence: Verify columns exist + - Type checking: Validate data types (int, float, str, bool, datetime, numeric) + - Range validation: Ensure values within specified range + - Uniqueness: Check for duplicate values + - Null threshold: Verify null values don't exceed threshold Examples: - xl validate data.csv --rules "age:int:0-120,email:email" - xl validate sales.xlsx --rules-file validation.json - xl validate data.csv --columns "email,phone" --rules "email:email,phone:phone" - xl validate data.xlsx --rules "*" --output report.json --fail-fast + xl validate data.xlsx --columns id,name,email + xl validate data.csv --types "age:int,salary:float" + xl validate data.xlsx --range "age:0:120" + xl validate data.csv --unique id --null-threshold 0.1 + xl validate data.xlsx --columns id --types "id:int" --unique id --verbose """ - path = Path(file_path) - factory = HandlerFactory() - - # Step 1: Validate file exists - if not path.exists(): - typer.echo(f"File not found: {file_path}", err=True) - raise typer.Exit(1) - - # Step 2: Parse validation rules - if not rules and not rules_file: - typer.echo("Error: Either --rules or --rules-file must be specified", err=True) - typer.echo("Use --rules '*' to validate all columns with basic type checking", err=True) - raise typer.Exit(1) - - if rules and rules_file: - typer.echo("Error: Cannot specify both --rules and --rules-file", err=True) - raise typer.Exit(1) - - # Parse rules - if rules: - rules_result = _parse_rules_string(rules) - else: - rules_result = _parse_rules_file(rules_file) - - if is_err(rules_result): - error_msg = unwrap_err(rules_result) - typer.echo(f"Error parsing rules: {error_msg}", err=True) - raise typer.Exit(1) - - validation_rules = unwrap(rules_result) - - # Step 3: Get handler - handler_result = factory.get_handler(path) - if is_err(handler_result): - error = unwrap_err(handler_result) - typer.echo(f"{error}", err=True) - raise typer.Exit(1) + # 1. Read file + df = read_data_file(file_path, sheet) - handler = unwrap(handler_result) + # 2. Build validation rules + rules = [] - # Step 4: Read file - if isinstance(handler, ExcelHandler): - sheet_name = sheet - kwargs = {"sheet_name": sheet_name} if sheet_name else {} - read_result = handler.read(path, **kwargs) - elif isinstance(handler, CSVHandler): - # Auto-detect encoding and delimiter - encoding_result = handler.detect_encoding(path) - encoding = unwrap(encoding_result) if is_ok(encoding_result) else "utf-8" - - delimiter_result = handler.detect_delimiter(path, encoding) - delimiter = unwrap(delimiter_result) if is_ok(delimiter_result) else "," - - read_result = handler.read(path, encoding=encoding, delimiter=delimiter) - else: - typer.echo("Unsupported handler type", err=True) - raise typer.Exit(1) - - if is_err(read_result): - error = unwrap_err(read_result) - typer.echo(f"Error reading file: {error}", err=True) - raise typer.Exit(1) - - df = unwrap(read_result) - - # Step 5: Handle empty file - if df.empty: - typer.echo("File is empty (no data rows)") - raise typer.Exit(0) - - # Step 6: Determine columns to validate + # Column existence rule if columns: - column_list = [c.strip() for c in columns.split(",")] - # Validate column names exist - missing_cols = [c for c in column_list if c not in df.columns] - if missing_cols: - typer.echo(f"Error: Columns not found: {', '.join(missing_cols)}", err=True) - typer.echo(f"Available columns: {', '.join(df.columns)}") - raise typer.Exit(1) - elif "*" in validation_rules: - # Validate all columns - column_list = list(df.columns) - else: - # Use columns from rules - column_list = [col for col in validation_rules.keys() if col in df.columns] - - if not column_list: - typer.echo("No columns to validate") - raise typer.Exit(0) - - # Step 7: Perform validation - all_errors = [] - all_warnings = [] - total_checked = 0 - - for col in column_list: - col_rules = validation_rules.get("*", {}) if "*" in validation_rules else validation_rules.get(col, {}) - - if not col_rules: - # Basic validation only - col_errors, col_warnings = _validate_column_basic(df[col], col) + col_list = [c.strip() for c in columns.split(",")] + rules.append({ + "type": "column_exists", + "columns": col_list + }) + + # Type validation rule + if types: + type_dict = {} + for spec in types.split(","): + col, col_type = spec.split(":") + type_dict[col.strip()] = col_type.strip() + rules.append({ + "type": "column_type", + "column_types": type_dict + }) + + # Range validation rule + if range or (min_value is not None or max_value is not None): + if range: + # Parse range spec "col:min:max" + col_name, min_val, max_val = range.split(":") + range_col = col_name.strip() + range_min = float(min_val) + range_max = float(max_val) else: - col_errors, col_warnings = _validate_column_with_rules(df[col], col, col_rules) - - all_errors.extend(col_errors) - all_warnings.extend(col_warnings) - total_checked += 1 - - # Fail fast if requested - if fail_fast and col_errors: - break - - # Step 8: Generate report - total_errors = len(all_errors) - total_warnings = len(all_warnings) - - report = { - "file": str(path), - "total_rows": len(df), - "columns_checked": total_checked, - "total_errors": total_errors, - "total_warnings": total_warnings, - "errors": all_errors[:100], # Limit to first 100 - "warnings": all_warnings[:100], - "has_errors": total_errors > 0, - "has_warnings": total_warnings > 0, - } - - # Step 9: Display results - if total_errors == 0 and total_warnings == 0: - typer.echo("Validation passed!") - typer.echo(f"Checked {total_checked} columns across {len(df)} rows") - else: - typer.echo(f"Validation completed with {total_errors} error(s) and {total_warnings} warning(s)") - typer.echo(f"Checked {total_checked} columns across {len(df)} rows") - typer.echo("") - - if total_errors > 0: - typer.echo(f"Errors ({min(total_errors, 100)} shown):") - for i, error in enumerate(all_errors[:20], 1): - typer.echo(f" {i}. {error}") - if total_errors > 20: - typer.echo(f" ... and {total_errors - 20} more errors") - typer.echo("") - - if total_warnings > 0: - typer.echo(f"Warnings ({min(total_warnings, 100)} shown):") - for i, warning in enumerate(all_warnings[:10], 1): - typer.echo(f" {i}. {warning}") - if total_warnings > 10: - typer.echo(f" ... and {total_warnings - 10} more warnings") - - # Step 10: Write output if specified - if output: - output_path = Path(output) - try: - with open(output_path, "w") as f: - json.dump(report, f, indent=2, default=str) - typer.echo(f"Report written to: {output}") - except Exception as e: - typer.echo(f"Error writing report: {str(e)}", err=True) - raise typer.Exit(1) + # Use --min and --max options (need a column) + if not columns: + typer.echo("Error: Must specify --columns with --min/--max", err=True) + raise typer.Exit(1) + range_col = columns.split(",")[0].strip() + range_min = min_value + range_max = max_value + + rules.append({ + "type": "value_range", + "column": range_col, + "min": range_min, + "max": range_max + }) + + # Uniqueness rule + if unique: + unique_cols = [c.strip() for c in unique.split(",")] + rules.append({ + "type": "unique", + "columns": unique_cols + }) + + # Null threshold rule + if null_threshold is not None: + cols_to_check = [c.strip() for c in columns.split(",")] if columns else None + rules.append({ + "type": "null_threshold", + "columns": cols_to_check, + "threshold": null_threshold + }) + + # If no rules specified, check all columns exist and have no nulls + if not rules: + typer.echo("No validation rules specified. Use --columns, --types, --range, --unique, or --null-threshold") + typer.echo("Run 'xl validate --help' for examples") + raise typer.Exit(0) - # Exit with error code if validation failed - if total_errors > 0: + # 3. Run validation + result = validate_dataframe(df, rules) + if is_err(result): + error = unwrap_err(result) + typer.echo(f"Validation error: {error}", err=True) raise typer.Exit(1) + report: ValidationReport = unwrap(result) -def _parse_rules_string(rules_str: str) -> Result[dict, str]: - """Parse rules from command-line string. + # 4. Display results + _display_validation_report(report, verbose) - Format: "column1:rule1,rule2;column2:rule3" - Examples: - "age:int:0-120,email:email" - "name:required,email:email:optional" - "*" # Wildcard for all columns with basic validation - """ - rules = {} - - # Handle wildcard - if rules_str.strip() == "*": - return ok({"*": {}}) - - try: - # Split by semicolon for multiple columns - col_rules_list = rules_str.split(";") - - for col_rule in col_rules_list: - # Split by first colon to get column name and its rules - if ":" not in col_rule: - return err(f"Invalid rule format: {col_rule}. Expected 'column:rule'") - - parts = col_rule.split(":") - col_name = parts[0].strip() - rule_specs = parts[1:] - - if col_name == "*": - # Wildcard rule applies to all columns - rules["*"] = _parse_rule_specs(rule_specs) - else: - rules[col_name] = _parse_rule_specs(rule_specs) - - return ok(rules) - - except Exception as e: - return err(f"Error parsing rules: {str(e)}") + # 5. Exit with error if failures + if report.failed > 0 and fail_fast: + raise typer.Exit(1) -def _parse_rule_specs(rule_specs: list[str]) -> dict: - """Parse rule specifications into a dictionary. +def _display_validation_report(report: ValidationReport, verbose: bool) -> None: + """Display validation report in user-friendly format. - Examples: - ["int", "0-120"] -> {"type": "int", "min": 0, "max": 120} - ["email"] -> {"type": "str", "pattern": "email"} + Args: + report: Validation report from validate_dataframe + verbose: Whether to show detailed warnings """ - rule_dict = {} - - for spec in rule_specs: - spec = spec.strip().lower() - - # Type specification - if spec in ["int", "float", "str", "bool", "datetime"]: - rule_dict["type"] = spec - - # Range specification - elif "-" in spec and spec.replace("-", "").replace(".", "").isdigit(): - parts = spec.split("-") - if len(parts) == 2: - try: - rule_dict["min"] = float(parts[0]) - rule_dict["max"] = float(parts[1]) - except ValueError: - pass - - # Pattern specification - elif spec in ["email", "url", "phone"]: - rule_dict["pattern"] = spec - - # Null specification - elif spec in ["required", "optional"]: - rule_dict["nullable"] = (spec == "optional") - - # Uniqueness specification - elif spec in ["unique", "duplicate"]: - rule_dict["unique"] = (spec == "unique") - - # Regex pattern - elif spec.startswith("regex:"): - rule_dict["pattern"] = spec[6:] - rule_dict["pattern_type"] = "regex" - - return rule_dict - - -def _parse_rules_file(file_path: str) -> Result[dict, str]: - """Parse rules from JSON file.""" - try: - path = Path(file_path) - if not path.exists(): - return err(f"Rules file not found: {file_path}") - - with open(path, "r") as f: - rules = json.load(f) - - # Basic validation of rules structure - if not isinstance(rules, dict): - return err("Rules file must contain a JSON object") - - return ok(rules) - - except json.JSONDecodeError as e: - return err(f"Invalid JSON in rules file: {str(e)}") - except Exception as e: - return err(f"Error reading rules file: {str(e)}") - - -def _validate_column_basic(series: pd.Series, col_name: str) -> tuple[list[str], list[str]]: - """Perform basic validation on a column.""" - errors = [] - warnings = [] - - # Check for null values - null_count = series.isna().sum() - if null_count > 0: - warnings.append(f"Column '{col_name}': {null_count} null values ({null_count / len(series) * 100:.1f}%)") - - # Check data type consistency - if len(series) > 0: - # Get the dtype of non-null values - non_null = series.dropna() - if len(non_null) > 0: - dtype = non_null.dtype - # Check if string column has mixed types - if dtype == "object": - try: - # Try to convert to numeric - pd.to_numeric(non_null, errors="coerce") - except: - pass - - return errors, warnings - - -def _validate_column_with_rules(series: pd.Series, col_name: str, rules: dict) -> tuple[list[str], list[str]]: - """Validate a column against specific rules.""" - errors = [] - warnings = [] - - # Null validation - nullable = rules.get("nullable", True) # Default: nullable - - null_count = series.isna().sum() - if not nullable and null_count > 0: - errors.append(f"Column '{col_name}': {null_count} null values found (column is required)") - elif null_count > 0: - warnings.append(f"Column '{col_name}': {null_count} null values ({null_count / len(series) * 100:.1f}%)") - - # Get non-null values for further validation - non_null = series.dropna() - - if len(non_null) == 0: - return errors, warnings - - # Type validation - expected_type = rules.get("type") - if expected_type: - type_errors = _validate_type(non_null, col_name, expected_type) - errors.extend(type_errors) - # If type validation failed, skip further validations - if type_errors: - return errors, warnings - - # Range validation - if "min" in rules or "max" in rules: - range_errors = _validate_range(non_null, col_name, rules.get("min"), rules.get("max")) - errors.extend(range_errors) - - # Pattern validation - pattern = rules.get("pattern") - if pattern: - pattern_errors, pattern_warnings = _validate_pattern(non_null, col_name, pattern, rules.get("pattern_type", "name")) - errors.extend(pattern_errors) - warnings.extend(pattern_warnings) - - # Uniqueness validation - if rules.get("unique"): - unique_errors, unique_warnings = _validate_uniqueness(series, col_name) - errors.extend(unique_errors) - warnings.extend(unique_warnings) - - return errors, warnings - - -def _validate_type(series: pd.Series, col_name: str, expected_type: str) -> list[str]: - """Validate data type of a series.""" - errors = [] - - try: - if expected_type == "int": - # Check if all values can be converted to int - pd.to_numeric(series, errors="coerce") - elif expected_type == "float": - pd.to_numeric(series, errors="coerce") - elif expected_type == "bool": - # Check if values are boolean-like - for val in series.head(100): # Sample first 100 - if val not in [True, False, 1, 0, "True", "False", "true", "false", "1", "0"]: - return [f"Column '{col_name}': Contains non-boolean values"] - elif expected_type == "datetime": - pd.to_datetime(series, errors="coerce") - except Exception: - errors.append(f"Column '{col_name}': Cannot validate as {expected_type} type") - - return errors - - -def _validate_range(series: pd.Series, col_name: str, min_val: float | None, max_val: float | None) -> list[str]: - """Validate numeric range.""" - errors = [] - - try: - numeric_series = pd.to_numeric(series, errors="coerce") - - if min_val is not None: - below_min = (numeric_series < min_val).sum() - if below_min > 0: - errors.append(f"Column '{col_name}': {below_min} values below minimum {min_val}") - - if max_val is not None: - above_max = (numeric_series > max_val).sum() - if above_max > 0: - errors.append(f"Column '{col_name}': {above_max} values above maximum {max_val}") - - except Exception: - pass # Range validation only applies to numeric values - - return errors - - -def _validate_pattern(series: pd.Series, col_name: str, pattern: str, pattern_type: str) -> tuple[list[str], list[str]]: - """Validate pattern matching.""" - errors = [] - warnings = [] - - # Convert to string for pattern matching - str_series = series.astype(str) - - if pattern_type == "regex": - regex = pattern - elif pattern == "email": - regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - elif pattern == "url": - regex = r'^https?://[^\s/$.?#].[^\s]*$' - elif pattern == "phone": - regex = r'^\+?[\d\s\-\(\)]+$' + # Summary + typer.echo(f"✅ Passed: {report.passed}") + if report.failed > 0: + typer.echo(f"❌ Failed: {report.failed}", err=True) else: - return errors, warnings - - try: - compiled_regex = re.compile(regex) - matches = str_series.apply(lambda x: bool(compiled_regex.match(x))) - non_matches = (~matches).sum() - - if non_matches > 0: - errors.append(f"Column '{col_name}': {non_matches} values don't match pattern '{pattern}'") - - except Exception: - warnings.append(f"Column '{col_name}': Invalid regex pattern '{pattern}'") - - return errors, warnings - - -def _validate_uniqueness(series: pd.Series, col_name: str) -> tuple[list[str], list[str]]: - """Validate uniqueness constraint.""" - errors = [] - warnings = [] - - total_count = len(series) - unique_count = series.nunique() - duplicate_count = total_count - unique_count - - if duplicate_count > 0: - errors.append(f"Column '{col_name}': {duplicate_count} duplicate values found ({unique_count} unique out of {total_count})") + typer.echo("❌ Failed: 0") + + typer.echo("") + + # Errors + if report.errors: + typer.echo("Errors:", err=True) + for i, error in enumerate(report.errors, 1): + rule_num = error.get("rule", "?") + error_type = error.get("type", "unknown") + error_msg = error.get("error", str(error)) + typer.echo(f" {i}. Rule #{rule_num} ({error_type}): {error_msg}", err=True) + typer.echo("") - return errors, warnings + # Warnings (only if verbose) + if report.warnings and verbose: + typer.echo("Warnings:") + for i, warning in enumerate(report.warnings, 1): + col = warning.get("column", "?") + null_count = warning.get("null_count", 0) + null_percent = warning.get("null_percent", 0.0) + typer.echo(f" {i}. Column '{col}': {null_count} nulls ({null_percent:.1%})") + typer.echo("") # Create CLI app for this command -app = typer.Typer(help="Validate data against rules and constraints") +app = typer.Typer(help="Validate data quality") # Register the command app.command()(validate) diff --git a/pyproject.toml b/pyproject.toml index 4768020..ecb1541 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "excel-toolkit-cwd" -version = "0.1.0" +version = "0.2.0" description = "Command-line toolkit for Excel data manipulation and analysis" readme = "README.md" requires-python = ">=3.10"