From 497fa34de7bbc4bc64b49e5113169b2713df81c8 Mon Sep 17 00:00:00 2001 From: John Ajera Date: Tue, 25 Nov 2025 17:08:03 +1300 Subject: [PATCH] feat: add client requests data adds client requests in the dashboard --- .gitignore | 2 - .vscode/extensions.json | 11 ++++ .vscode/settings.json | 60 ++++++++++++++++++++ README.md | 86 +++++++++++++++++++++++++++- dashboard/dashboard.py | 27 +++++++++ scripts/analyze_logs.py | 64 ++++++++++++++++++++- scripts/clear_logs.py | 121 ++++++++++++++++++++++++++++++++++++++++ scripts/query_logs.py | 73 +++++++++++++++++++----- 8 files changed, 425 insertions(+), 19 deletions(-) create mode 100644 .vscode/extensions.json create mode 100644 .vscode/settings.json create mode 100755 scripts/clear_logs.py diff --git a/.gitignore b/.gitignore index 360b974..b6d9ff1 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,6 @@ config/log_sources.yaml # logs/analytics.json # IDE -.vscode/ .idea/ *.swp *.swo @@ -39,4 +38,3 @@ Thumbs.db # Temporary files *.tmp *.bak - diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..b3ed25c --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,11 @@ +{ + "recommendations": [ + "ms-python.python", + "ms-python.vscode-pylance", + "ms-python.black-formatter", + "ms-python.flake8", + "timonwong.shellcheck", + "davidanson.vscode-markdownlint", + "redhat.vscode-yaml" + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3ddcad6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,60 @@ +{ + // Python settings + "python.defaultInterpreterPath": "python3", + "python.linting.enabled": true, + "python.linting.pylintEnabled": false, + "python.linting.flake8Enabled": true, + "python.formatting.provider": "black", + "python.formatting.blackArgs": ["--line-length", "100"], + "python.analysis.typeCheckingMode": "basic", + "python.analysis.autoImportCompletions": true, + + // File associations + "files.associations": { + "*.sh": "shellscript" + }, + + // Editor settings + "editor.formatOnSave": true, + "editor.rulers": [100], + "editor.tabSize": 4, + "editor.insertSpaces": true, + "editor.trimAutoWhitespace": true, + "files.trimTrailingWhitespace": true, + "files.insertFinalNewline": true, + "files.exclude": { + "**/__pycache__": true, + "**/*.pyc": true, + "**/.DS_Store": true + }, + + // Shell script settings + "[shellscript]": { + "editor.tabSize": 2, + "editor.insertSpaces": true + }, + + // Markdown settings + "[markdown]": { + "editor.wordWrap": "on", + "editor.quickSuggestions": { + "comments": "off", + "strings": "off", + "other": "off" + } + }, + + // Search settings + "search.exclude": { + "**/logs": true, + "**/__pycache__": true, + "**/*.pyc": true, + "**/.git": true + }, + + // File watcher settings (exclude large log directories) + "files.watcherExclude": { + "**/logs/**": true, + "**/__pycache__/**": true + } +} diff --git a/README.md b/README.md index 098a4b6..1c8ea53 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,11 @@ A comprehensive tool for syncing Fastly logs from S3, parsing them, and generati - **UTC timezone handling**: All date operations use UTC to match log file organization - **Log parsing**: Converts syslog-style Fastly logs to structured JSON/CSV - **Comprehensive analytics**: Traffic patterns, error analysis, performance metrics, user agent analysis, query parameter patterns, endpoint drill-down, daily summaries, and slowness investigation +- **Time-based filtering**: Filter logs to analyze only the last N hours (e.g., last hour, last 24 hours) +- **Client IP analysis**: Top client IPs by request volume for security and traffic analysis - **Interactive dashboard**: Streamlit-based web dashboard for visualizing log analytics - **Modular design**: Run sync, parse, or analyze operations independently or as a pipeline +- **Log management**: Utility script to clear all log files when needed ## Prerequisites @@ -78,6 +81,35 @@ python3 scripts/query_logs.py --start-date 2025-11-10 --end-date 2025-11-12 **Timezone Note**: All date operations use UTC to match the log file organization in S3. Log timestamps are preserved in UTC throughout parsing and analysis. +### Time-Based Filtering + +Filter logs to analyze only entries from the last N hours: + +**Analyze last hour from existing parsed logs** (no sync needed): + +```bash +python3 scripts/query_logs.py --last-hours 1.0 +``` + +**Sync, parse, and analyze with time filter**: + +```bash +python3 scripts/query_logs.py --date 2025-11-23 --last-hours 1.0 +``` + +**Other time filter examples**: + +```bash +python3 scripts/query_logs.py --last-hours 0.5 # Last 30 minutes +python3 scripts/query_logs.py --last-hours 24.0 # Last 24 hours +python3 scripts/query_logs.py --last-hours 2.5 # Last 2.5 hours +``` + +**Note**: When using `--last-hours` without dates: + +- If parsed logs exist, it will analyze them with the time filter +- If no parsed logs exist, it will automatically sync today's logs, parse them, then analyze with the time filter + ### Individual Operations #### Sync Only @@ -137,7 +169,14 @@ The dashboard will open in your browser (typically at `http://localhost:8501`) a - **Performance Metrics**: Cache hit/miss rates, response size statistics - **User Agent Analysis**: Top user agents and agent type distribution - **Query Patterns**: Most common query parameters and value distributions -- **Slowness Investigation**: Cache miss patterns, large response endpoints, peak traffic times +- **Slowness Investigation**: Cache miss patterns, large response endpoints, peak traffic times, **top client IPs by request volume** + +**New in Slowness Investigation**: Top client IPs analysis helps identify: + +- Bots and crawlers generating high traffic +- Potential abuse or DDoS patterns +- Most active clients +- Security investigation You can specify a custom parsed log file path in the dashboard sidebar. @@ -151,6 +190,7 @@ You can specify a custom parsed log file path in the dashboard sidebar. - `--operation OP`: Operation to perform: `sync`, `parse`, `analyze`, or `all` (default: `all`) - `--parsed-output FILE`: Output file for parsed logs (default: first enabled source's parsed output) - `--analytics-output FILE`: Output file for analytics report (optional) +- `--last-hours HOURS`: Filter to only entries from the last N hours (e.g., `1.0` for last hour). Only applies to analyze operation. If no parsed logs exist, automatically syncs today's logs first. ### sync_logs.sh @@ -171,6 +211,22 @@ You can specify a custom parsed log file path in the dashboard sidebar. - `--input FILE`: Input file (parsed JSON or CSV) - **required** - `--output FILE`: Output file path (optional) - `--format FORMAT`: Output format: `json` or `console` (default: `console`) +- `--last-hours HOURS`: Filter to only entries from the last N hours (e.g., `1.0` for last hour) + +### clear_logs.py + +Utility script to clear all log files (raw and parsed): + +- `--logs-dir DIR`: Path to logs directory (default: `./logs`) +- `--yes, -y`: Skip confirmation prompt + +**Examples**: + +```bash +python3 scripts/clear_logs.py # Clear with confirmation +python3 scripts/clear_logs.py --yes # Clear without confirmation +python3 scripts/clear_logs.py -y # Short form +``` ## Log Format @@ -222,7 +278,7 @@ The analytics report includes: - **Performance Metrics**: Cache hit/miss rates, response size statistics, hourly cache performance, hourly response size trends - **User Agent Analysis**: Top user agents, agent type distribution - **Query Patterns**: Most common query parameters, parameter value distributions, top query signatures -- **Slowness Investigation**: Traffic spikes, cache miss patterns, large response endpoints, peak traffic times, rate of change analysis +- **Slowness Investigation**: Traffic spikes, cache miss patterns, large response endpoints, peak traffic times, rate of change analysis, **top client IPs by request volume** - **Endpoint Drill-Down**: Detailed analysis for specific endpoints (time patterns, errors, cache, query params) - **Daily Summaries**: Daily request totals with status code breakdown by day @@ -243,7 +299,8 @@ fastly_log_query/ │ ├── query_logs.py # Main orchestration script │ ├── sync_logs.py # S3 sync script │ ├── parse_logs.py # Log parser -│ └── analyze_logs.py # Analytics engine +│ ├── analyze_logs.py # Analytics engine +│ └── clear_logs.py # Utility to clear all log files ├── src/ # Source code modules │ ├── sync/ # Sync implementations │ ├── parse/ # Parsing logic @@ -347,6 +404,29 @@ streamlit run dashboard/dashboard.py The dashboard will automatically load the parsed logs and display interactive visualizations. You can change the log file path in the sidebar if needed. +### Example 5: Analyze last hour of logs + +```bash +# Analyze last hour from existing parsed logs (no sync needed) +python3 scripts/query_logs.py --last-hours 1.0 + +# Or sync today's logs and analyze last hour +python3 scripts/query_logs.py --date 2025-11-23 --last-hours 1.0 + +# Analyze last 30 minutes +python3 scripts/query_logs.py --last-hours 0.5 +``` + +### Example 6: Clear all logs + +```bash +# Clear all log files with confirmation +python3 scripts/clear_logs.py + +# Clear without confirmation +python3 scripts/clear_logs.py --yes +``` + ## License This tool is provided as-is for internal use. diff --git a/dashboard/dashboard.py b/dashboard/dashboard.py index d621213..52578b2 100644 --- a/dashboard/dashboard.py +++ b/dashboard/dashboard.py @@ -524,6 +524,33 @@ def main(): if fig: st.plotly_chart(fig, width='stretch') + # Top client IPs + if analytics['slowness'].get('top_request_ips'): + st.subheader("Top Client IPs by Request Volume") + st.markdown("**Identifying clients generating the most traffic can help detect bots, crawlers, or potential abuse.**") + + col1, col2 = st.columns([2, 1]) + + with col1: + top_ips = analytics['slowness']['top_request_ips'] + fig = create_bar_chart( + top_ips, + "Top Client IPs", + "IP Address", + "Request Count", + limit=15 + ) + if fig: + st.plotly_chart(fig, width='stretch') + + with col2: + # Table with top IPs + df_ips = pd.DataFrame({ + 'IP Address': list(top_ips.keys())[:20], + 'Requests': list(top_ips.values())[:20] + }) + st.dataframe(df_ips, width='stretch', hide_index=True) + if __name__ == "__main__": main() diff --git a/scripts/analyze_logs.py b/scripts/analyze_logs.py index 55114c7..6307659 100755 --- a/scripts/analyze_logs.py +++ b/scripts/analyze_logs.py @@ -10,7 +10,7 @@ import sys from pathlib import Path from collections import Counter, defaultdict -from datetime import datetime +from datetime import datetime, timedelta, timezone from typing import Dict, List, Optional import pandas as pd import numpy as np @@ -33,6 +33,57 @@ def load_data(input_path: Path) -> List[Dict]: raise ValueError(f"Unsupported file format: {input_path.suffix}") +def filter_by_time(entries: List[Dict], last_hours: float) -> List[Dict]: + """ + Filter entries to only include those from the last N hours. + + Args: + entries: List of log entries + last_hours: Number of hours to look back (e.g., 1.0 for last hour) + + Returns: + Filtered list of entries + """ + if not entries or last_hours is None: + return entries + + # Get current time in UTC + now = datetime.now(timezone.utc) + cutoff_time = now - timedelta(hours=last_hours) + + filtered = [] + for entry in entries: + timestamp_str = entry.get('timestamp') + if not timestamp_str: + continue + + try: + # Parse timestamp (handle both ISO format and other formats) + if isinstance(timestamp_str, str): + # Try ISO format first + try: + entry_time = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00')) + except ValueError: + # Try other common formats + try: + entry_time = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S') + # Assume UTC if no timezone info + entry_time = entry_time.replace(tzinfo=timezone.utc) + except ValueError: + continue + else: + continue + + # Filter entries within the time window + if entry_time >= cutoff_time: + filtered.append(entry) + except (ValueError, TypeError, AttributeError): + # Skip entries with invalid timestamps + continue + + return filtered + + def analyze_traffic_patterns(entries: List[Dict]) -> Dict: """Analyze traffic patterns.""" if not entries: @@ -589,6 +640,11 @@ def main(): default='console', help='Output format (default: console)' ) + parser.add_argument( + '--last-hours', + type=float, + help='Filter to only entries from the last N hours (e.g., 1.0 for last hour)' + ) args = parser.parse_args() @@ -601,6 +657,12 @@ def main(): entries = load_data(input_path) print(f"Loaded {len(entries):,} log entries") + # Apply time filter if specified + if args.last_hours: + print(f"Filtering to last {args.last_hours} hour(s)...") + entries = filter_by_time(entries, args.last_hours) + print(f"Filtered to {len(entries):,} log entries") + print("Generating analytics...") analytics = { 'traffic': analyze_traffic_patterns(entries), diff --git a/scripts/clear_logs.py b/scripts/clear_logs.py new file mode 100755 index 0000000..fcfa62b --- /dev/null +++ b/scripts/clear_logs.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" +Clear all log files (raw and parsed) from the logs directory. +""" + +import argparse +import sys +from pathlib import Path + +# Colors for output +class Colors: + RED = '\033[0;31m' + GREEN = '\033[0;32m' + YELLOW = '\033[1;33m' + BLUE = '\033[0;34m' + NC = '\033[0m' # No Color + + +def clear_logs(logs_dir: Path = None, confirm: bool = True): + """ + Clear all log files from the logs directory. + + Args: + logs_dir: Path to logs directory (default: ./logs) + confirm: Whether to ask for confirmation (default: True) + """ + if logs_dir is None: + logs_dir = Path(__file__).parent.parent / "logs" + + if not logs_dir.exists(): + print(f"{Colors.YELLOW}Logs directory does not exist: {logs_dir}{Colors.NC}") + return 0 + + # Find all log files + log_files = [] + for pattern in ["*.log*", "*.json", "*.csv"]: + log_files.extend(logs_dir.rglob(pattern)) + + if not log_files: + print(f"{Colors.BLUE}No log files found to clear{Colors.NC}") + return 0 + + # Count files + file_count = len(log_files) + total_size = sum(f.stat().st_size for f in log_files if f.is_file()) + size_mb = total_size / (1024 * 1024) + + print(f"{Colors.BLUE}Found {file_count:,} log file(s) ({size_mb:.2f} MB){Colors.NC}") + + if confirm: + response = input(f"{Colors.YELLOW}Are you sure you want to delete all log files? (yes/no): {Colors.NC}") + if response.lower() not in ['yes', 'y']: + print(f"{Colors.BLUE}Cancelled{Colors.NC}") + return 0 + + # Delete files + deleted = 0 + for log_file in log_files: + try: + if log_file.is_file(): + log_file.unlink() + deleted += 1 + except Exception as e: + print(f"{Colors.RED}Error deleting {log_file}: {e}{Colors.NC}", file=sys.stderr) + + # Remove empty directories + try: + for dir_path in sorted(logs_dir.rglob("*"), reverse=True): + if dir_path.is_dir(): + try: + dir_path.rmdir() + except OSError: + pass # Directory not empty, skip + except Exception: + pass # Ignore errors when removing directories + + print(f"{Colors.GREEN}Cleared {deleted:,} log file(s){Colors.NC}") + + # Try to remove logs directory if empty + try: + if logs_dir.exists() and not any(logs_dir.iterdir()): + logs_dir.rmdir() + print(f"{Colors.GREEN}Removed empty logs directory{Colors.NC}") + except Exception: + pass + + return 0 + + +def main(): + parser = argparse.ArgumentParser( + description="Clear all log files (raw and parsed) from the logs directory", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s # Clear logs with confirmation + %(prog)s --yes # Clear logs without confirmation + %(prog)s --logs-dir ./logs # Clear logs from custom directory + """ + ) + parser.add_argument( + "--logs-dir", + type=str, + help="Path to logs directory (default: ./logs)" + ) + parser.add_argument( + "--yes", "-y", + action="store_true", + help="Skip confirmation prompt" + ) + + args = parser.parse_args() + + logs_dir = Path(args.logs_dir) if args.logs_dir else None + + return clear_logs(logs_dir, confirm=not args.yes) + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/scripts/query_logs.py b/scripts/query_logs.py index 5d8a5a3..59b9be6 100755 --- a/scripts/query_logs.py +++ b/scripts/query_logs.py @@ -122,7 +122,7 @@ def run_parse(): return parsed_outputs[0] if parsed_outputs else None -def run_analyze(parsed_output: str, analytics_output: str = None): +def run_analyze(parsed_output: str, analytics_output: str = None, last_hours: float = None): """Run analyze operation.""" script_dir = Path(__file__).parent analyze_script = script_dir / "analyze_logs.py" @@ -143,6 +143,9 @@ def run_analyze(parsed_output: str, analytics_output: str = None): if analytics_output: args.extend(["--output", analytics_output]) + + if last_hours: + args.extend(["--last-hours", str(last_hours)]) try: subprocess.run(args, check=True) @@ -184,21 +187,65 @@ def main(): type=str, help="Output file for analytics report (optional)" ) + parser.add_argument( + "--last-hours", + type=float, + help="Filter to only entries from the last N hours (e.g., 1.0 for last hour). Only applies to analyze operation." + ) args = parser.parse_args() - # Parse date range - try: - start_date, end_date = parse_date_range( - args.start_date, args.end_date, args.date - ) - except ValueError as e: - print(f"{Colors.RED}Error: {e}{Colors.NC}", file=sys.stderr) - parser.print_help() - sys.exit(1) + # Parse date range (only if dates are provided) + start_date = None + end_date = None + has_dates = bool(args.start_date or args.end_date or args.date) + if has_dates: + try: + start_date, end_date = parse_date_range( + args.start_date, args.end_date, args.date + ) + except ValueError as e: + print(f"{Colors.RED}Error: {e}{Colors.NC}", file=sys.stderr) + parser.print_help() + sys.exit(1) + + if args.date: + print(f"{Colors.YELLOW}Note: --date specified, syncing from {start_date} to {end_date} (today UTC){Colors.NC}") - if args.date: - print(f"{Colors.YELLOW}Note: --date specified, syncing from {start_date} to {end_date} (today UTC){Colors.NC}") + # If --last-hours is specified without dates and without explicit operation: + # - Check if parsed logs exist + # - If not, automatically sync today's logs and parse them, then analyze + # - If yes, just analyze + if args.last_hours and not has_dates and args.operation == "all": + # Check if parsed logs exist + try: + sources = load_config() + enabled = get_enabled_sources(sources) + if enabled: + first_source = next(iter(enabled)) + first_config = enabled[first_source] + parsed_dir = first_config.get('parsed_dir', f"logs/{first_source}/parsed") + parsed_output_path = Path(f"{parsed_dir}/parsed_logs.json") + + if parsed_output_path.exists(): + # Parsed logs exist, just analyze + args.operation = "analyze" + else: + # No parsed logs, sync today's logs and parse, then analyze + print(f"{Colors.YELLOW}No parsed logs found. Syncing today's logs...{Colors.NC}") + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + try: + start_date, end_date = parse_date_range(today, today, None) + has_dates = True + args.operation = "all" + except ValueError as e: + print(f"{Colors.RED}Error setting date: {e}{Colors.NC}", file=sys.stderr) + args.operation = "analyze" + else: + args.operation = "analyze" + except Exception: + # If we can't determine, default to analyze + args.operation = "analyze" # Validate date parameters for sync operation if args.operation in ["sync", "all"]: @@ -245,7 +292,7 @@ def main(): print(f"{Colors.RED}Error: Failed to determine parsed output: {e}{Colors.NC}", file=sys.stderr) sys.exit(1) - if not run_analyze(parsed_output, args.analytics_output): + if not run_analyze(parsed_output, args.analytics_output, args.last_hours): sys.exit(1) print(f"{Colors.GREEN}All operations completed successfully!{Colors.NC}")