From 497fa34de7bbc4bc64b49e5113169b2713df81c8 Mon Sep 17 00:00:00 2001
From: John Ajera <j.ajera@gns.cri.nz>
Date: Tue, 25 Nov 2025 17:08:03 +1300
Subject: [PATCH] feat: add client requests data

adds client requests in the dashboard
---
 .gitignore              |   2 -
 .vscode/extensions.json |  11 ++++
 .vscode/settings.json   |  60 ++++++++++++++++++++
 README.md               |  86 +++++++++++++++++++++++++++-
 dashboard/dashboard.py  |  27 +++++++++
 scripts/analyze_logs.py |  64 ++++++++++++++++++++-
 scripts/clear_logs.py   | 121 ++++++++++++++++++++++++++++++++++++++++
 scripts/query_logs.py   |  73 +++++++++++++++++++-----
 8 files changed, 425 insertions(+), 19 deletions(-)
 create mode 100644 .vscode/extensions.json
 create mode 100644 .vscode/settings.json
 create mode 100755 scripts/clear_logs.py

diff --git a/.gitignore b/.gitignore
index 360b974..b6d9ff1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,7 +26,6 @@ config/log_sources.yaml
 # logs/analytics.json
 
 # IDE
-.vscode/
 .idea/
 *.swp
 *.swo
@@ -39,4 +38,3 @@ Thumbs.db
 # Temporary files
 *.tmp
 *.bak
-
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
new file mode 100644
index 0000000..b3ed25c
--- /dev/null
+++ b/.vscode/extensions.json
@@ -0,0 +1,11 @@
+{
+  "recommendations": [
+    "ms-python.python",
+    "ms-python.vscode-pylance",
+    "ms-python.black-formatter",
+    "ms-python.flake8",
+    "timonwong.shellcheck",
+    "davidanson.vscode-markdownlint",
+    "redhat.vscode-yaml"
+  ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..3ddcad6
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,60 @@
+{
+  // Python settings
+  "python.defaultInterpreterPath": "python3",
+  "python.linting.enabled": true,
+  "python.linting.pylintEnabled": false,
+  "python.linting.flake8Enabled": true,
+  "python.formatting.provider": "black",
+  "python.formatting.blackArgs": ["--line-length", "100"],
+  "python.analysis.typeCheckingMode": "basic",
+  "python.analysis.autoImportCompletions": true,
+
+  // File associations
+  "files.associations": {
+    "*.sh": "shellscript"
+  },
+
+  // Editor settings
+  "editor.formatOnSave": true,
+  "editor.rulers": [100],
+  "editor.tabSize": 4,
+  "editor.insertSpaces": true,
+  "editor.trimAutoWhitespace": true,
+  "files.trimTrailingWhitespace": true,
+  "files.insertFinalNewline": true,
+  "files.exclude": {
+    "**/__pycache__": true,
+    "**/*.pyc": true,
+    "**/.DS_Store": true
+  },
+
+  // Shell script settings
+  "[shellscript]": {
+    "editor.tabSize": 2,
+    "editor.insertSpaces": true
+  },
+
+  // Markdown settings
+  "[markdown]": {
+    "editor.wordWrap": "on",
+    "editor.quickSuggestions": {
+      "comments": "off",
+      "strings": "off",
+      "other": "off"
+    }
+  },
+
+  // Search settings
+  "search.exclude": {
+    "**/logs": true,
+    "**/__pycache__": true,
+    "**/*.pyc": true,
+    "**/.git": true
+  },
+
+  // File watcher settings (exclude large log directories)
+  "files.watcherExclude": {
+    "**/logs/**": true,
+    "**/__pycache__/**": true
+  }
+}
diff --git a/README.md b/README.md
index 098a4b6..1c8ea53 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,11 @@ A comprehensive tool for syncing Fastly logs from S3, parsing them, and generati
 - **UTC timezone handling**: All date operations use UTC to match log file organization
 - **Log parsing**: Converts syslog-style Fastly logs to structured JSON/CSV
 - **Comprehensive analytics**: Traffic patterns, error analysis, performance metrics, user agent analysis, query parameter patterns, endpoint drill-down, daily summaries, and slowness investigation
+- **Time-based filtering**: Filter logs to analyze only the last N hours (e.g., last hour, last 24 hours)
+- **Client IP analysis**: Top client IPs by request volume for security and traffic analysis
 - **Interactive dashboard**: Streamlit-based web dashboard for visualizing log analytics
 - **Modular design**: Run sync, parse, or analyze operations independently or as a pipeline
+- **Log management**: Utility script to clear all log files when needed
 
 ## Prerequisites
 
@@ -78,6 +81,35 @@ python3 scripts/query_logs.py --start-date 2025-11-10 --end-date 2025-11-12
 
 **Timezone Note**: All date operations use UTC to match the log file organization in S3. Log timestamps are preserved in UTC throughout parsing and analysis.
 
+### Time-Based Filtering
+
+Filter logs to analyze only entries from the last N hours:
+
+**Analyze last hour from existing parsed logs** (no sync needed):
+
+```bash
+python3 scripts/query_logs.py --last-hours 1.0
+```
+
+**Sync, parse, and analyze with time filter**:
+
+```bash
+python3 scripts/query_logs.py --date 2025-11-23 --last-hours 1.0
+```
+
+**Other time filter examples**:
+
+```bash
+python3 scripts/query_logs.py --last-hours 0.5   # Last 30 minutes
+python3 scripts/query_logs.py --last-hours 24.0  # Last 24 hours
+python3 scripts/query_logs.py --last-hours 2.5   # Last 2.5 hours
+```
+
+**Note**: When using `--last-hours` without dates:
+
+- If parsed logs exist, it will analyze them with the time filter
+- If no parsed logs exist, it will automatically sync today's logs, parse them, then analyze with the time filter
+
 ### Individual Operations
 
 #### Sync Only
@@ -137,7 +169,14 @@ The dashboard will open in your browser (typically at `http://localhost:8501`) a
 - **Performance Metrics**: Cache hit/miss rates, response size statistics
 - **User Agent Analysis**: Top user agents and agent type distribution
 - **Query Patterns**: Most common query parameters and value distributions
-- **Slowness Investigation**: Cache miss patterns, large response endpoints, peak traffic times
+- **Slowness Investigation**: Cache miss patterns, large response endpoints, peak traffic times, **top client IPs by request volume**
+
+**New in Slowness Investigation**: Top client IPs analysis helps identify:
+
+- Bots and crawlers generating high traffic
+- Potential abuse or DDoS patterns
+- Most active clients
+- Security investigation
 
 You can specify a custom parsed log file path in the dashboard sidebar.
 
@@ -151,6 +190,7 @@ You can specify a custom parsed log file path in the dashboard sidebar.
 - `--operation OP`: Operation to perform: `sync`, `parse`, `analyze`, or `all` (default: `all`)
 - `--parsed-output FILE`: Output file for parsed logs (default: first enabled source's parsed output)
 - `--analytics-output FILE`: Output file for analytics report (optional)
+- `--last-hours HOURS`: Filter to only entries from the last N hours (e.g., `1.0` for last hour). Only applies to analyze operation. If no parsed logs exist, automatically syncs today's logs first.
 
 ### sync_logs.sh
 
@@ -171,6 +211,22 @@ You can specify a custom parsed log file path in the dashboard sidebar.
 - `--input FILE`: Input file (parsed JSON or CSV) - **required**
 - `--output FILE`: Output file path (optional)
 - `--format FORMAT`: Output format: `json` or `console` (default: `console`)
+- `--last-hours HOURS`: Filter to only entries from the last N hours (e.g., `1.0` for last hour)
+
+### clear_logs.py
+
+Utility script to clear all log files (raw and parsed):
+
+- `--logs-dir DIR`: Path to logs directory (default: `./logs`)
+- `--yes, -y`: Skip confirmation prompt
+
+**Examples**:
+
+```bash
+python3 scripts/clear_logs.py              # Clear with confirmation
+python3 scripts/clear_logs.py --yes        # Clear without confirmation
+python3 scripts/clear_logs.py -y           # Short form
+```
 
 ## Log Format
 
@@ -222,7 +278,7 @@ The analytics report includes:
 - **Performance Metrics**: Cache hit/miss rates, response size statistics, hourly cache performance, hourly response size trends
 - **User Agent Analysis**: Top user agents, agent type distribution
 - **Query Patterns**: Most common query parameters, parameter value distributions, top query signatures
-- **Slowness Investigation**: Traffic spikes, cache miss patterns, large response endpoints, peak traffic times, rate of change analysis
+- **Slowness Investigation**: Traffic spikes, cache miss patterns, large response endpoints, peak traffic times, rate of change analysis, **top client IPs by request volume**
 - **Endpoint Drill-Down**: Detailed analysis for specific endpoints (time patterns, errors, cache, query params)
 - **Daily Summaries**: Daily request totals with status code breakdown by day
 
@@ -243,7 +299,8 @@ fastly_log_query/
 │   ├── query_logs.py        # Main orchestration script
 │   ├── sync_logs.py         # S3 sync script
 │   ├── parse_logs.py        # Log parser
-│   └── analyze_logs.py      # Analytics engine
+│   ├── analyze_logs.py      # Analytics engine
+│   └── clear_logs.py         # Utility to clear all log files
 ├── src/                     # Source code modules
 │   ├── sync/                # Sync implementations
 │   ├── parse/               # Parsing logic
@@ -347,6 +404,29 @@ streamlit run dashboard/dashboard.py
 
 The dashboard will automatically load the parsed logs and display interactive visualizations. You can change the log file path in the sidebar if needed.
 
+### Example 5: Analyze last hour of logs
+
+```bash
+# Analyze last hour from existing parsed logs (no sync needed)
+python3 scripts/query_logs.py --last-hours 1.0
+
+# Or sync today's logs and analyze last hour
+python3 scripts/query_logs.py --date 2025-11-23 --last-hours 1.0
+
+# Analyze last 30 minutes
+python3 scripts/query_logs.py --last-hours 0.5
+```
+
+### Example 6: Clear all logs
+
+```bash
+# Clear all log files with confirmation
+python3 scripts/clear_logs.py
+
+# Clear without confirmation
+python3 scripts/clear_logs.py --yes
+```
+
 ## License
 
 This tool is provided as-is for internal use.
diff --git a/dashboard/dashboard.py b/dashboard/dashboard.py
index d621213..52578b2 100644
--- a/dashboard/dashboard.py
+++ b/dashboard/dashboard.py
@@ -524,6 +524,33 @@ def main():
             if fig:
                 st.plotly_chart(fig, width='stretch')
 
+        # Top client IPs
+        if analytics['slowness'].get('top_request_ips'):
+            st.subheader("Top Client IPs by Request Volume")
+            st.markdown("**Identifying clients generating the most traffic can help detect bots, crawlers, or potential abuse.**")
+            
+            col1, col2 = st.columns([2, 1])
+            
+            with col1:
+                top_ips = analytics['slowness']['top_request_ips']
+                fig = create_bar_chart(
+                    top_ips,
+                    "Top Client IPs",
+                    "IP Address",
+                    "Request Count",
+                    limit=15
+                )
+                if fig:
+                    st.plotly_chart(fig, width='stretch')
+            
+            with col2:
+                # Table with top IPs
+                df_ips = pd.DataFrame({
+                    'IP Address': list(top_ips.keys())[:20],
+                    'Requests': list(top_ips.values())[:20]
+                })
+                st.dataframe(df_ips, width='stretch', hide_index=True)
+
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/analyze_logs.py b/scripts/analyze_logs.py
index 55114c7..6307659 100755
--- a/scripts/analyze_logs.py
+++ b/scripts/analyze_logs.py
@@ -10,7 +10,7 @@
 import sys
 from pathlib import Path
 from collections import Counter, defaultdict
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 from typing import Dict, List, Optional
 import pandas as pd
 import numpy as np
@@ -33,6 +33,57 @@ def load_data(input_path: Path) -> List[Dict]:
         raise ValueError(f"Unsupported file format: {input_path.suffix}")
 
 
+def filter_by_time(entries: List[Dict], last_hours: float) -> List[Dict]:
+    """
+    Filter entries to only include those from the last N hours.
+    
+    Args:
+        entries: List of log entries
+        last_hours: Number of hours to look back (e.g., 1.0 for last hour)
+    
+    Returns:
+        Filtered list of entries
+    """
+    if not entries or last_hours is None:
+        return entries
+    
+    # Get current time in UTC
+    now = datetime.now(timezone.utc)
+    cutoff_time = now - timedelta(hours=last_hours)
+    
+    filtered = []
+    for entry in entries:
+        timestamp_str = entry.get('timestamp')
+        if not timestamp_str:
+            continue
+        
+        try:
+            # Parse timestamp (handle both ISO format and other formats)
+            if isinstance(timestamp_str, str):
+                # Try ISO format first
+                try:
+                    entry_time = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
+                except ValueError:
+                    # Try other common formats
+                    try:
+                        entry_time = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S')
+                        # Assume UTC if no timezone info
+                        entry_time = entry_time.replace(tzinfo=timezone.utc)
+                    except ValueError:
+                        continue
+            else:
+                continue
+            
+            # Filter entries within the time window
+            if entry_time >= cutoff_time:
+                filtered.append(entry)
+        except (ValueError, TypeError, AttributeError):
+            # Skip entries with invalid timestamps
+            continue
+    
+    return filtered
+
+
 def analyze_traffic_patterns(entries: List[Dict]) -> Dict:
     """Analyze traffic patterns."""
     if not entries:
@@ -589,6 +640,11 @@ def main():
         default='console',
         help='Output format (default: console)'
     )
+    parser.add_argument(
+        '--last-hours',
+        type=float,
+        help='Filter to only entries from the last N hours (e.g., 1.0 for last hour)'
+    )
     
     args = parser.parse_args()
     
@@ -601,6 +657,12 @@ def main():
     entries = load_data(input_path)
     print(f"Loaded {len(entries):,} log entries")
     
+    # Apply time filter if specified
+    if args.last_hours:
+        print(f"Filtering to last {args.last_hours} hour(s)...")
+        entries = filter_by_time(entries, args.last_hours)
+        print(f"Filtered to {len(entries):,} log entries")
+    
     print("Generating analytics...")
     analytics = {
         'traffic': analyze_traffic_patterns(entries),
diff --git a/scripts/clear_logs.py b/scripts/clear_logs.py
new file mode 100755
index 0000000..fcfa62b
--- /dev/null
+++ b/scripts/clear_logs.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""
+Clear all log files (raw and parsed) from the logs directory.
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Colors for output
+class Colors:
+    RED = '\033[0;31m'
+    GREEN = '\033[0;32m'
+    YELLOW = '\033[1;33m'
+    BLUE = '\033[0;34m'
+    NC = '\033[0m'  # No Color
+
+
+def clear_logs(logs_dir: Path = None, confirm: bool = True):
+    """
+    Clear all log files from the logs directory.
+    
+    Args:
+        logs_dir: Path to logs directory (default: ./logs)
+        confirm: Whether to ask for confirmation (default: True)
+    """
+    if logs_dir is None:
+        logs_dir = Path(__file__).parent.parent / "logs"
+    
+    if not logs_dir.exists():
+        print(f"{Colors.YELLOW}Logs directory does not exist: {logs_dir}{Colors.NC}")
+        return 0
+    
+    # Find all log files
+    log_files = []
+    for pattern in ["*.log*", "*.json", "*.csv"]:
+        log_files.extend(logs_dir.rglob(pattern))
+    
+    if not log_files:
+        print(f"{Colors.BLUE}No log files found to clear{Colors.NC}")
+        return 0
+    
+    # Count files
+    file_count = len(log_files)
+    total_size = sum(f.stat().st_size for f in log_files if f.is_file())
+    size_mb = total_size / (1024 * 1024)
+    
+    print(f"{Colors.BLUE}Found {file_count:,} log file(s) ({size_mb:.2f} MB){Colors.NC}")
+    
+    if confirm:
+        response = input(f"{Colors.YELLOW}Are you sure you want to delete all log files? (yes/no): {Colors.NC}")
+        if response.lower() not in ['yes', 'y']:
+            print(f"{Colors.BLUE}Cancelled{Colors.NC}")
+            return 0
+    
+    # Delete files
+    deleted = 0
+    for log_file in log_files:
+        try:
+            if log_file.is_file():
+                log_file.unlink()
+                deleted += 1
+        except Exception as e:
+            print(f"{Colors.RED}Error deleting {log_file}: {e}{Colors.NC}", file=sys.stderr)
+    
+    # Remove empty directories
+    try:
+        for dir_path in sorted(logs_dir.rglob("*"), reverse=True):
+            if dir_path.is_dir():
+                try:
+                    dir_path.rmdir()
+                except OSError:
+                    pass  # Directory not empty, skip
+    except Exception:
+        pass  # Ignore errors when removing directories
+    
+    print(f"{Colors.GREEN}Cleared {deleted:,} log file(s){Colors.NC}")
+    
+    # Try to remove logs directory if empty
+    try:
+        if logs_dir.exists() and not any(logs_dir.iterdir()):
+            logs_dir.rmdir()
+            print(f"{Colors.GREEN}Removed empty logs directory{Colors.NC}")
+    except Exception:
+        pass
+    
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Clear all log files (raw and parsed) from the logs directory",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s                    # Clear logs with confirmation
+  %(prog)s --yes              # Clear logs without confirmation
+  %(prog)s --logs-dir ./logs  # Clear logs from custom directory
+        """
+    )
+    parser.add_argument(
+        "--logs-dir",
+        type=str,
+        help="Path to logs directory (default: ./logs)"
+    )
+    parser.add_argument(
+        "--yes", "-y",
+        action="store_true",
+        help="Skip confirmation prompt"
+    )
+    
+    args = parser.parse_args()
+    
+    logs_dir = Path(args.logs_dir) if args.logs_dir else None
+    
+    return clear_logs(logs_dir, confirm=not args.yes)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+
diff --git a/scripts/query_logs.py b/scripts/query_logs.py
index 5d8a5a3..59b9be6 100755
--- a/scripts/query_logs.py
+++ b/scripts/query_logs.py
@@ -122,7 +122,7 @@ def run_parse():
     return parsed_outputs[0] if parsed_outputs else None
 
 
-def run_analyze(parsed_output: str, analytics_output: str = None):
+def run_analyze(parsed_output: str, analytics_output: str = None, last_hours: float = None):
     """Run analyze operation."""
     script_dir = Path(__file__).parent
     analyze_script = script_dir / "analyze_logs.py"
@@ -143,6 +143,9 @@ def run_analyze(parsed_output: str, analytics_output: str = None):
 
     if analytics_output:
         args.extend(["--output", analytics_output])
+    
+    if last_hours:
+        args.extend(["--last-hours", str(last_hours)])
 
     try:
         subprocess.run(args, check=True)
@@ -184,21 +187,65 @@ def main():
         type=str,
         help="Output file for analytics report (optional)"
     )
+    parser.add_argument(
+        "--last-hours",
+        type=float,
+        help="Filter to only entries from the last N hours (e.g., 1.0 for last hour). Only applies to analyze operation."
+    )
 
     args = parser.parse_args()
 
-    # Parse date range
-    try:
-        start_date, end_date = parse_date_range(
-            args.start_date, args.end_date, args.date
-        )
-    except ValueError as e:
-        print(f"{Colors.RED}Error: {e}{Colors.NC}", file=sys.stderr)
-        parser.print_help()
-        sys.exit(1)
+    # Parse date range (only if dates are provided)
+    start_date = None
+    end_date = None
+    has_dates = bool(args.start_date or args.end_date or args.date)
+    if has_dates:
+        try:
+            start_date, end_date = parse_date_range(
+                args.start_date, args.end_date, args.date
+            )
+        except ValueError as e:
+            print(f"{Colors.RED}Error: {e}{Colors.NC}", file=sys.stderr)
+            parser.print_help()
+            sys.exit(1)
+
+        if args.date:
+            print(f"{Colors.YELLOW}Note: --date specified, syncing from {start_date} to {end_date} (today UTC){Colors.NC}")
 
-    if args.date:
-        print(f"{Colors.YELLOW}Note: --date specified, syncing from {start_date} to {end_date} (today UTC){Colors.NC}")
+    # If --last-hours is specified without dates and without explicit operation:
+    # - Check if parsed logs exist
+    # - If not, automatically sync today's logs and parse them, then analyze
+    # - If yes, just analyze
+    if args.last_hours and not has_dates and args.operation == "all":
+        # Check if parsed logs exist
+        try:
+            sources = load_config()
+            enabled = get_enabled_sources(sources)
+            if enabled:
+                first_source = next(iter(enabled))
+                first_config = enabled[first_source]
+                parsed_dir = first_config.get('parsed_dir', f"logs/{first_source}/parsed")
+                parsed_output_path = Path(f"{parsed_dir}/parsed_logs.json")
+                
+                if parsed_output_path.exists():
+                    # Parsed logs exist, just analyze
+                    args.operation = "analyze"
+                else:
+                    # No parsed logs, sync today's logs and parse, then analyze
+                    print(f"{Colors.YELLOW}No parsed logs found. Syncing today's logs...{Colors.NC}")
+                    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+                    try:
+                        start_date, end_date = parse_date_range(today, today, None)
+                        has_dates = True
+                        args.operation = "all"
+                    except ValueError as e:
+                        print(f"{Colors.RED}Error setting date: {e}{Colors.NC}", file=sys.stderr)
+                        args.operation = "analyze"
+            else:
+                args.operation = "analyze"
+        except Exception:
+            # If we can't determine, default to analyze
+            args.operation = "analyze"
 
     # Validate date parameters for sync operation
     if args.operation in ["sync", "all"]:
@@ -245,7 +292,7 @@ def main():
                 print(f"{Colors.RED}Error: Failed to determine parsed output: {e}{Colors.NC}", file=sys.stderr)
                 sys.exit(1)
 
-        if not run_analyze(parsed_output, args.analytics_output):
+        if not run_analyze(parsed_output, args.analytics_output, args.last_hours):
             sys.exit(1)
 
     print(f"{Colors.GREEN}All operations completed successfully!{Colors.NC}")