cloudbuildlab · jajera · Nov 25, 2025 · Nov 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -26,7 +26,6 @@ config/log_sources.yaml
 # logs/analytics.json
 
 # IDE
-.vscode/
 .idea/
 *.swp
 *.swo
@@ -39,4 +38,3 @@ Thumbs.db
 # Temporary files
 *.tmp
 *.bak
-
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -0,0 +1,11 @@
+{
+  "recommendations": [
+    "ms-python.python",
+    "ms-python.vscode-pylance",
+    "ms-python.black-formatter",
+    "ms-python.flake8",
+    "timonwong.shellcheck",
+    "davidanson.vscode-markdownlint",
+    "redhat.vscode-yaml"
+  ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,60 @@
+{
+  // Python settings
+  "python.defaultInterpreterPath": "python3",
+  "python.linting.enabled": true,
+  "python.linting.pylintEnabled": false,
+  "python.linting.flake8Enabled": true,
+  "python.formatting.provider": "black",
+  "python.formatting.blackArgs": ["--line-length", "100"],
+  "python.analysis.typeCheckingMode": "basic",
+  "python.analysis.autoImportCompletions": true,
+
+  // File associations
+  "files.associations": {
+    "*.sh": "shellscript"
+  },
+
+  // Editor settings
+  "editor.formatOnSave": true,
+  "editor.rulers": [100],
+  "editor.tabSize": 4,
+  "editor.insertSpaces": true,
+  "editor.trimAutoWhitespace": true,
+  "files.trimTrailingWhitespace": true,
+  "files.insertFinalNewline": true,
+  "files.exclude": {
+    "**/__pycache__": true,
+    "**/*.pyc": true,
+    "**/.DS_Store": true
+  },
+
+  // Shell script settings
+  "[shellscript]": {
+    "editor.tabSize": 2,
+    "editor.insertSpaces": true
+  },
+
+  // Markdown settings
+  "[markdown]": {
+    "editor.wordWrap": "on",
+    "editor.quickSuggestions": {
+      "comments": "off",
+      "strings": "off",
+      "other": "off"
+    }
+  },
+
+  // Search settings
+  "search.exclude": {
+    "**/logs": true,
+    "**/__pycache__": true,
+    "**/*.pyc": true,
+    "**/.git": true
+  },
+
+  // File watcher settings (exclude large log directories)
+  "files.watcherExclude": {
+    "**/logs/**": true,
+    "**/__pycache__/**": true
+  }
+}
diff --git a/README.md b/README.md
@@ -26,8 +26,11 @@ A comprehensive tool for syncing Fastly logs from S3, parsing them, and generati
 - **UTC timezone handling**: All date operations use UTC to match log file organization
 - **Log parsing**: Converts syslog-style Fastly logs to structured JSON/CSV
 - **Comprehensive analytics**: Traffic patterns, error analysis, performance metrics, user agent analysis, query parameter patterns, endpoint drill-down, daily summaries, and slowness investigation
+- **Time-based filtering**: Filter logs to analyze only the last N hours (e.g., last hour, last 24 hours)
+- **Client IP analysis**: Top client IPs by request volume for security and traffic analysis
 - **Interactive dashboard**: Streamlit-based web dashboard for visualizing log analytics
 - **Modular design**: Run sync, parse, or analyze operations independently or as a pipeline
+- **Log management**: Utility script to clear all log files when needed
 
 ## Prerequisites
 
@@ -78,6 +81,35 @@ python3 scripts/query_logs.py --start-date 2025-11-10 --end-date 2025-11-12
 
 **Timezone Note**: All date operations use UTC to match the log file organization in S3. Log timestamps are preserved in UTC throughout parsing and analysis.
 
+### Time-Based Filtering
+
+Filter logs to analyze only entries from the last N hours:
+
+**Analyze last hour from existing parsed logs** (no sync needed):
+
+```bash
+python3 scripts/query_logs.py --last-hours 1.0
+```
+
+**Sync, parse, and analyze with time filter**:
+
+```bash
+python3 scripts/query_logs.py --date 2025-11-23 --last-hours 1.0
+```
+
+**Other time filter examples**:
+
+```bash
+python3 scripts/query_logs.py --last-hours 0.5   # Last 30 minutes
+python3 scripts/query_logs.py --last-hours 24.0  # Last 24 hours
+python3 scripts/query_logs.py --last-hours 2.5   # Last 2.5 hours
+```
+
+**Note**: When using `--last-hours` without dates:
+
+- If parsed logs exist, it will analyze them with the time filter
+- If no parsed logs exist, it will automatically sync today's logs, parse them, then analyze with the time filter
+
 ### Individual Operations
 
 #### Sync Only
@@ -137,7 +169,14 @@ The dashboard will open in your browser (typically at `http://localhost:8501`) a
 - **Performance Metrics**: Cache hit/miss rates, response size statistics
 - **User Agent Analysis**: Top user agents and agent type distribution
 - **Query Patterns**: Most common query parameters and value distributions
-- **Slowness Investigation**: Cache miss patterns, large response endpoints, peak traffic times
+- **Slowness Investigation**: Cache miss patterns, large response endpoints, peak traffic times, **top client IPs by request volume**
+
+**New in Slowness Investigation**: Top client IPs analysis helps identify:
+
+- Bots and crawlers generating high traffic
+- Potential abuse or DDoS patterns
+- Most active clients
+- Security investigation
 
 You can specify a custom parsed log file path in the dashboard sidebar.
 
@@ -151,6 +190,7 @@ You can specify a custom parsed log file path in the dashboard sidebar.
 - `--operation OP`: Operation to perform: `sync`, `parse`, `analyze`, or `all` (default: `all`)
 - `--parsed-output FILE`: Output file for parsed logs (default: first enabled source's parsed output)
 - `--analytics-output FILE`: Output file for analytics report (optional)
+- `--last-hours HOURS`: Filter to only entries from the last N hours (e.g., `1.0` for last hour). Only applies to analyze operation. If no parsed logs exist, automatically syncs today's logs first.
 
 ### sync_logs.sh
 
@@ -171,6 +211,22 @@ You can specify a custom parsed log file path in the dashboard sidebar.
 - `--input FILE`: Input file (parsed JSON or CSV) - **required**
 - `--output FILE`: Output file path (optional)
 - `--format FORMAT`: Output format: `json` or `console` (default: `console`)
+- `--last-hours HOURS`: Filter to only entries from the last N hours (e.g., `1.0` for last hour)
+
+### clear_logs.py
+
+Utility script to clear all log files (raw and parsed):
+
+- `--logs-dir DIR`: Path to logs directory (default: `./logs`)
+- `--yes, -y`: Skip confirmation prompt
+
+**Examples**:
+
+```bash
+python3 scripts/clear_logs.py              # Clear with confirmation
+python3 scripts/clear_logs.py --yes        # Clear without confirmation
+python3 scripts/clear_logs.py -y           # Short form
+```
 
 ## Log Format
 
@@ -222,7 +278,7 @@ The analytics report includes:
 - **Performance Metrics**: Cache hit/miss rates, response size statistics, hourly cache performance, hourly response size trends
 - **User Agent Analysis**: Top user agents, agent type distribution
 - **Query Patterns**: Most common query parameters, parameter value distributions, top query signatures
-- **Slowness Investigation**: Traffic spikes, cache miss patterns, large response endpoints, peak traffic times, rate of change analysis
+- **Slowness Investigation**: Traffic spikes, cache miss patterns, large response endpoints, peak traffic times, rate of change analysis, **top client IPs by request volume**
 - **Endpoint Drill-Down**: Detailed analysis for specific endpoints (time patterns, errors, cache, query params)
 - **Daily Summaries**: Daily request totals with status code breakdown by day
 
@@ -243,7 +299,8 @@ fastly_log_query/
 │   ├── query_logs.py        # Main orchestration script
 │   ├── sync_logs.py         # S3 sync script
 │   ├── parse_logs.py        # Log parser
-│   └── analyze_logs.py      # Analytics engine
+│   ├── analyze_logs.py      # Analytics engine
+│   └── clear_logs.py         # Utility to clear all log files
 ├── src/                     # Source code modules
 │   ├── sync/                # Sync implementations
 │   ├── parse/               # Parsing logic
@@ -347,6 +404,29 @@ streamlit run dashboard/dashboard.py
 
 The dashboard will automatically load the parsed logs and display interactive visualizations. You can change the log file path in the sidebar if needed.
 
+### Example 5: Analyze last hour of logs
+
+```bash
+# Analyze last hour from existing parsed logs (no sync needed)
+python3 scripts/query_logs.py --last-hours 1.0
+
+# Or sync today's logs and analyze last hour
+python3 scripts/query_logs.py --date 2025-11-23 --last-hours 1.0
+
+# Analyze last 30 minutes
+python3 scripts/query_logs.py --last-hours 0.5
+```
+
+### Example 6: Clear all logs
+
+```bash
+# Clear all log files with confirmation
+python3 scripts/clear_logs.py
+
+# Clear without confirmation
+python3 scripts/clear_logs.py --yes
+```
+
 ## License
 
 This tool is provided as-is for internal use.
diff --git a/dashboard/dashboard.py b/dashboard/dashboard.py
@@ -524,6 +524,33 @@ def main():
             if fig:
                 st.plotly_chart(fig, width='stretch')
 
+        # Top client IPs
+        if analytics['slowness'].get('top_request_ips'):
+            st.subheader("Top Client IPs by Request Volume")
+            st.markdown("**Identifying clients generating the most traffic can help detect bots, crawlers, or potential abuse.**")
+
+            col1, col2 = st.columns([2, 1])
+
+            with col1:
+                top_ips = analytics['slowness']['top_request_ips']
+                fig = create_bar_chart(
+                    top_ips,
+                    "Top Client IPs",
+                    "IP Address",
+                    "Request Count",
+                    limit=15
+                )
+                if fig:
+                    st.plotly_chart(fig, width='stretch')
+
+            with col2:
+                # Table with top IPs
+                df_ips = pd.DataFrame({
+                    'IP Address': list(top_ips.keys())[:20],
+                    'Requests': list(top_ips.values())[:20]
+                })
+                st.dataframe(df_ips, width='stretch', hide_index=True)
+
 
 if __name__ == "__main__":
     main()

diff --git a/scripts/analyze_logs.py b/scripts/analyze_logs.py
@@ -10,7 +10,7 @@
 import sys
 from pathlib import Path
 from collections import Counter, defaultdict
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 from typing import Dict, List, Optional
 import pandas as pd
 import numpy as np
@@ -33,6 +33,57 @@ def load_data(input_path: Path) -> List[Dict]:
         raise ValueError(f"Unsupported file format: {input_path.suffix}")
 
 
+def filter_by_time(entries: List[Dict], last_hours: float) -> List[Dict]:
+    """
+    Filter entries to only include those from the last N hours.
+
+    Args:
+        entries: List of log entries
+        last_hours: Number of hours to look back (e.g., 1.0 for last hour)
+
+    Returns:
+        Filtered list of entries
+    """
+    if not entries or last_hours is None:
+        return entries
+
+    # Get current time in UTC
+    now = datetime.now(timezone.utc)
+    cutoff_time = now - timedelta(hours=last_hours)
+
+    filtered = []
+    for entry in entries:
+        timestamp_str = entry.get('timestamp')
+        if not timestamp_str:
+            continue
+
+        try:
+            # Parse timestamp (handle both ISO format and other formats)
+            if isinstance(timestamp_str, str):
+                # Try ISO format first
+                try:
+                    entry_time = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
+                except ValueError:
+                    # Try other common formats
+                    try:
+                        entry_time = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S')
+                        # Assume UTC if no timezone info
+                        entry_time = entry_time.replace(tzinfo=timezone.utc)
+                    except ValueError:
+                        continue
+            else:
+                continue
+
+            # Filter entries within the time window
+            if entry_time >= cutoff_time:
+                filtered.append(entry)
+        except (ValueError, TypeError, AttributeError):
+            # Skip entries with invalid timestamps
+            continue
+
+    return filtered
+
+
 def analyze_traffic_patterns(entries: List[Dict]) -> Dict:
     """Analyze traffic patterns."""
     if not entries:
@@ -589,6 +640,11 @@ def main():
         default='console',
         help='Output format (default: console)'
     )
+    parser.add_argument(
+        '--last-hours',
+        type=float,
+        help='Filter to only entries from the last N hours (e.g., 1.0 for last hour)'
+    )
 
     args = parser.parse_args()
 
@@ -601,6 +657,12 @@ def main():
     entries = load_data(input_path)
     print(f"Loaded {len(entries):,} log entries")
 
+    # Apply time filter if specified
+    if args.last_hours:
+        print(f"Filtering to last {args.last_hours} hour(s)...")
+        entries = filter_by_time(entries, args.last_hours)
+        print(f"Filtered to {len(entries):,} log entries")
+
     print("Generating analytics...")
     analytics = {
         'traffic': analyze_traffic_patterns(entries),
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,7 +26,6 @@ config/log_sources.yaml @@
     # logs/analytics.json
     # IDE
-    .vscode/
     .idea/
     *.swp
     *.swo
@@ Expand All / @@ -39,4 +38,3 @@ Thumbs.db @@
     # Temporary files
     *.tmp
     *.bak