From 6eaa8c778375c55561ede333ff0cb5139395d8cb Mon Sep 17 00:00:00 2001 From: Johnny Miller Date: Wed, 2 Jul 2025 15:44:14 +0200 Subject: [PATCH 1/2] examples --- Makefile | 102 ++++ examples/README.md | 107 +++- examples/context_manager_safety_demo.py | 27 +- examples/exampleoutput/.gitignore | 6 + examples/exampleoutput/README.md | 26 + examples/export_large_table.py | 33 +- examples/export_to_parquet.py | 554 ++++++++++++++++++ examples/metrics_example.py | 27 +- examples/metrics_simple.py | 27 +- examples/realtime_processing.py | 27 +- examples/requirements.txt | 8 + examples/streaming_basic.py | 27 +- examples/streaming_non_blocking_demo.py | 303 ++++++++++ tests/integration/test_example_scripts.py | 298 ++++++++-- .../test_streaming_non_blocking.py | 341 +++++++++++ 15 files changed, 1859 insertions(+), 54 deletions(-) create mode 100644 examples/exampleoutput/.gitignore create mode 100644 examples/exampleoutput/README.md create mode 100644 examples/export_to_parquet.py create mode 100644 examples/requirements.txt create mode 100644 examples/streaming_non_blocking_demo.py create mode 100644 tests/integration/test_streaming_non_blocking.py diff --git a/Makefile b/Makefile index 25737c0..04eb5f5 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ help: @echo "Installation:" @echo " install Install the package" @echo " install-dev Install with development dependencies" + @echo " install-examples Install example dependencies (e.g., pyarrow)" @echo "" @echo "Quick Test Commands:" @echo " test-quick Run quick validation tests (~30s)" @@ -43,6 +44,17 @@ help: @echo " build Build distribution packages" @echo " clean Clean build artifacts" @echo "" + @echo "Examples:" + @echo " example-streaming Run streaming basic example" + @echo " example-export-csv Run CSV export example" + @echo " example-export-parquet Run Parquet export example" + @echo " example-realtime Run real-time processing example" + @echo " example-metrics Run metrics collection example" + @echo " example-non-blocking Run non-blocking demo" + @echo " example-context Run context manager safety demo" + @echo " example-fastapi Run FastAPI example app" + @echo " examples-all Run all examples sequentially" + @echo "" @echo "Environment variables:" @echo " CASSANDRA_CONTACT_POINTS Cassandra contact points (default: localhost)" @echo " SKIP_INTEGRATION_TESTS=1 Skip integration tests" @@ -56,6 +68,10 @@ install-dev: pip install -r requirements-lint.txt pre-commit install +install-examples: + @echo "Installing example dependencies..." + pip install -r examples/requirements.txt + # Environment setup CONTAINER_RUNTIME ?= $(shell command -v podman >/dev/null 2>&1 && echo podman || echo docker) CASSANDRA_CONTACT_POINTS ?= 127.0.0.1 @@ -322,3 +338,89 @@ clean: clean-all: clean cassandra-stop @echo "All cleaned up" + +# Example targets +.PHONY: example-streaming example-export-csv example-export-parquet example-realtime example-metrics example-non-blocking example-context example-fastapi examples-all + +# Ensure examples can connect to Cassandra +EXAMPLES_ENV = CASSANDRA_CONTACT_POINTS=$(CASSANDRA_CONTACT_POINTS) + +example-streaming: cassandra-wait + @echo "=== Running Streaming Basic Example ===" + @echo "This example demonstrates memory-efficient streaming of large result sets" + @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @$(EXAMPLES_ENV) python examples/streaming_basic.py + +example-export-csv: cassandra-wait + @echo "=== Running CSV Export Example ===" + @echo "This example exports a large Cassandra table to CSV format" + @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "Output will be saved to ./exports/ directory" + @$(EXAMPLES_ENV) python examples/export_large_table.py + +example-export-parquet: cassandra-wait + @echo "=== Running Parquet Export Example ===" + @echo "This example exports Cassandra tables to Parquet format with streaming" + @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "Output will be saved to ./parquet_exports/ directory" + @echo "Installing pyarrow if needed..." + @pip install pyarrow >/dev/null 2>&1 || echo "PyArrow already installed" + @$(EXAMPLES_ENV) python examples/export_to_parquet.py + +example-realtime: cassandra-wait + @echo "=== Running Real-time Processing Example ===" + @echo "This example demonstrates real-time streaming analytics on sensor data" + @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @$(EXAMPLES_ENV) python examples/realtime_processing.py + +example-metrics: cassandra-wait + @echo "=== Running Metrics Collection Examples ===" + @echo "Running simple metrics example..." + @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @$(EXAMPLES_ENV) python examples/metrics_simple.py + @echo "" + @echo "Running advanced metrics example..." + @$(EXAMPLES_ENV) python examples/metrics_example.py + +example-non-blocking: cassandra-wait + @echo "=== Running Non-Blocking Streaming Demo ===" + @echo "This demonstrates that streaming doesn't block the event loop" + @echo "Watch for heartbeat indicators showing continuous operation!" + @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @$(EXAMPLES_ENV) python examples/streaming_non_blocking_demo.py + +example-context: cassandra-wait + @echo "=== Running Context Manager Safety Demo ===" + @echo "This demonstrates proper resource management with context managers" + @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @$(EXAMPLES_ENV) python examples/context_manager_safety_demo.py + +example-fastapi: + @echo "=== Running FastAPI Example App ===" + @echo "This starts a full REST API with async Cassandra integration" + @echo "The app includes Docker Compose for easy setup" + @echo "See examples/fastapi_app/README.md for details" + @cd examples/fastapi_app && $(MAKE) run + +examples-all: cassandra-wait + @echo "=== Running All Examples ===" + @echo "This will run each example in sequence" + @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "" + @$(MAKE) example-streaming + @echo "\n----------------------------------------\n" + @$(MAKE) example-export-csv + @echo "\n----------------------------------------\n" + @$(MAKE) example-export-parquet + @echo "\n----------------------------------------\n" + @$(MAKE) example-realtime + @echo "\n----------------------------------------\n" + @$(MAKE) example-metrics + @echo "\n----------------------------------------\n" + @$(MAKE) example-non-blocking + @echo "\n----------------------------------------\n" + @$(MAKE) example-context + @echo "\nโœ… All examples completed!" + @echo "" + @echo "Note: FastAPI example not included as it starts a server" + @echo "Run 'make example-fastapi' separately to start the FastAPI app" diff --git a/examples/README.md b/examples/README.md index 2b51c4d..3b15055 100644 --- a/examples/README.md +++ b/examples/README.md @@ -2,6 +2,48 @@ This directory contains working examples demonstrating various features and use cases of async-cassandra. +## Quick Start + +### Running Examples with Make + +The easiest way to run examples is using the provided Make targets: + +```bash +# Run a specific example (automatically starts Cassandra if needed) +make example-streaming +make example-export-csv +make example-export-parquet +make example-realtime +make example-metrics +make example-non-blocking +make example-context + +# Run all examples in sequence +make examples-all + +# Use external Cassandra cluster +CASSANDRA_CONTACT_POINTS=node1.example.com,node2.example.com make example-streaming +``` + +### Installing Example Dependencies + +Some examples require additional dependencies: + +```bash +# Install all example dependencies (including pyarrow for Parquet export) +make install-examples + +# Or manually +pip install -r examples/requirements.txt +``` + +### Environment Variables + +All examples support these environment variables: +- `CASSANDRA_CONTACT_POINTS`: Comma-separated list of contact points (default: localhost) +- `CASSANDRA_PORT`: Port number (default: 9042) +- `EXAMPLE_OUTPUT_DIR`: Directory for output files like CSV and Parquet exports (default: examples/exampleoutput) + ## Available Examples ### 1. [FastAPI Integration](fastapi_app/) @@ -50,10 +92,38 @@ Shows how to export large Cassandra tables to CSV: **Run:** ```bash python export_large_table.py -# Exports will be saved in ./exports/ directory +# Exports will be saved in examples/exampleoutput/ directory (default) + +# Or with custom output directory: +EXAMPLE_OUTPUT_DIR=/tmp/my-exports python export_large_table.py ``` -### 4. [Real-time Data Processing](realtime_processing.py) +### 4. [Export to Parquet Format](export_to_parquet.py) + +Advanced example of exporting large Cassandra tables to Parquet format: +- Memory-efficient streaming with page-by-page processing +- Automatic schema inference from Cassandra data types +- Multiple compression options (snappy, gzip, lz4) +- Progress tracking during export +- Handles all Cassandra data types including collections +- Configurable row group sizes for optimization +- Export statistics and performance metrics + +**Run:** +```bash +python export_to_parquet.py +# Exports will be saved in examples/exampleoutput/ directory (default) + +# Or with custom output directory: +EXAMPLE_OUTPUT_DIR=/tmp/my-parquet-exports python export_to_parquet.py +``` + +**Note:** Requires PyArrow to be installed: +```bash +pip install pyarrow +``` + +### 5. [Real-time Data Processing](realtime_processing.py) Example of processing time-series data in real-time: - Sliding window analytics @@ -67,7 +137,7 @@ Example of processing time-series data in real-time: python realtime_processing.py ``` -### 5. [Metrics Collection](metrics_simple.py) +### 6. [Metrics Collection](metrics_simple.py) Simple example of metrics collection: - Query performance tracking @@ -80,7 +150,7 @@ Simple example of metrics collection: python metrics_simple.py ``` -### 6. [Advanced Metrics](metrics_example.py) +### 7. [Advanced Metrics](metrics_example.py) Comprehensive metrics and observability example: - Multiple metrics collectors setup @@ -94,7 +164,21 @@ Comprehensive metrics and observability example: python metrics_example.py ``` -### 7. [Context Manager Safety](context_manager_safety_demo.py) +### 8. [Non-Blocking Streaming Demo](streaming_non_blocking_demo.py) + +Visual demonstration that streaming doesn't block the event loop: +- Heartbeat monitoring to detect event loop blocking +- Concurrent queries during streaming +- Visual feedback showing event loop responsiveness +- Performance analysis of concurrent operations +- Proves the async wrapper truly keeps the event loop free + +**Run:** +```bash +python streaming_non_blocking_demo.py +``` + +### 9. [Context Manager Safety](context_manager_safety_demo.py) Demonstrates proper context manager usage: - Context manager isolation @@ -119,6 +203,19 @@ Production-ready monitoring configurations: - Connection health status - Error rates and trends +## Output Files + +Examples that generate output files (CSV exports, Parquet exports, etc.) save them to a configurable directory: + +- **Default location**: `examples/exampleoutput/` +- **Configure via environment variable**: `EXAMPLE_OUTPUT_DIR=/path/to/output` +- **Git ignored**: All files in the default output directory are ignored by Git (except README.md and .gitignore) +- **Cleanup**: Files are not automatically deleted; clean up manually when needed: + ```bash + rm -f examples/exampleoutput/*.csv + rm -f examples/exampleoutput/*.parquet + ``` + ## Prerequisites All examples require: diff --git a/examples/context_manager_safety_demo.py b/examples/context_manager_safety_demo.py index 6c4bf03..f934fd3 100644 --- a/examples/context_manager_safety_demo.py +++ b/examples/context_manager_safety_demo.py @@ -4,10 +4,29 @@ This example shows how context managers properly isolate resource cleanup, ensuring that errors in one operation don't close shared resources. + +How to run: +----------- +1. Using Make (automatically starts Cassandra if needed): + make example-context-safety + +2. With external Cassandra cluster: + CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2 make example-context-safety + +3. Direct Python execution: + python examples/context_manager_safety_demo.py + +4. With custom contact points: + CASSANDRA_CONTACT_POINTS=cassandra.example.com python examples/context_manager_safety_demo.py + +Environment variables: +- CASSANDRA_CONTACT_POINTS: Comma-separated list of contact points (default: localhost) +- CASSANDRA_PORT: Port number (default: 9042) """ import asyncio import logging +import os import uuid from cassandra import InvalidRequest @@ -168,8 +187,14 @@ async def main(): """Run all demonstrations.""" logger.info("Starting Context Manager Safety Demonstration") + # Get contact points from environment or use localhost + contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") + port = int(os.environ.get("CASSANDRA_PORT", "9042")) + + logger.info(f"Connecting to Cassandra at {contact_points}:{port}") + # Use cluster in context manager for automatic cleanup - async with AsyncCluster(["localhost"]) as cluster: + async with AsyncCluster(contact_points, port=port) as cluster: await demonstrate_query_error_safety(cluster) await demonstrate_streaming_error_safety(cluster) await demonstrate_context_manager_isolation(cluster) diff --git a/examples/exampleoutput/.gitignore b/examples/exampleoutput/.gitignore new file mode 100644 index 0000000..ba6cd86 --- /dev/null +++ b/examples/exampleoutput/.gitignore @@ -0,0 +1,6 @@ +# Ignore all files in this directory +* +# Except this .gitignore file +!.gitignore +# And the README +!README.md diff --git a/examples/exampleoutput/README.md b/examples/exampleoutput/README.md new file mode 100644 index 0000000..24df511 --- /dev/null +++ b/examples/exampleoutput/README.md @@ -0,0 +1,26 @@ +# Example Output Directory + +This directory is used by the async-cassandra examples to store output files such as: +- CSV exports +- Parquet exports +- Any other generated files + +All files in this directory (except .gitignore and README.md) are ignored by git. + +## Configuring Output Location + +You can override the output directory using the `EXAMPLE_OUTPUT_DIR` environment variable: + +```bash +EXAMPLE_OUTPUT_DIR=/tmp/my-output make example-export-csv +``` + +## Cleaning Up + +To remove all generated files: +```bash +rm -rf examples/exampleoutput/* +# Or just remove specific file types +rm -f examples/exampleoutput/*.csv +rm -f examples/exampleoutput/*.parquet +``` diff --git a/examples/export_large_table.py b/examples/export_large_table.py index d5051e3..8a69506 100644 --- a/examples/export_large_table.py +++ b/examples/export_large_table.py @@ -7,6 +7,25 @@ - Progress tracking during export - Async file I/O with aiofiles - Proper error handling + +How to run: +----------- +1. Using Make (automatically starts Cassandra if needed): + make example-export-large-table + +2. With external Cassandra cluster: + CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2 make example-export-large-table + +3. Direct Python execution: + python examples/export_large_table.py + +4. With custom contact points: + CASSANDRA_CONTACT_POINTS=cassandra.example.com python examples/export_large_table.py + +Environment variables: +- CASSANDRA_CONTACT_POINTS: Comma-separated list of contact points (default: localhost) +- CASSANDRA_PORT: Port number (default: 9042) +- EXAMPLE_OUTPUT_DIR: Directory for output files (default: examples/exampleoutput) """ import asyncio @@ -85,6 +104,7 @@ def progress_callback(page_num: int, rows_so_far: int): fieldnames = row._fields header = ",".join(fieldnames) + "\n" await f.write(header) + writer = True # Mark that header has been written # Write row data row_data = [] @@ -262,15 +282,22 @@ async def setup_sample_data(session): async def main(): """Run the export example.""" + # Get contact points from environment or use localhost + contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") + port = int(os.environ.get("CASSANDRA_PORT", "9042")) + + logger.info(f"Connecting to Cassandra at {contact_points}:{port}") + # Connect to Cassandra using context manager - async with AsyncCluster(["localhost"]) as cluster: + async with AsyncCluster(contact_points, port=port) as cluster: async with await cluster.connect() as session: # Setup sample data await setup_sample_data(session) # Create output directory - output_dir = Path("exports") - output_dir.mkdir(exist_ok=True) + output_dir = Path(os.environ.get("EXAMPLE_OUTPUT_DIR", "examples/exampleoutput")) + output_dir.mkdir(parents=True, exist_ok=True) + logger.info(f"Output directory: {output_dir}") # Export using async I/O if available if ASYNC_FILE_IO: diff --git a/examples/export_to_parquet.py b/examples/export_to_parquet.py new file mode 100644 index 0000000..9fd0a41 --- /dev/null +++ b/examples/export_to_parquet.py @@ -0,0 +1,554 @@ +#!/usr/bin/env python3 +""" +Export large Cassandra tables to Parquet format efficiently. + +This example demonstrates: +- Memory-efficient streaming of large result sets +- Exporting data to Parquet format without loading entire dataset in memory +- Progress tracking during export +- Schema inference from Cassandra data +- Handling different data types +- Batch writing for optimal performance + +How to run: +----------- +1. Using Make (automatically starts Cassandra if needed): + make example-export-parquet + +2. With external Cassandra cluster: + CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2 make example-export-parquet + +3. Direct Python execution: + python examples/export_to_parquet.py + +4. With custom contact points: + CASSANDRA_CONTACT_POINTS=cassandra.example.com python examples/export_to_parquet.py + +Environment variables: +- CASSANDRA_CONTACT_POINTS: Comma-separated list of contact points (default: localhost) +- CASSANDRA_PORT: Port number (default: 9042) +- EXAMPLE_OUTPUT_DIR: Directory for output files (default: examples/exampleoutput) +""" + +import asyncio +import logging +import os +from datetime import datetime, timedelta +from decimal import Decimal +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pyarrow as pa +import pyarrow.parquet as pq + +from async_cassandra import AsyncCluster, StreamConfig + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ParquetExporter: + """Export Cassandra tables to Parquet format with streaming.""" + + def __init__(self, output_dir: str = "parquet_exports"): + """ + Initialize the exporter. + + Args: + output_dir: Directory to save Parquet files + """ + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + @staticmethod + def infer_arrow_type(cassandra_type: Any) -> pa.DataType: + """ + Infer PyArrow data type from Cassandra column type. + + Args: + cassandra_type: Cassandra column type + + Returns: + Corresponding PyArrow data type + """ + # Map common Cassandra types to PyArrow types + type_name = str(cassandra_type).lower() + + if "text" in type_name or "varchar" in type_name or "ascii" in type_name: + return pa.string() + elif "int" in type_name and "big" in type_name: + return pa.int64() + elif "int" in type_name: + return pa.int32() + elif "float" in type_name: + return pa.float32() + elif "double" in type_name: + return pa.float64() + elif "decimal" in type_name: + return pa.decimal128(38, 10) # Default precision/scale + elif "boolean" in type_name: + return pa.bool_() + elif "timestamp" in type_name: + return pa.timestamp("ms") + elif "date" in type_name: + return pa.date32() + elif "time" in type_name: + return pa.time64("ns") + elif "uuid" in type_name: + return pa.string() # Store UUIDs as strings + elif "blob" in type_name: + return pa.binary() + else: + # Default to string for unknown types + return pa.string() + + async def export_table( + self, + session, + table_name: str, + keyspace: str, + fetch_size: int = 10000, + row_group_size: int = 50000, + where_clause: Optional[str] = None, + compression: str = "snappy", + ) -> Dict[str, Any]: + """ + Export a Cassandra table to Parquet format. + + Args: + session: AsyncCassandraSession instance + table_name: Name of the table to export + keyspace: Keyspace containing the table + fetch_size: Number of rows to fetch per page + row_group_size: Number of rows per Parquet row group + where_clause: Optional WHERE clause for filtering + compression: Parquet compression codec + + Returns: + Export statistics + """ + start_time = datetime.now() + output_file = self.output_dir / f"{keyspace}.{table_name}.parquet" + temp_file = self.output_dir / f"{keyspace}.{table_name}.parquet.tmp" + + logger.info(f"Starting export of {keyspace}.{table_name} to {output_file}") + + # Build query + query = f"SELECT * FROM {keyspace}.{table_name}" + if where_clause: + query += f" WHERE {where_clause}" + + # Statistics + total_rows = 0 + total_pages = 0 + total_bytes = 0 + + # Progress callback + def progress_callback(page_num: int, rows_in_page: int): + nonlocal total_pages + total_pages = page_num + if page_num % 10 == 0: + logger.info(f"Exported page {page_num} ({total_rows + rows_in_page:,} rows total)") + + # Configure streaming + config = StreamConfig( + fetch_size=fetch_size, + page_callback=progress_callback, + ) + + schema = None + writer = None + batch_data: Dict[str, List[Any]] = {} + + try: + # Stream data from Cassandra + async with await session.execute_stream(query, stream_config=config) as result: + # Process pages for memory efficiency + async for page in result.pages(): + if not page: + continue + + # Infer schema from first page + if schema is None and page: + first_row = page[0] + + # Get column names from first row + column_names = list(first_row._fields) + + # Build PyArrow schema by inspecting actual values + fields = [] + for name in column_names: + value = getattr(first_row, name) + + # Infer type from actual value + if value is None: + # For None values, we'll need to look at other rows + # For now, default to string which can handle nulls + arrow_type = pa.string() + elif isinstance(value, bool): + arrow_type = pa.bool_() + elif isinstance(value, int): + arrow_type = pa.int64() + elif isinstance(value, float): + arrow_type = pa.float64() + elif isinstance(value, Decimal): + arrow_type = pa.float64() # Convert Decimal to float64 + elif isinstance(value, datetime): + arrow_type = pa.timestamp("ms") + elif isinstance(value, str): + arrow_type = pa.string() + elif isinstance(value, bytes): + arrow_type = pa.binary() + elif isinstance(value, (list, set, dict)): + arrow_type = pa.string() # Convert collections to string + elif hasattr(value, "__class__") and value.__class__.__name__ in [ + "OrderedMapSerializedKey", + "SortedSet", + ]: + arrow_type = pa.string() # Cassandra special types + else: + arrow_type = pa.string() # Default to string for unknown types + + fields.append(pa.field(name, arrow_type)) + + schema = pa.schema(fields) + + # Create Parquet writer + writer = pq.ParquetWriter( + temp_file, + schema, + compression=compression, + version="2.6", # Latest format + use_dictionary=True, + ) + + # Initialize batch data structure + batch_data = {name: [] for name in column_names} + + # Process rows in page + for row in page: + # Add row data to batch + for field in column_names: + value = getattr(row, field) + + # Handle special types + if isinstance(value, datetime): + # Keep as datetime - PyArrow handles conversion + pass + elif isinstance(value, Decimal): + # Convert Decimal to float + value = float(value) + elif isinstance(value, (list, set, dict)): + # Convert collections to string + value = str(value) + elif value is not None and not isinstance( + value, (str, bytes, int, float, bool, datetime) + ): + # Convert other objects like UUID to string + value = str(value) + + batch_data[field].append(value) + + total_rows += 1 + + # Write batch when it reaches the desired size + if total_rows % row_group_size == 0: + batch = pa.record_batch(batch_data, schema=schema) + writer.write_batch(batch) + + # Clear batch data + batch_data = {name: [] for name in column_names} + + logger.info(f"Written {total_rows:,} rows to Parquet") + + # Write final partial batch + if any(batch_data.values()): + batch = pa.record_batch(batch_data, schema=schema) + writer.write_batch(batch) + + finally: + if writer: + writer.close() + + # Get file size + total_bytes = temp_file.stat().st_size + + # Rename temp file to final name + temp_file.rename(output_file) + + # Calculate statistics + duration = (datetime.now() - start_time).total_seconds() + rows_per_second = total_rows / duration if duration > 0 else 0 + mb_per_second = (total_bytes / (1024 * 1024)) / duration if duration > 0 else 0 + + stats = { + "table": f"{keyspace}.{table_name}", + "output_file": str(output_file), + "total_rows": total_rows, + "total_pages": total_pages, + "total_bytes": total_bytes, + "total_mb": round(total_bytes / (1024 * 1024), 2), + "duration_seconds": round(duration, 2), + "rows_per_second": round(rows_per_second), + "mb_per_second": round(mb_per_second, 2), + "compression": compression, + "row_group_size": row_group_size, + } + + logger.info("\nExport completed successfully!") + logger.info(f"Statistics: {stats}") + + return stats + + +async def setup_test_data(session): + """Create test data for export demonstration.""" + logger.info("Setting up test data...") + + # Create keyspace + await session.execute( + """ + CREATE KEYSPACE IF NOT EXISTS analytics + WITH REPLICATION = { + 'class': 'SimpleStrategy', + 'replication_factor': 1 + } + """ + ) + + await session.set_keyspace("analytics") + + # Create a table with various data types + await session.execute( + """ + CREATE TABLE IF NOT EXISTS user_events ( + user_id UUID, + event_time TIMESTAMP, + event_type TEXT, + device_type TEXT, + country_code TEXT, + city TEXT, + revenue DECIMAL, + duration_seconds INT, + is_premium BOOLEAN, + metadata MAP, + tags SET, + PRIMARY KEY (user_id, event_time) + ) WITH CLUSTERING ORDER BY (event_time DESC) + """ + ) + + # Insert test data + insert_stmt = await session.prepare( + """ + INSERT INTO user_events ( + user_id, event_time, event_type, device_type, + country_code, city, revenue, duration_seconds, + is_premium, metadata, tags + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + ) + + # Generate substantial test data + logger.info("Inserting test data...") + + import random + import uuid + from decimal import Decimal + + event_types = ["view", "click", "purchase", "signup", "logout"] + device_types = ["mobile", "desktop", "tablet", "tv"] + countries = ["US", "UK", "DE", "FR", "JP", "BR", "IN", "AU"] + cities = ["New York", "London", "Berlin", "Paris", "Tokyo", "Sรฃo Paulo", "Mumbai", "Sydney"] + + base_time = datetime.now() - timedelta(days=30) + tasks = [] + total_inserted = 0 + + # Insert data for 100 users over 30 days + for user_num in range(100): + user_id = uuid.uuid4() + is_premium = random.random() > 0.7 + + # Each user has 100-500 events + num_events = random.randint(100, 500) + + for event_num in range(num_events): + event_time = base_time + timedelta( + days=random.randint(0, 29), + hours=random.randint(0, 23), + minutes=random.randint(0, 59), + seconds=random.randint(0, 59), + ) + + event_type = random.choice(event_types) + revenue = ( + Decimal(str(round(random.uniform(0, 100), 2))) + if event_type == "purchase" + else Decimal("0") + ) + + metadata = { + "session_id": str(uuid.uuid4()), + "version": f"{random.randint(1, 5)}.{random.randint(0, 9)}.{random.randint(0, 9)}", + "platform": random.choice(["iOS", "Android", "Web"]), + } + + tags = set( + random.sample( + ["mobile", "desktop", "premium", "trial", "organic", "paid", "social"], + k=random.randint(1, 4), + ) + ) + + tasks.append( + session.execute( + insert_stmt, + [ + user_id, + event_time, + event_type, + random.choice(device_types), + random.choice(countries), + random.choice(cities), + revenue, + random.randint(10, 3600), + is_premium, + metadata, + tags, + ], + ) + ) + + # Execute in batches + if len(tasks) >= 100: + await asyncio.gather(*tasks) + tasks = [] + total_inserted += 100 + + if total_inserted % 5000 == 0: + logger.info(f"Inserted {total_inserted:,} events...") + + # Execute remaining tasks + if tasks: + await asyncio.gather(*tasks) + total_inserted += len(tasks) + + logger.info(f"Test data setup complete: {total_inserted:,} events inserted") + + +async def demonstrate_exports(session): + """Demonstrate various export scenarios.""" + output_dir = os.environ.get("EXAMPLE_OUTPUT_DIR", "examples/exampleoutput") + logger.info(f"Output directory: {output_dir}") + + # Example 1: Export entire table + logger.info("\n=== Example 1: Export Entire Table ===") + exporter1 = ParquetExporter(str(Path(output_dir) / "example1")) + stats1 = await exporter1.export_table( + session, + table_name="user_events", + keyspace="analytics", + fetch_size=5000, + row_group_size=25000, + ) + + # Example 2: Export with filtering + logger.info("\n=== Example 2: Export Filtered Data ===") + exporter2 = ParquetExporter(str(Path(output_dir) / "example2")) + stats2 = await exporter2.export_table( + session, + table_name="user_events", + keyspace="analytics", + fetch_size=5000, + row_group_size=25000, + where_clause="event_type = 'purchase' ALLOW FILTERING", + compression="gzip", + ) + + # Example 3: Export with different compression + logger.info("\n=== Example 3: Export with Different Compression ===") + exporter3 = ParquetExporter(str(Path(output_dir) / "example3")) + stats3 = await exporter3.export_table( + session, + table_name="user_events", + keyspace="analytics", + fetch_size=10000, + row_group_size=50000, + compression="lz4", + ) + + return [stats1, stats2, stats3] + + +async def verify_parquet_files(): + """Verify the exported Parquet files.""" + logger.info("\n=== Verifying Exported Files ===") + + export_dir = Path(os.environ.get("EXAMPLE_OUTPUT_DIR", "examples/exampleoutput")) + + # Look for Parquet files in subdirectories too + for parquet_file in export_dir.rglob("*.parquet"): + logger.info(f"\nVerifying {parquet_file.name}:") + + # Read Parquet file metadata + parquet_file_obj = pq.ParquetFile(parquet_file) + + # Display metadata + logger.info(f" Schema: {parquet_file_obj.schema}") + logger.info(f" Num row groups: {parquet_file_obj.num_row_groups}") + logger.info(f" Total rows: {parquet_file_obj.metadata.num_rows}") + logger.info( + f" Compression: {parquet_file_obj.metadata.row_group(0).column(0).compression}" + ) + + # Read first few rows + table = pq.read_table(parquet_file, columns=None) + df = table.to_pandas() + + logger.info(f" Columns: {list(df.columns)}") + logger.info(f" Shape: {df.shape}") + logger.info(f" Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB") + + # Show sample data + logger.info("\n First 5 rows:") + logger.info(df.head()) + + +async def main(): + """Run the Parquet export examples.""" + # Get contact points from environment or use localhost + contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") + port = int(os.environ.get("CASSANDRA_PORT", "9042")) + + logger.info(f"Connecting to Cassandra at {contact_points}:{port}") + + # Connect to Cassandra using context manager + async with AsyncCluster(contact_points, port=port) as cluster: + async with await cluster.connect() as session: + # Setup test data + await setup_test_data(session) + + # Run export demonstrations + export_stats = await demonstrate_exports(session) + + # Verify exported files + await verify_parquet_files() + + # Summary + logger.info("\n=== Export Summary ===") + for stats in export_stats: + logger.info( + f"- {stats['table']}: {stats['total_rows']:,} rows, " + f"{stats['total_mb']} MB, {stats['duration_seconds']}s " + f"({stats['rows_per_second']:,} rows/s)" + ) + + # Cleanup + logger.info("\nCleaning up...") + await session.execute("DROP KEYSPACE analytics") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/metrics_example.py b/examples/metrics_example.py index ffcabad..b22e8c1 100644 --- a/examples/metrics_example.py +++ b/examples/metrics_example.py @@ -8,9 +8,28 @@ - Tracking connection health - Basic metrics analysis - Performance optimization patterns + +How to run: +----------- +1. Using Make (automatically starts Cassandra if needed): + make example-metrics + +2. With external Cassandra cluster: + CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2 make example-metrics + +3. Direct Python execution: + python examples/metrics_example.py + +4. With custom contact points: + CASSANDRA_CONTACT_POINTS=cassandra.example.com python examples/metrics_example.py + +Environment variables: +- CASSANDRA_CONTACT_POINTS: Comma-separated list of contact points (default: localhost) +- CASSANDRA_PORT: Port number (default: 9042) """ import asyncio +import os import time import uuid from datetime import datetime, timezone @@ -144,6 +163,12 @@ async def main(): print("๐Ÿš€ Async-Cassandra Metrics Example") print("=" * 60) + # Get contact points from environment or use localhost + contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") + port = int(os.environ.get("CASSANDRA_PORT", "9042")) + + print(f"\nConnecting to Cassandra at {contact_points}:{port}") + # 1. Set up metrics collectors print("\n๐Ÿ”ง Setting up metrics system...") @@ -166,7 +191,7 @@ async def main(): metrics = MetricsMiddleware(collectors) # 2. Create cluster and run workload - async with AsyncCluster(contact_points=["localhost"]) as cluster: + async with AsyncCluster(contact_points=contact_points, port=port) as cluster: async with await cluster.connect() as session: # Set up test environment print("\n๐Ÿ“ฆ Setting up test database...") diff --git a/examples/metrics_simple.py b/examples/metrics_simple.py index 768844f..4771af2 100644 --- a/examples/metrics_simple.py +++ b/examples/metrics_simple.py @@ -3,9 +3,28 @@ Simple metrics collection example with async-cassandra. This example shows basic metrics collection and monitoring. + +How to run: +----------- +1. Using Make (automatically starts Cassandra if needed): + make example-metrics-simple + +2. With external Cassandra cluster: + CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2 make example-metrics-simple + +3. Direct Python execution: + python examples/metrics_simple.py + +4. With custom contact points: + CASSANDRA_CONTACT_POINTS=cassandra.example.com python examples/metrics_simple.py + +Environment variables: +- CASSANDRA_CONTACT_POINTS: Comma-separated list of contact points (default: localhost) +- CASSANDRA_PORT: Port number (default: 9042) """ import asyncio +import os import time import uuid from datetime import datetime @@ -18,6 +37,12 @@ async def main(): """Run basic metrics example.""" print("๐Ÿš€ async-cassandra Metrics Example\n") + # Get contact points from environment or use localhost + contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") + port = int(os.environ.get("CASSANDRA_PORT", "9042")) + + print(f"Connecting to Cassandra at {contact_points}:{port}\n") + # Create metrics collector collector = InMemoryMetricsCollector(max_entries=1000) @@ -25,7 +50,7 @@ async def main(): metrics_middleware = MetricsMiddleware([collector]) # Create cluster using context manager - async with AsyncCluster(["localhost"]) as cluster: + async with AsyncCluster(contact_points, port=port) as cluster: # Create session using context manager async with await cluster.connect() as session: diff --git a/examples/realtime_processing.py b/examples/realtime_processing.py index 7a30b66..7b68871 100644 --- a/examples/realtime_processing.py +++ b/examples/realtime_processing.py @@ -7,10 +7,29 @@ - Aggregating data while streaming - Handling continuous data ingestion - Implementing sliding window analytics + +How to run: +----------- +1. Using Make (automatically starts Cassandra if needed): + make example-realtime + +2. With external Cassandra cluster: + CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2 make example-realtime + +3. Direct Python execution: + python examples/realtime_processing.py + +4. With custom contact points: + CASSANDRA_CONTACT_POINTS=cassandra.example.com python examples/realtime_processing.py + +Environment variables: +- CASSANDRA_CONTACT_POINTS: Comma-separated list of contact points (default: localhost) +- CASSANDRA_PORT: Port number (default: 9042) """ import asyncio import logging +import os from collections import defaultdict, deque from dataclasses import dataclass from datetime import datetime, timedelta @@ -402,8 +421,14 @@ async def simulate_realtime_processing(session, processor: RealTimeProcessor): async def main(): """Run real-time processing example.""" + # Get contact points from environment or use localhost + contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") + port = int(os.environ.get("CASSANDRA_PORT", "9042")) + + logger.info(f"Connecting to Cassandra at {contact_points}:{port}") + # Connect to Cassandra using context manager - async with AsyncCluster(["localhost"]) as cluster: + async with AsyncCluster(contact_points, port=port) as cluster: async with await cluster.connect() as session: # Setup test data await setup_sensor_data(session) diff --git a/examples/requirements.txt b/examples/requirements.txt new file mode 100644 index 0000000..a16b1c2 --- /dev/null +++ b/examples/requirements.txt @@ -0,0 +1,8 @@ +# Requirements for running the examples +# Install with: pip install -r examples/requirements.txt + +# For Parquet export example +pyarrow>=10.0.0 + +# The main async-cassandra package (install from parent directory) +# pip install -e .. diff --git a/examples/streaming_basic.py b/examples/streaming_basic.py index 0684978..03f78da 100644 --- a/examples/streaming_basic.py +++ b/examples/streaming_basic.py @@ -7,10 +7,29 @@ - Configuring fetch size - Processing rows one at a time - Handling empty results + +How to run: +----------- +1. Using Make (automatically starts Cassandra if needed): + make example-streaming + +2. With external Cassandra cluster: + CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2 make example-streaming + +3. Direct Python execution: + python examples/streaming_basic.py + +4. With custom contact points: + CASSANDRA_CONTACT_POINTS=cassandra.example.com python examples/streaming_basic.py + +Environment variables: +- CASSANDRA_CONTACT_POINTS: Comma-separated list of contact points (default: localhost) +- CASSANDRA_PORT: Port number (default: 9042) """ import asyncio import logging +import os from datetime import datetime from async_cassandra import AsyncCluster, StreamConfig @@ -226,8 +245,14 @@ async def page_based_streaming_example(session): async def main(): """Run all streaming examples.""" + # Get contact points from environment or use localhost + contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") + port = int(os.environ.get("CASSANDRA_PORT", "9042")) + + logger.info(f"Connecting to Cassandra at {contact_points}:{port}") + # Connect to Cassandra using context manager - async with AsyncCluster(["localhost"]) as cluster: + async with AsyncCluster(contact_points, port=port) as cluster: async with await cluster.connect() as session: # Setup test data await setup_test_data(session) diff --git a/examples/streaming_non_blocking_demo.py b/examples/streaming_non_blocking_demo.py new file mode 100644 index 0000000..17cccc4 --- /dev/null +++ b/examples/streaming_non_blocking_demo.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +""" +Visual demonstration that streaming doesn't block the event loop. + +This example shows that while pages are being fetched from Cassandra, +the event loop remains free to handle other tasks like updating a +progress bar, handling other requests, or maintaining heartbeats. + +How to run: +----------- +1. Using Make (automatically starts Cassandra if needed): + make example-streaming-demo + +2. With external Cassandra cluster: + CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2 make example-streaming-demo + +3. Direct Python execution: + python examples/streaming_non_blocking_demo.py + +4. With custom contact points: + CASSANDRA_CONTACT_POINTS=cassandra.example.com python examples/streaming_non_blocking_demo.py + +Environment variables: +- CASSANDRA_CONTACT_POINTS: Comma-separated list of contact points (default: localhost) +- CASSANDRA_PORT: Port number (default: 9042) +""" + +import asyncio +import os +import time +from datetime import datetime + +from async_cassandra import AsyncCluster, StreamConfig + + +class EventLoopMonitor: + """Monitor event loop responsiveness during streaming.""" + + def __init__(self): + self.heartbeats = [] + self.page_fetch_times = [] + self.monitoring = True + + async def heartbeat(self): + """Send a heartbeat every 50ms to detect blocking.""" + print("๐Ÿ’“ Starting heartbeat monitor...") + while self.monitoring: + self.heartbeats.append(time.perf_counter()) + await asyncio.sleep(0.05) # 50ms + + def record_page_fetch(self, page_num: int): + """Record when a page is fetched.""" + self.page_fetch_times.append((page_num, time.perf_counter())) + + def analyze(self): + """Analyze heartbeat gaps to detect blocking.""" + if len(self.heartbeats) < 2: + return + + gaps = [] + for i in range(1, len(self.heartbeats)): + gap = self.heartbeats[i] - self.heartbeats[i - 1] + gaps.append(gap * 1000) # Convert to ms + + avg_gap = sum(gaps) / len(gaps) + max_gap = max(gaps) + blocked_count = sum(1 for gap in gaps if gap > 100) # >100ms gaps + + print("\n๐Ÿ“Š Event Loop Analysis:") + print(f" Average heartbeat gap: {avg_gap:.1f}ms (target: 50ms)") + print(f" Maximum gap: {max_gap:.1f}ms") + print(f" Gaps > 100ms: {blocked_count}") + + if max_gap < 100: + print(" โœ… Event loop remained responsive!") + elif max_gap < 200: + print(" โš ๏ธ Minor blocking detected") + else: + print(" โŒ Significant blocking detected") + + +async def setup_demo_data(session): + """Create demo data with enough rows for multiple pages.""" + print("๐Ÿ”ง Setting up demo data...") + + # Create keyspace and table + await session.execute( + """ + CREATE KEYSPACE IF NOT EXISTS streaming_demo + WITH REPLICATION = { + 'class': 'SimpleStrategy', + 'replication_factor': 1 + } + """ + ) + + await session.set_keyspace("streaming_demo") + + await session.execute( + """ + CREATE TABLE IF NOT EXISTS sensor_data ( + sensor_id TEXT, + reading_time TIMESTAMP, + temperature DOUBLE, + humidity DOUBLE, + location TEXT, + PRIMARY KEY (sensor_id, reading_time) + ) WITH CLUSTERING ORDER BY (reading_time DESC) + """ + ) + + # Insert data - enough for multiple pages + insert_stmt = await session.prepare( + """ + INSERT INTO sensor_data (sensor_id, reading_time, temperature, humidity, location) + VALUES (?, ?, ?, ?, ?) + """ + ) + + sensors = [f"sensor_{i:03d}" for i in range(20)] + locations = ["Building A", "Building B", "Building C", "Outdoor"] + + tasks = [] + total_rows = 0 + base_time = datetime.now() + + for sensor_id in sensors: + for hour in range(24): + for minute in range(0, 60, 5): # Every 5 minutes + reading_time = base_time.replace(hour=hour, minute=minute, second=0) + temperature = 20.0 + (hash(f"{sensor_id}{hour}{minute}") % 100) / 10.0 + humidity = 40.0 + (hash(f"{sensor_id}{hour}{minute}h") % 400) / 10.0 + location = locations[hash(sensor_id) % len(locations)] + + tasks.append( + session.execute( + insert_stmt, [sensor_id, reading_time, temperature, humidity, location] + ) + ) + total_rows += 1 + + # Execute in batches + if len(tasks) >= 100: + await asyncio.gather(*tasks) + tasks = [] + + # Execute remaining + if tasks: + await asyncio.gather(*tasks) + + print(f"โœ… Created {total_rows:,} sensor readings") + + +async def demonstrate_non_blocking_streaming(session): + """Show that streaming doesn't block the event loop.""" + monitor = EventLoopMonitor() + + print("\n๐Ÿš€ Starting non-blocking streaming demonstration...") + print("Watch how the heartbeat continues while pages are fetched!\n") + + # Start heartbeat monitor + heartbeat_task = asyncio.create_task(monitor.heartbeat()) + + # Configure streaming with page callback + def page_callback(page_num: int, rows_in_page: int): + monitor.record_page_fetch(page_num) + print(f"๐Ÿ“„ Page {page_num} fetched ({rows_in_page} rows) - " f"Heartbeat still running! ๐Ÿ’“") + + config = StreamConfig( + fetch_size=500, page_callback=page_callback # Small pages to see multiple fetches + ) + + # Stream data + start_time = time.perf_counter() + rows_processed = 0 + + print("๐Ÿ”„ Starting to stream sensor data...\n") + + async with await session.execute_stream( + "SELECT * FROM sensor_data", stream_config=config + ) as result: + async for row in result: + rows_processed += 1 + + # Show progress periodically + if rows_processed % 1000 == 0: + elapsed = time.perf_counter() - start_time + rate = rows_processed / elapsed + print(f"โšก Processed {rows_processed:,} rows " f"({rate:.0f} rows/sec)") + + # Simulate some async processing + if rows_processed % 100 == 0: + await asyncio.sleep(0.001) # 1ms + + # Stop monitoring + monitor.monitoring = False + await heartbeat_task + + # Show results + duration = time.perf_counter() - start_time + print("\nโœ… Streaming complete!") + print(f" Total rows: {rows_processed:,}") + print(f" Duration: {duration:.2f}s") + print(f" Rate: {rows_processed/duration:.0f} rows/sec") + print(f" Pages fetched: {len(monitor.page_fetch_times)}") + + # Analyze event loop blocking + monitor.analyze() + + +async def demonstrate_concurrent_operations(session): + """Show that other operations can run during streaming.""" + print("\n\n๐ŸŽฏ Demonstrating concurrent operations during streaming...") + + # Prepare queries + count_stmt = await session.prepare("SELECT COUNT(*) FROM sensor_data WHERE sensor_id = ?") + + concurrent_results = [] + + async def run_concurrent_queries(): + """Run other queries while streaming is happening.""" + for i in range(10): + await asyncio.sleep(0.5) # Every 500ms + + start = time.perf_counter() + result = await session.execute(count_stmt, [f"sensor_{i:03d}"]) + duration = time.perf_counter() - start + count = result.one()[0] + + concurrent_results.append((i, duration, count)) + print(f" ๐Ÿ” Query {i+1} completed in {duration*1000:.1f}ms " f"(count: {count})") + + async def stream_data(): + """Stream data concurrently.""" + config = StreamConfig(fetch_size=1000) + rows = 0 + + async with await session.execute_stream( + "SELECT * FROM sensor_data", stream_config=config + ) as result: + async for row in result: + rows += 1 + if rows % 2000 == 0: + print(f" ๐Ÿ“Š Streaming progress: {rows:,} rows") + + return rows + + # Run both concurrently + print(" Running streaming and queries concurrently...\n") + + streaming_task = asyncio.create_task(stream_data()) + queries_task = asyncio.create_task(run_concurrent_queries()) + + rows_streamed, _ = await asyncio.gather(streaming_task, queries_task) + + # Analyze concurrent query performance + if concurrent_results: + avg_duration = sum(d for _, d, _ in concurrent_results) / len(concurrent_results) + max_duration = max(d for _, d, _ in concurrent_results) + + print("\n โœ… Concurrent operations summary:") + print(f" Rows streamed: {rows_streamed:,}") + print(f" Concurrent queries: {len(concurrent_results)}") + print(f" Average query time: {avg_duration*1000:.1f}ms") + print(f" Max query time: {max_duration*1000:.1f}ms") + + if max_duration < 0.1: # 100ms + print(" ๐ŸŽ‰ All queries remained fast during streaming!") + + +async def main(): + """Run the non-blocking streaming demonstration.""" + # Get contact points from environment or use localhost + contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") + port = int(os.environ.get("CASSANDRA_PORT", "9042")) + + print(f"Connecting to Cassandra at {contact_points}:{port}\n") + + # Connect to Cassandra + async with AsyncCluster(contact_points, port=port) as cluster: + async with await cluster.connect() as session: + # Setup demo data + await setup_demo_data(session) + + # Run demonstrations + await demonstrate_non_blocking_streaming(session) + await demonstrate_concurrent_operations(session) + + # Cleanup + print("\n๐Ÿงน Cleaning up...") + await session.execute("DROP KEYSPACE streaming_demo") + + print("\nโœจ Demonstration complete!") + print("\nKey takeaways:") + print("1. The event loop stays responsive during page fetches") + print("2. Other queries can execute concurrently with streaming") + print("3. async-cassandra bridges the driver's threads without blocking") + + +if __name__ == "__main__": + print("๐ŸŽช async-cassandra Non-Blocking Streaming Demo") + print("=" * 50) + asyncio.run(main()) diff --git a/tests/integration/test_example_scripts.py b/tests/integration/test_example_scripts.py index 53b5cb4..7ed2629 100644 --- a/tests/integration/test_example_scripts.py +++ b/tests/integration/test_example_scripts.py @@ -28,6 +28,8 @@ """ import asyncio +import os +import shutil import subprocess import sys from pathlib import Path @@ -105,7 +107,7 @@ async def test_streaming_basic_example(self, cassandra_cluster): ) assert result.one() is None, "Keyspace was not cleaned up" - async def test_export_large_table_example(self, cassandra_cluster): + async def test_export_large_table_example(self, cassandra_cluster, tmp_path): """ Test the table export example. @@ -116,6 +118,7 @@ async def test_export_large_table_example(self, cassandra_cluster): 3. Handles different data types properly 4. Shows progress during export 5. Cleans up resources + 6. Validates output file content Why this matters: ---------------- @@ -127,48 +130,75 @@ async def test_export_large_table_example(self, cassandra_cluster): script_path = EXAMPLES_DIR / "export_large_table.py" assert script_path.exists(), f"Example script not found: {script_path}" - # Ensure export directory doesn't exist - export_dir = Path("exports") - if export_dir.exists(): - for file in export_dir.glob("*.csv"): - file.unlink() - - # Run the example script - result = subprocess.run( - [sys.executable, str(script_path)], - capture_output=True, - text=True, - timeout=60, - ) + # Use temp directory for output + export_dir = tmp_path / "example_output" + export_dir.mkdir(exist_ok=True) + + try: + # Run the example script with custom output directory + env = os.environ.copy() + env["EXAMPLE_OUTPUT_DIR"] = str(export_dir) + + result = subprocess.run( + [sys.executable, str(script_path)], + capture_output=True, + text=True, + timeout=60, + env=env, + ) + + # Check execution succeeded + assert result.returncode == 0, f"Script failed with: {result.stderr}" + + # Verify expected output (might be in stdout or stderr due to logging) + output = result.stdout + result.stderr + assert "Created 5000 sample products" in output + assert "Export completed:" in output + assert "Rows exported: 5,000" in output + assert f"Output directory: {export_dir}" in output + + # Verify CSV file was created + csv_files = list(export_dir.glob("*.csv")) + assert len(csv_files) > 0, "No CSV files were created" + + # Verify CSV content + csv_file = csv_files[0] + assert csv_file.stat().st_size > 0, "CSV file is empty" + + # Read and validate CSV content + with open(csv_file, "r") as f: + header = f.readline().strip() + # Verify header contains expected columns + assert "product_id" in header + assert "category" in header + assert "price" in header + assert "in_stock" in header + assert "tags" in header + assert "attributes" in header + assert "created_at" in header + + # Read a few data rows to verify content + row_count = 0 + for line in f: + row_count += 1 + if row_count > 10: # Check first 10 rows + break + # Basic validation that row has content + assert len(line.strip()) > 0 + assert "," in line # CSV format - # Check execution succeeded - assert result.returncode == 0, f"Script failed with: {result.stderr}" + # Verify we have the expected number of rows (5000 + header) + f.seek(0) + total_lines = sum(1 for _ in f) + assert ( + total_lines == 5001 + ), f"Expected 5001 lines (header + 5000 rows), got {total_lines}" - # Verify expected output (might be in stdout or stderr due to logging) - output = result.stdout + result.stderr - assert "Created 5000 sample products" in output - assert "Export completed:" in output - assert "Rows exported: 5,000" in output - - # Verify CSV file was created - csv_files = list(export_dir.glob("*.csv")) - assert len(csv_files) > 0, "No CSV files were created" - - # Verify CSV content - csv_file = csv_files[0] - assert csv_file.stat().st_size > 0, "CSV file is empty" - - # Read first few lines to verify format - with open(csv_file, "r") as f: - header = f.readline().strip() - # Verify header contains expected columns - assert "product_id" in header - assert "category" in header - assert "price" in header - - # Cleanup - for file in export_dir.glob("*.csv"): - file.unlink() + finally: + # Cleanup - always clean up even if test fails + # pytest's tmp_path fixture also cleans up automatically + if export_dir.exists(): + shutil.rmtree(export_dir) async def test_context_manager_safety_demo(self, cassandra_cluster): """ @@ -354,6 +384,188 @@ async def test_metrics_advanced_example(self, cassandra_cluster): assert "Metrics" in output or "metrics" in output assert "queries" in output.lower() or "Queries" in output + @pytest.mark.timeout(240) # Override default timeout for this test + async def test_export_to_parquet_example(self, cassandra_cluster, tmp_path): + """ + Test the Parquet export example. + + What this tests: + --------------- + 1. Creates test data with various types + 2. Exports data to Parquet format + 3. Handles different compression formats + 4. Shows progress during export + 5. Verifies exported files + 6. Validates Parquet file content + 7. Cleans up resources automatically + + Why this matters: + ---------------- + - Parquet is popular for analytics + - Memory-efficient export critical for large datasets + - Type handling must be correct + - Shows advanced streaming patterns + """ + script_path = EXAMPLES_DIR / "export_to_parquet.py" + assert script_path.exists(), f"Example script not found: {script_path}" + + # Use temp directory for output + export_dir = tmp_path / "parquet_output" + export_dir.mkdir(exist_ok=True) + + try: + # Run the example script with custom output directory + env = os.environ.copy() + env["EXAMPLE_OUTPUT_DIR"] = str(export_dir) + + result = subprocess.run( + [sys.executable, str(script_path)], + capture_output=True, + text=True, + timeout=180, # Allow time for data generation and export + env=env, + ) + + # Check execution succeeded + if result.returncode != 0: + print(f"STDOUT:\n{result.stdout}") + print(f"STDERR:\n{result.stderr}") + assert result.returncode == 0, f"Script failed with return code {result.returncode}" + + # Verify expected output + output = result.stderr if result.stderr else result.stdout + assert "Setting up test data" in output + assert "Test data setup complete" in output + assert "Example 1: Export Entire Table" in output + assert "Example 2: Export Filtered Data" in output + assert "Example 3: Export with Different Compression" in output + assert "Export completed successfully!" in output + assert "Verifying Exported Files" in output + assert f"Output directory: {export_dir}" in output + + # Verify Parquet files were created (look recursively in subdirectories) + parquet_files = list(export_dir.rglob("*.parquet")) + assert ( + len(parquet_files) >= 3 + ), f"Expected at least 3 Parquet files, found {len(parquet_files)}" + + # Verify files have content + for parquet_file in parquet_files: + assert parquet_file.stat().st_size > 0, f"Parquet file {parquet_file} is empty" + + # Verify we can read and validate the Parquet files + try: + import pyarrow as pa + import pyarrow.parquet as pq + + # Track total rows across all files + total_rows = 0 + + for parquet_file in parquet_files: + table = pq.read_table(parquet_file) + assert table.num_rows > 0, f"Parquet file {parquet_file} has no rows" + total_rows += table.num_rows + + # Verify expected columns exist + column_names = [field.name for field in table.schema] + assert "user_id" in column_names + assert "event_time" in column_names + assert "event_type" in column_names + assert "device_type" in column_names + assert "country_code" in column_names + assert "city" in column_names + assert "revenue" in column_names + assert "duration_seconds" in column_names + assert "is_premium" in column_names + assert "metadata" in column_names + assert "tags" in column_names + + # Verify data types are preserved + schema = table.schema + assert schema.field("is_premium").type == pa.bool_() + assert ( + schema.field("duration_seconds").type == pa.int64() + ) # We use int64 in our schema + + # Read first few rows to validate content + df = table.to_pandas() + assert len(df) > 0 + + # Validate some data characteristics + assert ( + df["event_type"] + .isin(["view", "click", "purchase", "signup", "logout"]) + .all() + ) + assert df["device_type"].isin(["mobile", "desktop", "tablet", "tv"]).all() + assert df["duration_seconds"].between(10, 3600).all() + + # Verify we generated substantial test data (should be > 10k rows) + assert total_rows > 10000, f"Expected > 10000 total rows, got {total_rows}" + + except ImportError: + # PyArrow not available in test environment + pytest.skip("PyArrow not available for full validation") + + finally: + # Cleanup - always clean up even if test fails + # pytest's tmp_path fixture also cleans up automatically + if export_dir.exists(): + shutil.rmtree(export_dir) + + async def test_streaming_non_blocking_demo(self, cassandra_cluster): + """ + Test the non-blocking streaming demonstration. + + What this tests: + --------------- + 1. Creates test data for streaming + 2. Demonstrates event loop responsiveness + 3. Shows concurrent operations during streaming + 4. Provides visual feedback of non-blocking behavior + 5. Cleans up resources + + Why this matters: + ---------------- + - Proves async wrapper doesn't block + - Critical for understanding async benefits + - Shows real concurrent execution + - Validates our architecture claims + """ + script_path = EXAMPLES_DIR / "streaming_non_blocking_demo.py" + assert script_path.exists(), f"Example script not found: {script_path}" + + # Run the example script + result = subprocess.run( + [sys.executable, str(script_path)], + capture_output=True, + text=True, + timeout=120, # Allow time for demonstrations + ) + + # Check execution succeeded + if result.returncode != 0: + print(f"STDOUT:\n{result.stdout}") + print(f"STDERR:\n{result.stderr}") + assert result.returncode == 0, f"Script failed with return code {result.returncode}" + + # Verify expected output + output = result.stdout + result.stderr + assert "Starting non-blocking streaming demonstration" in output + assert "Heartbeat still running!" in output + assert "Event Loop Analysis:" in output + assert "Event loop remained responsive!" in output + assert "Demonstrating concurrent operations" in output + assert "Demonstration complete!" in output + + # Verify keyspace was cleaned up + async with AsyncCluster(["localhost"]) as cluster: + async with await cluster.connect() as session: + result = await session.execute( + "SELECT keyspace_name FROM system_schema.keyspaces WHERE keyspace_name = 'streaming_demo'" + ) + assert result.one() is None, "Keyspace was not cleaned up" + @pytest.mark.parametrize( "script_name", [ @@ -361,6 +573,8 @@ async def test_metrics_advanced_example(self, cassandra_cluster): "export_large_table.py", "context_manager_safety_demo.py", "metrics_simple.py", + "export_to_parquet.py", + "streaming_non_blocking_demo.py", ], ) async def test_example_uses_context_managers(self, script_name): @@ -406,6 +620,8 @@ async def test_example_uses_context_managers(self, script_name): "export_large_table.py", "context_manager_safety_demo.py", "metrics_simple.py", + "export_to_parquet.py", + "streaming_non_blocking_demo.py", ], ) async def test_example_uses_prepared_statements(self, script_name): diff --git a/tests/integration/test_streaming_non_blocking.py b/tests/integration/test_streaming_non_blocking.py new file mode 100644 index 0000000..4ca51b4 --- /dev/null +++ b/tests/integration/test_streaming_non_blocking.py @@ -0,0 +1,341 @@ +""" +Integration tests demonstrating that streaming doesn't block the event loop. + +This test proves that while the driver fetches pages in its thread pool, +the asyncio event loop remains free to handle other tasks. +""" + +import asyncio +import time +from typing import List + +import pytest + +from async_cassandra import AsyncCluster, StreamConfig + + +class TestStreamingNonBlocking: + """Test that streaming operations don't block the event loop.""" + + @pytest.fixture(autouse=True) + async def setup_test_data(self, cassandra_cluster): + """Create test data for streaming tests.""" + async with AsyncCluster(["localhost"]) as cluster: + async with await cluster.connect() as session: + # Create keyspace and table + await session.execute( + """ + CREATE KEYSPACE IF NOT EXISTS test_streaming + WITH REPLICATION = { + 'class': 'SimpleStrategy', + 'replication_factor': 1 + } + """ + ) + await session.set_keyspace("test_streaming") + + await session.execute( + """ + CREATE TABLE IF NOT EXISTS large_table ( + partition_key INT, + clustering_key INT, + data TEXT, + PRIMARY KEY (partition_key, clustering_key) + ) + """ + ) + + # Insert enough data to ensure multiple pages + # With fetch_size=1000 and 10k rows, we'll have 10 pages + insert_stmt = await session.prepare( + "INSERT INTO large_table (partition_key, clustering_key, data) VALUES (?, ?, ?)" + ) + + tasks = [] + for partition in range(10): + for cluster in range(1000): + # Create some data that takes time to process + data = f"Data for partition {partition}, cluster {cluster}" * 10 + tasks.append(session.execute(insert_stmt, [partition, cluster, data])) + + # Execute in batches + if len(tasks) >= 100: + await asyncio.gather(*tasks) + tasks = [] + + # Execute remaining + if tasks: + await asyncio.gather(*tasks) + + yield + + # Cleanup + await session.execute("DROP KEYSPACE test_streaming") + + async def test_event_loop_not_blocked_during_paging(self, cassandra_cluster): + """ + Test that the event loop remains responsive while pages are being fetched. + + This test runs a streaming query that fetches multiple pages while + simultaneously running a "heartbeat" task that increments a counter + every 10ms. If the event loop was blocked during page fetches, + we would see gaps in the heartbeat counter. + """ + heartbeat_count = 0 + heartbeat_times: List[float] = [] + streaming_events: List[tuple[float, str]] = [] + stop_heartbeat = False + + async def heartbeat_task(): + """Increment counter every 10ms to detect event loop blocking.""" + nonlocal heartbeat_count + start_time = time.perf_counter() + + while not stop_heartbeat: + heartbeat_count += 1 + current_time = time.perf_counter() + heartbeat_times.append(current_time - start_time) + await asyncio.sleep(0.01) # 10ms + + async def streaming_task(): + """Stream data and record when pages are fetched.""" + nonlocal streaming_events + + async with AsyncCluster(["localhost"]) as cluster: + async with await cluster.connect() as session: + await session.set_keyspace("test_streaming") + + rows_seen = 0 + pages_fetched = 0 + + def page_callback(page_num: int, rows_in_page: int): + nonlocal pages_fetched + pages_fetched = page_num + current_time = time.perf_counter() - start_time + streaming_events.append((current_time, f"Page {page_num} fetched")) + + # Use small fetch_size to ensure multiple pages + config = StreamConfig(fetch_size=1000, page_callback=page_callback) + + start_time = time.perf_counter() + + async with await session.execute_stream( + "SELECT * FROM large_table", stream_config=config + ) as result: + async for row in result: + rows_seen += 1 + + # Simulate some processing time + await asyncio.sleep(0.001) # 1ms per row + + # Record progress at key points + if rows_seen % 1000 == 0: + current_time = time.perf_counter() - start_time + streaming_events.append( + (current_time, f"Processed {rows_seen} rows") + ) + + return rows_seen, pages_fetched + + # Run both tasks concurrently + heartbeat = asyncio.create_task(heartbeat_task()) + + # Run streaming and measure time + stream_start = time.perf_counter() + rows_processed, pages = await streaming_task() + stream_duration = time.perf_counter() - stream_start + + # Stop heartbeat + stop_heartbeat = True + await heartbeat + + # Analyze results + print("\n=== Event Loop Blocking Test Results ===") + print(f"Total rows processed: {rows_processed:,}") + print(f"Total pages fetched: {pages}") + print(f"Streaming duration: {stream_duration:.2f}s") + print(f"Heartbeat count: {heartbeat_count}") + print(f"Expected heartbeats: ~{int(stream_duration / 0.01)}") + + # Check heartbeat consistency + if len(heartbeat_times) > 1: + # Calculate gaps between heartbeats + heartbeat_gaps = [] + for i in range(1, len(heartbeat_times)): + gap = heartbeat_times[i] - heartbeat_times[i - 1] + heartbeat_gaps.append(gap) + + avg_gap = sum(heartbeat_gaps) / len(heartbeat_gaps) + max_gap = max(heartbeat_gaps) + gaps_over_50ms = sum(1 for gap in heartbeat_gaps if gap > 0.05) + + print("\nHeartbeat Analysis:") + print(f"Average gap: {avg_gap*1000:.1f}ms (target: 10ms)") + print(f"Max gap: {max_gap*1000:.1f}ms") + print(f"Gaps > 50ms: {gaps_over_50ms}") + + # Print streaming events timeline + print("\nStreaming Events Timeline:") + for event_time, event in streaming_events: + print(f" {event_time:.3f}s: {event}") + + # Assertions + assert heartbeat_count > 0, "Heartbeat task didn't run" + + # The average gap should be close to 10ms + # Allow some tolerance for scheduling + assert avg_gap < 0.02, f"Average heartbeat gap too large: {avg_gap*1000:.1f}ms" + + # Max gap shows worst-case blocking + # Even with page fetches, should not block for long + assert max_gap < 0.1, f"Max heartbeat gap too large: {max_gap*1000:.1f}ms" + + # Should have very few large gaps + assert gaps_over_50ms < 5, f"Too many large gaps: {gaps_over_50ms}" + + # Verify streaming completed successfully + assert rows_processed == 10000, f"Expected 10000 rows, got {rows_processed}" + assert pages >= 10, f"Expected at least 10 pages, got {pages}" + + async def test_concurrent_queries_during_streaming(self, cassandra_cluster): + """ + Test that other queries can execute while streaming is in progress. + + This proves that the thread pool isn't completely blocked by streaming. + """ + async with AsyncCluster(["localhost"]) as cluster: + async with await cluster.connect() as session: + await session.set_keyspace("test_streaming") + + # Prepare a simple query + count_stmt = await session.prepare( + "SELECT COUNT(*) FROM large_table WHERE partition_key = ?" + ) + + query_times: List[float] = [] + queries_completed = 0 + + async def run_concurrent_queries(): + """Run queries every 100ms during streaming.""" + nonlocal queries_completed + + for i in range(20): # 20 queries over 2 seconds + start = time.perf_counter() + await session.execute(count_stmt, [i % 10]) + duration = time.perf_counter() - start + query_times.append(duration) + queries_completed += 1 + + # Log slow queries + if duration > 0.1: + print(f"Slow query {i}: {duration:.3f}s") + + await asyncio.sleep(0.1) # 100ms between queries + + async def stream_large_dataset(): + """Stream the entire table.""" + config = StreamConfig(fetch_size=1000) + rows = 0 + + async with await session.execute_stream( + "SELECT * FROM large_table", stream_config=config + ) as result: + async for row in result: + rows += 1 + # Minimal processing + if rows % 2000 == 0: + await asyncio.sleep(0.001) + + return rows + + # Run both concurrently + streaming_task = asyncio.create_task(stream_large_dataset()) + queries_task = asyncio.create_task(run_concurrent_queries()) + + # Wait for both to complete + rows_streamed, _ = await asyncio.gather(streaming_task, queries_task) + + # Analyze results + print("\n=== Concurrent Queries Test Results ===") + print(f"Rows streamed: {rows_streamed:,}") + print(f"Concurrent queries completed: {queries_completed}") + + if query_times: + avg_query_time = sum(query_times) / len(query_times) + max_query_time = max(query_times) + + print(f"Average query time: {avg_query_time*1000:.1f}ms") + print(f"Max query time: {max_query_time*1000:.1f}ms") + + # Assertions + assert queries_completed >= 15, "Not enough queries completed" + assert avg_query_time < 0.1, f"Queries too slow: {avg_query_time:.3f}s" + + # Even the slowest query shouldn't be terribly slow + assert max_query_time < 0.5, f"Max query time too high: {max_query_time:.3f}s" + + async def test_multiple_streams_concurrent(self, cassandra_cluster): + """ + Test that multiple streaming operations can run concurrently. + + This demonstrates that streaming doesn't monopolize the thread pool. + """ + async with AsyncCluster(["localhost"]) as cluster: + async with await cluster.connect() as session: + await session.set_keyspace("test_streaming") + + async def stream_partition(partition: int) -> tuple[int, float]: + """Stream a specific partition.""" + config = StreamConfig(fetch_size=500) + rows = 0 + start = time.perf_counter() + + stmt = await session.prepare( + "SELECT * FROM large_table WHERE partition_key = ?" + ) + + async with await session.execute_stream( + stmt, [partition], stream_config=config + ) as result: + async for row in result: + rows += 1 + + duration = time.perf_counter() - start + return rows, duration + + # Start multiple streams concurrently + print("\n=== Multiple Concurrent Streams Test ===") + start_time = time.perf_counter() + + # Stream 5 partitions concurrently + tasks = [stream_partition(i) for i in range(5)] + + results = await asyncio.gather(*tasks) + + total_duration = time.perf_counter() - start_time + + # Analyze results + total_rows = sum(rows for rows, _ in results) + individual_durations = [duration for _, duration in results] + + print(f"Total rows streamed: {total_rows:,}") + print(f"Total duration: {total_duration:.2f}s") + print(f"Individual stream durations: {[f'{d:.2f}s' for d in individual_durations]}") + + # If streams were serialized, total duration would be sum of individual + sum_durations = sum(individual_durations) + concurrency_factor = sum_durations / total_duration + + print(f"Sum of individual durations: {sum_durations:.2f}s") + print(f"Concurrency factor: {concurrency_factor:.1f}x") + + # Assertions + assert total_rows == 5000, f"Expected 5000 rows total, got {total_rows}" + + # Should show significant concurrency (at least 2x) + assert ( + concurrency_factor > 2.0 + ), f"Insufficient concurrency: {concurrency_factor:.1f}x" + + # Total time should be much less than sum of individual times + assert total_duration < sum_durations * 0.7, "Streams appear to be serialized" From 30fd99fc63b19c5a5dfe980efe53ad09a517da43 Mon Sep 17 00:00:00 2001 From: Johnny Miller Date: Wed, 2 Jul 2025 16:20:48 +0200 Subject: [PATCH 2/2] examples --- Makefile | 230 +++++++++++++++++++----- README.md | 44 +++++ examples/context_manager_safety_demo.py | 103 +++++++---- examples/export_large_table.py | 92 ++++++---- examples/export_to_parquet.py | 108 +++++++---- examples/metrics_example.py | 159 +++++++++------- examples/metrics_simple.py | 103 ++++++----- examples/realtime_processing.py | 113 +++++++----- examples/streaming_basic.py | 65 ++++--- examples/streaming_non_blocking_demo.py | 14 +- 10 files changed, 703 insertions(+), 328 deletions(-) diff --git a/Makefile b/Makefile index 04eb5f5..044f49c 100644 --- a/Makefile +++ b/Makefile @@ -346,81 +346,225 @@ clean-all: clean cassandra-stop EXAMPLES_ENV = CASSANDRA_CONTACT_POINTS=$(CASSANDRA_CONTACT_POINTS) example-streaming: cassandra-wait - @echo "=== Running Streaming Basic Example ===" - @echo "This example demonstrates memory-efficient streaming of large result sets" - @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ STREAMING BASIC EXAMPLE โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ This example demonstrates memory-efficient streaming of large result sets โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ What you'll see: โ•‘" + @echo "โ•‘ โ€ข Streaming 100,000 events without loading all into memory โ•‘" + @echo "โ•‘ โ€ข Progress tracking with page-by-page processing โ•‘" + @echo "โ•‘ โ€ข True Async Paging - pages fetched on-demand as you process โ•‘" + @echo "โ•‘ โ€ข Different streaming patterns (basic, filtered, page-based) โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿ“ก Connecting to Cassandra at $(CASSANDRA_CONTACT_POINTS)..." + @echo "" @$(EXAMPLES_ENV) python examples/streaming_basic.py example-export-csv: cassandra-wait - @echo "=== Running CSV Export Example ===" - @echo "This example exports a large Cassandra table to CSV format" - @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" - @echo "Output will be saved to ./exports/ directory" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ CSV EXPORT EXAMPLE โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ This example exports a large Cassandra table to CSV format efficiently โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ What you'll see: โ•‘" + @echo "โ•‘ โ€ข Creating and populating a sample products table (5,000 items) โ•‘" + @echo "โ•‘ โ€ข Streaming export with progress tracking โ•‘" + @echo "โ•‘ โ€ข Memory-efficient processing (no loading entire table into memory) โ•‘" + @echo "โ•‘ โ€ข Export statistics (rows/sec, file size, duration) โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿ“ก Connecting to Cassandra at $(CASSANDRA_CONTACT_POINTS)..." + @echo "๐Ÿ’พ Output will be saved to: $(EXAMPLE_OUTPUT_DIR)" + @echo "" @$(EXAMPLES_ENV) python examples/export_large_table.py example-export-parquet: cassandra-wait - @echo "=== Running Parquet Export Example ===" - @echo "This example exports Cassandra tables to Parquet format with streaming" - @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" - @echo "Output will be saved to ./parquet_exports/ directory" - @echo "Installing pyarrow if needed..." - @pip install pyarrow >/dev/null 2>&1 || echo "PyArrow already installed" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ PARQUET EXPORT EXAMPLE โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ This example exports Cassandra tables to Parquet format with streaming โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ What you'll see: โ•‘" + @echo "โ•‘ โ€ข Creating time-series data with complex types (30,000+ events) โ•‘" + @echo "โ•‘ โ€ข Three export scenarios: โ•‘" + @echo "โ•‘ - Full table export with snappy compression โ•‘" + @echo "โ•‘ - Filtered export (purchase events only) with gzip โ•‘" + @echo "โ•‘ - Different compression comparison (lz4) โ•‘" + @echo "โ•‘ โ€ข Automatic schema inference from Cassandra types โ•‘" + @echo "โ•‘ โ€ข Verification of exported Parquet files โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿ“ก Connecting to Cassandra at $(CASSANDRA_CONTACT_POINTS)..." + @echo "๐Ÿ’พ Output will be saved to: $(EXAMPLE_OUTPUT_DIR)" + @echo "๐Ÿ“ฆ Installing PyArrow if needed..." + @pip install pyarrow >/dev/null 2>&1 || echo "โœ… PyArrow ready" + @echo "" @$(EXAMPLES_ENV) python examples/export_to_parquet.py example-realtime: cassandra-wait - @echo "=== Running Real-time Processing Example ===" - @echo "This example demonstrates real-time streaming analytics on sensor data" - @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ REAL-TIME PROCESSING EXAMPLE โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ This example demonstrates real-time streaming analytics on sensor data โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ What you'll see: โ•‘" + @echo "โ•‘ โ€ข Simulating IoT sensor network (50 sensors, time-series data) โ•‘" + @echo "โ•‘ โ€ข Sliding window analytics with time-based queries โ•‘" + @echo "โ•‘ โ€ข Real-time anomaly detection and alerting โ•‘" + @echo "โ•‘ โ€ข Continuous monitoring with aggregations โ•‘" + @echo "โ•‘ โ€ข High-performance streaming of time-series data โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿ“ก Connecting to Cassandra at $(CASSANDRA_CONTACT_POINTS)..." + @echo "๐ŸŒก๏ธ Simulating sensor network..." + @echo "" @$(EXAMPLES_ENV) python examples/realtime_processing.py example-metrics: cassandra-wait - @echo "=== Running Metrics Collection Examples ===" - @echo "Running simple metrics example..." - @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ METRICS COLLECTION EXAMPLES โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ These examples demonstrate query performance monitoring and metrics โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ Part 1 - Simple Metrics: โ•‘" + @echo "โ•‘ โ€ข Basic query performance tracking โ•‘" + @echo "โ•‘ โ€ข Connection health monitoring โ•‘" + @echo "โ•‘ โ€ข Error rate calculation โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ Part 2 - Advanced Metrics: โ•‘" + @echo "โ•‘ โ€ข Multiple metrics collectors โ•‘" + @echo "โ•‘ โ€ข Prometheus integration patterns โ•‘" + @echo "โ•‘ โ€ข FastAPI integration examples โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿ“ก Connecting to Cassandra at $(CASSANDRA_CONTACT_POINTS)..." + @echo "" + @echo "๐Ÿ“Š Part 1: Simple Metrics..." + @echo "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€" @$(EXAMPLES_ENV) python examples/metrics_simple.py @echo "" - @echo "Running advanced metrics example..." + @echo "๐Ÿ“ˆ Part 2: Advanced Metrics..." + @echo "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€" @$(EXAMPLES_ENV) python examples/metrics_example.py example-non-blocking: cassandra-wait - @echo "=== Running Non-Blocking Streaming Demo ===" - @echo "This demonstrates that streaming doesn't block the event loop" - @echo "Watch for heartbeat indicators showing continuous operation!" - @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ NON-BLOCKING STREAMING DEMO โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ This PROVES that streaming doesn't block the asyncio event loop! โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ What you'll see: โ•‘" + @echo "โ•‘ โ€ข ๐Ÿ’“ Heartbeat indicators pulsing every 10ms โ•‘" + @echo "โ•‘ โ€ข Streaming 50,000 rows while heartbeat continues โ•‘" + @echo "โ•‘ โ€ข Event loop responsiveness analysis โ•‘" + @echo "โ•‘ โ€ข Concurrent queries executing during streaming โ•‘" + @echo "โ•‘ โ€ข Multiple streams running in parallel โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ ๐Ÿ” Watch the heartbeats - they should NEVER stop! โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿ“ก Connecting to Cassandra at $(CASSANDRA_CONTACT_POINTS)..." + @echo "" @$(EXAMPLES_ENV) python examples/streaming_non_blocking_demo.py example-context: cassandra-wait - @echo "=== Running Context Manager Safety Demo ===" - @echo "This demonstrates proper resource management with context managers" - @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ CONTEXT MANAGER SAFETY DEMO โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ This demonstrates proper resource management with context managers โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ What you'll see: โ•‘" + @echo "โ•‘ โ€ข Query errors DON'T close sessions (resilience) โ•‘" + @echo "โ•‘ โ€ข Streaming errors DON'T affect other operations โ•‘" + @echo "โ•‘ โ€ข Context managers provide proper isolation โ•‘" + @echo "โ•‘ โ€ข Multiple concurrent operations share resources safely โ•‘" + @echo "โ•‘ โ€ข Automatic cleanup even during exceptions โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ ๐Ÿ’ก Key lesson: ALWAYS use context managers! โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿ“ก Connecting to Cassandra at $(CASSANDRA_CONTACT_POINTS)..." + @echo "" @$(EXAMPLES_ENV) python examples/context_manager_safety_demo.py example-fastapi: - @echo "=== Running FastAPI Example App ===" - @echo "This starts a full REST API with async Cassandra integration" - @echo "The app includes Docker Compose for easy setup" - @echo "See examples/fastapi_app/README.md for details" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ FASTAPI EXAMPLE APP โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ This starts a full REST API with async Cassandra integration โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ Features: โ•‘" + @echo "โ•‘ โ€ข Complete CRUD operations with async patterns โ•‘" + @echo "โ•‘ โ€ข Streaming endpoints for large datasets โ•‘" + @echo "โ•‘ โ€ข Performance comparison endpoints (async vs sync) โ•‘" + @echo "โ•‘ โ€ข Connection lifecycle management โ•‘" + @echo "โ•‘ โ€ข Docker Compose for easy development โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ ๐Ÿ“š See examples/fastapi_app/README.md for API documentation โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿš€ Starting FastAPI application..." + @echo "" @cd examples/fastapi_app && $(MAKE) run examples-all: cassandra-wait - @echo "=== Running All Examples ===" - @echo "This will run each example in sequence" - @echo "Contact points: $(CASSANDRA_CONTACT_POINTS)" + @echo "" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ RUNNING ALL EXAMPLES โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ This will run each example in sequence to demonstrate all features โ•‘" + @echo "โ•‘ โ•‘" + @echo "โ•‘ Examples to run: โ•‘" + @echo "โ•‘ 1. Streaming Basic - Memory-efficient data processing โ•‘" + @echo "โ•‘ 2. CSV Export - Large table export with progress tracking โ•‘" + @echo "โ•‘ 3. Parquet Export - Complex types and compression options โ•‘" + @echo "โ•‘ 4. Real-time Processing - IoT sensor analytics โ•‘" + @echo "โ•‘ 5. Metrics Collection - Performance monitoring โ•‘" + @echo "โ•‘ 6. Non-blocking Demo - Event loop responsiveness proof โ•‘" + @echo "โ•‘ 7. Context Managers - Resource management patterns โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" + @echo "๐Ÿ“ก Using Cassandra at $(CASSANDRA_CONTACT_POINTS)" @echo "" @$(MAKE) example-streaming - @echo "\n----------------------------------------\n" + @echo "" + @echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" @$(MAKE) example-export-csv - @echo "\n----------------------------------------\n" + @echo "" + @echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" @$(MAKE) example-export-parquet - @echo "\n----------------------------------------\n" + @echo "" + @echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" @$(MAKE) example-realtime - @echo "\n----------------------------------------\n" + @echo "" + @echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" @$(MAKE) example-metrics - @echo "\n----------------------------------------\n" + @echo "" + @echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" @$(MAKE) example-non-blocking - @echo "\n----------------------------------------\n" + @echo "" + @echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" + @echo "" @$(MAKE) example-context - @echo "\nโœ… All examples completed!" @echo "" - @echo "Note: FastAPI example not included as it starts a server" - @echo "Run 'make example-fastapi' separately to start the FastAPI app" + @echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + @echo "โ•‘ โœ… ALL EXAMPLES COMPLETED SUCCESSFULLY! โ•‘" + @echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + @echo "โ•‘ Note: FastAPI example not included as it starts a server. โ•‘" + @echo "โ•‘ Run 'make example-fastapi' separately to start the FastAPI app. โ•‘" + @echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" diff --git a/README.md b/README.md index 58de197..edc2db1 100644 --- a/README.md +++ b/README.md @@ -287,6 +287,50 @@ We welcome contributions! Please see: - [FastAPI Integration](examples/fastapi_app/README.md) - Complete REST API example - [More Examples](examples/) - Additional usage patterns +## ๐ŸŽฏ Running the Examples + +The project includes comprehensive examples demonstrating various features and use cases. Each example can be run using the provided Makefile, which automatically handles Cassandra setup if needed. + +### Available Examples + +Run any example with: `make example-` + +- **`make example-basic`** - Basic connection and query execution +- **`make example-streaming`** - Memory-efficient streaming of large result sets with True Async Paging +- **`make example-context-safety`** - Demonstrates proper context manager usage and resource isolation +- **`make example-export-large-table`** - Export large tables to CSV with progress tracking +- **`make example-export-parquet`** - Export data to Parquet format with complex data types +- **`make example-metrics`** - Comprehensive metrics collection and performance monitoring +- **`make example-metrics-simple`** - Basic metrics collection example +- **`make example-realtime`** - Real-time data processing with sliding window analytics +- **`make example-streaming-demo`** - Visual demonstration that streaming doesn't block the event loop + +### Running with External Cassandra + +If you have Cassandra running elsewhere: + +```bash +# Single node +CASSANDRA_CONTACT_POINTS=10.0.0.1 make example-streaming + +# Multiple nodes +CASSANDRA_CONTACT_POINTS=10.0.0.1,10.0.0.2,10.0.0.3 make example-streaming + +# With custom port +CASSANDRA_CONTACT_POINTS=cassandra.example.com CASSANDRA_PORT=9043 make example-basic +``` + +### Example Descriptions + +- **Basic Example**: Shows fundamental operations like connecting, executing queries, and using prepared statements +- **Streaming Examples**: Demonstrate True Async Paging for processing millions of rows without memory issues +- **Export Examples**: Show how to export Cassandra data to various formats (CSV, Parquet) with progress tracking +- **Metrics Examples**: Illustrate performance monitoring, query tracking, and connection health checking +- **Real-time Processing**: Demonstrates processing time-series IoT data with concurrent operations +- **Context Safety Demo**: Proves that errors in one operation don't affect others when using context managers + +Each example includes detailed comments explaining the concepts and best practices. Start with `example-basic` if you're new to the library. + ## โšก Performance async-cassandra enables your async Python application to work with Cassandra without blocking the event loop. While it doesn't eliminate the underlying driver's thread pool, it prevents those blocking operations from freezing your entire application. This is crucial for web servers where a blocked event loop means no requests can be processed. diff --git a/examples/context_manager_safety_demo.py b/examples/context_manager_safety_demo.py index f934fd3..7b4101a 100644 --- a/examples/context_manager_safety_demo.py +++ b/examples/context_manager_safety_demo.py @@ -40,26 +40,33 @@ async def demonstrate_query_error_safety(cluster): """Show that query errors don't close the session.""" - logger.info("\n=== Demonstrating Query Error Safety ===") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ›ก๏ธ QUERY ERROR SAFETY DEMONSTRATION") + logger.info("=" * 80) async with await cluster.connect() as session: + logger.info("\n๐Ÿงช Test 1: Execute a failing query") try: # This will fail await session.execute("SELECT * FROM non_existent_table") except InvalidRequest as e: - logger.info(f"Query failed as expected: {e}") + logger.info(f" โœ“ Query failed as expected: {type(e).__name__}") # Session should still work - logger.info("Session still works after error:") + logger.info("\n๐Ÿงช Test 2: Verify session still works after error") result = await session.execute("SELECT release_version FROM system.local") - logger.info(f"Cassandra version: {result.one().release_version}") + logger.info(f" โœ… Session is healthy! Cassandra version: {result.one().release_version}") + logger.info("\n๐Ÿ’ก Key insight: Query errors are isolated - they don't affect the session!") async def demonstrate_streaming_error_safety(cluster): """Show that streaming errors don't close the session.""" - logger.info("\n=== Demonstrating Streaming Error Safety ===") + logger.info("\n" + "=" * 80) + logger.info("๐ŸŒŠ STREAMING ERROR SAFETY DEMONSTRATION") + logger.info("=" * 80) async with await cluster.connect() as session: + logger.info("\n๐Ÿ› ๏ธ Setting up test data...") # Create test keyspace and data await session.execute( """ @@ -70,11 +77,9 @@ async def demonstrate_streaming_error_safety(cluster): } """ ) - await session.set_keyspace("context_demo") - await session.execute( """ - CREATE TABLE IF NOT EXISTS test_data ( + CREATE TABLE IF NOT EXISTS context_demo.test_data ( id UUID PRIMARY KEY, value TEXT ) @@ -82,26 +87,31 @@ async def demonstrate_streaming_error_safety(cluster): ) # Insert some data using prepared statement - insert_stmt = await session.prepare("INSERT INTO test_data (id, value) VALUES (?, ?)") + insert_stmt = await session.prepare( + "INSERT INTO context_demo.test_data (id, value) VALUES (?, ?)" + ) for i in range(10): await session.execute(insert_stmt, [uuid.uuid4(), f"value_{i}"]) + logger.info(" โœ“ Created 10 test records") # Try streaming from non-existent table (will fail) + logger.info("\n๐Ÿงช Test 1: Stream from non-existent table") try: async with await session.execute_stream("SELECT * FROM non_existent_table") as stream: async for row in stream: pass except Exception as e: - logger.info(f"Streaming failed as expected: {e}") + logger.info(f" โœ“ Streaming failed as expected: {type(e).__name__}") # Session should still work for new streaming - logger.info("Starting new streaming operation after error:") + logger.info("\n๐Ÿงช Test 2: Start new streaming operation after error") count = 0 - async with await session.execute_stream("SELECT * FROM test_data") as stream: + async with await session.execute_stream("SELECT * FROM context_demo.test_data") as stream: async for row in stream: count += 1 - logger.info(f"Successfully streamed {count} rows after error") + logger.info(f" โœ… Successfully streamed {count} rows after error!") + logger.info("\n๐Ÿ’ก Key insight: Streaming errors are isolated - session remains healthy!") # Cleanup await session.execute("DROP KEYSPACE context_demo") @@ -109,57 +119,66 @@ async def demonstrate_streaming_error_safety(cluster): async def demonstrate_context_manager_isolation(cluster): """Show how context managers isolate resource cleanup.""" - logger.info("\n=== Demonstrating Context Manager Isolation ===") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ”’ CONTEXT MANAGER ISOLATION DEMONSTRATION") + logger.info("=" * 80) # Scenario 1: Session context doesn't affect cluster - logger.info("\nScenario 1: Session context with error") + logger.info("\n๐Ÿงช Scenario 1: Session error doesn't affect cluster") try: async with await cluster.connect() as session: result = await session.execute("SELECT now() FROM system.local") - logger.info(f"Query succeeded: {result.one()[0]}") + logger.info(f" โœ“ Query succeeded: {result.one()[0]}") + logger.info(" ๐Ÿ’ฅ Simulating error...") raise ValueError("Simulated error in session context") except ValueError: - logger.info("Error handled, session was closed by context manager") + logger.info(" โœ“ Error handled, session closed by context manager") # Cluster should still work - logger.info("Creating new session from same cluster:") + logger.info("\n๐Ÿงช Creating new session from same cluster:") async with await cluster.connect() as session2: result = await session2.execute("SELECT now() FROM system.local") - logger.info(f"New session works: {result.one()[0]}") + logger.info(f" โœ… New session works perfectly: {result.one()[0]}") # Scenario 2: Streaming context doesn't affect session - logger.info("\nScenario 2: Streaming context with early exit") + logger.info("\n๐Ÿงช Scenario 2: Early streaming exit doesn't affect session") async with await cluster.connect() as session3: # Stream with early exit count = 0 + logger.info(" ๐Ÿ”„ Starting streaming with early exit...") async with await session3.execute_stream("SELECT * FROM system.local") as stream: async for row in stream: count += 1 + logger.info(f" โœ“ Read {count} row, exiting early...") break # Early exit - logger.info(f"Exited streaming early after {count} row") - # Session should still work + logger.info("\n ๐Ÿงช Testing session after early streaming exit:") result = await session3.execute("SELECT now() FROM system.local") - logger.info(f"Session still works: {result.one()[0]}") + logger.info(f" โœ… Session still healthy: {result.one()[0]}") + + logger.info("\n๐Ÿ’ก Key insight: Context managers provide proper isolation!") async def demonstrate_concurrent_safety(cluster): """Show that multiple operations can use shared resources safely.""" - logger.info("\n=== Demonstrating Concurrent Safety ===") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿš€ CONCURRENT OPERATIONS SAFETY DEMONSTRATION") + logger.info("=" * 80) # Create shared session + logger.info("\n๐Ÿ”„ Running multiple concurrent operations on shared session...") async with await cluster.connect() as session: async def worker(worker_id, query_count): """Worker that executes queries.""" for i in range(query_count): try: - result = await session.execute("SELECT now() FROM system.local") - logger.info(f"Worker {worker_id} query {i+1}: {result.one()[0]}") + await session.execute("SELECT now() FROM system.local") + logger.info(f" ๐Ÿ‘ท Worker {worker_id} query {i+1}: Success") await asyncio.sleep(0.1) except Exception as e: - logger.error(f"Worker {worker_id} error: {e}") + logger.error(f" โŒ Worker {worker_id} error: {e}") async def streamer(): """Worker that uses streaming.""" @@ -171,27 +190,30 @@ async def streamer(): async for row in stream: count += 1 if count % 5 == 0: - logger.info(f"Streamer: Processed {count} keyspaces") + logger.info(f" ๐ŸŒŠ Streamer: Processed {count} keyspaces") await asyncio.sleep(0.1) - logger.info(f"Streamer: Total {count} keyspaces") + logger.info(f" โœ… Streamer: Completed ({count} keyspaces)") except Exception as e: - logger.error(f"Streamer error: {e}") + logger.error(f" โŒ Streamer error: {e}") # Run workers concurrently await asyncio.gather(worker(1, 3), worker(2, 3), streamer(), return_exceptions=True) - logger.info("All concurrent operations completed") + logger.info("\nโœ… All concurrent operations completed successfully!") + logger.info("\n๐Ÿ’ก Key insight: Multiple operations can safely share a session!") async def main(): """Run all demonstrations.""" - logger.info("Starting Context Manager Safety Demonstration") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ›ก๏ธ CONTEXT MANAGER SAFETY DEMONSTRATION") + logger.info("=" * 80) # Get contact points from environment or use localhost contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") port = int(os.environ.get("CASSANDRA_PORT", "9042")) - logger.info(f"Connecting to Cassandra at {contact_points}:{port}") + logger.info(f"\n๐Ÿ“ก Connecting to Cassandra at {contact_points}:{port}") # Use cluster in context manager for automatic cleanup async with AsyncCluster(contact_points, port=port) as cluster: @@ -200,12 +222,15 @@ async def main(): await demonstrate_context_manager_isolation(cluster) await demonstrate_concurrent_safety(cluster) - logger.info("\nAll demonstrations completed successfully!") - logger.info("Key takeaways:") - logger.info("1. Query errors don't close sessions") - logger.info("2. Streaming errors don't close sessions") - logger.info("3. Context managers only close their own resources") - logger.info("4. Multiple operations can safely share sessions and clusters") + logger.info("\n" + "=" * 80) + logger.info("โœ… ALL DEMONSTRATIONS COMPLETED SUCCESSFULLY!") + logger.info("=" * 80) + logger.info("\n๐ŸŽฏ Key Takeaways:") + logger.info(" 1. Query errors don't close sessions") + logger.info(" 2. Streaming errors don't close sessions") + logger.info(" 3. Context managers only close their own resources") + logger.info(" 4. Multiple operations can safely share sessions and clusters") + logger.info("\n๐Ÿ’ก Best Practice: Always use context managers for proper resource management!") if __name__ == "__main__": diff --git a/examples/export_large_table.py b/examples/export_large_table.py index 8a69506..ed4824f 100644 --- a/examples/export_large_table.py +++ b/examples/export_large_table.py @@ -64,11 +64,15 @@ async def count_table_rows(session, keyspace: str, table_name: str) -> int: async def export_table_async(session, keyspace: str, table_name: str, output_file: str): """Export table using async file I/O (requires aiofiles).""" - logger.info(f"Starting async export of {keyspace}.{table_name} to {output_file}") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ“ค CSV EXPORT WITH ASYNC FILE I/O") + logger.info("=" * 80) + logger.info(f"\n๐Ÿ“Š Exporting: {keyspace}.{table_name}") + logger.info(f"๐Ÿ’พ Output file: {output_file}") # Get approximate row count for progress tracking total_rows = await count_table_rows(session, keyspace, table_name) - logger.info(f"Table has approximately {total_rows:,} rows") + logger.info(f"๐Ÿ“‹ Table size: ~{total_rows:,} rows") # Configure streaming with progress callback rows_exported = 0 @@ -78,9 +82,12 @@ def progress_callback(page_num: int, rows_so_far: int): rows_exported = rows_so_far if total_rows > 0: progress = (rows_so_far / total_rows) * 100 + bar_length = 40 + filled = int(bar_length * progress / 100) + bar = "โ–ˆ" * filled + "โ–‘" * (bar_length - filled) logger.info( - f"Export progress: {rows_so_far:,}/{total_rows:,} rows " - f"({progress:.1f}%) - Page {page_num}" + f"๐Ÿ“Š Progress: [{bar}] {progress:.1f}% " + f"({rows_so_far:,}/{total_rows:,} rows) - Page {page_num}" ) config = StreamConfig(fetch_size=5000, page_callback=progress_callback) @@ -127,32 +134,45 @@ def progress_callback(page_num: int, rows_so_far: int): row_count += 1 elapsed = (datetime.now() - start_time).total_seconds() - logger.info("\nExport completed:") - logger.info(f"- Rows exported: {row_count:,}") - logger.info(f"- Time elapsed: {elapsed:.2f} seconds") - logger.info(f"- Export rate: {row_count/elapsed:.0f} rows/sec") - logger.info(f"- Output file: {output_file}") - logger.info(f"- File size: {os.path.getsize(output_file):,} bytes") + file_size_mb = os.path.getsize(output_file) / (1024 * 1024) + + logger.info("\n" + "โ”€" * 80) + logger.info("โœ… EXPORT COMPLETED SUCCESSFULLY!") + logger.info("โ”€" * 80) + logger.info("\n๐Ÿ“Š Export Statistics:") + logger.info(f" โ€ข Rows exported: {row_count:,}") + logger.info(f" โ€ข Time elapsed: {elapsed:.2f} seconds") + logger.info(f" โ€ข Export rate: {row_count/elapsed:,.0f} rows/sec") + logger.info(f" โ€ข File size: {file_size_mb:.2f} MB ({os.path.getsize(output_file):,} bytes)") + logger.info(f" โ€ข Output path: {output_file}") def export_table_sync(session, keyspace: str, table_name: str, output_file: str): """Export table using synchronous file I/O.""" - logger.info(f"Starting sync export of {keyspace}.{table_name} to {output_file}") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ“ค CSV EXPORT WITH SYNC FILE I/O") + logger.info("=" * 80) + logger.info(f"\n๐Ÿ“Š Exporting: {keyspace}.{table_name}") + logger.info(f"๐Ÿ’พ Output file: {output_file}") async def _export(): # Get approximate row count total_rows = await count_table_rows(session, keyspace, table_name) - logger.info(f"Table has approximately {total_rows:,} rows") + logger.info(f"๐Ÿ“‹ Table size: ~{total_rows:,} rows") # Configure streaming - config = StreamConfig( - fetch_size=5000, - page_callback=lambda p, t: ( - logger.info(f"Exported {t:,}/{total_rows:,} rows ({100*t/total_rows:.1f}%)") - if total_rows > 0 - else None - ), - ) + def sync_progress(page_num: int, rows_so_far: int): + if total_rows > 0: + progress = (rows_so_far / total_rows) * 100 + bar_length = 40 + filled = int(bar_length * progress / 100) + bar = "โ–ˆ" * filled + "โ–‘" * (bar_length - filled) + logger.info( + f"๐Ÿ“Š Progress: [{bar}] {progress:.1f}% " + f"({rows_so_far:,}/{total_rows:,} rows) - Page {page_num}" + ) + + config = StreamConfig(fetch_size=5000, page_callback=sync_progress) start_time = datetime.now() @@ -189,12 +209,19 @@ async def _export(): row_count += 1 elapsed = (datetime.now() - start_time).total_seconds() - logger.info("\nExport completed:") - logger.info(f"- Rows exported: {row_count:,}") - logger.info(f"- Time elapsed: {elapsed:.2f} seconds") - logger.info(f"- Export rate: {row_count/elapsed:.0f} rows/sec") - logger.info(f"- Output file: {output_file}") - logger.info(f"- File size: {os.path.getsize(output_file):,} bytes") + file_size_mb = os.path.getsize(output_file) / (1024 * 1024) + + logger.info("\n" + "โ”€" * 80) + logger.info("โœ… EXPORT COMPLETED SUCCESSFULLY!") + logger.info("โ”€" * 80) + logger.info("\n๐Ÿ“Š Export Statistics:") + logger.info(f" โ€ข Rows exported: {row_count:,}") + logger.info(f" โ€ข Time elapsed: {elapsed:.2f} seconds") + logger.info(f" โ€ข Export rate: {row_count/elapsed:,.0f} rows/sec") + logger.info( + f" โ€ข File size: {file_size_mb:.2f} MB ({os.path.getsize(output_file):,} bytes)" + ) + logger.info(f" โ€ข Output path: {output_file}") # Run the async export function return _export() @@ -202,7 +229,7 @@ async def _export(): async def setup_sample_data(session): """Create sample table with data for testing.""" - logger.info("Setting up sample data...") + logger.info("\n๐Ÿ› ๏ธ Setting up sample data...") # Create keyspace await session.execute( @@ -215,12 +242,10 @@ async def setup_sample_data(session): """ ) - await session.set_keyspace("export_example") - # Create table await session.execute( """ - CREATE TABLE IF NOT EXISTS products ( + CREATE TABLE IF NOT EXISTS export_example.products ( category text, product_id int, name text, @@ -237,7 +262,7 @@ async def setup_sample_data(session): # Insert sample data insert_stmt = await session.prepare( """ - INSERT INTO products ( + INSERT INTO export_example.products ( category, product_id, name, price, in_stock, tags, attributes, created_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) @@ -277,7 +302,7 @@ async def setup_sample_data(session): await asyncio.gather(*tasks) - logger.info(f"Created {total_products} sample products") + logger.info(f"โœ… Created {total_products:,} sample products in 'export_example.products' table") async def main(): @@ -310,8 +335,9 @@ async def main(): ) # Cleanup (optional) - logger.info("\nCleaning up...") + logger.info("\n๐Ÿงน Cleaning up...") await session.execute("DROP KEYSPACE export_example") + logger.info("โœ… Keyspace dropped") if __name__ == "__main__": diff --git a/examples/export_to_parquet.py b/examples/export_to_parquet.py index 9fd0a41..d40cfd7 100644 --- a/examples/export_to_parquet.py +++ b/examples/export_to_parquet.py @@ -132,7 +132,9 @@ async def export_table( output_file = self.output_dir / f"{keyspace}.{table_name}.parquet" temp_file = self.output_dir / f"{keyspace}.{table_name}.parquet.tmp" - logger.info(f"Starting export of {keyspace}.{table_name} to {output_file}") + logger.info(f"\n๐ŸŽฏ Starting export of {keyspace}.{table_name}") + logger.info(f"๐Ÿ“„ Output: {output_file}") + logger.info(f"๐Ÿ—œ๏ธ Compression: {compression}") # Build query query = f"SELECT * FROM {keyspace}.{table_name}" @@ -149,7 +151,9 @@ def progress_callback(page_num: int, rows_in_page: int): nonlocal total_pages total_pages = page_num if page_num % 10 == 0: - logger.info(f"Exported page {page_num} ({total_rows + rows_in_page:,} rows total)") + logger.info( + f"๐Ÿ“ฆ Processing page {page_num} ({total_rows + rows_in_page:,} rows exported so far)" + ) # Configure streaming config = StreamConfig( @@ -260,7 +264,9 @@ def progress_callback(page_num: int, rows_in_page: int): # Clear batch data batch_data = {name: [] for name in column_names} - logger.info(f"Written {total_rows:,} rows to Parquet") + logger.info( + f"๐Ÿ’พ Written {total_rows:,} rows to Parquet (row group {total_rows // row_group_size})" + ) # Write final partial batch if any(batch_data.values()): @@ -296,15 +302,27 @@ def progress_callback(page_num: int, rows_in_page: int): "row_group_size": row_group_size, } - logger.info("\nExport completed successfully!") - logger.info(f"Statistics: {stats}") + logger.info("\n" + "โ”€" * 80) + logger.info("โœ… PARQUET EXPORT COMPLETED!") + logger.info("โ”€" * 80) + logger.info("\n๐Ÿ“Š Export Statistics:") + logger.info(f" โ€ข Table: {stats['table']}") + logger.info(f" โ€ข Rows: {stats['total_rows']:,}") + logger.info(f" โ€ข Pages: {stats['total_pages']}") + logger.info(f" โ€ข Size: {stats['total_mb']} MB") + logger.info(f" โ€ข Duration: {stats['duration_seconds']}s") + logger.info( + f" โ€ข Speed: {stats['rows_per_second']:,} rows/sec ({stats['mb_per_second']} MB/s)" + ) + logger.info(f" โ€ข Compression: {stats['compression']}") + logger.info(f" โ€ข Row Group Size: {stats['row_group_size']:,}") return stats async def setup_test_data(session): """Create test data for export demonstration.""" - logger.info("Setting up test data...") + logger.info("\n๐Ÿ› ๏ธ Setting up test data for Parquet export demonstration...") # Create keyspace await session.execute( @@ -317,12 +335,10 @@ async def setup_test_data(session): """ ) - await session.set_keyspace("analytics") - # Create a table with various data types await session.execute( """ - CREATE TABLE IF NOT EXISTS user_events ( + CREATE TABLE IF NOT EXISTS analytics.user_events ( user_id UUID, event_time TIMESTAMP, event_type TEXT, @@ -342,7 +358,7 @@ async def setup_test_data(session): # Insert test data insert_stmt = await session.prepare( """ - INSERT INTO user_events ( + INSERT INTO analytics.user_events ( user_id, event_time, event_type, device_type, country_code, city, revenue, duration_seconds, is_premium, metadata, tags @@ -351,7 +367,7 @@ async def setup_test_data(session): ) # Generate substantial test data - logger.info("Inserting test data...") + logger.info("๐Ÿ“ Inserting test data with complex types (maps, sets, decimals)...") import random import uuid @@ -428,23 +444,27 @@ async def setup_test_data(session): total_inserted += 100 if total_inserted % 5000 == 0: - logger.info(f"Inserted {total_inserted:,} events...") + logger.info(f" ๐Ÿ“Š Progress: {total_inserted:,} events inserted...") # Execute remaining tasks if tasks: await asyncio.gather(*tasks) total_inserted += len(tasks) - logger.info(f"Test data setup complete: {total_inserted:,} events inserted") + logger.info( + f"โœ… Test data setup complete: {total_inserted:,} events inserted into analytics.user_events" + ) async def demonstrate_exports(session): """Demonstrate various export scenarios.""" output_dir = os.environ.get("EXAMPLE_OUTPUT_DIR", "examples/exampleoutput") - logger.info(f"Output directory: {output_dir}") + logger.info(f"\n๐Ÿ“ Output directory: {output_dir}") # Example 1: Export entire table - logger.info("\n=== Example 1: Export Entire Table ===") + logger.info("\n" + "=" * 80) + logger.info("EXAMPLE 1: Export Entire Table with Snappy Compression") + logger.info("=" * 80) exporter1 = ParquetExporter(str(Path(output_dir) / "example1")) stats1 = await exporter1.export_table( session, @@ -455,7 +475,9 @@ async def demonstrate_exports(session): ) # Example 2: Export with filtering - logger.info("\n=== Example 2: Export Filtered Data ===") + logger.info("\n" + "=" * 80) + logger.info("EXAMPLE 2: Export Filtered Data (Purchase Events Only)") + logger.info("=" * 80) exporter2 = ParquetExporter(str(Path(output_dir) / "example2")) stats2 = await exporter2.export_table( session, @@ -468,7 +490,9 @@ async def demonstrate_exports(session): ) # Example 3: Export with different compression - logger.info("\n=== Example 3: Export with Different Compression ===") + logger.info("\n" + "=" * 80) + logger.info("EXAMPLE 3: Export with LZ4 Compression") + logger.info("=" * 80) exporter3 = ParquetExporter(str(Path(output_dir) / "example3")) stats3 = await exporter3.export_table( session, @@ -484,36 +508,44 @@ async def demonstrate_exports(session): async def verify_parquet_files(): """Verify the exported Parquet files.""" - logger.info("\n=== Verifying Exported Files ===") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ” VERIFYING EXPORTED PARQUET FILES") + logger.info("=" * 80) export_dir = Path(os.environ.get("EXAMPLE_OUTPUT_DIR", "examples/exampleoutput")) # Look for Parquet files in subdirectories too for parquet_file in export_dir.rglob("*.parquet"): - logger.info(f"\nVerifying {parquet_file.name}:") + logger.info(f"\n๐Ÿ“„ Verifying: {parquet_file.name}") + logger.info("โ”€" * 60) # Read Parquet file metadata parquet_file_obj = pq.ParquetFile(parquet_file) # Display metadata - logger.info(f" Schema: {parquet_file_obj.schema}") - logger.info(f" Num row groups: {parquet_file_obj.num_row_groups}") - logger.info(f" Total rows: {parquet_file_obj.metadata.num_rows}") + logger.info(f" ๐Ÿ“‹ Schema columns: {len(parquet_file_obj.schema)}") + logger.info(f" ๐Ÿ“Š Row groups: {parquet_file_obj.num_row_groups}") + logger.info(f" ๐Ÿ“ˆ Total rows: {parquet_file_obj.metadata.num_rows:,}") logger.info( - f" Compression: {parquet_file_obj.metadata.row_group(0).column(0).compression}" + f" ๐Ÿ—œ๏ธ Compression: {parquet_file_obj.metadata.row_group(0).column(0).compression}" ) # Read first few rows table = pq.read_table(parquet_file, columns=None) df = table.to_pandas() - logger.info(f" Columns: {list(df.columns)}") - logger.info(f" Shape: {df.shape}") - logger.info(f" Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB") + logger.info(f" ๐Ÿ“ Dimensions: {df.shape[0]:,} rows ร— {df.shape[1]} columns") + logger.info(f" ๐Ÿ’พ Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB") + logger.info( + f" ๐Ÿท๏ธ Columns: {', '.join(list(df.columns)[:5])}{' ...' if len(df.columns) > 5 else ''}" + ) - # Show sample data - logger.info("\n First 5 rows:") - logger.info(df.head()) + # Show data types + logger.info("\n ๐Ÿ“Š Sample data (first 3 rows):") + for idx, row in df.head(3).iterrows(): + logger.info( + f" Row {idx}: event_type='{row['event_type']}', revenue={row['revenue']}, city='{row['city']}'" + ) async def main(): @@ -537,17 +569,23 @@ async def main(): await verify_parquet_files() # Summary - logger.info("\n=== Export Summary ===") - for stats in export_stats: + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ“Š EXPORT SUMMARY") + logger.info("=" * 80) + logger.info("\n๐ŸŽฏ Three exports completed:") + for i, stats in enumerate(export_stats, 1): logger.info( - f"- {stats['table']}: {stats['total_rows']:,} rows, " - f"{stats['total_mb']} MB, {stats['duration_seconds']}s " - f"({stats['rows_per_second']:,} rows/s)" + f"\n {i}. {stats['compression'].upper()} compression:" + f"\n โ€ข {stats['total_rows']:,} rows exported" + f"\n โ€ข {stats['total_mb']} MB file size" + f"\n โ€ข {stats['duration_seconds']}s duration" + f"\n โ€ข {stats['rows_per_second']:,} rows/sec throughput" ) # Cleanup - logger.info("\nCleaning up...") + logger.info("\n๐Ÿงน Cleaning up...") await session.execute("DROP KEYSPACE analytics") + logger.info("โœ… Keyspace dropped") if __name__ == "__main__": diff --git a/examples/metrics_example.py b/examples/metrics_example.py index b22e8c1..5df67c3 100644 --- a/examples/metrics_example.py +++ b/examples/metrics_example.py @@ -44,17 +44,19 @@ async def run_workload(session, metrics: MetricsMiddleware): """Run a workload and collect metrics.""" - print("\n๐Ÿ“Š Running queries with metrics collection...") + print("\n" + "=" * 80) + print("๐Ÿ“Š RUNNING QUERY WORKLOAD") + print("=" * 80) # Prepare statements for better performance insert_stmt = await session.prepare( - "INSERT INTO users (id, name, email, created_at) VALUES (?, ?, ?, ?)" + "INSERT INTO metrics_demo.users (id, name, email, created_at) VALUES (?, ?, ?, ?)" ) - select_stmt = await session.prepare("SELECT * FROM users WHERE id = ?") + select_stmt = await session.prepare("SELECT * FROM metrics_demo.users WHERE id = ?") # Insert users with metrics tracking user_ids = [] - print("Inserting users...") + print("\n๐Ÿ“ Inserting users...") for i in range(20): user_id = uuid.uuid4() user_ids.append(user_id) @@ -67,24 +69,29 @@ async def run_workload(session, metrics: MetricsMiddleware): ) duration = time.time() - start_time await metrics.record_query_metrics( - "INSERT INTO users", duration, success=True, parameters_count=4 + "INSERT INTO metrics_demo.users", duration, success=True, parameters_count=4 ) except Exception as e: duration = time.time() - start_time await metrics.record_query_metrics( - "INSERT INTO users", duration, success=False, error_type=type(e).__name__ + "INSERT INTO metrics_demo.users", + duration, + success=False, + error_type=type(e).__name__, ) raise + print(f" โœ“ Inserted {len(user_ids)} users successfully") + # Select queries - print("Reading users...") + print("\n๐Ÿ” Reading users...") for user_id in user_ids[:10]: start_time = time.time() try: result = await session.execute(select_stmt, [user_id]) duration = time.time() - start_time await metrics.record_query_metrics( - "SELECT * FROM users WHERE id = ?", + "SELECT * FROM metrics_demo.users WHERE id = ?", duration, success=True, parameters_count=1, @@ -93,32 +100,42 @@ async def run_workload(session, metrics: MetricsMiddleware): except Exception as e: duration = time.time() - start_time await metrics.record_query_metrics( - "SELECT * FROM users WHERE id = ?", + "SELECT * FROM metrics_demo.users WHERE id = ?", duration, success=False, error_type=type(e).__name__, ) raise + print(" โœ“ Read 10 users successfully") + # Batch query simulation - print("Running batch query...") + print("\n๐Ÿ“ฆ Running batch query...") start_time = time.time() try: - result = await session.execute("SELECT * FROM users LIMIT 100") + result = await session.execute("SELECT * FROM metrics_demo.users LIMIT 100") rows = list(result) duration = time.time() - start_time await metrics.record_query_metrics( - "SELECT * FROM users LIMIT 100", duration, success=True, result_size=len(rows) + "SELECT * FROM metrics_demo.users LIMIT 100", + duration, + success=True, + result_size=len(rows), ) except Exception as e: duration = time.time() - start_time await metrics.record_query_metrics( - "SELECT * FROM users LIMIT 100", duration, success=False, error_type=type(e).__name__ + "SELECT * FROM metrics_demo.users LIMIT 100", + duration, + success=False, + error_type=type(e).__name__, ) raise + print(f" โœ“ Batch query returned {len(rows)} rows") + # Simulate an error - print("Testing error handling...") + print("\n๐Ÿงช Testing error handling...") try: start_time = time.time() await session.execute("SELECT * FROM non_existent_table") @@ -127,12 +144,15 @@ async def run_workload(session, metrics: MetricsMiddleware): await metrics.record_query_metrics( "SELECT * FROM non_existent_table", duration, success=False, error_type=type(e).__name__ ) - print(f" โŒ Expected error caught: {type(e).__name__}") + print(f" โœ“ Expected error properly tracked: {type(e).__name__}") async def monitor_connections(cluster, metrics: MetricsMiddleware): """Monitor connection health.""" - print("\n๐Ÿฅ Monitoring connection health...") + print("\n" + "=" * 80) + print("๐Ÿฅ CONNECTION HEALTH MONITORING") + print("=" * 80) + print("\n๐Ÿ” Checking cluster nodes...") hosts = cluster._cluster.metadata.all_hosts() for host in hosts: @@ -149,25 +169,26 @@ async def monitor_connections(cluster, metrics: MetricsMiddleware): response_time=response_time, total_queries=30, # Example value ) - print(f" โœ… {host.address}: Healthy (response: {response_time*1000:.1f}ms)") + print(f" โœ… {host.address}: Healthy (response: {response_time*1000:.1f}ms)") except Exception: response_time = time.time() - start_time await metrics.record_connection_metrics( str(host.address), is_healthy=False, response_time=response_time, error_count=1 ) - print(f" โŒ {host.address}: Unhealthy") + print(f" โŒ {host.address}: Unhealthy") async def main(): """Demonstrate comprehensive metrics collection.""" - print("๐Ÿš€ Async-Cassandra Metrics Example") - print("=" * 60) + print("\n" + "=" * 80) + print("๐Ÿš€ ADVANCED ASYNC-CASSANDRA METRICS EXAMPLE") + print("=" * 80) # Get contact points from environment or use localhost contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") port = int(os.environ.get("CASSANDRA_PORT", "9042")) - print(f"\nConnecting to Cassandra at {contact_points}:{port}") + print(f"\n๐Ÿ“ก Connecting to Cassandra at {contact_points}:{port}") # 1. Set up metrics collectors print("\n๐Ÿ”ง Setting up metrics system...") @@ -181,11 +202,11 @@ async def main(): prometheus_collector = PrometheusMetricsCollector() if prometheus_collector._available: collectors.append(prometheus_collector) - print(" โœ… Prometheus metrics enabled") + print(" โœ… Prometheus metrics enabled") else: - print(" โ„น๏ธ Prometheus client not available") + print(" โ„น๏ธ Prometheus client not available (pip install prometheus_client)") except Exception: - print(" โ„น๏ธ Prometheus client not available") + print(" โ„น๏ธ Prometheus client not available (pip install prometheus_client)") # Create metrics middleware metrics = MetricsMiddleware(collectors) @@ -194,7 +215,7 @@ async def main(): async with AsyncCluster(contact_points=contact_points, port=port) as cluster: async with await cluster.connect() as session: # Set up test environment - print("\n๐Ÿ“ฆ Setting up test database...") + print("\n๐Ÿ› ๏ธ Setting up test database...") await session.execute( """ CREATE KEYSPACE IF NOT EXISTS metrics_demo @@ -205,11 +226,9 @@ async def main(): """ ) - await session.set_keyspace("metrics_demo") - await session.execute( """ - CREATE TABLE IF NOT EXISTS users ( + CREATE TABLE IF NOT EXISTS metrics_demo.users ( id UUID PRIMARY KEY, name TEXT, email TEXT, @@ -225,8 +244,9 @@ async def main(): await monitor_connections(cluster, metrics) # 3. Display collected metrics - print("\n๐Ÿ“ˆ Metrics Summary") - print("=" * 60) + print("\n" + "=" * 80) + print("๐Ÿ“ˆ METRICS ANALYSIS RESULTS") + print("=" * 80) # Get stats from memory collector stats = await memory_collector.get_stats() @@ -234,63 +254,78 @@ async def main(): if "query_performance" in stats: perf = stats["query_performance"] if isinstance(perf, dict) and "total_queries" in perf: - print("\n๐Ÿ“Š Query Performance:") - print(f" Total Queries: {perf['total_queries']}") - print(f" Recent Queries (5min): {perf.get('recent_queries_5min', 0)}") - print(f" Success Rate: {perf.get('success_rate', 0)*100:.1f}%") - print(f" Average Duration: {perf.get('avg_duration_ms', 0):.2f}ms") - print(f" Min Duration: {perf.get('min_duration_ms', 0):.2f}ms") - print(f" Max Duration: {perf.get('max_duration_ms', 0):.2f}ms") - print(f" Queries/Second: {perf.get('queries_per_second', 0):.2f}") + print("\n๐Ÿ“Š Query Performance Metrics:") + print(f" โ€ข Total Queries: {perf['total_queries']}") + print(f" โ€ข Recent Queries (5min): {perf.get('recent_queries_5min', 0)}") + print(f" โ€ข Success Rate: {perf.get('success_rate', 0)*100:.1f}%") + print(f" โ€ข Average Duration: {perf.get('avg_duration_ms', 0):.2f}ms") + print(f" โ€ข Min Duration: {perf.get('min_duration_ms', 0):.2f}ms") + print(f" โ€ข Max Duration: {perf.get('max_duration_ms', 0):.2f}ms") + print(f" โ€ข Queries/Second: {perf.get('queries_per_second', 0):.2f}") if "error_summary" in stats and stats["error_summary"]: print("\nโŒ Error Summary:") for error_type, count in stats["error_summary"].items(): - print(f" {error_type}: {count}") + print(f" โ€ข {error_type}: {count} occurrences") if "top_queries" in stats and stats["top_queries"]: print("\n๐Ÿ”ฅ Top Queries by Frequency:") - for query_hash, count in list(stats["top_queries"].items())[:5]: - print(f" Query {query_hash}: {count} executions") + for i, (query_hash, count) in enumerate(list(stats["top_queries"].items())[:5], 1): + print(f" {i}. Query {query_hash}: {count} executions") if "connection_health" in stats and stats["connection_health"]: - print("\n๐Ÿ”— Connection Health:") + print("\n๐Ÿ”— Connection Health Details:") for host, health in stats["connection_health"].items(): - status = "UP" if health["healthy"] else "DOWN" - print(f" {host}: {status}") - print(f" Response Time: {health['response_time_ms']:.2f}ms") - print(f" Total Queries: {health.get('total_queries', 0)}") - print(f" Error Count: {health.get('error_count', 0)}") + status = "โœ… UP" if health["healthy"] else "โŒ DOWN" + print(f"\n ๐Ÿ“ก {host}: {status}") + print(f" โ€ข Response Time: {health['response_time_ms']:.2f}ms") + print(f" โ€ข Total Queries: {health.get('total_queries', 0)}") + print(f" โ€ข Error Count: {health.get('error_count', 0)}") # 4. Show optimization tips based on metrics - print("\n๐Ÿ’ก Performance Insights:") + print("\n" + "=" * 80) + print("๐Ÿ’ก PERFORMANCE INSIGHTS & RECOMMENDATIONS") + print("=" * 80) + if "query_performance" in stats and isinstance(stats["query_performance"], dict): perf = stats["query_performance"] avg_duration = perf.get("avg_duration_ms", 0) + + print("\n๐ŸŽฏ Query Performance Analysis:") if avg_duration > 10: - print(" โš ๏ธ Average query duration is high. Consider:") - print(" - Using prepared statements") - print(" - Adding appropriate indexes") - print(" - Reviewing data model") + print(" โš ๏ธ Average query duration is high ({:.2f}ms)".format(avg_duration)) + print(" ๐Ÿ“Œ Recommendations:") + print(" โ€ข Use prepared statements for repeated queries") + print(" โ€ข Add appropriate secondary indexes") + print(" โ€ข Review your data model for optimization") + print(" โ€ข Consider partitioning strategy") elif avg_duration < 1: - print(" โœ… Excellent query performance!") + print( + " โœ… Excellent query performance! ({:.2f}ms average)".format(avg_duration) + ) else: - print(" โœ… Good query performance") + print(" โœ… Good query performance ({:.2f}ms average)".format(avg_duration)) success_rate = perf.get("success_rate", 1) if success_rate < 0.95: - print(f" โš ๏ธ Success rate is {success_rate*100:.1f}%. Check error logs.") + print(f"\n โš ๏ธ Success rate is {success_rate*100:.1f}%") + print(" ๐Ÿ“Œ Action required: Check error logs for failure patterns") # Cleanup print("\n๐Ÿงน Cleaning up...") await session.execute("DROP KEYSPACE IF EXISTS metrics_demo") - - print("\nโœ… Example complete!") - print("\nNext steps:") - print("- Install prometheus_client for production metrics") - print("- Integrate with your monitoring dashboard") - print("- Set up alerts based on thresholds") - print("- Use metrics to optimize slow queries") + print("โœ… Keyspace dropped") + + print("\n" + "=" * 80) + print("โœ… EXAMPLE COMPLETE!") + print("=" * 80) + print("\n๐Ÿš€ Next Steps for Production:") + print(" 1. Install prometheus_client for production metrics") + print(" 2. Integrate with your monitoring dashboard (Grafana, etc.)") + print(" 3. Set up alerts based on performance thresholds") + print(" 4. Use metrics to identify and optimize slow queries") + print(" 5. Monitor connection health continuously") + print("\n๐Ÿ’ก Pro Tip: Export metrics to /metrics endpoint for Prometheus scraping!") if __name__ == "__main__": diff --git a/examples/metrics_simple.py b/examples/metrics_simple.py index 4771af2..4f5aee8 100644 --- a/examples/metrics_simple.py +++ b/examples/metrics_simple.py @@ -35,13 +35,15 @@ async def main(): """Run basic metrics example.""" - print("๐Ÿš€ async-cassandra Metrics Example\n") + print("\n" + "=" * 80) + print("๐Ÿš€ ASYNC-CASSANDRA METRICS COLLECTION EXAMPLE") + print("=" * 80) # Get contact points from environment or use localhost contact_points = os.environ.get("CASSANDRA_CONTACT_POINTS", "localhost").split(",") port = int(os.environ.get("CASSANDRA_PORT", "9042")) - print(f"Connecting to Cassandra at {contact_points}:{port}\n") + print(f"\n๐Ÿ“ก Connecting to Cassandra at {contact_points}:{port}") # Create metrics collector collector = InMemoryMetricsCollector(max_entries=1000) @@ -55,7 +57,7 @@ async def main(): async with await cluster.connect() as session: # Set up test keyspace - print("Setting up test database...") + print("\n๐Ÿ› ๏ธ Setting up test database...") await session.execute( """ CREATE KEYSPACE IF NOT EXISTS metrics_demo @@ -66,12 +68,10 @@ async def main(): """ ) - await session.set_keyspace("metrics_demo") - # Create test table await session.execute( """ - CREATE TABLE IF NOT EXISTS users ( + CREATE TABLE IF NOT EXISTS metrics_demo.users ( id UUID PRIMARY KEY, name TEXT, email TEXT, @@ -80,17 +80,19 @@ async def main(): """ ) - print("โœ… Database ready\n") + print("โœ… Database ready!") # Execute some queries and collect metrics - print("\n=== Query Metrics Example ===") - print("Executing queries...") + print("\n" + "=" * 80) + print("๐Ÿ“Š QUERY METRICS COLLECTION") + print("=" * 80) + print("\n๐Ÿ”„ Executing queries with metrics tracking...") # Prepare statements insert_stmt = await session.prepare( - "INSERT INTO users (id, name, email, created_at) VALUES (?, ?, ?, ?)" + "INSERT INTO metrics_demo.users (id, name, email, created_at) VALUES (?, ?, ?, ?)" ) - select_stmt = await session.prepare("SELECT * FROM users WHERE id = ?") + select_stmt = await session.prepare("SELECT * FROM metrics_demo.users WHERE id = ?") # Insert some users with metrics tracking user_ids = [] @@ -106,16 +108,19 @@ async def main(): ) duration = time.time() - start_time await metrics_middleware.record_query_metrics( - "INSERT INTO users", duration, success=True, parameters_count=4 + "INSERT INTO metrics_demo.users", duration, success=True, parameters_count=4 ) except Exception as e: duration = time.time() - start_time await metrics_middleware.record_query_metrics( - "INSERT INTO users", duration, success=False, error_type=type(e).__name__ + "INSERT INTO metrics_demo.users", + duration, + success=False, + error_type=type(e).__name__, ) raise - print(f"โœ… Inserted {len(user_ids)} users") + print(f"\nโœ… Inserted {len(user_ids)} users successfully") # Select users with metrics for user_id in user_ids[:5]: @@ -125,18 +130,18 @@ async def main(): user = result.one() duration = time.time() - start_time await metrics_middleware.record_query_metrics( - "SELECT * FROM users WHERE id = ?", + "SELECT * FROM metrics_demo.users WHERE id = ?", duration, success=True, parameters_count=1, result_size=1 if user else 0, ) if user: - print(f" Found user: {user.name}") + print(f" โ€ข Found: {user.name}") except Exception as e: duration = time.time() - start_time await metrics_middleware.record_query_metrics( - "SELECT * FROM users WHERE id = ?", + "SELECT * FROM metrics_demo.users WHERE id = ?", duration, success=False, error_type=type(e).__name__, @@ -144,7 +149,10 @@ async def main(): raise # Execute a failing query - print("\n=== Error Tracking Example ===") + print("\n" + "=" * 80) + print("โŒ ERROR TRACKING DEMONSTRATION") + print("=" * 80) + print("\n๐Ÿงช Testing error metrics collection...") try: start_time = time.time() await session.execute("SELECT * FROM non_existent_table") @@ -156,10 +164,13 @@ async def main(): success=False, error_type=type(e).__name__, ) - print(f" โŒ Expected error recorded: {type(e).__name__}") + print(f" โœ“ Expected error captured: {type(e).__name__}") # Connection health monitoring - print("\n=== Connection Health Monitoring ===") + print("\n" + "=" * 80) + print("๐Ÿ”— CONNECTION HEALTH MONITORING") + print("=" * 80) + print("\n๐Ÿฅ Checking cluster health...") # Record connection health metrics hosts = cluster._cluster.metadata.all_hosts() @@ -175,7 +186,7 @@ async def main(): response_time=response_time, total_queries=20, # Example value ) - print(f"โœ… {host.address}: UP (response time: {response_time*1000:.1f}ms)") + print(f" โœ… {host.address}: UP (response time: {response_time*1000:.1f}ms)") except Exception: response_time = time.time() - start_time await metrics_middleware.record_connection_metrics( @@ -184,45 +195,55 @@ async def main(): response_time=response_time, error_count=1, ) - print(f"โŒ {host.address}: DOWN") + print(f" โŒ {host.address}: DOWN") # Get and display metrics summary - print("\n=== Performance Summary ===") + print("\n" + "=" * 80) + print("๐Ÿ“Š PERFORMANCE METRICS SUMMARY") + print("=" * 80) stats = await collector.get_stats() if "query_performance" in stats: perf = stats["query_performance"] if "total_queries" in perf: - print("\n๐Ÿ“Š Query Metrics:") - print(f" Total queries: {perf['total_queries']}") - print(f" Recent queries (5min): {perf.get('recent_queries_5min', 0)}") - print(f" Success rate: {perf.get('success_rate', 0)*100:.1f}%") - print(f" Average latency: {perf.get('avg_duration_ms', 0):.1f}ms") - print(f" Min latency: {perf.get('min_duration_ms', 0):.1f}ms") - print(f" Max latency: {perf.get('max_duration_ms', 0):.1f}ms") - print(f" Queries/second: {perf.get('queries_per_second', 0):.2f}") + print("\n๐Ÿ“ˆ Query Performance:") + print(f" โ€ข Total queries: {perf['total_queries']}") + print(f" โ€ข Recent queries (5min): {perf.get('recent_queries_5min', 0)}") + print(f" โ€ข Success rate: {perf.get('success_rate', 0)*100:.1f}%") + print(f" โ€ข Average latency: {perf.get('avg_duration_ms', 0):.1f}ms") + print(f" โ€ข Min latency: {perf.get('min_duration_ms', 0):.1f}ms") + print(f" โ€ข Max latency: {perf.get('max_duration_ms', 0):.1f}ms") + print(f" โ€ข Queries/second: {perf.get('queries_per_second', 0):.2f}") if "error_summary" in stats and stats["error_summary"]: print("\nโŒ Error Summary:") for error_type, count in stats["error_summary"].items(): - print(f" {error_type}: {count}") + print(f" โ€ข {error_type}: {count} occurrences") if "top_queries" in stats and stats["top_queries"]: - print("\n๐Ÿ”ฅ Top Queries:") - for query_hash, count in list(stats["top_queries"].items())[:5]: - print(f" Query {query_hash}: {count} executions") + print("\n๐Ÿ”ฅ Top Queries by Frequency:") + for i, (query_hash, count) in enumerate(list(stats["top_queries"].items())[:5], 1): + print(f" {i}. Query {query_hash}: {count} executions") if "connection_health" in stats: - print("\n๐Ÿ”— Connection Health:") + print("\n๐Ÿ”— Connection Health Status:") for host, health in stats["connection_health"].items(): - status = "UP" if health["healthy"] else "DOWN" - print(f" {host}: {status} (response: {health['response_time_ms']:.1f}ms)") + status = "โœ… UP" if health["healthy"] else "โŒ DOWN" + print(f" โ€ข {host}: {status} (response: {health['response_time_ms']:.1f}ms)") # Clean up - print("\nCleaning up...") + print("\n๐Ÿงน Cleaning up...") await session.execute("DROP KEYSPACE metrics_demo") - - print("\nโœ… Example complete!") + print("โœ… Keyspace dropped") + + print("\n" + "=" * 80) + print("โœ… METRICS EXAMPLE COMPLETE!") + print("=" * 80) + print("\n๐Ÿ’ก This example demonstrated:") + print(" โ€ข Query performance tracking") + print(" โ€ข Error rate monitoring") + print(" โ€ข Connection health checks") + print(" โ€ข Metrics aggregation and reporting") if __name__ == "__main__": diff --git a/examples/realtime_processing.py b/examples/realtime_processing.py index 7b68871..758fdcb 100644 --- a/examples/realtime_processing.py +++ b/examples/realtime_processing.py @@ -115,7 +115,7 @@ def _check_alerts(self, reading: SensorReading): if reading.temperature > 35.0 or reading.temperature < -10.0: self.alerts_triggered += 1 logger.warning( - f"ALERT: Sensor {reading.sensor_id} temperature out of range: " + f"๐Ÿšจ ALERT: Sensor {reading.sensor_id} temperature out of range: " f"{reading.temperature}ยฐC" ) @@ -123,7 +123,7 @@ def _check_alerts(self, reading: SensorReading): if reading.humidity > 90.0: self.alerts_triggered += 1 logger.warning( - f"ALERT: Sensor {reading.sensor_id} high humidity: " f"{reading.humidity}%" + f"๐Ÿšจ ALERT: Sensor {reading.sensor_id} high humidity: {reading.humidity}%" ) def get_summary(self) -> Dict: @@ -149,7 +149,7 @@ def get_summary(self) -> Dict: async def setup_sensor_data(session): """Create sensor data table and insert sample data.""" - logger.info("Setting up sensor data...") + logger.info("\n๐Ÿ› ๏ธ Setting up IoT sensor data simulation...") # Create keyspace await session.execute( @@ -162,12 +162,10 @@ async def setup_sensor_data(session): """ ) - await session.set_keyspace("iot_data") - # Create time-series table await session.execute( """ - CREATE TABLE IF NOT EXISTS sensor_readings ( + CREATE TABLE IF NOT EXISTS iot_data.sensor_readings ( date date, sensor_id text, timestamp timestamp, @@ -182,7 +180,7 @@ async def setup_sensor_data(session): # Insert sample data for the last hour insert_stmt = await session.prepare( """ - INSERT INTO sensor_readings ( + INSERT INTO iot_data.sensor_readings ( date, sensor_id, timestamp, temperature, humidity, pressure ) VALUES (?, ?, ?, ?, ?, ?) """ @@ -192,7 +190,8 @@ async def setup_sensor_data(session): sensors = [f"sensor_{i:03d}" for i in range(50)] base_time = datetime.now() - timedelta(hours=6) # 6 hours of data - logger.info("Inserting sample sensor data...") + logger.info(f"๐Ÿ“ก Generating data for {len(sensors)} sensors over 6 hours...") + logger.info("๐ŸŒก๏ธ Data includes temperature, humidity, and pressure readings") tasks = [] total_readings = 0 @@ -228,18 +227,24 @@ async def setup_sensor_data(session): await asyncio.gather(*tasks) tasks = [] if total_readings % 10000 == 0: - logger.info(f"Inserted {total_readings:,} readings...") + logger.info(f" ๐Ÿ“Š Progress: {total_readings:,} readings inserted...") # Execute remaining tasks if tasks: await asyncio.gather(*tasks) - logger.info(f"Sample data inserted: {total_readings:,} total readings") + logger.info(f"โœ… Sample data setup complete: {total_readings:,} sensor readings created") + logger.info(f" โ€ข {len(sensors)} sensors") + logger.info(" โ€ข 6 hours of historical data") + logger.info(" โ€ข 10-second intervals") + logger.info(" โ€ข Includes simulated anomalies for alert testing") async def process_historical_data(session, processor: RealTimeProcessor): """Process historical data using streaming.""" - logger.info("\n=== Processing Historical Data ===") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ“ˆ PROCESSING HISTORICAL DATA WITH STREAMING") + logger.info("=" * 80) # Query last 6 hours of data six_hours_ago = datetime.now() - timedelta(hours=6) @@ -248,7 +253,7 @@ async def process_historical_data(session, processor: RealTimeProcessor): # Prepare query for specific date partition stmt = await session.prepare( """ - SELECT * FROM sensor_readings + SELECT * FROM iot_data.sensor_readings WHERE date = ? AND timestamp > ? ALLOW FILTERING @@ -258,7 +263,7 @@ async def process_historical_data(session, processor: RealTimeProcessor): # Configure streaming with appropriate page size for True Async Paging config = StreamConfig( fetch_size=5000, # Process 5000 rows per page - page_callback=lambda p, t: logger.info(f"Processing page {p} ({t:,} readings)"), + page_callback=lambda p, t: logger.info(f"๐Ÿ“„ Processing page {p} ({t:,} readings so far)"), ) # Stream and process data @@ -284,27 +289,34 @@ async def process_historical_data(session, processor: RealTimeProcessor): if readings_processed % 10000 == 0: summary = processor.get_summary() logger.info( - f"Progress: {readings_processed:,} readings - " - f"{summary['active_sensors']} sensors - " - f"{summary['alerts_triggered']} alerts" + f"๐Ÿ“Š Progress: {readings_processed:,} readings โ€ข " + f"{summary['active_sensors']} sensors โ€ข " + f"{summary['alerts_triggered']} alerts triggered" ) elapsed = (datetime.now() - start_time).total_seconds() - logger.info(f"\nProcessing completed in {elapsed:.2f} seconds") - logger.info( - f"Processed {readings_processed:,} readings " - f"({readings_processed/elapsed:,.0f} readings/sec)" - ) + + logger.info("\n" + "โ”€" * 80) + logger.info("โœ… HISTORICAL DATA PROCESSING COMPLETE") + logger.info("โ”€" * 80) + logger.info("\n๐Ÿ“Š Processing Statistics:") + logger.info(f" โ€ข Total readings: {readings_processed:,}") + logger.info(f" โ€ข Duration: {elapsed:.2f} seconds") + logger.info(f" โ€ข Throughput: {readings_processed/elapsed:,.0f} readings/sec") + logger.info(f" โ€ข Active sensors: {processor.get_summary()['active_sensors']}") + logger.info(f" โ€ข Alerts triggered: {processor.alerts_triggered}") async def process_data_in_pages(session): """Demonstrate True Async Paging for batch processing.""" - logger.info("\n=== True Async Paging Example ===") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ“ฆ TRUE ASYNC PAGING - BATCH PROCESSING DEMONSTRATION") + logger.info("=" * 80) # Query all data for batch processing stmt = await session.prepare( """ - SELECT * FROM sensor_readings + SELECT * FROM iot_data.sensor_readings WHERE date = ? ALLOW FILTERING """ @@ -317,7 +329,8 @@ async def process_data_in_pages(session): total_readings = 0 sensor_data = defaultdict(list) - logger.info("Processing sensor data in pages...") + logger.info("\n๐Ÿ”„ Processing sensor data in pages...") + logger.info("๐Ÿ’ก Note: Each page is fetched ONLY when needed (True Async Paging)") # Use True Async Paging to process large dataset efficiently async with await session.execute_stream( @@ -328,7 +341,7 @@ async def process_data_in_pages(session): readings_in_page = len(page) total_readings += readings_in_page - logger.info(f"Processing page {page_count} with {readings_in_page:,} readings") + logger.info(f"\n๐Ÿ“„ Page {page_count}: {readings_in_page:,} readings") # Process each page (e.g., aggregate by sensor) for row in page: @@ -347,24 +360,30 @@ async def process_data_in_pages(session): # Log memory-efficient processing if page_count % 5 == 0: logger.info( - f" Processed {total_readings:,} total readings across {len(sensor_data)} sensors" + f" โœ“ Progress: {total_readings:,} total readings โ€ข {len(sensor_data)} sensors" ) - logger.info("\nPage-based processing completed:") - logger.info(f" - Total pages: {page_count}") - logger.info(f" - Total readings: {total_readings:,}") - logger.info(f" - Unique sensors: {len(sensor_data)}") - logger.info(" - Memory usage remains constant due to page-by-page processing!") + logger.info("\n" + "โ”€" * 80) + logger.info("โœ… PAGE-BASED PROCESSING COMPLETE") + logger.info("โ”€" * 80) + logger.info("\n๐Ÿ“Š Results:") + logger.info(f" โ€ข Total pages: {page_count}") + logger.info(f" โ€ข Total readings: {total_readings:,}") + logger.info(f" โ€ข Unique sensors: {len(sensor_data)}") + logger.info("\n๐Ÿ’ก Key benefit: Memory usage remains constant regardless of dataset size!") + logger.info(" Pages are fetched on-demand as you process them.") async def simulate_realtime_processing(session, processor: RealTimeProcessor): """Simulate real-time data processing.""" - logger.info("\n=== Simulating Real-Time Processing ===") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ”ด SIMULATING REAL-TIME PROCESSING") + logger.info("=" * 80) # Prepare query for recent data stmt = await session.prepare( """ - SELECT * FROM sensor_readings + SELECT * FROM iot_data.sensor_readings WHERE date = ? AND sensor_id = ? AND timestamp > ? @@ -377,7 +396,7 @@ async def simulate_realtime_processing(session, processor: RealTimeProcessor): iterations = 10 for i in range(iterations): - logger.info(f"\nProcessing cycle {i+1}/{iterations}") + logger.info(f"\n๐Ÿ”„ Processing cycle {i+1}/{iterations}") # Query recent data for each sensor cutoff_time = datetime.now() - timedelta(minutes=processor.window_minutes) @@ -402,7 +421,11 @@ async def simulate_realtime_processing(session, processor: RealTimeProcessor): # Show current statistics summary = processor.get_summary() - logger.info(f"Current state: {summary}") + logger.info( + f"๐Ÿ“Š Current state: {summary['active_sensors']} sensors โ€ข " + f"{summary['total_readings']} readings โ€ข " + f"{summary['alerts_triggered']} alerts" + ) # Show sensor details for sensor_id, stats in processor.sensor_stats.items(): @@ -444,19 +467,25 @@ async def main(): # Show final summary summary = processor.get_summary() - logger.info("\nFinal Summary:") - logger.info(f"- Active sensors: {summary['active_sensors']}") - logger.info(f"- Total readings: {summary['total_readings']}") - logger.info(f"- Alerts triggered: {summary['alerts_triggered']}") - logger.info(f"- Avg temperature: {summary['avg_temperature']}ยฐC") - logger.info(f"- Avg humidity: {summary['avg_humidity']}%") + logger.info("\n" + "=" * 80) + logger.info("๐Ÿ“Š FINAL ANALYTICS SUMMARY") + logger.info("=" * 80) + logger.info("\n๐ŸŒก๏ธ Environmental Metrics:") + logger.info(f" โ€ข Active sensors: {summary['active_sensors']}") + logger.info(f" โ€ข Total readings processed: {summary['total_readings']:,}") + logger.info(f" โ€ข Average temperature: {summary['avg_temperature']}ยฐC") + logger.info(f" โ€ข Average humidity: {summary['avg_humidity']}%") + logger.info("\n๐Ÿšจ Alert Summary:") + logger.info(f" โ€ข Total alerts triggered: {summary['alerts_triggered']}") + logger.info(" โ€ข Alert types: Temperature anomalies, High humidity") # Simulate real-time processing await simulate_realtime_processing(session, processor) # Cleanup - logger.info("\nCleaning up...") + logger.info("\n๐Ÿงน Cleaning up...") await session.execute("DROP KEYSPACE iot_data") + logger.info("โœ… Keyspace dropped") if __name__ == "__main__": diff --git a/examples/streaming_basic.py b/examples/streaming_basic.py index 03f78da..dd829da 100644 --- a/examples/streaming_basic.py +++ b/examples/streaming_basic.py @@ -52,12 +52,10 @@ async def setup_test_data(session): """ ) - await session.set_keyspace("streaming_example") - # Create table await session.execute( """ - CREATE TABLE IF NOT EXISTS events ( + CREATE TABLE IF NOT EXISTS streaming_example.events ( partition_id int, event_id int, event_time timestamp, @@ -72,7 +70,7 @@ async def setup_test_data(session): logger.info("Inserting test data...") insert_stmt = await session.prepare( """ - INSERT INTO events (partition_id, event_id, event_time, event_type, data) + INSERT INTO streaming_example.events (partition_id, event_id, event_time, event_type, data) VALUES (?, ?, ?, ?, ?) """ ) @@ -112,7 +110,9 @@ async def setup_test_data(session): async def basic_streaming_example(session): """Demonstrate basic streaming.""" - logger.info("\n=== Basic Streaming Example ===") + logger.info("\n" + "=" * 80) + logger.info("BASIC STREAMING EXAMPLE") + logger.info("=" * 80) # Configure streaming with smaller page size to demonstrate True Async Paging # IMPORTANT: The driver fetches pages on-demand, not all at once @@ -131,7 +131,9 @@ async def basic_streaming_example(session): start_time = datetime.now() # CRITICAL: Always use context manager to prevent memory leaks - async with await session.execute_stream("SELECT * FROM events", stream_config=config) as result: + async with await session.execute_stream( + "SELECT * FROM streaming_example.events", stream_config=config + ) as result: # Process rows one at a time event_count = 0 event_types = {} @@ -150,22 +152,25 @@ async def basic_streaming_example(session): logger.info(f"Processed {event_count:,} events ({rate:,.0f} events/sec)") elapsed = (datetime.now() - start_time).total_seconds() - logger.info("\nStreaming completed:") - logger.info(f"- Total events: {event_count:,}") - logger.info(f"- Time elapsed: {elapsed:.2f} seconds") - logger.info(f"- Rate: {event_count/elapsed:,.0f} events/sec") - logger.info(f"- Event types: {event_types}") + logger.info("\nโœ… Streaming completed!") + logger.info("๐Ÿ“Š Statistics:") + logger.info(f" โ€ข Total events: {event_count:,}") + logger.info(f" โ€ข Time elapsed: {elapsed:.2f} seconds") + logger.info(f" โ€ข Processing rate: {event_count/elapsed:,.0f} events/sec") + logger.info(f" โ€ข Event types distribution: {event_types}") async def filtered_streaming_example(session): """Demonstrate streaming with WHERE clause.""" - logger.info("\n=== Filtered Streaming Example ===") + logger.info("\n" + "=" * 80) + logger.info("FILTERED STREAMING EXAMPLE") + logger.info("=" * 80) # Prepare a filtered query # Note: event_type is not part of primary key, so we need ALLOW FILTERING stmt = await session.prepare( """ - SELECT * FROM events + SELECT * FROM streaming_example.events WHERE partition_id = ? ALLOW FILTERING """ @@ -187,13 +192,19 @@ async def filtered_streaming_example(session): event_type = row.event_type type_counts[event_type] = type_counts.get(event_type, 0) + 1 - logger.info(f"Found {count} events in partition {partition_id}") - logger.info(f"Event types in partition: {type_counts}") + logger.info("\nโœ… Filtered streaming completed!") + logger.info(f"๐Ÿ“Š Results for partition {partition_id}:") + logger.info(f" โ€ข Total events: {count}") + logger.info(f" โ€ข Event type breakdown: {type_counts}") async def page_based_streaming_example(session): """Demonstrate True Async Paging with page-by-page processing.""" - logger.info("\n=== Page-Based Streaming Example (True Async Paging) ===") + logger.info("\n" + "=" * 80) + logger.info("PAGE-BASED STREAMING EXAMPLE (True Async Paging)") + logger.info("=" * 80) + logger.info("\n๐Ÿ’ก Key Insight: Pages are fetched ON-DEMAND as you process them!") + logger.info(" The driver fetches the next page WHILE you process the current one.\n") # Page Size Recommendations: # - Smaller pages (1000-5000): Better for memory, responsiveness, real-time processing @@ -204,7 +215,9 @@ async def page_based_streaming_example(session): config = StreamConfig(fetch_size=7500) # Will result in ~13-14 pages # Use context manager for automatic resource cleanup - async with await session.execute_stream("SELECT * FROM events", stream_config=config) as result: + async with await session.execute_stream( + "SELECT * FROM streaming_example.events", stream_config=config + ) as result: # Process data page by page using True Async Paging page_count = 0 total_events = 0 @@ -217,7 +230,7 @@ async def page_based_streaming_example(session): events_in_page = len(page) total_events += events_in_page - logger.info(f"Processing page {page_count} with {events_in_page:,} events") + logger.info(f"๐Ÿ“„ Processing page {page_count} ({events_in_page:,} events)...") # Simulate batch processing (e.g., writing to another system) # In real scenarios, this could be bulk writes to S3, another DB, etc. @@ -231,16 +244,16 @@ async def page_based_streaming_example(session): page_time = (datetime.now() - page_start).total_seconds() processing_times.append(page_time) - logger.info( - f" Page {page_count} processed in {page_time:.3f}s - Event types: {event_types}" - ) + logger.info(f" โœ“ Page {page_count} done in {page_time:.3f}s | Types: {event_types}") avg_page_time = sum(processing_times) / len(processing_times) if processing_times else 0 - logger.info("\nCompleted True Async Paging:") - logger.info(f" - Total pages: {page_count}") - logger.info(f" - Total events: {total_events:,}") - logger.info(f" - Average page processing time: {avg_page_time:.3f}s") - logger.info(" - Pages are fetched asynchronously while previous pages are being processed!") + logger.info("\nโœ… Page-based streaming completed!") + logger.info("๐Ÿ“Š Statistics:") + logger.info(f" โ€ข Total pages processed: {page_count}") + logger.info(f" โ€ข Total events: {total_events:,}") + logger.info(f" โ€ข Average page processing time: {avg_page_time:.3f}s") + logger.info("\n๐Ÿš€ Performance Note: Pages were fetched asynchronously!") + logger.info(" While you processed each page, the driver was already fetching the next one.") async def main(): diff --git a/examples/streaming_non_blocking_demo.py b/examples/streaming_non_blocking_demo.py index 17cccc4..58a185d 100644 --- a/examples/streaming_non_blocking_demo.py +++ b/examples/streaming_non_blocking_demo.py @@ -94,11 +94,9 @@ async def setup_demo_data(session): """ ) - await session.set_keyspace("streaming_demo") - await session.execute( """ - CREATE TABLE IF NOT EXISTS sensor_data ( + CREATE TABLE IF NOT EXISTS streaming_demo.sensor_data ( sensor_id TEXT, reading_time TIMESTAMP, temperature DOUBLE, @@ -112,7 +110,7 @@ async def setup_demo_data(session): # Insert data - enough for multiple pages insert_stmt = await session.prepare( """ - INSERT INTO sensor_data (sensor_id, reading_time, temperature, humidity, location) + INSERT INTO streaming_demo.sensor_data (sensor_id, reading_time, temperature, humidity, location) VALUES (?, ?, ?, ?, ?) """ ) @@ -177,7 +175,7 @@ def page_callback(page_num: int, rows_in_page: int): print("๐Ÿ”„ Starting to stream sensor data...\n") async with await session.execute_stream( - "SELECT * FROM sensor_data", stream_config=config + "SELECT * FROM streaming_demo.sensor_data", stream_config=config ) as result: async for row in result: rows_processed += 1 @@ -213,7 +211,9 @@ async def demonstrate_concurrent_operations(session): print("\n\n๐ŸŽฏ Demonstrating concurrent operations during streaming...") # Prepare queries - count_stmt = await session.prepare("SELECT COUNT(*) FROM sensor_data WHERE sensor_id = ?") + count_stmt = await session.prepare( + "SELECT COUNT(*) FROM streaming_demo.sensor_data WHERE sensor_id = ?" + ) concurrent_results = [] @@ -236,7 +236,7 @@ async def stream_data(): rows = 0 async with await session.execute_stream( - "SELECT * FROM sensor_data", stream_config=config + "SELECT * FROM streaming_demo.sensor_data", stream_config=config ) as result: async for row in result: rows += 1