Extract structured carrier participation data from insurance tower schematic Excel files.
- Extract carrier data from Excel tower schematics (layer limits, participation %, premiums)
- Preflight analysis to assess extraction viability before processing
- AI-powered verification using Google Gemini to validate extraction accuracy
- CLI tools for batch processing and verification
pip install schematic-explorerFor development:
git clone https://github.com/arqu-co/schematic-explorer.git
cd schematic-explorer
pip install -e ".[dev]"from schematic_explorer import extract_schematic
# Extract carrier data from an Excel tower schematic
entries = extract_schematic("tower.xlsx")
for entry in entries:
print(f"{entry['carrier']}: {entry['participation_pct']:.1%} - ${entry['premium']:,.0f}")Extract carrier entries from an Excel file.
from schematic_explorer import extract_schematic
entries = extract_schematic("tower.xlsx")
# Returns list of dicts with carrier dataReturns a list of dictionaries with:
carrier- Carrier namelayer_limit- Layer limit (e.g., "$50M")layer_description- Layer description textparticipation_pct- Participation percentage (0.0-1.0)premium- Premium amountpremium_share- Premium share percentageterms- Terms/conditions textpolicy_number- Policy number if foundexcel_range- Source cell referencecol_span,row_span- Cell dimensionsfill_color- Cell background colorattachment_point- Attachment point if found
Extract carrier entries along with layer-level summaries.
from schematic_explorer import extract_schematic_with_summaries
entries, layer_summaries = extract_schematic_with_summaries("tower.xlsx")
for summary in layer_summaries:
print(f"Layer {summary['layer_limit']}: ${summary['layer_bound_premium']:,.0f}")Analyze an Excel file before extraction to assess viability and confidence.
from schematic_explorer import preflight
result = preflight("tower.xlsx")
if result.can_extract:
print(f"Ready to extract with {result.confidence:.0%} confidence")
print(f"Found {result.layers_found} layers, {result.carriers_found} carriers")
else:
print("Issues detected:")
for issue in result.issues:
print(f" - {issue}")PreflightResult attributes:
can_extract- Whether extraction is likely to succeedconfidence- Confidence score (0.0-1.0)layers_found- Number of layers detectedcarriers_found- Number of carriers detectedhas_percentages- Whether participation % were foundhas_currency- Whether premium values were foundhas_terms- Whether terms text was foundissues- List of detected issuessuggestions- List of suggestions
Requires google-generativeai and a Gemini API key.
Extract and verify a file using AI-powered analysis.
from schematic_explorer import verify_file
result = verify_file("tower.xlsx")
print(f"Accuracy: {result.score:.0%}")
print(f"Summary: {result.summary}")
if result.issues:
print("Issues found:")
for issue in result.issues:
print(f" - {issue}")Verify already-extracted data against the source file.
from schematic_explorer import extract_schematic, verify_extraction
entries = extract_schematic("tower.xlsx")
result = verify_extraction("tower.xlsx", entries)VerificationResult attributes:
score- Accuracy score (0.0-1.0)summary- Brief summary of verificationissues- List of issues foundsuggestions- List of improvement suggestionsmetadata- Parsing metadata (fallback_used, parsing_method)
from flask import Flask, request, jsonify
from schematic_explorer import extract_schematic, preflight
app = Flask(__name__)
@app.route("/extract", methods=["POST"])
def extract():
file = request.files["file"]
filepath = f"/tmp/{file.filename}"
file.save(filepath)
# Check viability first
check = preflight(filepath)
if not check.can_extract:
return jsonify({"error": "Cannot extract", "issues": check.issues}), 400
# Extract data
entries = extract_schematic(filepath)
return jsonify({"entries": entries, "confidence": check.confidence})# views.py
from django.http import JsonResponse
from schematic_explorer import extract_schematic, preflight
def extract_tower(request):
uploaded_file = request.FILES["file"]
# Save temporarily
with open(f"/tmp/{uploaded_file.name}", "wb") as f:
for chunk in uploaded_file.chunks():
f.write(chunk)
filepath = f"/tmp/{uploaded_file.name}"
# Preflight check
result = preflight(filepath)
if not result.can_extract:
return JsonResponse({"success": False, "issues": result.issues})
# Extract
entries = extract_schematic(filepath)
return JsonResponse({
"success": True,
"entries": entries,
"layers_found": result.layers_found,
"carriers_found": result.carriers_found,
})from pathlib import Path
from schematic_explorer import extract_schematic_with_summaries, preflight
import json
def process_directory(input_dir: str, output_dir: str):
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
for xlsx_file in input_path.glob("*.xlsx"):
if xlsx_file.name.startswith("~$"): # Skip temp files
continue
print(f"Processing {xlsx_file.name}...")
# Preflight check
check = preflight(str(xlsx_file))
if not check.can_extract:
print(f" Skipping: {check.issues}")
continue
# Extract
entries, summaries = extract_schematic_with_summaries(str(xlsx_file))
# Save JSON output
output_file = output_path / f"{xlsx_file.stem}.json"
output_file.write_text(json.dumps({
"entries": entries,
"summaries": [s.to_dict() for s in summaries],
}, indent=2))
print(f" Extracted {len(entries)} entries -> {output_file.name}")
if __name__ == "__main__":
process_directory("./input", "./output")import pandas as pd
from schematic_explorer import extract_schematic
entries = extract_schematic("tower.xlsx")
# Convert to DataFrame
df = pd.DataFrame(entries)
# Analyze by layer
layer_summary = df.groupby("layer_limit").agg({
"carrier": "count",
"participation_pct": "sum",
"premium": "sum"
}).rename(columns={"carrier": "num_carriers"})
print(layer_summary)import asyncio
from concurrent.futures import ProcessPoolExecutor
from schematic_explorer import extract_schematic
async def extract_async(filepath: str):
loop = asyncio.get_event_loop()
with ProcessPoolExecutor() as pool:
return await loop.run_in_executor(pool, extract_schematic, filepath)
async def process_files(filepaths: list[str]):
tasks = [extract_async(fp) for fp in filepaths]
return await asyncio.gather(*tasks)
# Usage
results = asyncio.run(process_files(["tower1.xlsx", "tower2.xlsx"]))The package includes CLI tools for development and batch processing:
# Process files
bin/process all # Process all files in input/
bin/process random # Process a random file
bin/process tower.xlsx # Process a specific file
bin/process all --verify # Process and verify
# Verify extractions
bin/verify all # Verify all files
bin/verify tower.xlsx # Verify a specific file
bin/verify --verbose # Show detailed metadata
# Preflight check
bin/preflight tower.xlsx # Check extraction viabilityFor verification features:
export GEMINI_API_KEY="your-api-key"
export GEMINI_MODEL_ID="gemini-2.5-flash" # Optional, defaults to gemini-2.5-flashfrom schematic_explorer import CarrierEntry, LayerSummary, VerificationResult, PreflightResult
# All types are dataclasses with .to_dict() methods
entry = CarrierEntry(...)
data = entry.to_dict()- Python 3.12+
- openpyxl >= 3.1.5
- pyyaml >= 6.0.3
Optional for verification:
- google-generativeai
- python-dotenv
- Pillow
# Install dev dependencies
pip install -e ".[dev]"
# Run tests
pytest tests/
# Run tests with coverage
pytest tests/ --cov=schematic_explorer --cov-report=term-missing
# Lint
ruff check src/ tests/
ruff format src/ tests/MIT