blackboxprogramming · Copilot · Feb 25, 2026 · Feb 25, 2026 · Copilot · Feb 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.egg-info/
+dist/
+build/
+.env
+*.json.bak
diff --git a/scrapers/README.md b/scrapers/README.md
@@ -0,0 +1,94 @@
+# Scrapers
+
+Python web scrapers for collecting data relevant to the simulation-theory research repository.
+
+## Scrapers
+
+| Script | Source | Topics |
+|--------|--------|--------|
+| [`arxiv_scraper.py`](./arxiv_scraper.py) | [arXiv](https://arxiv.org) | Simulation hypothesis, Gödel incompleteness, Riemann zeta, qutrit/ternary quantum, halting problem, IIT consciousness |
+| [`wikipedia_scraper.py`](./wikipedia_scraper.py) | [Wikipedia](https://en.wikipedia.org) | SHA-256, Riemann hypothesis, quantum computing, Euler's identity, fine-structure constant, Turing machine, DNA, Blockchain |
+| [`oeis_scraper.py`](./oeis_scraper.py) | [OEIS](https://oeis.org) | Prime numbers, Fibonacci, pi digits, Euler–Mascheroni constant, Catalan numbers, partition numbers |
+
+## Setup
+
+```bash
+pip install -r requirements.txt
-pip install -r requirements.txt
+# From the repo root:
+pip install -r scrapers/requirements.txt
+
+# Or, from within the scrapers/ directory:
+# pip install -r requirements.txt
-pip install -r requirements.txt
+# From the repo root:
+pip install -r scrapers/requirements.txt
+
+# Or, from within the scrapers/ directory:
+# pip install -r requirements.txt
+```
+
+## Usage
+
+### arXiv scraper
+
+```bash
+# Use default topic list
+python arxiv_scraper.py
+
+# Custom query, limit to 3 results per query
+python arxiv_scraper.py --query "Riemann hypothesis zeros" --max 3
+
+# Save to file
+python arxiv_scraper.py --output arxiv_results.json
+```
+
+### Wikipedia scraper
+
+```bash
+# Use default topic list
+python wikipedia_scraper.py
+
+# Custom topics
+python wikipedia_scraper.py --topics "Riemann hypothesis" "SHA-2" "Turing machine"
+
+# Save to file
+python wikipedia_scraper.py --output wikipedia_results.json
+```
+
+### OEIS scraper
+
+```bash
+# Use default sequence list
+python oeis_scraper.py
+
+# Custom sequence IDs
+python oeis_scraper.py --ids A000040 A000045 A000796
+
+# Save to file
+python oeis_scraper.py --output oeis_results.json
+```
+
+## Output format
+
+All scrapers output JSON to stdout by default, or to a file with `--output`.
-All scrapers output JSON to stdout by default, or to a file with `--output`.
+All scrapers emit their results as JSON. When run without `--output`, JSON is printed to stdout along with occasional human-readable progress or error messages; use `--output` to write clean JSON to a file.
-All scrapers output JSON to stdout by default, or to a file with `--output`.
+All scrapers emit their results as JSON. When run without `--output`, JSON is printed to stdout along with occasional human-readable progress or error messages; use `--output` to write clean JSON to a file.
+
+**arXiv** — dict keyed by query, each value is a list of:
+```json
+{
+  "title": "...",
+  "authors": ["..."],
+  "published": "2024-01-01T00:00:00Z",
+  "abstract": "...",
+  "url": "https://arxiv.org/abs/..."
+}
+```
+
+**Wikipedia** — list of:
+```json
+{
+  "topic": "SHA-2",
+  "title": "SHA-2",
+  "url": "https://en.wikipedia.org/wiki/SHA-2",
+  "summary": "..."
+}
+```
+
+**OEIS** — list of:
+```json
+{
+  "id": "A000040",
+  "name": "The prime numbers.",
+  "description": "...",
+  "values": ["2", "3", "5", "7", "11", "..."],
+  "url": "https://oeis.org/A000040"
+}
+```
diff --git a/scrapers/arxiv_scraper.py b/scrapers/arxiv_scraper.py
@@ -0,0 +1,119 @@
+"""
+arXiv scraper — fetches abstracts for papers related to simulation theory research topics.
+
+Topics covered: simulation hypothesis, Gödel incompleteness, Riemann hypothesis,
+quantum computation, SHA-256/cryptographic hash functions, consciousness/integrated
+information theory, ternary/qutrit systems.
+
+Usage:
+    python arxiv_scraper.py
+    python arxiv_scraper.py --query "Riemann hypothesis" --max 5
+    python arxiv_scraper.py --output results.json
+"""
+
+import argparse
+import json
+import time
+import xml.etree.ElementTree as ET
+
+import requests
+
+ARXIV_API = "https://export.arxiv.org/api/query"
+
+DEFAULT_QUERIES = [
+    "simulation hypothesis computational reality",
+    "Gödel incompleteness self-reference formal systems",
+    "Riemann zeta function trivial zeros",
+    "SHA-256 hash chain cryptographic proof",
+    "qutrit ternary quantum computation",
+    "integrated information theory consciousness",
+    "halting problem quantum physics undecidability",
+]
+
+NS = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
+
+
+def fetch_papers(query: str, max_results: int = 5) -> list[dict]:
+    """Return a list of paper dicts for the given arXiv search query."""
+    params = {
+        "search_query": f"all:{query}",
+        "start": 0,
+        "max_results": max_results,
+        "sortBy": "relevance",
+        "sortOrder": "descending",
+    }
+    resp = requests.get(ARXIV_API, params=params, timeout=30)
+    resp.raise_for_status()
+
+    root = ET.fromstring(resp.text)
-    root = ET.fromstring(resp.text)
+    try:
+        root = ET.fromstring(resp.text)
+    except ET.ParseError:
+        # Malformed or non-XML response; fail gracefully with no papers.
+        return []
-    root = ET.fromstring(resp.text)
+    try:
+        root = ET.fromstring(resp.text)
+    except ET.ParseError:
+        # Malformed or non-XML response; fail gracefully with no papers.
+        return []
+    papers = []
+    for entry in root.findall("atom:entry", NS):
+        title_el = entry.find("atom:title", NS)
+        summary_el = entry.find("atom:summary", NS)
+        id_el = entry.find("atom:id", NS)
+        published_el = entry.find("atom:published", NS)
+        authors = [
+            a.find("atom:name", NS).text
+            for a in entry.findall("atom:author", NS)
+            if a.find("atom:name", NS) is not None
+        ]
+        papers.append(
+            {
+                "title": title_el.text.strip() if title_el is not None else "",
+                "authors": authors,
+                "published": published_el.text.strip() if published_el is not None else "",
+                "abstract": summary_el.text.strip() if summary_el is not None else "",
+                "url": id_el.text.strip() if id_el is not None else "",
+            }
+        )
+    return papers
+
+
+def scrape(queries: list[str], max_per_query: int = 5) -> dict[str, list[dict]]:
+    """Scrape arXiv for each query and return results keyed by query string."""
+    results = {}
+    for query in queries:
+        print(f"Fetching: {query!r} …")
+        try:
+            results[query] = fetch_papers(query, max_results=max_per_query)
+        except requests.RequestException as exc:
+            print(f"  Error: {exc}")
+            results[query] = []
+        time.sleep(1)  # be polite to the API
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Scrape arXiv for simulation-theory topics.")
+    parser.add_argument(
+        "--query",
+        nargs="*",
+        default=DEFAULT_QUERIES,
+        help="Search queries (defaults to built-in topic list).",
+    )
+    parser.add_argument(
+        "--max",
+        type=int,
+        default=5,
+        dest="max_results",
+        help="Maximum results per query (default: 5).",
+    )
+    parser.add_argument(
+        "--output",
+        default=None,
+        help="Write results to a JSON file instead of stdout.",
+    )
+    args = parser.parse_args()
+
+    results = scrape(args.query, max_per_query=args.max_results)
+
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as fh:
+            json.dump(results, fh, indent=2, ensure_ascii=False)
+        print(f"Results written to {args.output}")
+    else:
+        print(json.dumps(results, indent=2, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scrapers/oeis_scraper.py b/scrapers/oeis_scraper.py
@@ -0,0 +1,100 @@
+"""
+OEIS (On-Line Encyclopedia of Integer Sequences) scraper — fetches sequence
+metadata for integer sequences relevant to simulation-theory research.
+
+Sequences of interest: primes, Fibonacci, pi digits, Euler–Mascheroni constant
+digits, Pascal's triangle, Catalan numbers, SHA-256 round constants, and others.
+
+Usage:
+    python oeis_scraper.py
+    python oeis_scraper.py --ids A000040 A000045
+    python oeis_scraper.py --output results.json
+"""
+
+import argparse
+import json
+import time
+
+import requests
+
+OEIS_SEARCH_URL = "https://oeis.org/search"
+
+# Default sequence IDs relevant to the repository topics
+DEFAULT_IDS = [
+    "A000040",   # prime numbers
+    "A000045",   # Fibonacci numbers
+    "A000796",   # decimal expansion of pi
+    "A001620",   # decimal expansion of Euler–Mascheroni constant
+    "A000108",   # Catalan numbers
+    "A000012",   # the all-1s sequence (trivial zero analogue)
+    "A000720",   # pi(n): number of primes <= n
+    "A006862",   # Euclid numbers: 1 + product of first n primes
+    "A000041",   # number of partitions of n
+    "A001358",   # semiprimes
+]
+
+
+def fetch_sequence(oeis_id: str) -> dict:
+    """Fetch metadata for a single OEIS sequence via the JSON search endpoint."""
+    params = {"q": f"id:{oeis_id}", "fmt": "json"}
+    resp = requests.get(OEIS_SEARCH_URL, params=params, timeout=30)
+    resp.raise_for_status()
+    data = resp.json()
-    data = resp.json()
+    try:
+        data = resp.json()
+    except ValueError:
+        # OEIS returned a non-JSON response (e.g., HTML error page); return empty result.
+        return {"id": oeis_id, "name": "", "description": "", "values": [], "url": ""}
-    data = resp.json()
+    try:
+        data = resp.json()
+    except ValueError:
+        # OEIS returned a non-JSON response (e.g., HTML error page); return empty result.
+        return {"id": oeis_id, "name": "", "description": "", "values": [], "url": ""}
+
+    results = data.get("results") or []
+    if not results:
+        return {"id": oeis_id, "name": "", "description": "", "values": [], "url": ""}
+
+    seq = results[0]
+    return {
+        "id": oeis_id,
+        "name": seq.get("name", ""),
+        "description": seq.get("comment", [""])[0] if seq.get("comment") else "",
+        "values": seq.get("data", "").split(",")[:20],  # first 20 terms
-    return {
-        "id": oeis_id,
-        "name": seq.get("name", ""),
-        "description": seq.get("comment", [""])[0] if seq.get("comment") else "",
-        "values": seq.get("data", "").split(",")[:20],  # first 20 terms
+
+    # Normalize the data field: handle missing/blank data and strip whitespace.
+    data_str = seq.get("data", "")
+    if not data_str or not str(data_str).strip():
+        values = []
+    else:
+        # Split on commas, strip whitespace, and discard empty terms.
+        raw_terms = str(data_str).split(",")
+        values = [term.strip() for term in raw_terms if term.strip()]
+    values = values[:20]  # first 20 terms
+
+    return {
+        "id": oeis_id,
+        "name": seq.get("name", ""),
+        "description": seq.get("comment", [""])[0] if seq.get("comment") else "",
+        "values": values,
-    return {
-        "id": oeis_id,
-        "name": seq.get("name", ""),
-        "description": seq.get("comment", [""])[0] if seq.get("comment") else "",
-        "values": seq.get("data", "").split(",")[:20],  # first 20 terms
+
+    # Normalize the data field: handle missing/blank data and strip whitespace.
+    data_str = seq.get("data", "")
+    if not data_str or not str(data_str).strip():
+        values = []
+    else:
+        # Split on commas, strip whitespace, and discard empty terms.
+        raw_terms = str(data_str).split(",")
+        values = [term.strip() for term in raw_terms if term.strip()]
+    values = values[:20]  # first 20 terms
+
+    return {
+        "id": oeis_id,
+        "name": seq.get("name", ""),
+        "description": seq.get("comment", [""])[0] if seq.get("comment") else "",
+        "values": values,
+        "url": f"https://oeis.org/{oeis_id}",
+    }
+
+
+def scrape(ids: list[str]) -> list[dict]:
+    """Scrape OEIS for each sequence ID."""
+    results = []
+    for oeis_id in ids:
+        print(f"Fetching: {oeis_id} …")
+        try:
+            results.append(fetch_sequence(oeis_id))
+        except requests.RequestException as exc:
+            print(f"  Error: {exc}")
+            results.append({"id": oeis_id, "name": "", "description": "", "values": [], "url": ""})
+        time.sleep(0.5)  # be polite
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Scrape OEIS sequences relevant to simulation-theory research."
+    )
+    parser.add_argument(
+        "--ids",
+        nargs="*",
+        default=DEFAULT_IDS,
+        help="OEIS sequence IDs (e.g. A000040). Defaults to built-in list.",
+    )
+    parser.add_argument(
+        "--output",
+        default=None,
+        help="Write results to a JSON file instead of stdout.",
+    )
+    args = parser.parse_args()
+
+    results = scrape(args.ids)
+
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as fh:
+            json.dump(results, fh, indent=2, ensure_ascii=False)
+        print(f"Results written to {args.output}")
+    else:
+        print(json.dumps(results, indent=2, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt
@@ -0,0 +1,3 @@
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+lxml>=4.9.0
-beautifulsoup4>=4.12.0
-lxml>=4.9.0
-beautifulsoup4>=4.12.0
-lxml>=4.9.0