diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..db23414 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.egg-info/ +dist/ +build/ +.env +*.json.bak diff --git a/scrapers/README.md b/scrapers/README.md new file mode 100644 index 0000000..0ce56ac --- /dev/null +++ b/scrapers/README.md @@ -0,0 +1,94 @@ +# Scrapers + +Python web scrapers for collecting data relevant to the simulation-theory research repository. + +## Scrapers + +| Script | Source | Topics | +|--------|--------|--------| +| [`arxiv_scraper.py`](./arxiv_scraper.py) | [arXiv](https://arxiv.org) | Simulation hypothesis, Gödel incompleteness, Riemann zeta, qutrit/ternary quantum, halting problem, IIT consciousness | +| [`wikipedia_scraper.py`](./wikipedia_scraper.py) | [Wikipedia](https://en.wikipedia.org) | SHA-256, Riemann hypothesis, quantum computing, Euler's identity, fine-structure constant, Turing machine, DNA, Blockchain | +| [`oeis_scraper.py`](./oeis_scraper.py) | [OEIS](https://oeis.org) | Prime numbers, Fibonacci, pi digits, Euler–Mascheroni constant, Catalan numbers, partition numbers | + +## Setup + +```bash +pip install -r requirements.txt +``` + +## Usage + +### arXiv scraper + +```bash +# Use default topic list +python arxiv_scraper.py + +# Custom query, limit to 3 results per query +python arxiv_scraper.py --query "Riemann hypothesis zeros" --max 3 + +# Save to file +python arxiv_scraper.py --output arxiv_results.json +``` + +### Wikipedia scraper + +```bash +# Use default topic list +python wikipedia_scraper.py + +# Custom topics +python wikipedia_scraper.py --topics "Riemann hypothesis" "SHA-2" "Turing machine" + +# Save to file +python wikipedia_scraper.py --output wikipedia_results.json +``` + +### OEIS scraper + +```bash +# Use default sequence list +python oeis_scraper.py + +# Custom sequence IDs +python oeis_scraper.py --ids A000040 A000045 A000796 + +# Save to file +python oeis_scraper.py --output oeis_results.json +``` + +## Output format + +All scrapers output JSON to stdout by default, or to a file with `--output`. + +**arXiv** — dict keyed by query, each value is a list of: +```json +{ + "title": "...", + "authors": ["..."], + "published": "2024-01-01T00:00:00Z", + "abstract": "...", + "url": "https://arxiv.org/abs/..." +} +``` + +**Wikipedia** — list of: +```json +{ + "topic": "SHA-2", + "title": "SHA-2", + "url": "https://en.wikipedia.org/wiki/SHA-2", + "summary": "..." +} +``` + +**OEIS** — list of: +```json +{ + "id": "A000040", + "name": "The prime numbers.", + "description": "...", + "values": ["2", "3", "5", "7", "11", "..."], + "url": "https://oeis.org/A000040" +} +``` diff --git a/scrapers/arxiv_scraper.py b/scrapers/arxiv_scraper.py new file mode 100644 index 0000000..a67779b --- /dev/null +++ b/scrapers/arxiv_scraper.py @@ -0,0 +1,119 @@ +""" +arXiv scraper — fetches abstracts for papers related to simulation theory research topics. + +Topics covered: simulation hypothesis, Gödel incompleteness, Riemann hypothesis, +quantum computation, SHA-256/cryptographic hash functions, consciousness/integrated +information theory, ternary/qutrit systems. + +Usage: + python arxiv_scraper.py + python arxiv_scraper.py --query "Riemann hypothesis" --max 5 + python arxiv_scraper.py --output results.json +""" + +import argparse +import json +import time +import xml.etree.ElementTree as ET + +import requests + +ARXIV_API = "https://export.arxiv.org/api/query" + +DEFAULT_QUERIES = [ + "simulation hypothesis computational reality", + "Gödel incompleteness self-reference formal systems", + "Riemann zeta function trivial zeros", + "SHA-256 hash chain cryptographic proof", + "qutrit ternary quantum computation", + "integrated information theory consciousness", + "halting problem quantum physics undecidability", +] + +NS = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"} + + +def fetch_papers(query: str, max_results: int = 5) -> list[dict]: + """Return a list of paper dicts for the given arXiv search query.""" + params = { + "search_query": f"all:{query}", + "start": 0, + "max_results": max_results, + "sortBy": "relevance", + "sortOrder": "descending", + } + resp = requests.get(ARXIV_API, params=params, timeout=30) + resp.raise_for_status() + + root = ET.fromstring(resp.text) + papers = [] + for entry in root.findall("atom:entry", NS): + title_el = entry.find("atom:title", NS) + summary_el = entry.find("atom:summary", NS) + id_el = entry.find("atom:id", NS) + published_el = entry.find("atom:published", NS) + authors = [ + a.find("atom:name", NS).text + for a in entry.findall("atom:author", NS) + if a.find("atom:name", NS) is not None + ] + papers.append( + { + "title": title_el.text.strip() if title_el is not None else "", + "authors": authors, + "published": published_el.text.strip() if published_el is not None else "", + "abstract": summary_el.text.strip() if summary_el is not None else "", + "url": id_el.text.strip() if id_el is not None else "", + } + ) + return papers + + +def scrape(queries: list[str], max_per_query: int = 5) -> dict[str, list[dict]]: + """Scrape arXiv for each query and return results keyed by query string.""" + results = {} + for query in queries: + print(f"Fetching: {query!r} …") + try: + results[query] = fetch_papers(query, max_results=max_per_query) + except requests.RequestException as exc: + print(f" Error: {exc}") + results[query] = [] + time.sleep(1) # be polite to the API + return results + + +def main() -> None: + parser = argparse.ArgumentParser(description="Scrape arXiv for simulation-theory topics.") + parser.add_argument( + "--query", + nargs="*", + default=DEFAULT_QUERIES, + help="Search queries (defaults to built-in topic list).", + ) + parser.add_argument( + "--max", + type=int, + default=5, + dest="max_results", + help="Maximum results per query (default: 5).", + ) + parser.add_argument( + "--output", + default=None, + help="Write results to a JSON file instead of stdout.", + ) + args = parser.parse_args() + + results = scrape(args.query, max_per_query=args.max_results) + + if args.output: + with open(args.output, "w", encoding="utf-8") as fh: + json.dump(results, fh, indent=2, ensure_ascii=False) + print(f"Results written to {args.output}") + else: + print(json.dumps(results, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/scrapers/oeis_scraper.py b/scrapers/oeis_scraper.py new file mode 100644 index 0000000..ee12e57 --- /dev/null +++ b/scrapers/oeis_scraper.py @@ -0,0 +1,100 @@ +""" +OEIS (On-Line Encyclopedia of Integer Sequences) scraper — fetches sequence +metadata for integer sequences relevant to simulation-theory research. + +Sequences of interest: primes, Fibonacci, pi digits, Euler–Mascheroni constant +digits, Pascal's triangle, Catalan numbers, SHA-256 round constants, and others. + +Usage: + python oeis_scraper.py + python oeis_scraper.py --ids A000040 A000045 + python oeis_scraper.py --output results.json +""" + +import argparse +import json +import time + +import requests + +OEIS_SEARCH_URL = "https://oeis.org/search" + +# Default sequence IDs relevant to the repository topics +DEFAULT_IDS = [ + "A000040", # prime numbers + "A000045", # Fibonacci numbers + "A000796", # decimal expansion of pi + "A001620", # decimal expansion of Euler–Mascheroni constant + "A000108", # Catalan numbers + "A000012", # the all-1s sequence (trivial zero analogue) + "A000720", # pi(n): number of primes <= n + "A006862", # Euclid numbers: 1 + product of first n primes + "A000041", # number of partitions of n + "A001358", # semiprimes +] + + +def fetch_sequence(oeis_id: str) -> dict: + """Fetch metadata for a single OEIS sequence via the JSON search endpoint.""" + params = {"q": f"id:{oeis_id}", "fmt": "json"} + resp = requests.get(OEIS_SEARCH_URL, params=params, timeout=30) + resp.raise_for_status() + data = resp.json() + + results = data.get("results") or [] + if not results: + return {"id": oeis_id, "name": "", "description": "", "values": [], "url": ""} + + seq = results[0] + return { + "id": oeis_id, + "name": seq.get("name", ""), + "description": seq.get("comment", [""])[0] if seq.get("comment") else "", + "values": seq.get("data", "").split(",")[:20], # first 20 terms + "url": f"https://oeis.org/{oeis_id}", + } + + +def scrape(ids: list[str]) -> list[dict]: + """Scrape OEIS for each sequence ID.""" + results = [] + for oeis_id in ids: + print(f"Fetching: {oeis_id} …") + try: + results.append(fetch_sequence(oeis_id)) + except requests.RequestException as exc: + print(f" Error: {exc}") + results.append({"id": oeis_id, "name": "", "description": "", "values": [], "url": ""}) + time.sleep(0.5) # be polite + return results + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Scrape OEIS sequences relevant to simulation-theory research." + ) + parser.add_argument( + "--ids", + nargs="*", + default=DEFAULT_IDS, + help="OEIS sequence IDs (e.g. A000040). Defaults to built-in list.", + ) + parser.add_argument( + "--output", + default=None, + help="Write results to a JSON file instead of stdout.", + ) + args = parser.parse_args() + + results = scrape(args.ids) + + if args.output: + with open(args.output, "w", encoding="utf-8") as fh: + json.dump(results, fh, indent=2, ensure_ascii=False) + print(f"Results written to {args.output}") + else: + print(json.dumps(results, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt new file mode 100644 index 0000000..4cca127 --- /dev/null +++ b/scrapers/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.31.0 +beautifulsoup4>=4.12.0 +lxml>=4.9.0 diff --git a/scrapers/wikipedia_scraper.py b/scrapers/wikipedia_scraper.py new file mode 100644 index 0000000..44a2fb0 --- /dev/null +++ b/scrapers/wikipedia_scraper.py @@ -0,0 +1,114 @@ +""" +Wikipedia scraper — fetches introductory summaries for key topics in the +simulation-theory research repository. + +Topics covered: simulation hypothesis, SHA-256, Gödel incompleteness, +Riemann hypothesis, quantum computing, halting problem, integrated information +theory, fine-structure constant, Euler's identity, and more. + +Usage: + python wikipedia_scraper.py + python wikipedia_scraper.py --topics "Riemann hypothesis" "SHA-2" + python wikipedia_scraper.py --output results.json +""" + +import argparse +import json +import time + +import requests +from bs4 import BeautifulSoup + +WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php" + +DEFAULT_TOPICS = [ + "Simulation hypothesis", + "SHA-2", + "Gödel's incompleteness theorems", + "Riemann hypothesis", + "Quantum computing", + "Halting problem", + "Integrated information theory", + "Fine-structure constant", + "Euler's identity", + "Ternary numeral system", + "DNA", + "Blockchain", + "Boltzmann entropy formula", + "Turing machine", +] + + +def fetch_summary(topic: str) -> dict: + """Return a dict with title, url and plain-text intro for a Wikipedia topic.""" + params = { + "action": "query", + "prop": "extracts|info", + "exintro": True, + "explaintext": True, + "inprop": "url", + "titles": topic, + "format": "json", + "redirects": 1, + } + resp = requests.get(WIKIPEDIA_API, params=params, timeout=30) + resp.raise_for_status() + data = resp.json() + + pages = data.get("query", {}).get("pages", {}) + page = next(iter(pages.values())) + + if "missing" in page: + return {"topic": topic, "title": topic, "url": "", "summary": ""} + + return { + "topic": topic, + "title": page.get("title", topic), + "url": page.get("fullurl", ""), + "summary": page.get("extract", "").strip(), + } + + +def scrape(topics: list[str]) -> list[dict]: + """Scrape Wikipedia summaries for each topic.""" + results = [] + for topic in topics: + print(f"Fetching: {topic!r} …") + try: + results.append(fetch_summary(topic)) + except requests.RequestException as exc: + print(f" Error: {exc}") + results.append({"topic": topic, "title": topic, "url": "", "summary": ""}) + time.sleep(0.5) # be polite + return results + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Scrape Wikipedia summaries for simulation-theory topics." + ) + parser.add_argument( + "--topics", + nargs="*", + default=DEFAULT_TOPICS, + help="Wikipedia article titles to scrape (defaults to built-in topic list).", + ) + parser.add_argument( + "--output", + default=None, + help="Write results to a JSON file instead of stdout.", + ) + args = parser.parse_args() + + results = scrape(args.topics) + + if args.output: + with open(args.output, "w", encoding="utf-8") as fh: + json.dump(results, fh, indent=2, ensure_ascii=False) + print(f"Results written to {args.output}") + else: + print(json.dumps(results, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main()