diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3bf780b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +.env \ No newline at end of file diff --git a/README.md b/README.md index 2f9cb51..6bd94b1 100644 --- a/README.md +++ b/README.md @@ -3,26 +3,41 @@ A python tool for downloading & processing the [stackexchange data dumps](https: Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar) -# Setup +## Setup ``` git clone https://github.com/EleutherAI/stackexchange_dataset/ cd stackexchange_dataset pip install -r requirements.txt ``` -# Usage +## Usage -To download *every* stackexchange dump & parse to text, simply run + +### List all available StackExchagne dumps + +``` +python3 main.py --list +``` + + + +### Download every StackExchange dumps + +To download *every* stackexchange dumps & parse to text, simply run ``` python3 main.py --names all ``` +### Download a single StackExchange dump + To download only a single stackexchange, you can add the name as an optional argument. E.G: ``` python3 main.py --names security.stackexchange ``` +### Download a list of StackExchange dumps + To download a list of multiple stackexchanges, you can add the names separated by commas. E.G: ``` @@ -31,7 +46,7 @@ python3 main.py --names ru.stackoverflow,money.stackexchange The name should be the url of the stackoverflow site, minus `http(s)://` and `.com`. You can view all available stackoverflow dumps [here](https://archive.org/download/stackexchange). -## All Usage Options: +### All Usage Options: ``` usage: main.py [-h] [--names NAMES] @@ -47,6 +62,19 @@ optional arguments: *every* stackoverflow site ``` +### Proxy support + +If you need to pass through a proxy, you can configure an `.env` file and add as follow: + +``` +HTTP_PROXY=http://proxy:port +http_proxy=http://proxy:port +HTTPS_PROXY=http://proxy:port +https_proxy=http://proxy:port +NO_PROXY=address to ignore,localhost +no_proxy=address to ignore,localhost +``` + # TODO: - [ ] should we add metadata to the text (i.e name of stackexchange & tags)? diff --git a/main.py b/main.py index 50a727a..131653d 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,17 @@ -import argparse, traceback -from multiprocessing import Pool, cpu_count -from utils import * -from downloader import Stack_Exchange_Downloader -from pairer import QA_Pairer +import argparse import os +import traceback +import zipfile from itertools import repeat +from multiprocessing import Pool, cpu_count + +import dotenv from lm_dataformat import Archive -import zipfile + +from downloader import Stack_Exchange_Downloader +from pairer import QA_Pairer + +dotenv.load_dotenv(override=True) def download_and_process_single(name, out_format, min_score, max_responses): @@ -14,6 +19,10 @@ def download_and_process_single(name, out_format, min_score, max_responses): name = name.strip().lower() os.makedirs("dumps", exist_ok=True) s = Stack_Exchange_Downloader(name) + if name not in s.sites: + similar_entries = list(filter(lambda key: key.startswith(name) or key.endswith(name), s.sites.keys())) + print("StackExchange source not found. Perhaps you meant", similar_entries) + return path_to_xml = "dumps/{}/Posts.xml".format(name) if name != "stackoverflow": path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"]) @@ -51,6 +60,12 @@ def download_and_process_single(name, out_format, min_score, max_responses): def main(args): + if args.list: + s = Stack_Exchange_Downloader("all") + print("List of all the sources of StackExchange: ") + print("- "+"\n- ".join(sorted(s.sites.keys()))) + return + names = args.names.split(',') if names[0].strip().lower() == "all": s = Stack_Exchange_Downloader("all") @@ -72,6 +87,10 @@ def main(args): parser = argparse.ArgumentParser( description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw ' 'question-answer pair text dataset for Language Models') + + parser.add_argument('--list', help='list of all the sources from stackechange', + required=False, action="store_true") + parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. ' 'If "all", will download, extract & parse *every* stackoverflow site', default="3dprinting.stackexchange,3dprinting.meta.stackexchange", @@ -85,6 +104,7 @@ def main(args): parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. ' 'Default 3.', type=int, default=3) args = parser.parse_args() + main(args)