diff --git a/README.md b/README.md index 2f9cb51..19c672c 100644 --- a/README.md +++ b/README.md @@ -11,18 +11,33 @@ pip install -r requirements.txt ``` # Usage -To download *every* stackexchange dump & parse to text, simply run + +## List all available StackExchagne dumps + +``` +python3 main.py --list +``` + + + +## Download every StackExchange dumps + +To download *every* stackexchange dumps & parse to text, simply run ``` python3 main.py --names all ``` +## Download a single StackExchange dump + To download only a single stackexchange, you can add the name as an optional argument. E.G: ``` python3 main.py --names security.stackexchange ``` +## Download a list of StackExchange dumps + To download a list of multiple stackexchanges, you can add the names separated by commas. E.G: ``` @@ -31,6 +46,17 @@ python3 main.py --names ru.stackoverflow,money.stackexchange The name should be the url of the stackoverflow site, minus `http(s)://` and `.com`. You can view all available stackoverflow dumps [here](https://archive.org/download/stackexchange). +## List available sources in Stack Exchange + +this will list all the available sources: + +``` +python3 main.py --list +``` + +They will be listed as list, which could be parsed with `grep` and other batch utilities. + + ## All Usage Options: ``` diff --git a/main.py b/main.py index 50a727a..131653d 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,17 @@ -import argparse, traceback -from multiprocessing import Pool, cpu_count -from utils import * -from downloader import Stack_Exchange_Downloader -from pairer import QA_Pairer +import argparse import os +import traceback +import zipfile from itertools import repeat +from multiprocessing import Pool, cpu_count + +import dotenv from lm_dataformat import Archive -import zipfile + +from downloader import Stack_Exchange_Downloader +from pairer import QA_Pairer + +dotenv.load_dotenv(override=True) def download_and_process_single(name, out_format, min_score, max_responses): @@ -14,6 +19,10 @@ def download_and_process_single(name, out_format, min_score, max_responses): name = name.strip().lower() os.makedirs("dumps", exist_ok=True) s = Stack_Exchange_Downloader(name) + if name not in s.sites: + similar_entries = list(filter(lambda key: key.startswith(name) or key.endswith(name), s.sites.keys())) + print("StackExchange source not found. Perhaps you meant", similar_entries) + return path_to_xml = "dumps/{}/Posts.xml".format(name) if name != "stackoverflow": path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"]) @@ -51,6 +60,12 @@ def download_and_process_single(name, out_format, min_score, max_responses): def main(args): + if args.list: + s = Stack_Exchange_Downloader("all") + print("List of all the sources of StackExchange: ") + print("- "+"\n- ".join(sorted(s.sites.keys()))) + return + names = args.names.split(',') if names[0].strip().lower() == "all": s = Stack_Exchange_Downloader("all") @@ -72,6 +87,10 @@ def main(args): parser = argparse.ArgumentParser( description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw ' 'question-answer pair text dataset for Language Models') + + parser.add_argument('--list', help='list of all the sources from stackechange', + required=False, action="store_true") + parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. ' 'If "all", will download, extract & parse *every* stackoverflow site', default="3dprinting.stackexchange,3dprinting.meta.stackexchange", @@ -85,6 +104,7 @@ def main(args): parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. ' 'Default 3.', type=int, default=3) args = parser.parse_args() + main(args) diff --git a/pairer.py b/pairer.py index 880bee7..e561b5d 100644 --- a/pairer.py +++ b/pairer.py @@ -29,7 +29,7 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo self.ar = archiver def main(self): - """iterates through SE xmls and: + """iterates through SE XMLs and: - stores PostTypeId="1" with AcceptedAnswerIds / Answers. - when an AcceptedAnswerId or Answer > min_score is reached, it should: