EleutherAI · lfoppiano · Sep 20, 2023 · Sep 20, 2023 · Sep 20, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.env
diff --git a/README.md b/README.md
@@ -3,26 +3,41 @@ A python tool for downloading & processing the [stackexchange data dumps](https:
 
 Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar)
 
-# Setup
+## Setup
 ```
 git clone https://github.com/EleutherAI/stackexchange_dataset/
 cd stackexchange_dataset
 pip install -r requirements.txt
 ```
-# Usage
+## Usage
 
-To download *every* stackexchange dump & parse to text, simply run
+
+### List all available StackExchagne dumps
+
+```
+python3 main.py --list 
+```
+
+
+
+### Download every StackExchange dumps 
+
+To download *every* stackexchange dumps & parse to text, simply run
 
 ```
 python3 main.py --names all
 ```
 
+### Download a single StackExchange dump 
+
 To download only a single stackexchange, you can add the name as an optional argument. E.G: 
 
 ```
 python3 main.py --names security.stackexchange
 ```
 
+### Download a list of StackExchange dumps
+
 To download a list of multiple stackexchanges, you can add the names separated by commas. E.G:
 
 ```
@@ -31,7 +46,7 @@ python3 main.py --names ru.stackoverflow,money.stackexchange
 
 The name should be the url of the stackoverflow site, minus `http(s)://` and `.com`. You can view all available stackoverflow dumps [here](https://archive.org/download/stackexchange).
 
-## All Usage Options:
+### All Usage Options:
 
 ```
 usage: main.py [-h] [--names NAMES]
@@ -47,6 +62,19 @@ optional arguments:
                  *every* stackoverflow site
 ```
 
+### Proxy support 
+
+If you need to pass through a proxy, you can configure an `.env` file and add as follow: 
+
+```
+HTTP_PROXY=http://proxy:port
+http_proxy=http://proxy:port
+HTTPS_PROXY=http://proxy:port
+https_proxy=http://proxy:port
+NO_PROXY=address to ignore,localhost
+no_proxy=address to ignore,localhost
+```
+
 # TODO:
 
 - [ ] should we add metadata to the text (i.e name of stackexchange & tags)?

diff --git a/main.py b/main.py
@@ -1,19 +1,28 @@
-import argparse, traceback
-from multiprocessing import Pool, cpu_count
-from utils import *
-from downloader import Stack_Exchange_Downloader
-from pairer import QA_Pairer
+import argparse
 import os
+import traceback
+import zipfile
 from itertools import repeat
+from multiprocessing import Pool, cpu_count
+
+import dotenv
 from lm_dataformat import Archive
-import zipfile
+
+from downloader import Stack_Exchange_Downloader
+from pairer import QA_Pairer
+
+dotenv.load_dotenv(override=True)
 
 
 def download_and_process_single(name, out_format, min_score, max_responses):
     try:
         name = name.strip().lower()
         os.makedirs("dumps", exist_ok=True)
         s = Stack_Exchange_Downloader(name)
+        if name not in s.sites:
+            similar_entries = list(filter(lambda key: key.startswith(name) or key.endswith(name), s.sites.keys()))
+            print("StackExchange source not found. Perhaps you meant", similar_entries)
+            return
         path_to_xml = "dumps/{}/Posts.xml".format(name)
         if name != "stackoverflow":
             path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
@@ -51,6 +60,12 @@ def download_and_process_single(name, out_format, min_score, max_responses):
 
 
 def main(args):
+    if args.list:
+        s = Stack_Exchange_Downloader("all")
+        print("List of all the sources of StackExchange: ")
+        print("- "+"\n- ".join(sorted(s.sites.keys())))
+        return
+
     names = args.names.split(',')
     if names[0].strip().lower() == "all":
         s = Stack_Exchange_Downloader("all")
@@ -72,6 +87,10 @@ def main(args):
     parser = argparse.ArgumentParser(
         description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw '
                     'question-answer pair text dataset for Language Models')
+
+    parser.add_argument('--list', help='list of all the sources from stackechange',
+                        required=False, action="store_true")
+
     parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. '
                                         'If "all", will download, extract & parse *every* stackoverflow site',
                         default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
@@ -85,6 +104,7 @@ def main(args):
     parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
                                                 'Default 3.', type=int, default=3)
     args = parser.parse_args()
+
     main(args)