Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,33 @@ pip install -r requirements.txt
```
# Usage

To download *every* stackexchange dump & parse to text, simply run

## List all available StackExchagne dumps

```
python3 main.py --list
```



## Download every StackExchange dumps

To download *every* stackexchange dumps & parse to text, simply run

```
python3 main.py --names all
```

## Download a single StackExchange dump

To download only a single stackexchange, you can add the name as an optional argument. E.G:

```
python3 main.py --names security.stackexchange
```

## Download a list of StackExchange dumps

To download a list of multiple stackexchanges, you can add the names separated by commas. E.G:

```
Expand All @@ -31,6 +46,17 @@ python3 main.py --names ru.stackoverflow,money.stackexchange

The name should be the url of the stackoverflow site, minus `http(s)://` and `.com`. You can view all available stackoverflow dumps [here](https://archive.org/download/stackexchange).

## List available sources in Stack Exchange

this will list all the available sources:

```
python3 main.py --list
```

They will be listed as list, which could be parsed with `grep` and other batch utilities.


## All Usage Options:

```
Expand Down
32 changes: 26 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,28 @@
import argparse, traceback
from multiprocessing import Pool, cpu_count
from utils import *
from downloader import Stack_Exchange_Downloader
from pairer import QA_Pairer
import argparse
import os
import traceback
import zipfile
from itertools import repeat
from multiprocessing import Pool, cpu_count

import dotenv
from lm_dataformat import Archive
import zipfile

from downloader import Stack_Exchange_Downloader
from pairer import QA_Pairer

dotenv.load_dotenv(override=True)


def download_and_process_single(name, out_format, min_score, max_responses):
try:
name = name.strip().lower()
os.makedirs("dumps", exist_ok=True)
s = Stack_Exchange_Downloader(name)
if name not in s.sites:
similar_entries = list(filter(lambda key: key.startswith(name) or key.endswith(name), s.sites.keys()))
print("StackExchange source not found. Perhaps you meant", similar_entries)
return
path_to_xml = "dumps/{}/Posts.xml".format(name)
if name != "stackoverflow":
path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
Expand Down Expand Up @@ -51,6 +60,12 @@ def download_and_process_single(name, out_format, min_score, max_responses):


def main(args):
if args.list:
s = Stack_Exchange_Downloader("all")
print("List of all the sources of StackExchange: ")
print("- "+"\n- ".join(sorted(s.sites.keys())))
return

names = args.names.split(',')
if names[0].strip().lower() == "all":
s = Stack_Exchange_Downloader("all")
Expand All @@ -72,6 +87,10 @@ def main(args):
parser = argparse.ArgumentParser(
description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw '
'question-answer pair text dataset for Language Models')

parser.add_argument('--list', help='list of all the sources from stackechange',
required=False, action="store_true")

parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. '
'If "all", will download, extract & parse *every* stackoverflow site',
default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
Expand All @@ -85,6 +104,7 @@ def main(args):
parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
'Default 3.', type=int, default=3)
args = parser.parse_args()

main(args)


2 changes: 1 addition & 1 deletion pairer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo
self.ar = archiver

def main(self):
"""iterates through SE xmls and:
"""iterates through SE XMLs and:

- stores PostTypeId="1" with AcceptedAnswerIds / Answers.
- when an AcceptedAnswerId or Answer > min_score is reached, it should:
Expand Down