From 22a907538a90af9f410691a181436c8e7bf40d78 Mon Sep 17 00:00:00 2001 From: suriya Date: Fri, 10 Feb 2023 12:11:51 -0800 Subject: [PATCH 1/6] fix links for stackoverflow archive --- downloader.py | 2 +- main.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/downloader.py b/downloader.py index 228b18e..4131ac6 100644 --- a/downloader.py +++ b/downloader.py @@ -22,7 +22,7 @@ def parse_sitesmap(self, sitesmap): site_name = url.replace(".com", "").replace(".net", "") download_link = "https://archive.org/download/stackexchange/" + url + ".7z" if url == "stackoverflow.com": - download_link = "https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z" + download_link = "https://archive.org/download/stackexchange/Stackoverflow.com-Posts.7z" self.sites[site_name] = {"url" : url, "download" : download_link} def download(self): diff --git a/main.py b/main.py index 50a727a..03300d2 100644 --- a/main.py +++ b/main.py @@ -18,8 +18,8 @@ def download_and_process_single(name, out_format, min_score, max_responses): if name != "stackoverflow": path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"]) else: - path_to_7z = "dumps/stackoverflow.com-Posts.7z" - out_folder = "out".format(name) + path_to_7z = "dumps/Stackoverflow.com-Posts.7z" + out_folder = "out" os.makedirs(out_folder, exist_ok=True) if not os.path.isfile(path_to_7z): # download 7z if it's not downloaded already From 89e1ed1644b2f39032c758316da17f41b8326a20 Mon Sep 17 00:00:00 2001 From: suriya Date: Mon, 13 Feb 2023 16:03:31 -0800 Subject: [PATCH 2/6] modified parsing to have detailed meta dict-- takes more space but more customizable --- .gitignore | 5 +++ README.md | 2 ++ downloader.py | 12 +++---- main.py | 81 +++++++++++++++++++++++++++++------------------- pairer.py | 80 +++++++++++++++++++++++++++++++++++------------ requirements.txt | 3 +- utils.py | 7 +++++ 7 files changed, 131 insertions(+), 59 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2664379 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +dumps +out* +logs* +test* diff --git a/README.md b/README.md index 2f9cb51..2229079 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # stackexchange_dataset A python tool for downloading & processing the [stackexchange data dumps](https://archive.org/details/stackexchange) into a text dataset for Language Models. +The schema documentation of the dump can be found [here](https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede/2678#2678) Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar) @@ -10,6 +11,7 @@ cd stackexchange_dataset pip install -r requirements.txt ``` # Usage +The default output format (for parsed data) is .zip. To create a lm_dataformat dataset pass option --out_format=lm_dataformat to the following commands. To download *every* stackexchange dump & parse to text, simply run diff --git a/downloader.py b/downloader.py index 4131ac6..ba51683 100644 --- a/downloader.py +++ b/downloader.py @@ -3,7 +3,7 @@ from utils import * import py7zr - +curr_dir = os.path.dirname(__file__) class Stack_Exchange_Downloader(): def __init__(self, name): @@ -28,12 +28,12 @@ def parse_sitesmap(self, sitesmap): def download(self): if self.name == "all": for k in self.sites: - command = "wget {} -P dumps".format(self.sites[k]["download"]) + command = "wget {} -P {}/dumps".format(curr_dir, self.sites[k]["download"]) print(command) if os.system(command): print('Download for {} failed!'.format(k)) else: - command = "wget {} -P dumps".format(self.sites[self.name]["download"]) + command = "wget {} -P {}/dumps".format(curr_dir, self.sites[self.name]["download"]) print(command) if os.system(command): print('Download for {} failed!'.format(self.name)) @@ -45,8 +45,7 @@ def extract(self): # , mode='r')) # archive.extractall() # archive.close() - command = "py7zr x dumps/{} dumps/{}".format(self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""), - k) + command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, k) print(command) if os.system(command): print('Extraction for {} failed!'.format(k)) @@ -56,8 +55,7 @@ def extract(self): # , mode='r')) # archive.extractall() # archive.close() - command = "py7zr x dumps/{} dumps/{}".format(self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""), - self.name) + command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, self.name) print(command) if os.system(command): print('Extraction for {} failed!'.format(self.name)) diff --git a/main.py b/main.py index 03300d2..183d5d8 100644 --- a/main.py +++ b/main.py @@ -7,45 +7,52 @@ from itertools import repeat from lm_dataformat import Archive import zipfile +import os - +curr_dir = os.path.dirname(__file__) def download_and_process_single(name, out_format, min_score, max_responses): try: name = name.strip().lower() - os.makedirs("dumps", exist_ok=True) + os.makedirs("{}/dumps".format(curr_dir), exist_ok=True) s = Stack_Exchange_Downloader(name) - path_to_xml = "dumps/{}/Posts.xml".format(name) + # *.7z files are downloaded from "https://archive.org/download/stackexchange/ if name != "stackoverflow": - path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"]) + path_to_7z = "{}/dumps/{}.7z".format(curr_dir,s.sites[name]["url"]) else: - path_to_7z = "dumps/Stackoverflow.com-Posts.7z" - out_folder = "out" - os.makedirs(out_folder, exist_ok=True) + path_to_7z = "{}/dumps/Stackoverflow.com-Posts.7z".format(curr_dir) if not os.path.isfile(path_to_7z): # download 7z if it's not downloaded already s.download() + + # *.xml files are extracted from *.7z files using py7zr + path_to_xml = "{}/dumps/{}/Posts.xml".format(curr_dir, name) if not os.path.isfile(path_to_xml): # extract 7z if it's not extracted already s.extract() + + out_folder = "{}/out".format(curr_dir) + os.makedirs(out_folder, exist_ok=True) + os.makedirs("{}/samples".format(out_folder), exist_ok=True) if out_format == "lm_dataformat": archiver = Archive(out_folder) elif out_format == "zip": archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a') else: archiver = None - qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses) + + qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses) qa.main() if out_format == "lm_dataformat": archiver.commit(name) elif out_format == "zip": archiver.close() - try: - os.remove(path_to_7z) - except FileNotFoundError: - print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"])) - filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")] - for f in filelist: - os.remove(os.path.join("dumps/{}".format(name), f)) + # try: + # os.remove(path_to_7z) + # except FileNotFoundError: + # print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"])) + # filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")] + # for f in filelist: + # os.remove(os.path.join("dumps/{}".format(name), f)) except: traceback.print_exc() @@ -57,9 +64,8 @@ def main(args): names = [] for k in s.sites: names.append(k) - # bring stackoverflow to the front so it is always processed first, since it's the largest - if "stackoverflow" in names: - names.insert(0, names.pop(names.index("stackoverflow"))) + print('Removing stackoverflow from the list of sites to process. Process it separately.') + names.pop(names.index("stackoverflow")) print('Downloading and processing stackexchange dumps for {}'.format(names)) # Download & Process # init pool with as many CPUs as available @@ -70,20 +76,31 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser( - description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw ' - 'question-answer pair text dataset for Language Models') - parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. ' - 'If "all", will download, extract & parse *every* stackoverflow site', - default="3dprinting.stackexchange,3dprinting.meta.stackexchange", - type=str) - parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be ' - 'lm_dataformat, as you will run into number of files per directory limits.', - default="zip", - type=str) - parser.add_argument('--min_score', help='minimum score of a response in order to be included in the dataset. Default 3.', - type=int, default=3) - parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. ' - 'Default 3.', type=int, default=3) + description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw question-answer pair text dataset for Language Models') + parser.add_argument( + '--names', + help='names of stackexchanges to download, extract & parse, separated by commas. If "all", will download, extract & parse *every* stackoverflow site', + default="3dprinting.stackexchange,3dprinting.meta.stackexchange", + type=str + ) + parser.add_argument( + '--out_format', + help='format of out file - if you are processing everything this will need to be lm_dataformat, as you will run into number of files per directory limits.', + default="zip", + type=str + ) + parser.add_argument( + '--min_score', + help='minimum score of a response in order to be included in the dataset. Default 3.', + type=int, + default=3 + ) + parser.add_argument( + '--max_responses', + help='maximum number of responses (sorted by score) to include for each question. Default 100.', + type=int, + default=100 + ) args = parser.parse_args() main(args) diff --git a/pairer.py b/pairer.py index 880bee7..b8c7ca0 100644 --- a/pairer.py +++ b/pairer.py @@ -1,6 +1,6 @@ import traceback import xml.etree.ElementTree as etree -from collections import defaultdict +from collections import defaultdict, OrderedDict from bs4 import BeautifulSoup from tqdm import tqdm from utils import * @@ -12,7 +12,7 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo """Makes a text dataset from StackExchange dumps""" self.xml_path = xml_path if name is None: - self.name = os.path.dirname(xml_path).replace("dumps/", "") + self.name = os.path.dirname(xml_path).replace("*dumps/", "") else: self.name = name # dict to save questions @@ -27,6 +27,11 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo if out_format in ["lm_dataformat", "zip"]: assert archiver is not None self.ar = archiver + self.sample = True + self.num_discarded_answers = 0 + self.num_discarded_questions = 0 + self.num_questions = 0 + self.num_answers = 0 def main(self): """iterates through SE xmls and: @@ -40,26 +45,34 @@ def main(self): """ os.makedirs(self.out_folder, exist_ok=True) - for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name)): + for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name), disable=True): if elem.tag == "row": try: attribs = defaultdict(lambda: None, elem.attrib) + # checks if PostTypeId=1 if is_question(attribs): if has_answers(attribs): + # trim post data to ['Id', 'Body', 'Title', 'Tags', 'AnswerCount', 'AcceptedAnswerId', 'PostTypeId'] + # other potentially usuful keys: ['CreationDate', 'Score', 'ViewCount', 'OwnerUserId', 'LastActivityDate', 'CommentCount', 'ContentLicense'] trim_attribs(attribs, "question") self.questions[attribs["Id"]] = attribs else: # if the question has no answers, discard it + self.num_discarded_questions += 1 continue + # checks if PostTypeId=2 elif is_answer(attribs): # if is accepted answer, append answer Body to relevant questions "AcceptedAnswer" field - # if the answer's score > min_score - # append the answer to the relevant question's OtherAnswers dict + # if the answer's score > min_score append the answer to the relevant question's OtherAnswers dict self.add_answer(attribs) self.check_complete(attribs) elem.clear() except: traceback.print_exc() + print("##### Stats #####") + print(f"num_questions={self.num_questions}, num_discarded_questions={self.num_discarded_questions}") + print(f"num_answers={self.num_answers}, num_discarded_answers={self.num_discarded_answers}") + print("###### End ######") def is_above_threshold(self, a_attribs): """ @@ -77,11 +90,9 @@ def is_above_threshold(self, a_attribs): def add_answer(self, a_attribs): """ Adds answer to its parent question in self.questions if it's either an accepted answer or above self.min_score. - If answer is an accepted answer, it gets appended to the AcceptedAnswer field, otherwise it gets appended to - OtherAnswers. + If answer is an accepted answer, it gets appended to the AcceptedAnswer field, otherwise it gets appended to OtherAnswers. - Also increments the question's 'ParsedAnswers' field. When ParsedAnswers = AnswerCount, the question is deleted - from memory and saved to a text file. + Also increments the question's 'ParsedAnswers' field. When ParsedAnswers = AnswerCount, the question is deleted from memory and saved to a text file. :param a_attribs: Answer's attribute dict """ @@ -99,6 +110,7 @@ def add_answer(self, a_attribs): else: self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 else: + self.num_discarded_answers += 1 self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 def check_complete(self, a_attribs): @@ -113,7 +125,17 @@ def check_complete(self, a_attribs): if int(parent["ParsedAnswers"]) == int(parent['AnswerCount']): keys_to_del.append(a_attribs["ParentId"]) if parent["Answers"] is not None and len(parent["Answers"]) > 0: - out_name = "{}_{}.txt".format(self.name, parent["Id"].zfill(10)) + out_tags = tags_as_list(parent["Tags"]) + out_name = "{}_{}_{}.txt".format(self.name, parent["Id"].zfill(10),"_".join(out_tags)) + out_dict = dict( + name=out_name, + tags=out_tags, + title=BeautifulSoup(parent["Title"], "html.parser").get_text(), + question=BeautifulSoup(parent["Body"], "html.parser").get_text(), + answers=[], + answers_scores=[], + ) + # fmt: "Q:\n\n{question.title}\n\n{question.body}\n\nA:\n\n{answer.body\n\n for answer.sortby(score)}" out_str = "" out_str += 'Q:\n\n' if parent["Title"] is not None: @@ -122,14 +144,17 @@ def check_complete(self, a_attribs): out_str += '{}\n\n'.format(BeautifulSoup(parent["Body"], "html.parser").get_text()) if parent["Answers"] is not None: key_score_dict = {} - for k, a in parent["Answers"].items(): - key_score_dict[k] = int(a["Score"]) - key_score_dict = {k: v for k, v in sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True)} + for ans_id, ans_attrib in parent["Answers"].items(): + key_score_dict[ans_id] = int(ans_attrib["Score"]) + key_score_dict = OrderedDict((ans_id, score) for ans_id, score in sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True)) count = 0 - for k in key_score_dict: + for ans_id, score in key_score_dict.items(): if count >= self.max_responses: break - out_str += 'A:\n\n{}\n\n'.format(BeautifulSoup(parent["Answers"][k]["Body"], "html.parser").get_text()) + ans_text = BeautifulSoup(parent["Answers"][ans_id]["Body"], "html.parser").get_text() + out_str += 'A:\n\n{}\n\n'.format(ans_text) + out_dict['answers'].append(ans_text) + out_dict['answers_scores'].append(score) count += 1 if self.out_format == "txt": with open("{}/{}".format(self.out_folder, out_name), 'w') as f: @@ -144,10 +169,27 @@ def check_complete(self, a_attribs): self.ar.writestr(out_name, filter_newlines(handle_unicode_errors(out_str))) elif self.out_format == "lm_dataformat": try: - self.ar.add_data(filter_newlines(out_str), meta={ - 'name': out_name}) + self.ar.add_data( + filter_newlines(out_str), + meta=out_dict + ) except: - self.ar.add_data(filter_newlines(handle_unicode_errors(out_str)), meta={ - 'name': out_name}) + self.ar.add_data( + filter_newlines(handle_unicode_errors(out_str)), + meta=out_dict + ) + if self.sample and len(out_dict['answers'])>=3: + with open("{}/samples/sample_{}".format(self.out_folder, out_name), 'w') as f: + try: + f.write(filter_newlines(out_str)) + except: + f.write(filter_newlines(handle_unicode_errors(out_str))) + self.sample = False + self.num_questions += 1 + self.num_answers += len(out_dict['answers']) + self.num_discarded_answers += len(key_score_dict)-len(out_dict['answers']) # cases where number of answers is > max responses + else: + # discard questions with no accepted answers + self.num_discarded_questions += 1 for key in keys_to_del: self.questions.pop(key, None) diff --git a/requirements.txt b/requirements.txt index 9d3cf3e..3833202 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ lxml py7zr tqdm lm-dataformat -jsonlines \ No newline at end of file +jsonlines +requests \ No newline at end of file diff --git a/utils.py b/utils.py index a6c3dc1..e1f1a04 100644 --- a/utils.py +++ b/utils.py @@ -61,3 +61,10 @@ def trim_attribs(elem_attribs, attrib_type="question"): return new_dict else: raise Exception('Unrecognized attribute type - please specify either question or answer') + +def tags_as_list(tag_str): + tag_str = tag_str.strip(">").strip("<") + tag_str = tag_str.replace("-","_") + tag_list = tag_str.split("><") + tag_list.sort() + return tag_list \ No newline at end of file From 39606c5281de1334429657b13b2891aa0d0b16c3 Mon Sep 17 00:00:00 2001 From: suriya Date: Mon, 13 Feb 2023 17:19:18 -0800 Subject: [PATCH 3/6] filter by tags --- main.py | 17 ++++++++++++----- pairer.py | 5 +++-- utils.py | 11 +++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 183d5d8..79bcef3 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,7 @@ import os curr_dir = os.path.dirname(__file__) -def download_and_process_single(name, out_format, min_score, max_responses): +def download_and_process_single(name, out_format, min_score, max_responses, chk_tags): try: name = name.strip().lower() os.makedirs("{}/dumps".format(curr_dir), exist_ok=True) @@ -40,7 +40,7 @@ def download_and_process_single(name, out_format, min_score, max_responses): else: archiver = None - qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses) + qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses, chk_tags=chk_tags) qa.main() if out_format == "lm_dataformat": archiver.commit(name) @@ -58,20 +58,21 @@ def download_and_process_single(name, out_format, min_score, max_responses): def main(args): - names = args.names.split(',') + names = args.names.split(',') + tags = args.tags.split(',') if len(args.tags) else [] if names[0].strip().lower() == "all": s = Stack_Exchange_Downloader("all") names = [] for k in s.sites: names.append(k) print('Removing stackoverflow from the list of sites to process. Process it separately.') - names.pop(names.index("stackoverflow")) + names.pop(names.index("stackoverflow")) print('Downloading and processing stackexchange dumps for {}'.format(names)) # Download & Process # init pool with as many CPUs as available cpu_no = cpu_count() - 1 p = Pool(cpu_no) - p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses))) + p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)), repeat(tags)) if __name__ == "__main__": @@ -101,6 +102,12 @@ def main(args): type=int, default=100 ) + parser.add_argument( + '--tags', + help='list of tags to include.', + type=str, + default="" + ) args = parser.parse_args() main(args) diff --git a/pairer.py b/pairer.py index b8c7ca0..74da416 100644 --- a/pairer.py +++ b/pairer.py @@ -8,7 +8,7 @@ class QA_Pairer(): - def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None): + def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None, chk_tags=""): """Makes a text dataset from StackExchange dumps""" self.xml_path = xml_path if name is None: @@ -27,6 +27,7 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo if out_format in ["lm_dataformat", "zip"]: assert archiver is not None self.ar = archiver + self.chk_tags = chk_tags self.sample = True self.num_discarded_answers = 0 self.num_discarded_questions = 0 @@ -51,7 +52,7 @@ def main(self): attribs = defaultdict(lambda: None, elem.attrib) # checks if PostTypeId=1 if is_question(attribs): - if has_answers(attribs): + if has_answers(attribs) and match_tags_or(attribs, self.chk_tags): # trim post data to ['Id', 'Body', 'Title', 'Tags', 'AnswerCount', 'AcceptedAnswerId', 'PostTypeId'] # other potentially usuful keys: ['CreationDate', 'Score', 'ViewCount', 'OwnerUserId', 'LastActivityDate', 'CommentCount', 'ContentLicense'] trim_attribs(attribs, "question") diff --git a/utils.py b/utils.py index e1f1a04..6380611 100644 --- a/utils.py +++ b/utils.py @@ -44,6 +44,17 @@ def has_answers(elem_attribs): return True return False +def match_tags_or(elem_attribs, chk_tags): + assert is_question(elem_attribs), "Must be a question to match tags" + if not(len(chk_tags)): + return True + if elem_attribs["Tags"] is not None: + elem_tags = tags_as_list(elem_attribs["Tags"]) + for tag in chk_tags: + if tag in elem_tags: + return True + return False + def trim_attribs(elem_attribs, attrib_type="question"): """deletes non-useful data from attribs dict for questions / answers, returns remaining""" From d9fffd8825e3c4f6d3b6d184e7b697296fd23bdd Mon Sep 17 00:00:00 2001 From: suriya Date: Mon, 13 Feb 2023 23:57:02 -0800 Subject: [PATCH 4/6] minor edits to tags filtering --- main.py | 5 +++-- pairer.py | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 79bcef3..be1f53f 100644 --- a/main.py +++ b/main.py @@ -31,6 +31,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_ s.extract() out_folder = "{}/out".format(curr_dir) + # out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow_python".format(curr_dir) os.makedirs(out_folder, exist_ok=True) os.makedirs("{}/samples".format(out_folder), exist_ok=True) if out_format == "lm_dataformat": @@ -39,7 +40,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_ archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a') else: archiver = None - + name = name+"_"+"_".join(chk_tags) if len(chk_tags) else name qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses, chk_tags=chk_tags) qa.main() if out_format == "lm_dataformat": @@ -72,7 +73,7 @@ def main(args): # init pool with as many CPUs as available cpu_no = cpu_count() - 1 p = Pool(cpu_no) - p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)), repeat(tags)) + p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses), repeat(tags))) if __name__ == "__main__": diff --git a/pairer.py b/pairer.py index 74da416..e0b6758 100644 --- a/pairer.py +++ b/pairer.py @@ -46,10 +46,12 @@ def main(self): """ os.makedirs(self.out_folder, exist_ok=True) + i=0 for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name), disable=True): if elem.tag == "row": try: attribs = defaultdict(lambda: None, elem.attrib) + i=i+1 # checks if PostTypeId=1 if is_question(attribs): if has_answers(attribs) and match_tags_or(attribs, self.chk_tags): @@ -98,7 +100,7 @@ def add_answer(self, a_attribs): :param a_attribs: Answer's attribute dict """ assert is_answer(a_attribs), "Must be an answer to add to parent" - if a_attribs is not None and self.questions[a_attribs["ParentId"]] is not None: + if a_attribs is not None and self.questions.get(a_attribs["ParentId"], None) is not None: if is_accepted_answer(a_attribs, self.questions[a_attribs["ParentId"]]): self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 @@ -120,7 +122,7 @@ def check_complete(self, a_attribs): removes from dict and prints to file. """ keys_to_del = [] - parent = self.questions[a_attribs["ParentId"]] + parent = self.questions.get(a_attribs["ParentId"], None) if a_attribs is not None and parent is not None: if parent["AnswerCount"] is not None and parent["ParsedAnswers"] is not None: if int(parent["ParsedAnswers"]) == int(parent['AnswerCount']): From 898c25563a652810a5a8bdf7f90db81cee28a1f0 Mon Sep 17 00:00:00 2001 From: suriya Date: Tue, 14 Feb 2023 12:59:59 -0800 Subject: [PATCH 5/6] refactor filter logic and better stats --- main.py | 16 ++++------------ main_filter.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ pairer.py | 49 ++++++++++++++++++++++++++++++++++++++----------- utils.py | 3 +++ 4 files changed, 90 insertions(+), 23 deletions(-) create mode 100644 main_filter.py diff --git a/main.py b/main.py index be1f53f..2d9d4f5 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,7 @@ import os curr_dir = os.path.dirname(__file__) -def download_and_process_single(name, out_format, min_score, max_responses, chk_tags): +def download_and_process_single(name, out_format, min_score, max_responses): try: name = name.strip().lower() os.makedirs("{}/dumps".format(curr_dir), exist_ok=True) @@ -31,7 +31,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_ s.extract() out_folder = "{}/out".format(curr_dir) - # out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow_python".format(curr_dir) + # out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir) os.makedirs(out_folder, exist_ok=True) os.makedirs("{}/samples".format(out_folder), exist_ok=True) if out_format == "lm_dataformat": @@ -40,8 +40,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_ archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a') else: archiver = None - name = name+"_"+"_".join(chk_tags) if len(chk_tags) else name - qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses, chk_tags=chk_tags) + qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses) qa.main() if out_format == "lm_dataformat": archiver.commit(name) @@ -60,7 +59,6 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_ def main(args): names = args.names.split(',') - tags = args.tags.split(',') if len(args.tags) else [] if names[0].strip().lower() == "all": s = Stack_Exchange_Downloader("all") names = [] @@ -73,7 +71,7 @@ def main(args): # init pool with as many CPUs as available cpu_no = cpu_count() - 1 p = Pool(cpu_no) - p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses), repeat(tags))) + p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses))) if __name__ == "__main__": @@ -103,12 +101,6 @@ def main(args): type=int, default=100 ) - parser.add_argument( - '--tags', - help='list of tags to include.', - type=str, - default="" - ) args = parser.parse_args() main(args) diff --git a/main_filter.py b/main_filter.py new file mode 100644 index 0000000..0c9108f --- /dev/null +++ b/main_filter.py @@ -0,0 +1,45 @@ +''' +Filter the stackoverflow dataset generated by main.py in lm_dataformat +Edit filter_logic to implement filtering of any meta dict sample +''' +from lm_dataformat import Reader, Archive +import os +import glob + +curr_dir = os.path.dirname(__file__) +out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir) +full_data_file = "{}/data_*stackoverflow.jsonl.zst".format(out_folder) + +class FilterMeta: + # currently implementing or of filter tags + def __init__(self): + self.filter_tags = ['python'] + self.file_suffix = "_"+"_".join(self.filter_tags) + + def filter_logic(self,meta): + for tag in self.filter_tags: + if tag in meta['tags']: + return True + return False + +new_archive = Archive(out_folder) +filter_meta = FilterMeta() +num_accepted = 0 +num_rejected = 0 +for filename in glob.glob(full_data_file): + # create a reader object for the file + print("Filtering {}".format(filename)) + rdr = Reader(filename) + # iterate over the samples in the file + for doc, meta in rdr.stream_data(get_meta=True): + if filter_meta.filter_logic(meta): + # add the sample to the new archive + new_archive.add_data(doc, meta) + num_accepted += 1 + else: + num_rejected += 1 + +filtered_data_file_name = "stack_overflow"+filter_meta.file_suffix +new_archive.commit(filtered_data_file_name) +print("##### Stats #####") +print(f"num_accepted: {num_accepted}, num_rejected: {num_rejected}") diff --git a/pairer.py b/pairer.py index e0b6758..2598630 100644 --- a/pairer.py +++ b/pairer.py @@ -8,7 +8,7 @@ class QA_Pairer(): - def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None, chk_tags=""): + def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None): """Makes a text dataset from StackExchange dumps""" self.xml_path = xml_path if name is None: @@ -27,8 +27,9 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo if out_format in ["lm_dataformat", "zip"]: assert archiver is not None self.ar = archiver - self.chk_tags = chk_tags self.sample = True + self.num_posts = 0 + self.num_nonQA_posts = 0 self.num_discarded_answers = 0 self.num_discarded_questions = 0 self.num_questions = 0 @@ -45,16 +46,15 @@ def main(self): > Delete from memory """ - os.makedirs(self.out_folder, exist_ok=True) - i=0 + os.makedirs(self.out_folder, exist_ok=True) for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name), disable=True): if elem.tag == "row": try: attribs = defaultdict(lambda: None, elem.attrib) - i=i+1 + self.num_posts += 1 # checks if PostTypeId=1 if is_question(attribs): - if has_answers(attribs) and match_tags_or(attribs, self.chk_tags): + if has_answers(attribs): # trim post data to ['Id', 'Body', 'Title', 'Tags', 'AnswerCount', 'AcceptedAnswerId', 'PostTypeId'] # other potentially usuful keys: ['CreationDate', 'Score', 'ViewCount', 'OwnerUserId', 'LastActivityDate', 'CommentCount', 'ContentLicense'] trim_attribs(attribs, "question") @@ -69,12 +69,16 @@ def main(self): # if the answer's score > min_score append the answer to the relevant question's OtherAnswers dict self.add_answer(attribs) self.check_complete(attribs) + else: + self.num_nonQA_posts += 1 elem.clear() except: traceback.print_exc() + print("##### Stats #####") print(f"num_questions={self.num_questions}, num_discarded_questions={self.num_discarded_questions}") print(f"num_answers={self.num_answers}, num_discarded_answers={self.num_discarded_answers}") + print(f"num_posts={self.num_posts}, num_nonQA_posts={self.num_nonQA_posts}, unprocessed_questions={len(self.questions.items())}") print("###### End ######") def is_above_threshold(self, a_attribs): @@ -100,7 +104,7 @@ def add_answer(self, a_attribs): :param a_attribs: Answer's attribute dict """ assert is_answer(a_attribs), "Must be an answer to add to parent" - if a_attribs is not None and self.questions.get(a_attribs["ParentId"], None) is not None: + if self.questions.get(a_attribs["ParentId"], None) is not None: if is_accepted_answer(a_attribs, self.questions[a_attribs["ParentId"]]): self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 @@ -112,9 +116,15 @@ def add_answer(self, a_attribs): self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 else: self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 - else: + else: self.num_discarded_answers += 1 + # print("Discarded answer with score {}".format(a_attribs["Score"]), self.num_discarded_answers) + self.questions[a_attribs["ParentId"]]["NonAnswers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 + else: + parentid = a_attribs["ParentId"] + self.num_discarded_answers += 1 + # print(f"ParentId {parentid} not found", self.num_discarded_answers) def check_complete(self, a_attribs): """ @@ -124,10 +134,12 @@ def check_complete(self, a_attribs): keys_to_del = [] parent = self.questions.get(a_attribs["ParentId"], None) if a_attribs is not None and parent is not None: - if parent["AnswerCount"] is not None and parent["ParsedAnswers"] is not None: + if parent["AnswerCount"] and parent["ParsedAnswers"]: if int(parent["ParsedAnswers"]) == int(parent['AnswerCount']): keys_to_del.append(a_attribs["ParentId"]) - if parent["Answers"] is not None and len(parent["Answers"]) > 0: + ## Filter change: still use quesions with no accepted answers + # if parent["Answers"] is not None and len(parent["Answers"]) > 0: + if 1: out_tags = tags_as_list(parent["Tags"]) out_name = "{}_{}_{}.txt".format(self.name, parent["Id"].zfill(10),"_".join(out_tags)) out_dict = dict( @@ -137,6 +149,8 @@ def check_complete(self, a_attribs): question=BeautifulSoup(parent["Body"], "html.parser").get_text(), answers=[], answers_scores=[], + non_answers=[], + non_answers_scores=[] ) # fmt: "Q:\n\n{question.title}\n\n{question.body}\n\nA:\n\n{answer.body\n\n for answer.sortby(score)}" out_str = "" @@ -159,6 +173,17 @@ def check_complete(self, a_attribs): out_dict['answers'].append(ans_text) out_dict['answers_scores'].append(score) count += 1 + + if parent["NonAnswers"] is not None: + nona_key_score_dict = {} + for ans_id, ans_attrib in parent["NonAnswers"].items(): + nona_key_score_dict[ans_id] = int(ans_attrib["Score"]) + nona_key_score_dict = OrderedDict((ans_id, score) for ans_id, score in sorted(nona_key_score_dict.items(), key=lambda item: item[1], reverse=True)) + for ans_id, score in nona_key_score_dict.items(): + ans_text = BeautifulSoup(parent["NonAnswers"][ans_id]["Body"], "html.parser").get_text() + out_dict['non_answers'].append(ans_text) + out_dict['non_answers_scores'].append(score) + if self.out_format == "txt": with open("{}/{}".format(self.out_folder, out_name), 'w') as f: try: @@ -190,7 +215,9 @@ def check_complete(self, a_attribs): self.sample = False self.num_questions += 1 self.num_answers += len(out_dict['answers']) - self.num_discarded_answers += len(key_score_dict)-len(out_dict['answers']) # cases where number of answers is > max responses + if len(key_score_dict)-len(out_dict['answers'])>0: + self.num_discarded_answers += len(key_score_dict)-len(out_dict['answers']) # cases where number of answers is > max responses + print(f"Discarding {len(key_score_dict)-len(out_dict['answers'])} of {len(key_score_dict)} answers", self.num_discarded_answers) else: # discard questions with no accepted answers self.num_discarded_questions += 1 diff --git a/utils.py b/utils.py index 6380611..1587aae 100644 --- a/utils.py +++ b/utils.py @@ -64,6 +64,9 @@ def trim_attribs(elem_attribs, attrib_type="question"): [elem_attribs.pop(x, None) for x in to_delete] elem_attribs["ParsedAnswers"] = 0 elem_attribs["Answers"] = {} + elem_attribs["NonAnswers"] = {} + if 'AnswerCount' not in elem_attribs.keys(): + elem_attribs['AnswerCount']=-1 elif attrib_type == "answer": to_keep = ['Id', 'Body', 'Score'] new_dict = {} From 875044f510049e2ca227a0dc71243438ee569180 Mon Sep 17 00:00:00 2001 From: suriya Date: Fri, 17 Feb 2023 18:40:31 -0800 Subject: [PATCH 6/6] minor edits --- main.py | 8 +++++++- main_filter.py | 2 +- pairer.py | 14 ++++++-------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/main.py b/main.py index 2d9d4f5..658f44b 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,7 @@ from lm_dataformat import Archive import zipfile import os +import json curr_dir = os.path.dirname(__file__) def download_and_process_single(name, out_format, min_score, max_responses): @@ -31,9 +32,10 @@ def download_and_process_single(name, out_format, min_score, max_responses): s.extract() out_folder = "{}/out".format(curr_dir) - # out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir) + # out_folder = "{}/../../../suriyagwu/stackexchange/all".format(curr_dir) os.makedirs(out_folder, exist_ok=True) os.makedirs("{}/samples".format(out_folder), exist_ok=True) + os.makedirs("{}/misc".format(out_folder), exist_ok=True) if out_format == "lm_dataformat": archiver = Archive(out_folder) elif out_format == "zip": @@ -46,6 +48,10 @@ def download_and_process_single(name, out_format, min_score, max_responses): archiver.commit(name) elif out_format == "zip": archiver.close() + + # save qa.questions dictionary data to a file + json.dump(qa.questions, open("{}/misc/{}_unprocessed_questions.json".format(out_folder, name), "w"), indent=4) + # try: # os.remove(path_to_7z) # except FileNotFoundError: diff --git a/main_filter.py b/main_filter.py index 0c9108f..d4f7e0e 100644 --- a/main_filter.py +++ b/main_filter.py @@ -39,7 +39,7 @@ def filter_logic(self,meta): else: num_rejected += 1 -filtered_data_file_name = "stack_overflow"+filter_meta.file_suffix +filtered_data_file_name = "stackoverflow"+filter_meta.file_suffix new_archive.commit(filtered_data_file_name) print("##### Stats #####") print(f"num_accepted: {num_accepted}, num_rejected: {num_rejected}") diff --git a/pairer.py b/pairer.py index 2598630..227e501 100644 --- a/pairer.py +++ b/pairer.py @@ -78,7 +78,10 @@ def main(self): print("##### Stats #####") print(f"num_questions={self.num_questions}, num_discarded_questions={self.num_discarded_questions}") print(f"num_answers={self.num_answers}, num_discarded_answers={self.num_discarded_answers}") - print(f"num_posts={self.num_posts}, num_nonQA_posts={self.num_nonQA_posts}, unprocessed_questions={len(self.questions.items())}") + print(f"num_posts={self.num_posts}, num_nonQA_posts={self.num_nonQA_posts}") + unprocessed_questions = len(self.questions.items()) + unprocessed_answers = sum([len(q_att['Answers'].items()) for q, q_att in self.questions.items()]) + print(f"unprocessed_questions={unprocessed_questions}, unprocessed_answers={unprocessed_answers}") print("###### End ######") def is_above_threshold(self, a_attribs): @@ -109,13 +112,8 @@ def add_answer(self, a_attribs): self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 elif self.is_above_threshold(a_attribs): - if a_attribs["Id"] is not None: - parent = self.questions[a_attribs["ParentId"]] - if parent is not None: - self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") - self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 - else: - self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 + self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer") + self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1 else: self.num_discarded_answers += 1 # print("Discarded answer with score {}".format(a_attribs["Score"]), self.num_discarded_answers)