From 22a907538a90af9f410691a181436c8e7bf40d78 Mon Sep 17 00:00:00 2001
From: suriya <suriyag88@gmail.com>
Date: Fri, 10 Feb 2023 12:11:51 -0800
Subject: [PATCH 1/6] fix links for stackoverflow archive

---
 downloader.py | 2 +-
 main.py       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/downloader.py b/downloader.py
index 228b18e..4131ac6 100644
--- a/downloader.py
+++ b/downloader.py
@@ -22,7 +22,7 @@ def parse_sitesmap(self, sitesmap):
             site_name = url.replace(".com", "").replace(".net", "")
             download_link = "https://archive.org/download/stackexchange/" + url + ".7z"
             if url == "stackoverflow.com":
-                download_link = "https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z"
+                download_link = "https://archive.org/download/stackexchange/Stackoverflow.com-Posts.7z"
             self.sites[site_name] = {"url" : url, "download" : download_link}
 
     def download(self):
diff --git a/main.py b/main.py
index 50a727a..03300d2 100644
--- a/main.py
+++ b/main.py
@@ -18,8 +18,8 @@ def download_and_process_single(name, out_format, min_score, max_responses):
         if name != "stackoverflow":
             path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
         else:
-            path_to_7z = "dumps/stackoverflow.com-Posts.7z"
-        out_folder = "out".format(name)
+            path_to_7z = "dumps/Stackoverflow.com-Posts.7z"
+        out_folder = "out"
         os.makedirs(out_folder, exist_ok=True)
         if not os.path.isfile(path_to_7z):
             # download 7z if it's not downloaded already

From 89e1ed1644b2f39032c758316da17f41b8326a20 Mon Sep 17 00:00:00 2001
From: suriya <suriyag88@gmail.com>
Date: Mon, 13 Feb 2023 16:03:31 -0800
Subject: [PATCH 2/6] modified parsing to have detailed meta dict-- takes more
 space but more customizable

---
 .gitignore       |  5 +++
 README.md        |  2 ++
 downloader.py    | 12 +++----
 main.py          | 81 +++++++++++++++++++++++++++++-------------------
 pairer.py        | 80 +++++++++++++++++++++++++++++++++++------------
 requirements.txt |  3 +-
 utils.py         |  7 +++++
 7 files changed, 131 insertions(+), 59 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2664379
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+dumps
+out*
+logs*
+test*
diff --git a/README.md b/README.md
index 2f9cb51..2229079 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 # stackexchange_dataset
 A python tool for downloading & processing the [stackexchange data dumps](https://archive.org/details/stackexchange) into a text dataset for Language Models.
+The schema documentation of the dump can be found [here](https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede/2678#2678)
 
 Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar)
 
@@ -10,6 +11,7 @@ cd stackexchange_dataset
 pip install -r requirements.txt
 ```
 # Usage
+The default output format (for parsed data) is .zip. To create a lm_dataformat dataset pass option --out_format=lm_dataformat to the following commands. 
 
 To download *every* stackexchange dump & parse to text, simply run
 
diff --git a/downloader.py b/downloader.py
index 4131ac6..ba51683 100644
--- a/downloader.py
+++ b/downloader.py
@@ -3,7 +3,7 @@
 from utils import *
 import py7zr
 
-
+curr_dir  = os.path.dirname(__file__)
 class Stack_Exchange_Downloader():
 
     def __init__(self, name):
@@ -28,12 +28,12 @@ def parse_sitesmap(self, sitesmap):
     def download(self):
         if self.name == "all":
             for k in self.sites:
-                command = "wget {} -P dumps".format(self.sites[k]["download"])
+                command = "wget {} -P {}/dumps".format(curr_dir, self.sites[k]["download"])
                 print(command)
                 if os.system(command):
                     print('Download for {} failed!'.format(k))
         else:
-            command = "wget {} -P dumps".format(self.sites[self.name]["download"])
+            command = "wget {} -P {}/dumps".format(curr_dir, self.sites[self.name]["download"])
             print(command)
             if os.system(command):
                 print('Download for {} failed!'.format(self.name))
@@ -45,8 +45,7 @@ def extract(self):
                 #                                                , mode='r'))
                 # archive.extractall()
                 # archive.close()
-                command = "py7zr x dumps/{} dumps/{}".format(self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""),
-                                                       k)
+                command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, k)
                 print(command)
                 if os.system(command):
                     print('Extraction for {} failed!'.format(k))
@@ -56,8 +55,7 @@ def extract(self):
             #                       , mode='r'))
             # archive.extractall()
             # archive.close()
-            command = "py7zr x dumps/{} dumps/{}".format(self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""),
-                                                      self.name)
+            command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, self.name)
             print(command)
             if os.system(command):
                 print('Extraction for {} failed!'.format(self.name))
diff --git a/main.py b/main.py
index 03300d2..183d5d8 100644
--- a/main.py
+++ b/main.py
@@ -7,45 +7,52 @@
 from itertools import repeat
 from lm_dataformat import Archive
 import zipfile
+import os
 
-
+curr_dir  = os.path.dirname(__file__)
 def download_and_process_single(name, out_format, min_score, max_responses):
     try:
         name = name.strip().lower()
-        os.makedirs("dumps", exist_ok=True)
+        os.makedirs("{}/dumps".format(curr_dir), exist_ok=True)
         s = Stack_Exchange_Downloader(name)
-        path_to_xml = "dumps/{}/Posts.xml".format(name)
+        # *.7z files are downloaded from "https://archive.org/download/stackexchange/ 
         if name != "stackoverflow":
-            path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
+            path_to_7z = "{}/dumps/{}.7z".format(curr_dir,s.sites[name]["url"])
         else:
-            path_to_7z = "dumps/Stackoverflow.com-Posts.7z"
-        out_folder = "out"
-        os.makedirs(out_folder, exist_ok=True)
+            path_to_7z = "{}/dumps/Stackoverflow.com-Posts.7z".format(curr_dir)
         if not os.path.isfile(path_to_7z):
             # download 7z if it's not downloaded already
             s.download()
+
+        # *.xml files are extracted from *.7z files using py7zr
+        path_to_xml = "{}/dumps/{}/Posts.xml".format(curr_dir, name)
         if not os.path.isfile(path_to_xml):
             # extract 7z if it's not extracted already
             s.extract()
+
+        out_folder = "{}/out".format(curr_dir)
+        os.makedirs(out_folder, exist_ok=True)
+        os.makedirs("{}/samples".format(out_folder), exist_ok=True)
         if out_format == "lm_dataformat":
             archiver = Archive(out_folder)
         elif out_format == "zip":
             archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a')
         else:
             archiver = None
-        qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
+
+        qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
         qa.main()
         if out_format == "lm_dataformat":
             archiver.commit(name)
         elif out_format == "zip":
             archiver.close()
-        try:
-            os.remove(path_to_7z)
-        except FileNotFoundError:
-            print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
-        filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
-        for f in filelist:
-            os.remove(os.path.join("dumps/{}".format(name), f))
+        # try:
+        #     os.remove(path_to_7z)
+        # except FileNotFoundError:
+        #     print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
+        # filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
+        # for f in filelist:
+        #     os.remove(os.path.join("dumps/{}".format(name), f))
     except:
         traceback.print_exc()
 
@@ -57,9 +64,8 @@ def main(args):
         names = []
         for k in s.sites:
             names.append(k)
-        # bring stackoverflow to the front so it is always processed first, since it's the largest
-        if "stackoverflow" in names:
-            names.insert(0, names.pop(names.index("stackoverflow")))
+        print('Removing stackoverflow from the list of sites to process. Process it separately.')
+        names.pop(names.index("stackoverflow"))
     print('Downloading and processing stackexchange dumps for {}'.format(names))
     # Download & Process
     # init pool with as many CPUs as available
@@ -70,20 +76,31 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw '
-                    'question-answer pair text dataset for Language Models')
-    parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. '
-                                        'If "all", will download, extract & parse *every* stackoverflow site',
-                        default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
-                        type=str)
-    parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be '
-                                             'lm_dataformat, as you will run into number of files per directory limits.',
-                        default="zip",
-                        type=str)
-    parser.add_argument('--min_score', help='minimum score of a response in order to be included in the dataset. Default 3.',
-                        type=int, default=3)
-    parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
-                                                'Default 3.', type=int, default=3)
+        description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw question-answer pair text dataset for Language Models')
+    parser.add_argument(
+        '--names', 
+        help='names of stackexchanges to download, extract & parse, separated by commas. If "all", will download, extract & parse *every* stackoverflow site',
+        default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
+        type=str
+    )
+    parser.add_argument(
+        '--out_format', 
+        help='format of out file - if you are processing everything this will need to be lm_dataformat, as you will run into number of files per directory limits.',
+        default="zip",
+        type=str
+    )
+    parser.add_argument(
+        '--min_score', 
+        help='minimum score of a response in order to be included in the dataset. Default 3.',
+        type=int, 
+        default=3
+    )
+    parser.add_argument(
+        '--max_responses', 
+        help='maximum number of responses (sorted by score) to include for each question. Default 100.', 
+        type=int, 
+        default=100
+    )
     args = parser.parse_args()
     main(args)
 
diff --git a/pairer.py b/pairer.py
index 880bee7..b8c7ca0 100644
--- a/pairer.py
+++ b/pairer.py
@@ -1,6 +1,6 @@
 import traceback
 import xml.etree.ElementTree as etree
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from bs4 import BeautifulSoup
 from tqdm import tqdm
 from utils import *
@@ -12,7 +12,7 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo
         """Makes a text dataset from StackExchange dumps"""
         self.xml_path = xml_path
         if name is None:
-            self.name = os.path.dirname(xml_path).replace("dumps/", "")
+            self.name = os.path.dirname(xml_path).replace("*dumps/", "")
         else:
             self.name = name
         # dict to save questions
@@ -27,6 +27,11 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo
         if out_format in ["lm_dataformat", "zip"]:
             assert archiver is not None
             self.ar = archiver
+        self.sample = True 
+        self.num_discarded_answers = 0
+        self.num_discarded_questions = 0
+        self.num_questions = 0
+        self.num_answers = 0 
 
     def main(self):
         """iterates through SE xmls and:
@@ -40,26 +45,34 @@ def main(self):
 
         """
         os.makedirs(self.out_folder, exist_ok=True)
-        for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name)):
+        for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name), disable=True):
             if elem.tag == "row":
                 try:
                     attribs = defaultdict(lambda: None, elem.attrib)
+                    # checks if PostTypeId=1
                     if is_question(attribs):
                         if has_answers(attribs):
+                            # trim post data to  ['Id', 'Body', 'Title', 'Tags', 'AnswerCount', 'AcceptedAnswerId', 'PostTypeId']
+                            # other potentially usuful keys: ['CreationDate',  'Score', 'ViewCount', 'OwnerUserId', 'LastActivityDate', 'CommentCount', 'ContentLicense']
                             trim_attribs(attribs, "question")
                             self.questions[attribs["Id"]] = attribs
                         else:
                             # if the question has no answers, discard it
+                            self.num_discarded_questions += 1
                             continue
+                    # checks if PostTypeId=2
                     elif is_answer(attribs):
                         # if is accepted answer, append answer Body to relevant questions "AcceptedAnswer" field
-                        # if the answer's score > min_score
-                        # append the answer to the relevant question's OtherAnswers dict
+                        # if the answer's score > min_score append the answer to the relevant question's OtherAnswers dict
                         self.add_answer(attribs)
                         self.check_complete(attribs)
                     elem.clear()
                 except:
                     traceback.print_exc()
+        print("##### Stats #####")
+        print(f"num_questions={self.num_questions}, num_discarded_questions={self.num_discarded_questions}")
+        print(f"num_answers={self.num_answers}, num_discarded_answers={self.num_discarded_answers}")
+        print("###### End ######")
 
     def is_above_threshold(self, a_attribs):
         """
@@ -77,11 +90,9 @@ def is_above_threshold(self, a_attribs):
     def add_answer(self, a_attribs):
         """
         Adds answer to its parent question in self.questions if it's either an accepted answer or above self.min_score.
-         If answer is an accepted answer, it gets appended to the AcceptedAnswer field, otherwise it gets appended to
-         OtherAnswers.
+        If answer is an accepted answer, it gets appended to the AcceptedAnswer field, otherwise it gets appended to OtherAnswers.
 
-         Also increments the question's 'ParsedAnswers' field. When ParsedAnswers = AnswerCount, the question is deleted
-         from memory and saved to a text file.
+        Also increments the question's 'ParsedAnswers' field. When ParsedAnswers = AnswerCount, the question is deleted from memory and saved to a text file.
 
         :param a_attribs: Answer's attribute dict
         """
@@ -99,6 +110,7 @@ def add_answer(self, a_attribs):
                 else:
                     self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
             else:
+                self.num_discarded_answers += 1 
                 self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
 
     def check_complete(self, a_attribs):
@@ -113,7 +125,17 @@ def check_complete(self, a_attribs):
                 if int(parent["ParsedAnswers"]) == int(parent['AnswerCount']):
                     keys_to_del.append(a_attribs["ParentId"])
                     if parent["Answers"] is not None and len(parent["Answers"]) > 0:
-                        out_name = "{}_{}.txt".format(self.name, parent["Id"].zfill(10))
+                        out_tags = tags_as_list(parent["Tags"])
+                        out_name = "{}_{}_{}.txt".format(self.name, parent["Id"].zfill(10),"_".join(out_tags))
+                        out_dict = dict(
+                            name=out_name,
+                            tags=out_tags,
+                            title=BeautifulSoup(parent["Title"], "html.parser").get_text(),
+                            question=BeautifulSoup(parent["Body"], "html.parser").get_text(),
+                            answers=[],
+                            answers_scores=[],
+                        )
+                        # fmt: "Q:\n\n{question.title}\n\n{question.body}\n\nA:\n\n{answer.body\n\n for answer.sortby(score)}"                        
                         out_str = ""
                         out_str += 'Q:\n\n'
                         if parent["Title"] is not None:
@@ -122,14 +144,17 @@ def check_complete(self, a_attribs):
                             out_str += '{}\n\n'.format(BeautifulSoup(parent["Body"], "html.parser").get_text())
                         if parent["Answers"] is not None:
                             key_score_dict = {}
-                            for k, a in parent["Answers"].items():
-                                key_score_dict[k] = int(a["Score"])
-                            key_score_dict = {k: v for k, v in sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True)}
+                            for ans_id, ans_attrib in parent["Answers"].items():
+                                key_score_dict[ans_id] = int(ans_attrib["Score"])
+                            key_score_dict = OrderedDict((ans_id, score) for ans_id, score in sorted(key_score_dict.items(), key=lambda item: item[1], reverse=True))
                             count = 0
-                            for k in key_score_dict:
+                            for ans_id, score in key_score_dict.items():
                                 if count >= self.max_responses:
                                     break
-                                out_str += 'A:\n\n{}\n\n'.format(BeautifulSoup(parent["Answers"][k]["Body"], "html.parser").get_text())
+                                ans_text = BeautifulSoup(parent["Answers"][ans_id]["Body"], "html.parser").get_text()
+                                out_str += 'A:\n\n{}\n\n'.format(ans_text)
+                                out_dict['answers'].append(ans_text)
+                                out_dict['answers_scores'].append(score)
                                 count += 1
                         if self.out_format == "txt":
                             with open("{}/{}".format(self.out_folder, out_name), 'w') as f:
@@ -144,10 +169,27 @@ def check_complete(self, a_attribs):
                                 self.ar.writestr(out_name, filter_newlines(handle_unicode_errors(out_str)))
                         elif self.out_format == "lm_dataformat":
                             try:
-                                self.ar.add_data(filter_newlines(out_str), meta={
-                                    'name': out_name})
+                                self.ar.add_data(
+                                    filter_newlines(out_str), 
+                                    meta=out_dict
+                                )
                             except:
-                                self.ar.add_data(filter_newlines(handle_unicode_errors(out_str)), meta={
-                                    'name': out_name})
+                                self.ar.add_data(
+                                    filter_newlines(handle_unicode_errors(out_str)), 
+                                    meta=out_dict
+                                )
+                        if self.sample and len(out_dict['answers'])>=3:
+                            with open("{}/samples/sample_{}".format(self.out_folder, out_name), 'w') as f:
+                                try:
+                                    f.write(filter_newlines(out_str))
+                                except:
+                                    f.write(filter_newlines(handle_unicode_errors(out_str)))
+                            self.sample = False
+                        self.num_questions += 1
+                        self.num_answers += len(out_dict['answers'])
+                        self.num_discarded_answers += len(key_score_dict)-len(out_dict['answers']) # cases where number of answers is > max responses
+                    else:
+                        # discard questions with no accepted answers
+                        self.num_discarded_questions += 1
         for key in keys_to_del:
             self.questions.pop(key, None)
diff --git a/requirements.txt b/requirements.txt
index 9d3cf3e..3833202 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ lxml
 py7zr
 tqdm
 lm-dataformat
-jsonlines
\ No newline at end of file
+jsonlines
+requests
\ No newline at end of file
diff --git a/utils.py b/utils.py
index a6c3dc1..e1f1a04 100644
--- a/utils.py
+++ b/utils.py
@@ -61,3 +61,10 @@ def trim_attribs(elem_attribs, attrib_type="question"):
         return new_dict
     else:
         raise Exception('Unrecognized attribute type - please specify either question or answer')
+
+def tags_as_list(tag_str):
+    tag_str = tag_str.strip(">").strip("<")
+    tag_str = tag_str.replace("-","_")
+    tag_list = tag_str.split("><")
+    tag_list.sort()
+    return tag_list
\ No newline at end of file

From 39606c5281de1334429657b13b2891aa0d0b16c3 Mon Sep 17 00:00:00 2001
From: suriya <suriyag88@gmail.com>
Date: Mon, 13 Feb 2023 17:19:18 -0800
Subject: [PATCH 3/6] filter by tags

---
 main.py   | 17 ++++++++++++-----
 pairer.py |  5 +++--
 utils.py  | 11 +++++++++++
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index 183d5d8..79bcef3 100644
--- a/main.py
+++ b/main.py
@@ -10,7 +10,7 @@
 import os
 
 curr_dir  = os.path.dirname(__file__)
-def download_and_process_single(name, out_format, min_score, max_responses):
+def download_and_process_single(name, out_format, min_score, max_responses, chk_tags):
     try:
         name = name.strip().lower()
         os.makedirs("{}/dumps".format(curr_dir), exist_ok=True)
@@ -40,7 +40,7 @@ def download_and_process_single(name, out_format, min_score, max_responses):
         else:
             archiver = None
 
-        qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
+        qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses, chk_tags=chk_tags)
         qa.main()
         if out_format == "lm_dataformat":
             archiver.commit(name)
@@ -58,20 +58,21 @@ def download_and_process_single(name, out_format, min_score, max_responses):
 
 
 def main(args):
-    names = args.names.split(',')
+    names = args.names.split(',')    
+    tags = args.tags.split(',') if len(args.tags) else []
     if names[0].strip().lower() == "all":
         s = Stack_Exchange_Downloader("all")
         names = []
         for k in s.sites:
             names.append(k)
         print('Removing stackoverflow from the list of sites to process. Process it separately.')
-        names.pop(names.index("stackoverflow"))
+        names.pop(names.index("stackoverflow"))        
     print('Downloading and processing stackexchange dumps for {}'.format(names))
     # Download & Process
     # init pool with as many CPUs as available
     cpu_no = cpu_count() - 1
     p = Pool(cpu_no)
-    p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)))
+    p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)), repeat(tags))
 
 
 if __name__ == "__main__":
@@ -101,6 +102,12 @@ def main(args):
         type=int, 
         default=100
     )
+    parser.add_argument(
+        '--tags', 
+        help='list of tags to include.', 
+        type=str, 
+        default=""
+    )
     args = parser.parse_args()
     main(args)
 
diff --git a/pairer.py b/pairer.py
index b8c7ca0..74da416 100644
--- a/pairer.py
+++ b/pairer.py
@@ -8,7 +8,7 @@
 
 class QA_Pairer():
 
-    def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None):
+    def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None, chk_tags=""):
         """Makes a text dataset from StackExchange dumps"""
         self.xml_path = xml_path
         if name is None:
@@ -27,6 +27,7 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo
         if out_format in ["lm_dataformat", "zip"]:
             assert archiver is not None
             self.ar = archiver
+        self.chk_tags = chk_tags
         self.sample = True 
         self.num_discarded_answers = 0
         self.num_discarded_questions = 0
@@ -51,7 +52,7 @@ def main(self):
                     attribs = defaultdict(lambda: None, elem.attrib)
                     # checks if PostTypeId=1
                     if is_question(attribs):
-                        if has_answers(attribs):
+                        if has_answers(attribs) and match_tags_or(attribs, self.chk_tags):
                             # trim post data to  ['Id', 'Body', 'Title', 'Tags', 'AnswerCount', 'AcceptedAnswerId', 'PostTypeId']
                             # other potentially usuful keys: ['CreationDate',  'Score', 'ViewCount', 'OwnerUserId', 'LastActivityDate', 'CommentCount', 'ContentLicense']
                             trim_attribs(attribs, "question")
diff --git a/utils.py b/utils.py
index e1f1a04..6380611 100644
--- a/utils.py
+++ b/utils.py
@@ -44,6 +44,17 @@ def has_answers(elem_attribs):
             return True
     return False
 
+def match_tags_or(elem_attribs, chk_tags):
+    assert is_question(elem_attribs), "Must be a question to match tags"
+    if not(len(chk_tags)):
+        return True
+    if elem_attribs["Tags"] is not None:
+        elem_tags = tags_as_list(elem_attribs["Tags"])
+        for tag in chk_tags:
+            if tag in elem_tags:
+                return True
+    return False
+
 
 def trim_attribs(elem_attribs, attrib_type="question"):
     """deletes non-useful data from attribs dict for questions / answers, returns remaining"""

From d9fffd8825e3c4f6d3b6d184e7b697296fd23bdd Mon Sep 17 00:00:00 2001
From: suriya <suriyag88@gmail.com>
Date: Mon, 13 Feb 2023 23:57:02 -0800
Subject: [PATCH 4/6] minor edits to tags filtering

---
 main.py   | 5 +++--
 pairer.py | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 79bcef3..be1f53f 100644
--- a/main.py
+++ b/main.py
@@ -31,6 +31,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_
             s.extract()
 
         out_folder = "{}/out".format(curr_dir)
+        # out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow_python".format(curr_dir)
         os.makedirs(out_folder, exist_ok=True)
         os.makedirs("{}/samples".format(out_folder), exist_ok=True)
         if out_format == "lm_dataformat":
@@ -39,7 +40,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_
             archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a')
         else:
             archiver = None
-
+        name = name+"_"+"_".join(chk_tags) if len(chk_tags) else name 
         qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses, chk_tags=chk_tags)
         qa.main()
         if out_format == "lm_dataformat":
@@ -72,7 +73,7 @@ def main(args):
     # init pool with as many CPUs as available
     cpu_no = cpu_count() - 1
     p = Pool(cpu_no)
-    p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)), repeat(tags))
+    p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses), repeat(tags)))
 
 
 if __name__ == "__main__":
diff --git a/pairer.py b/pairer.py
index 74da416..e0b6758 100644
--- a/pairer.py
+++ b/pairer.py
@@ -46,10 +46,12 @@ def main(self):
 
         """
         os.makedirs(self.out_folder, exist_ok=True)
+        i=0
         for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name), disable=True):
             if elem.tag == "row":
                 try:
                     attribs = defaultdict(lambda: None, elem.attrib)
+                    i=i+1
                     # checks if PostTypeId=1
                     if is_question(attribs):
                         if has_answers(attribs) and match_tags_or(attribs, self.chk_tags):
@@ -98,7 +100,7 @@ def add_answer(self, a_attribs):
         :param a_attribs: Answer's attribute dict
         """
         assert is_answer(a_attribs), "Must be an answer to add to parent"
-        if a_attribs is not None and self.questions[a_attribs["ParentId"]] is not None:
+        if a_attribs is not None and self.questions.get(a_attribs["ParentId"], None) is not None:
             if is_accepted_answer(a_attribs, self.questions[a_attribs["ParentId"]]):
                 self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer")
                 self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
@@ -120,7 +122,7 @@ def check_complete(self, a_attribs):
         removes from dict and prints to file.
         """
         keys_to_del = []
-        parent = self.questions[a_attribs["ParentId"]]
+        parent = self.questions.get(a_attribs["ParentId"], None)
         if a_attribs is not None and parent is not None:
             if parent["AnswerCount"] is not None and parent["ParsedAnswers"] is not None:
                 if int(parent["ParsedAnswers"]) == int(parent['AnswerCount']):

From 898c25563a652810a5a8bdf7f90db81cee28a1f0 Mon Sep 17 00:00:00 2001
From: suriya <suriyag88@gmail.com>
Date: Tue, 14 Feb 2023 12:59:59 -0800
Subject: [PATCH 5/6] refactor filter logic and better stats

---
 main.py        | 16 ++++------------
 main_filter.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 pairer.py      | 49 ++++++++++++++++++++++++++++++++++++++-----------
 utils.py       |  3 +++
 4 files changed, 90 insertions(+), 23 deletions(-)
 create mode 100644 main_filter.py

diff --git a/main.py b/main.py
index be1f53f..2d9d4f5 100644
--- a/main.py
+++ b/main.py
@@ -10,7 +10,7 @@
 import os
 
 curr_dir  = os.path.dirname(__file__)
-def download_and_process_single(name, out_format, min_score, max_responses, chk_tags):
+def download_and_process_single(name, out_format, min_score, max_responses):
     try:
         name = name.strip().lower()
         os.makedirs("{}/dumps".format(curr_dir), exist_ok=True)
@@ -31,7 +31,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_
             s.extract()
 
         out_folder = "{}/out".format(curr_dir)
-        # out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow_python".format(curr_dir)
+        # out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir)
         os.makedirs(out_folder, exist_ok=True)
         os.makedirs("{}/samples".format(out_folder), exist_ok=True)
         if out_format == "lm_dataformat":
@@ -40,8 +40,7 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_
             archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a')
         else:
             archiver = None
-        name = name+"_"+"_".join(chk_tags) if len(chk_tags) else name 
-        qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses, chk_tags=chk_tags)
+        qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
         qa.main()
         if out_format == "lm_dataformat":
             archiver.commit(name)
@@ -60,7 +59,6 @@ def download_and_process_single(name, out_format, min_score, max_responses, chk_
 
 def main(args):
     names = args.names.split(',')    
-    tags = args.tags.split(',') if len(args.tags) else []
     if names[0].strip().lower() == "all":
         s = Stack_Exchange_Downloader("all")
         names = []
@@ -73,7 +71,7 @@ def main(args):
     # init pool with as many CPUs as available
     cpu_no = cpu_count() - 1
     p = Pool(cpu_no)
-    p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses), repeat(tags)))
+    p.starmap(download_and_process_single, zip(names, repeat(args.out_format), repeat(args.min_score), repeat(args.max_responses)))
 
 
 if __name__ == "__main__":
@@ -103,12 +101,6 @@ def main(args):
         type=int, 
         default=100
     )
-    parser.add_argument(
-        '--tags', 
-        help='list of tags to include.', 
-        type=str, 
-        default=""
-    )
     args = parser.parse_args()
     main(args)
 
diff --git a/main_filter.py b/main_filter.py
new file mode 100644
index 0000000..0c9108f
--- /dev/null
+++ b/main_filter.py
@@ -0,0 +1,45 @@
+'''
+Filter the stackoverflow dataset generated by main.py in lm_dataformat
+Edit filter_logic to implement filtering of any meta dict sample
+'''
+from lm_dataformat import Reader, Archive
+import os
+import glob
+
+curr_dir  = os.path.dirname(__file__)
+out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir)
+full_data_file = "{}/data_*stackoverflow.jsonl.zst".format(out_folder)
+
+class FilterMeta:
+    # currently implementing or of filter tags    
+    def __init__(self):
+        self.filter_tags = ['python']
+        self.file_suffix = "_"+"_".join(self.filter_tags)
+    
+    def filter_logic(self,meta):    
+        for tag in self.filter_tags:
+            if tag in meta['tags']:
+                return True
+        return False
+            
+new_archive = Archive(out_folder)
+filter_meta = FilterMeta()
+num_accepted = 0
+num_rejected = 0
+for filename in glob.glob(full_data_file):
+    # create a reader object for the file
+    print("Filtering {}".format(filename))
+    rdr = Reader(filename)
+    # iterate over the samples in the file
+    for doc, meta in rdr.stream_data(get_meta=True):
+        if filter_meta.filter_logic(meta):
+            # add the sample to the new archive
+            new_archive.add_data(doc, meta)
+            num_accepted += 1
+        else:
+            num_rejected += 1
+
+filtered_data_file_name = "stack_overflow"+filter_meta.file_suffix
+new_archive.commit(filtered_data_file_name)
+print("##### Stats #####")
+print(f"num_accepted: {num_accepted}, num_rejected: {num_rejected}")
diff --git a/pairer.py b/pairer.py
index e0b6758..2598630 100644
--- a/pairer.py
+++ b/pairer.py
@@ -8,7 +8,7 @@
 
 class QA_Pairer():
 
-    def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None, chk_tags=""):
+    def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_responses=3, out_format="txt", archiver=None):
         """Makes a text dataset from StackExchange dumps"""
         self.xml_path = xml_path
         if name is None:
@@ -27,8 +27,9 @@ def __init__(self, xml_path, name=None, out_folder="out", min_score=3, max_respo
         if out_format in ["lm_dataformat", "zip"]:
             assert archiver is not None
             self.ar = archiver
-        self.chk_tags = chk_tags
         self.sample = True 
+        self.num_posts = 0
+        self.num_nonQA_posts = 0
         self.num_discarded_answers = 0
         self.num_discarded_questions = 0
         self.num_questions = 0
@@ -45,16 +46,15 @@ def main(self):
             > Delete from memory
 
         """
-        os.makedirs(self.out_folder, exist_ok=True)
-        i=0
+        os.makedirs(self.out_folder, exist_ok=True)        
         for event, elem in tqdm(etree.iterparse(self.xml_path, events=('end',)), desc="Parsing {} XML file".format(self.name), disable=True):
             if elem.tag == "row":
                 try:
                     attribs = defaultdict(lambda: None, elem.attrib)
-                    i=i+1
+                    self.num_posts += 1
                     # checks if PostTypeId=1
                     if is_question(attribs):
-                        if has_answers(attribs) and match_tags_or(attribs, self.chk_tags):
+                        if has_answers(attribs):
                             # trim post data to  ['Id', 'Body', 'Title', 'Tags', 'AnswerCount', 'AcceptedAnswerId', 'PostTypeId']
                             # other potentially usuful keys: ['CreationDate',  'Score', 'ViewCount', 'OwnerUserId', 'LastActivityDate', 'CommentCount', 'ContentLicense']
                             trim_attribs(attribs, "question")
@@ -69,12 +69,16 @@ def main(self):
                         # if the answer's score > min_score append the answer to the relevant question's OtherAnswers dict
                         self.add_answer(attribs)
                         self.check_complete(attribs)
+                    else:
+                        self.num_nonQA_posts += 1
                     elem.clear()
                 except:
                     traceback.print_exc()
+        
         print("##### Stats #####")
         print(f"num_questions={self.num_questions}, num_discarded_questions={self.num_discarded_questions}")
         print(f"num_answers={self.num_answers}, num_discarded_answers={self.num_discarded_answers}")
+        print(f"num_posts={self.num_posts}, num_nonQA_posts={self.num_nonQA_posts}, unprocessed_questions={len(self.questions.items())}")
         print("###### End ######")
 
     def is_above_threshold(self, a_attribs):
@@ -100,7 +104,7 @@ def add_answer(self, a_attribs):
         :param a_attribs: Answer's attribute dict
         """
         assert is_answer(a_attribs), "Must be an answer to add to parent"
-        if a_attribs is not None and self.questions.get(a_attribs["ParentId"], None) is not None:
+        if self.questions.get(a_attribs["ParentId"], None) is not None:
             if is_accepted_answer(a_attribs, self.questions[a_attribs["ParentId"]]):
                 self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer")
                 self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
@@ -112,9 +116,15 @@ def add_answer(self, a_attribs):
                         self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
                 else:
                     self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
-            else:
+            else:                
                 self.num_discarded_answers += 1 
+                # print("Discarded answer with score {}".format(a_attribs["Score"]), self.num_discarded_answers)
+                self.questions[a_attribs["ParentId"]]["NonAnswers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer")
                 self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
+        else: 
+            parentid = a_attribs["ParentId"]            
+            self.num_discarded_answers += 1
+            # print(f"ParentId {parentid} not found", self.num_discarded_answers)
 
     def check_complete(self, a_attribs):
         """
@@ -124,10 +134,12 @@ def check_complete(self, a_attribs):
         keys_to_del = []
         parent = self.questions.get(a_attribs["ParentId"], None)
         if a_attribs is not None and parent is not None:
-            if parent["AnswerCount"] is not None and parent["ParsedAnswers"] is not None:
+            if parent["AnswerCount"] and parent["ParsedAnswers"]:
                 if int(parent["ParsedAnswers"]) == int(parent['AnswerCount']):
                     keys_to_del.append(a_attribs["ParentId"])
-                    if parent["Answers"] is not None and len(parent["Answers"]) > 0:
+                    ## Filter change: still use quesions with no accepted answers
+                    # if parent["Answers"] is not None and len(parent["Answers"]) > 0:
+                    if 1:
                         out_tags = tags_as_list(parent["Tags"])
                         out_name = "{}_{}_{}.txt".format(self.name, parent["Id"].zfill(10),"_".join(out_tags))
                         out_dict = dict(
@@ -137,6 +149,8 @@ def check_complete(self, a_attribs):
                             question=BeautifulSoup(parent["Body"], "html.parser").get_text(),
                             answers=[],
                             answers_scores=[],
+                            non_answers=[],
+                            non_answers_scores=[]
                         )
                         # fmt: "Q:\n\n{question.title}\n\n{question.body}\n\nA:\n\n{answer.body\n\n for answer.sortby(score)}"                        
                         out_str = ""
@@ -159,6 +173,17 @@ def check_complete(self, a_attribs):
                                 out_dict['answers'].append(ans_text)
                                 out_dict['answers_scores'].append(score)
                                 count += 1
+
+                        if parent["NonAnswers"] is not None:
+                            nona_key_score_dict = {}
+                            for ans_id, ans_attrib in parent["NonAnswers"].items():
+                                nona_key_score_dict[ans_id] = int(ans_attrib["Score"])
+                            nona_key_score_dict = OrderedDict((ans_id, score) for ans_id, score in sorted(nona_key_score_dict.items(), key=lambda item: item[1], reverse=True))
+                            for ans_id, score in nona_key_score_dict.items():
+                                ans_text = BeautifulSoup(parent["NonAnswers"][ans_id]["Body"], "html.parser").get_text()
+                                out_dict['non_answers'].append(ans_text)
+                                out_dict['non_answers_scores'].append(score)
+
                         if self.out_format == "txt":
                             with open("{}/{}".format(self.out_folder, out_name), 'w') as f:
                                 try:
@@ -190,7 +215,9 @@ def check_complete(self, a_attribs):
                             self.sample = False
                         self.num_questions += 1
                         self.num_answers += len(out_dict['answers'])
-                        self.num_discarded_answers += len(key_score_dict)-len(out_dict['answers']) # cases where number of answers is > max responses
+                        if len(key_score_dict)-len(out_dict['answers'])>0:
+                            self.num_discarded_answers += len(key_score_dict)-len(out_dict['answers']) # cases where number of answers is > max responses
+                            print(f"Discarding {len(key_score_dict)-len(out_dict['answers'])} of {len(key_score_dict)} answers", self.num_discarded_answers)                        
                     else:
                         # discard questions with no accepted answers
                         self.num_discarded_questions += 1
diff --git a/utils.py b/utils.py
index 6380611..1587aae 100644
--- a/utils.py
+++ b/utils.py
@@ -64,6 +64,9 @@ def trim_attribs(elem_attribs, attrib_type="question"):
         [elem_attribs.pop(x, None) for x in to_delete]
         elem_attribs["ParsedAnswers"] = 0
         elem_attribs["Answers"] = {}
+        elem_attribs["NonAnswers"] = {}
+        if 'AnswerCount' not in elem_attribs.keys():
+            elem_attribs['AnswerCount']=-1
     elif attrib_type == "answer":
         to_keep = ['Id', 'Body', 'Score']
         new_dict = {}

From 875044f510049e2ca227a0dc71243438ee569180 Mon Sep 17 00:00:00 2001
From: suriya <suriyag88@gmail.com>
Date: Fri, 17 Feb 2023 18:40:31 -0800
Subject: [PATCH 6/6] minor edits

---
 main.py        |  8 +++++++-
 main_filter.py |  2 +-
 pairer.py      | 14 ++++++--------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/main.py b/main.py
index 2d9d4f5..658f44b 100644
--- a/main.py
+++ b/main.py
@@ -8,6 +8,7 @@
 from lm_dataformat import Archive
 import zipfile
 import os
+import json
 
 curr_dir  = os.path.dirname(__file__)
 def download_and_process_single(name, out_format, min_score, max_responses):
@@ -31,9 +32,10 @@ def download_and_process_single(name, out_format, min_score, max_responses):
             s.extract()
 
         out_folder = "{}/out".format(curr_dir)
-        # out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir)
+        # out_folder = "{}/../../../suriyagwu/stackexchange/all".format(curr_dir)
         os.makedirs(out_folder, exist_ok=True)
         os.makedirs("{}/samples".format(out_folder), exist_ok=True)
+        os.makedirs("{}/misc".format(out_folder), exist_ok=True)
         if out_format == "lm_dataformat":
             archiver = Archive(out_folder)
         elif out_format == "zip":
@@ -46,6 +48,10 @@ def download_and_process_single(name, out_format, min_score, max_responses):
             archiver.commit(name)
         elif out_format == "zip":
             archiver.close()
+        
+        # save qa.questions dictionary data to a file
+        json.dump(qa.questions, open("{}/misc/{}_unprocessed_questions.json".format(out_folder, name), "w"), indent=4)
+
         # try:
         #     os.remove(path_to_7z)
         # except FileNotFoundError:
diff --git a/main_filter.py b/main_filter.py
index 0c9108f..d4f7e0e 100644
--- a/main_filter.py
+++ b/main_filter.py
@@ -39,7 +39,7 @@ def filter_logic(self,meta):
         else:
             num_rejected += 1
 
-filtered_data_file_name = "stack_overflow"+filter_meta.file_suffix
+filtered_data_file_name = "stackoverflow"+filter_meta.file_suffix
 new_archive.commit(filtered_data_file_name)
 print("##### Stats #####")
 print(f"num_accepted: {num_accepted}, num_rejected: {num_rejected}")
diff --git a/pairer.py b/pairer.py
index 2598630..227e501 100644
--- a/pairer.py
+++ b/pairer.py
@@ -78,7 +78,10 @@ def main(self):
         print("##### Stats #####")
         print(f"num_questions={self.num_questions}, num_discarded_questions={self.num_discarded_questions}")
         print(f"num_answers={self.num_answers}, num_discarded_answers={self.num_discarded_answers}")
-        print(f"num_posts={self.num_posts}, num_nonQA_posts={self.num_nonQA_posts}, unprocessed_questions={len(self.questions.items())}")
+        print(f"num_posts={self.num_posts}, num_nonQA_posts={self.num_nonQA_posts}")
+        unprocessed_questions = len(self.questions.items())
+        unprocessed_answers = sum([len(q_att['Answers'].items()) for q, q_att in self.questions.items()])
+        print(f"unprocessed_questions={unprocessed_questions}, unprocessed_answers={unprocessed_answers}")
         print("###### End ######")
 
     def is_above_threshold(self, a_attribs):
@@ -109,13 +112,8 @@ def add_answer(self, a_attribs):
                 self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer")
                 self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
             elif self.is_above_threshold(a_attribs):
-                if a_attribs["Id"] is not None:
-                    parent = self.questions[a_attribs["ParentId"]]
-                    if parent is not None:
-                        self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer")
-                        self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
-                else:
-                    self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
+                self.questions[a_attribs["ParentId"]]["Answers"][a_attribs["Id"]] = trim_attribs(a_attribs, "answer")
+                self.questions[a_attribs["ParentId"]]["ParsedAnswers"] += 1
             else:                
                 self.num_discarded_answers += 1 
                 # print("Discarded answer with score {}".format(a_attribs["Score"]), self.num_discarded_answers)