stackovaflo/parse_stack_overflow.py at master · mattratt/stackovaflo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import sys
import logging
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np


logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s")
pd.set_option('display.width', 240)


POST_FIELDS = ['Id', 'CreationDate', 'OwnerUserId', 'Score', 'CommentCount']
QUESTION_FIELDS = POST_FIELDS + ['ViewCount', 'AnswerCount', 'FavoriteCount', 'AcceptedAnswerId']
ANSWER_FIELDS = POST_FIELDS + ['ParentId']
# skipped: PostTypeId, Body, Title, Tags, LastEditorUserId, LastEditorDisplayName, LastEditDate,
#          LastActivityDate

USER_FIELDS = ['Id', 'Reputation', 'CreationDate', 'Location', 'Views', 'UpVotes', 'DownVotes',
               'Age', 'AccountId']  #zzz which id field to use?  Id or AccountId?
# skipped: DisplayName, LastAccessDate, WebsiteUrl, AboutMe


def parse_posts(infile):
    questions = []
    answers = []
    accepted_answer_ids = set()
    for i, line in enumerate(infile):
        if i % 10000 == 0:
            logging.debug("\t{}\t{} questions,\t{} answers".format(i, len(questions), len(answers)))

        try:
            rec = ET.fromstring(line)
        except ET.ParseError as err:
            logging.debug("can't parse line {}: {}".format(i, line.strip()))
            continue

        if rec.tag != 'row':
            continue

        # this one's special since it's calculated
        body_len = len(rec.attrib['Body']) if 'Body' in rec.attrib else None

        if rec.attrib['PostTypeId'] == '1':  # question
            vals = [ rec.attrib.get(attr) for attr in QUESTION_FIELDS ] + [body_len]
            accepted_answer_ids.add(rec.attrib.get('AcceptedAnswerId'))
            questions.append(tuple(vals))

        else:  # answer
            accepted = rec.attrib['Id'] in accepted_answer_ids  # if we come across the answer
                                                                # first, this will break
            vals = [ rec.attrib.get(attr) for attr in ANSWER_FIELDS ] + [body_len, accepted]
            answers.append(tuple(vals))

    logging.info("creating DataFrame for {} questions".format(len(questions)))
    index_vals = get_index_vals(questions, QUESTION_FIELDS, 'Id', int)
    question_df = pd.DataFrame(questions, index=index_vals, columns=QUESTION_FIELDS+['Length'],
                               dtype=np.int64)

    logging.info("creating DataFrame for {} posts".format(len(answers)))
    index_vals = get_index_vals(answers, ANSWER_FIELDS, 'Id', int)
    answer_df = pd.DataFrame(answers, index=index_vals, columns=ANSWER_FIELDS+['Length', 'Accepted'],
                             dtype=np.int64)

    return question_df, answer_df


def get_index_vals(tups, col_names, index_col_name, index_col_type=str):
    p = col_names.index(index_col_name)
    return [index_col_type(t[p]) for t in tups]


def parse_users(infile, selects=None):
    if selects is not None:
        selects = set(selects)
        logging.debug("selecting from {} unique users".format(len(selects)))

    users = []
    reject_count = 0
    for i, line in enumerate(infile):
        if i % 10000 == 0:
            logging.debug("\t{}\t{} users,\t{} rejects".format(i, len(users), reject_count))
        try:
            rec = ET.fromstring(line)
        except ET.ParseError as err:
            logging.debug("can't parse line {}: {}".format(i, line.strip()))
            continue

        if rec.tag != 'row':
            continue

        if (selects is not None) and (rec.attrib['Id'] not in selects):
            reject_count += 1
            continue

        users.append(tuple(rec.attrib.get(attr) for attr in USER_FIELDS))

    logging.info("creating DataFrame for {} users".format(len(users)))
    index_vals = get_index_vals(users, USER_FIELDS, 'Id', int)
    return pd.DataFrame(users, index=index_vals, columns=USER_FIELDS, dtype=np.int64)


####################################

if __name__ == '__main__':

    # parse questions, answers, users
    USAGE = sys.argv[0] + " postfile userfile"  # e.g. parse_stack_overflow.py Posts.xml Users.xml
    if len(sys.argv) < 3:
        sys.exit(" usage: " + USAGE)

    with open(sys.argv[1], 'r') as infile:
        quest_df, ans_df = parse_posts(infile)
        print quest_df.head(), "\n", quest_df.dtypes
        print ans_df.head(), "\n", ans_df.dtypes

    with open(sys.argv[2], 'r') as infile:
        user_df = parse_users(infile)
        print user_df.head(), "\n", user_df.dtypes

    # add answer aggregs to questions, then join user table
    logging.info("aggreg answer cols")
    aggs = {'Score': {'mean_Score': 'mean'},
            'CommentCount': {'mean_CommCount': 'mean'},
            'Length': {'mean_Length': 'mean'}}
    answer_aggregs_df = ans_df.groupby('ParentId').agg(aggs)
    answer_aggregs_df.columns = answer_aggregs_df.columns.droplevel(0)
    # print answer_aggregs_df.head(), "\n", answer_aggregs_df.dtypes

    logging.info("joining answer cols to questions")
    quest_df = quest_df.join(answer_aggregs_df, rsuffix='_answer')
    # print "quest joined:\n", quest_df.loc[quest_df['Id'] == key]
    # print "quest joined:\n", quest_df.loc[key]
    # print quest_df.head(), "\n", quest_df.dtypes

    # for some reason these is object
    quest_df['OwnerUserId'] = pd.to_numeric(quest_df['OwnerUserId'], downcast='integer')
    user_df['Age'] = pd.to_numeric(user_df['Age'], downcast='integer')

    logging.info("joining users and questions")
    user_question_df = quest_df.join(user_df, on='OwnerUserId', rsuffix='_user')
    logging.debug("joined table has {} rows".format(len(user_question_df)))
    # print user_question_df.head(), "\n", user_question_df.dtypes

    logging.info("joining users and answers")
    user_answer_df = ans_df.join(user_df, on='OwnerUserId', rsuffix='_user')
    logging.debug("joined table has {} rows".format(len(user_answer_df)))

    logging.info("joining answers and questions")
    question_answer_df = ans_df.join(quest_df, on='ParentId', rsuffix='_quest')
    logging.debug("joined table has {} rows".format(len(question_answer_df)))