forked from cbogart/githubscraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_membership.py
More file actions
87 lines (79 loc) · 3.03 KB
/
get_membership.py
File metadata and controls
87 lines (79 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sys
import re
import pdb
from pymongo import MongoClient
from dateutil.parser import parse
from datetime import timedelta
import datetime
from collections import defaultdict
import glob
import csv
from config import Config
from mapper import get_usermap, usermap
from util import forceString
confg = Config(sys.argv[1])
r_db = confg.proc_db()
end_of_days = parse("2970-01-01")
participation = defaultdict(lambda: end_of_days)
first_part = defaultdict(str)
membership = defaultdict(lambda: end_of_days)
first_memb = defaultdict(str)
projects = set()
at_count = defaultdict(set)
user_count = defaultdict(set)
def register(r):
actor = r["actor"]
if actor is None: return
if "@" in actor and actor in usermap:
actor = usermap[actor]
rectype = r["rectype"]
owner = r["project_owner"]
pname = r["project_name"]
dt = r["time"]
project = owner + "/" + pname
key = (owner, pname, actor)
if owner + "/" + pname not in projects:
projects.add(project)
if dt < participation[key]:
participation[key] = dt
first_part[key] = rectype
if actor==owner and dt < membership[key]:
membership[key] = dt
first_memb[key] = "owner"
elif dt < membership[key] and rectype in ["pull_request_merged", "commit_messages", "pull_request_commit"]:
membership[key] = dt
first_memb[key] = rectype
if "@" in forceString(actor):
at_count[project].add(forceString(actor))
at_count["all"].add(forceString(actor))
user_count[project].add(forceString(actor))
user_count["all"].add(forceString(actor))
interesting_projects = [k.lower() for k in confg.get_sample_set_project_names()]
#interesting_projects = set()
#burstsf = csv.DictReader(open("micro_burst_congruence.csv","r"))
#for burstday in burstsf:
#interesting_projects.add((burstday["project_owner"] , burstday["project_name"]))
get_usermap(confg)
for (ix, pr) in enumerate(interesting_projects):
p = pr.split("/")
if ix % 100 == 0:
print ix, "of", len(interesting_projects), ":", p
for r in r_db.issue_events.find({"project_owner": p[0], "project_name": p[1]}):
register(r)
for r in r_db.project_events.find({"project_owner": p[0], "project_name": p[1]}):
register(r)
csvf = csv.writer(open(confg["data_dir"] + "/participation.csv", "w"))
csvf.writerow(["owner","project","actor","first_participation_date","first_participation_action","first_member_date","first_member_action"])
for k in sorted(participation.keys()):
try:
[owner, project, actor] = k
row = [owner, project, forceString(actor), participation[k].isoformat(), first_part[k] ,
membership[k].isoformat() if membership[k] != end_of_days else "",
first_memb[k]]
csvf.writerow(row)
except Exception, e:
print "Problem: skipping ", owner, project, forceString(actor), e
csvf2 = csv.writer(open(confg["data_dir"] + "/atcount2.csv","w"))
csvf2.writerow(["project","Users","Users with @ in name"])
for p in user_count:
csvf2.writerow([p, len(user_count[p]), len(at_count[p])])