forked from cbogart/githubscraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmapper.py
More file actions
46 lines (40 loc) · 1.38 KB
/
mapper.py
File metadata and controls
46 lines (40 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from collections import defaultdict
import csv
usermap = defaultdict(set)
def add_alias(mapping, a1, a2):
if "@" in a1 and "@" not in a2:
mapping[a1].add(a2)
elif "@" in a2 and "@" not in a1:
mapping[a2].add(a1)
def find_close_enough(emailfront, candidates):
stripped = emailfront.replace(".","").replace(" ","").lower()
for c in candidates:
if c.lower() == stripped:
print emailfront, "matches", c
return c
return None
def get_usermap(confg, include_pushes = False):
if (include_pushes):
pushcsv = csv.DictReader(open(confg["data_dir"]+"/aliases_pushes.csv", "r"))
print "Reading pushes"
for p in pushcsv:
add_alias(usermap, p["alias2"], p["alias1"])
print "Reading pulls"
pushcsv = csv.DictReader(open(confg["data_dir"]+"/aliases_prs.csv", "r"))
for p in pushcsv:
add_alias(usermap, p["alias2"], p["alias1"])
print "Erasing duplicates"
# Forget any learned ambiguities
forgets = set()
for u in usermap:
if len(usermap[u]) > 1:
closest = find_close_enough(u.split("@")[0], usermap[u])
if closest is not None:
usermap[u] = closest
else:
forgets.add(u)
else:
usermap[u] = list(usermap[u])[0]
for f in forgets:
del usermap[f]
print "Done with heuristic"