-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_time.py
More file actions
123 lines (102 loc) · 3.07 KB
/
get_time.py
File metadata and controls
123 lines (102 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import bs4
from bs4 import BeautifulSoup
import urllib2
from spacy.en import English
from Company import Company
import sys
import wikipedia
#function to get category max
def cat_max(cat_entry):
cat_entry = unicode(cat_entry)
sim_val = 0
sim_cat = ''
for label in base.keys():
label_max = 0
for case in base[label]:
check = nlp(cat_entry).similarity(nlp(unicode(case)))
# print(cat_entry, case, check)
if (check > label_max):
label_max = check
if label_max > sim_val:
sim_cat = label
sim_val = label_max
val = 0.55 #confidence level
if (sim_val < val):
return 'misc'
return sim_cat
def check_categories(category):
if("Dow Jones" in category or "New York Stock Exchange" in category
or "NASDAQ" in category or "Wikidata" in category
or "needing additional references" in category
or "infobox" in category):
return True
else:
return False
base = {}
#TODO populate base dict with cases
base['location'] = ["Based in","Located in"]
base['industry'] = ["Technology Company","Financial","Consulting","Manufacturing"]
base['time'] = ["Established in","Created in"]
base['products'] = ["Produces","Creates","Develops","Provides"]
base['misc'] = []
companies = []
nlp = English()
############################################
wiki = "https://en.wikipedia.org/w/index.php?title=List_of_S%26P_500_companies&oldid=697200065"
page = urllib2.urlopen(wiki)
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table", { "class" : "wikitable sortable" })
allNames = []
numberOfCompanies = 0
i = 1
values = table.findAll("td")
while(numberOfCompanies < 510):
try:
allNames.append(values[i].text)
except:
pass
i += 8
numberOfCompanies += 1
counter = 0
dicRef = {}
for each in allNames:
dicRef[each] = []
companies_pages = []
for each in allNames:
try:
page = wikipedia.page(each)
companies_pages.append(page)
except:
try:
a = wikipedia.search(each)
page = wikipedia.page(a[0])
companies_pages.append(page)
except:
companies_pages.append("NA")
counter = counter + 1
print(len(companies_pages))
companies_obj = []
for i in range(0, 504, 1):
companies_obj.append(Company(allNames[i]))
comp_cats = companies_pages[i].categories
for category in comp_cats:
if(check_categories(category)):
continue
label = cat_max(category)
companies_obj[-1].add_parameter(label,category)
print(len(companies_obj))
compIndDic = {}
# print(companies_obj[0].data)
for each in companies_obj:
for values in each.data:
if(values == 'time'):
try:
a = str(each.data[values][0])
time_value = (int(filter(str.isdigit, a)))
compIndDic[each.name] = time_value
except:
compIndDic[each.name] = -1
print(compIndDic)
import json
with open('time.json', 'w') as refer_files:
json.dump(compIndDic, refer_files)