Skip to content

Commit b22483e

Browse files
authored
Merge pull request #23 from edyirdaw/dev
LDA service almost done
2 parents 74e6e94 + e38372c commit b22483e

File tree

4 files changed

+350
-5
lines changed

4 files changed

+350
-5
lines changed

Docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ RUN mkdir /home/top/pay/ \
8080

8181

8282

83-
ENV DAEMON_VERSION 0.1.7
83+
ENV DAEMON_VERSION 1.0.0
8484

8585
RUN wget -O snet-daemon.tar.gz https://github.com/singnet/snet-daemon/releases/download/v$DAEMON_VERSION/snet-daemon-v$DAEMON_VERSION-linux-amd64.tar.gz && \
8686
tar --strip-components=1 -xzf snet-daemon.tar.gz && \

lda-service/lda_wrapper.py

Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
__author__ = 'eyob'
2+
# Tested on python3.6
3+
4+
5+
import psutil
6+
print('===================ram used at program start:',float(list(psutil.virtual_memory())[3])/1073741824.0,'GB')
7+
8+
import os
9+
import sys
10+
import pathlib
11+
import csv
12+
import random
13+
import datetime
14+
import time
15+
import json
16+
import logging
17+
18+
import re
19+
import numpy as np
20+
import pandas as pd
21+
from pprint import pprint
22+
23+
import gensim
24+
import gensim.corpora as corpora
25+
from gensim.utils import simple_preprocess
26+
from gensim.models import CoherenceModel
27+
28+
# Enable logging for gensim - optional
29+
import logging
30+
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
31+
32+
import warnings
33+
warnings.filterwarnings("ignore",category=DeprecationWarning)
34+
35+
# sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa')
36+
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing')
37+
# sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa')
38+
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/topic-analysis/plsa-service/preprocessing')
39+
40+
# import example_plsa as pplsa
41+
import cleansing as pclean
42+
import porter_dictionary
43+
44+
class LDA_wrapper:
45+
46+
def __init__(self, docs,local=False):
47+
48+
self.docs = docs
49+
if not local:
50+
self.root_path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/lda/'
51+
else:
52+
self.root_path = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/lda/'
53+
print('>>>>>>>>>>>>>self.root_path>>>>>>>>>>>')
54+
print(self.root_path)
55+
self.extracted_folder = self.root_path + 'extracted/'
56+
self.file_dict = self.root_path + 'dict/'
57+
self.source_texts = self.root_path + 'extracted/'
58+
self.output_dir = self.root_path + 'cleaned/'
59+
print(self.output_dir)
60+
self.folder = self.root_path + 'cleaned/'
61+
self.dict_path = self.root_path + 'dict/'
62+
self.lda_parameters_path = self.root_path + 'lda-parameters/'
63+
self.LDA_PARAMETERS_PATH = ''
64+
65+
# self.messages
66+
self.unique_folder_naming = None
67+
self.num_topics = None
68+
self.topic_divider = None
69+
self.max_iter = None
70+
71+
def __del__(self):
72+
73+
# Close db connections
74+
pass
75+
76+
77+
78+
def write_to_json(self):
79+
80+
81+
82+
# self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
83+
print(self.unique_folder_naming)
84+
85+
os.mkdir(self.extracted_folder+self.unique_folder_naming)
86+
87+
contents_dict = {}
88+
89+
file = self.extracted_folder + self.unique_folder_naming + 'extracted' + '.json'
90+
91+
for i in range(len(self.docs)):
92+
contents_dict[str(i)] = self.docs[i]
93+
94+
with open(file, "w") as f:
95+
json.dump(contents_dict, f, indent=4)
96+
97+
print("len(contents_dict):",len(contents_dict))
98+
99+
100+
101+
def generate_topics_gensim(self,num_topics, passes, chunksize,
102+
update_every=0, alpha='auto', eta='auto', decay=0.5, offset=1.0, eval_every=1,
103+
iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None,
104+
minimum_phi_value=0.01, per_word_topics=True, callbacks=None):
105+
106+
start_time_1 = time.time()
107+
108+
pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict'
109+
pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json'
110+
pclean.output_dir = self.output_dir + self.unique_folder_naming
111+
112+
os.mkdir(pclean.output_dir)
113+
114+
# Do cleansing on the data and turing it to bad-of-words model
115+
116+
with open(self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f:
117+
f.write('Preprocessing started.')
118+
119+
pclean.pre_pro()
120+
121+
with open(self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f:
122+
f.write('Preprocessing finished. Topic analysis started.')
123+
124+
with open(pclean.output_dir+'cleaned.json', "r") as read_file:
125+
ret = json.load(read_file)
126+
127+
data_lemmatized = []
128+
129+
for k in ret:
130+
data_lemmatized.append(ret[k].splitlines())
131+
132+
# Create Dictionary
133+
id2word = corpora.Dictionary(data_lemmatized)
134+
135+
# Create Corpus
136+
texts = data_lemmatized
137+
138+
# Term Document Frequency
139+
corpus = [id2word.doc2bow(text) for text in texts]
140+
141+
# View
142+
# print(corpus[0:1])
143+
# print(id2word[1])
144+
145+
self.lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
146+
id2word=id2word,
147+
num_topics=num_topics,
148+
random_state=random_state,
149+
update_every=update_every,
150+
chunksize=chunksize,
151+
passes=passes,
152+
alpha=alpha,
153+
eta=eta,
154+
per_word_topics=per_word_topics,
155+
decay=decay,
156+
offset=offset,
157+
eval_every=eval_every,
158+
iterations=iterations,
159+
gamma_threshold=gamma_threshold,
160+
minimum_probability=minimum_probability,
161+
minimum_phi_value=minimum_phi_value,
162+
callbacks=callbacks)
163+
164+
port_dict = porter_dictionary.porter_dictionary()
165+
166+
topics = self.lda_model.show_topics(num_topics=num_topics,num_words=300,formatted=False)
167+
168+
extracted_topics = []
169+
170+
for topic in topics:
171+
a_topic = []
172+
for item in topic[1]:
173+
a_topic.append(item[0])
174+
extracted_topics.append(a_topic)
175+
176+
port_dict.load_dict(self.dict_path + self.unique_folder_naming[:-1] + '_dict')
177+
178+
179+
self.topics_destemmed = []
180+
181+
for i in extracted_topics:
182+
destemmed = []
183+
for j in i:
184+
try:
185+
destemmed.append(port_dict.dictionary[j][0])
186+
except:
187+
logging.exception('message')
188+
self.topics_destemmed.append(destemmed)
189+
190+
'''
191+
Seems remaining code is to extract any produced parameters from the resulting lda model, like the weights. We need to define the proto formats of course
192+
for all the returned parameters
193+
194+
also code that writes the final status that shows total running time that elapsed
195+
196+
in general, compare the outputs of plsa and as much as possible try to apply it to the results that are returned by lda
197+
'''
198+
199+
200+
201+
202+
203+
204+
205+
206+
def run_lda():
207+
208+
docs = []
209+
s = LDA_wrapper(docs, local=True)
210+
211+
# path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json'
212+
# path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json'
213+
# path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_bio_all.json'
214+
# path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hersheys_all.json'
215+
# path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hr_all.json'
216+
path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_all.json'
217+
218+
docs = []
219+
220+
221+
with open(path, "r") as read_file:
222+
fileList = json.load(read_file)
223+
224+
for k in fileList:
225+
docs.append(fileList[k])
226+
227+
s = LDA_wrapper(docs,local=True)
228+
# s.topic_divider = 0
229+
# s.num_topics = 2
230+
# s.max_iter = 22
231+
# s.beta = 1
232+
s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
233+
os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming)
234+
s.write_to_json()
235+
# s.generate_topics_gensim(num_topics=3,passes=22,chunksize=200)
236+
s.generate_topics_gensim(num_topics=70,passes=22,chunksize=20000)
237+
# s.generate_topics_gensim(num_topics=2,passes=22,chunksize=200)
238+
# s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2)
239+
240+
241+
# pprint(s.lda_model.print_topics(3,50))
242+
# topics = s.lda_model.show_topics(2,5,formatted=False)
243+
# print(topics)
244+
print_two_d(s.topics_destemmed)
245+
246+
247+
# topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/singnet_all_plsa_topics_2.txt'
248+
# topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hersheys_all_plsa_topics.txt'
249+
# topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/bio_all_plsa_topics.txt'
250+
251+
# topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hr_all_plsa_topics.txt'
252+
# with open(topics_snet_all_plsa_file,'r') as f:
253+
# temp_list = f.readlines()
254+
# topics_snet_all_plsa = []
255+
# for l in temp_list:
256+
# topics_snet_all_plsa.append(l.split(','))
257+
#
258+
# for i in range(len(topics_snet_all_plsa)):
259+
# for j in range(len(topics_snet_all_plsa[0])):
260+
# topics_snet_all_plsa[i][j] = topics_snet_all_plsa[i][j].strip()
261+
#
262+
# topics_snet_all_plsa_file_2 = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/misc/topics/hr_all_plsa_topics_2.txt'
263+
# with open(topics_snet_all_plsa_file_2, 'r') as f:
264+
# temp_list = f.readlines()
265+
# topics_snet_all_plsa_2 = []
266+
# for l in temp_list:
267+
# topics_snet_all_plsa_2.append(l.split(','))
268+
#
269+
# for i in range(len(topics_snet_all_plsa_2)):
270+
# for j in range(len(topics_snet_all_plsa_2[0])):
271+
# topics_snet_all_plsa_2[i][j] = topics_snet_all_plsa_2[i][j].strip()
272+
273+
274+
275+
# two topics
276+
# print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30))
277+
# print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30))
278+
# print('=========================')
279+
# print(dot_product(topics_snet_all_plsa[1],s.topics_destemmed[0],depth=30))
280+
# print(dot_product(topics_snet_all_plsa[1],s.topics_destemmed[1],depth=30))
281+
# two topics
282+
283+
# three topics
284+
# print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30))
285+
# print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30))
286+
# print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[2],depth=30))
287+
# print('=========================')
288+
# print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[0], depth=30))
289+
# print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[1], depth=30))
290+
# print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[2], depth=30))
291+
# print('=========================')
292+
# print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[0], depth=30))
293+
# print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[1], depth=30))
294+
# print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[2], depth=30))
295+
# print('=========================')
296+
# three topics
297+
298+
# plsa self
299+
# print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[0], depth=30))
300+
# print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[1], depth=30))
301+
# print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[2], depth=30))
302+
# print('=========================')
303+
# print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[0], depth=30))
304+
# print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[1], depth=30))
305+
# print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[2], depth=30))
306+
# print('=========================')
307+
# print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[0], depth=30))
308+
# print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[1], depth=30))
309+
# print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[2], depth=30))
310+
# print('=========================')
311+
# plsa self
312+
313+
314+
def dot_product(list_1,list_2,depth=30):
315+
316+
count = 0
317+
for i in list_1[0:depth]:
318+
if i in list_2[0:depth]:
319+
count = count + 1
320+
return count
321+
322+
def print_two_d(two_d):
323+
for i in two_d:
324+
print(i)
325+
326+
327+
328+
329+
__end__ = '__end__'
330+
331+
332+
if __name__ == '__main__':
333+
334+
run_lda()
335+
336+
pass

plsa-service/plsa/plsa_wrapper.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def generate_topics_json(self):
145145

146146
def run_plsa():
147147

148-
path = str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/misc/extracted.json'
148+
path = str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/misc/extracted_2.json'
149149

150150
docs = []
151151

@@ -157,6 +157,12 @@ def run_plsa():
157157
docs.append(fileList[k])
158158

159159
s = PLSA_wrapper(docs,local=True)
160+
s.topic_divider = 0
161+
s.num_topics = 2
162+
s.max_iter = 22
163+
s.beta = 1
164+
s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
165+
os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/plsa/plsa-parameters/'+s.unique_folder_naming)
160166
s.write_to_json()
161167
s.generate_topics_json()
162168

topic_analysis_grpc.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
import plsa_wrapper
2626
import threading
27+
import multiprocessing as mp
2728

2829
from service_spec import topic_analysis_pb2
2930
from service_spec import topic_analysis_pb2_grpc
@@ -96,8 +97,10 @@ def PLSA(self,request,context):
9697

9798
unique_folder_naming = str(datetime.datetime.now()).replace(':', '-').replace('.', '-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
9899

99-
thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
100-
thread1.start()
100+
# thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
101+
p1 = mp.Process(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
102+
103+
p1.start()
101104

102105
resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1].replace('-','e').replace(' ','d').replace('^','y'))
103106

@@ -157,7 +160,7 @@ def generate_topics_plsa(docs,unique_folder_naming,num_topics,topic_divider,maxi
157160

158161
def serve():
159162

160-
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
163+
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
161164
topic_analysis_pb2_grpc.add_TopicAnalysisServicer_to_server(TopicAnalysis(), server)
162165
print('Starting server. Listening on port 5000.')
163166
server.add_insecure_port('127.0.0.1:5000')

0 commit comments

Comments
 (0)