From bb8d4159a168669618700776fa421bbe0aade8e1 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Mon, 6 May 2019 17:25:03 +0300 Subject: [PATCH 01/11] test run plsa --- plsa-service/plsa/plsa_wrapper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/plsa-service/plsa/plsa_wrapper.py b/plsa-service/plsa/plsa_wrapper.py index 82e3d3d..f4ef541 100644 --- a/plsa-service/plsa/plsa_wrapper.py +++ b/plsa-service/plsa/plsa_wrapper.py @@ -145,7 +145,7 @@ def generate_topics_json(self): def run_plsa(): - path = str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/misc/extracted.json' + path = str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/misc/extracted_2.json' docs = [] @@ -157,6 +157,12 @@ def run_plsa(): docs.append(fileList[k]) s = PLSA_wrapper(docs,local=True) + s.topic_divider = 0 + s.num_topics = 2 + s.max_iter = 22 + s.beta = 1 + s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' + os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/plsa/plsa-parameters/'+s.unique_folder_naming) s.write_to_json() s.generate_topics_json() From 4d87ca437875a0be51b48957d89369b406ab672d Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Mon, 6 May 2019 17:26:58 +0300 Subject: [PATCH 02/11] init lda service code --- lda-service/lda_wrapper.py | 128 +++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 lda-service/lda_wrapper.py diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py new file mode 100644 index 0000000..4b71523 --- /dev/null +++ b/lda-service/lda_wrapper.py @@ -0,0 +1,128 @@ +__author__ = 'eyob' +# Tested on python3.6 + + +import psutil +print('===================ram used at program start:',float(list(psutil.virtual_memory())[3])/1073741824.0,'GB') + +import os +import sys +import pathlib +import csv +import random +import datetime +import time +import json +import logging + +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing') +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/topic-analysis/plsa-service/preprocessing') + +import cleansing as pclean + + +class LDA_wrapper: + + def __init__(self, docs,local=False): + + self.docs = docs + if not local: + self.root_path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/lda/' + else: + self.root_path = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/lda/' + print('>>>>>>>>>>>>>self.root_path>>>>>>>>>>>') + print(self.root_path) + self.extracted_folder = self.root_path + 'extracted/' + self.file_dict = self.root_path + 'dict/' + self.source_texts = self.root_path + 'extracted/' + self.output_dir = self.root_path + 'cleaned/' + print(self.output_dir) + self.folder = self.root_path + 'cleaned/' + self.dict_path = self.root_path + 'dict/' + self.lda_parameters_path = self.root_path + 'lda-parameters/' + self.LDA_PARAMETERS_PATH = '' + + # self.messages + self.unique_folder_naming = None + self.num_topics = None + self.topic_divider = None + self.max_iter = None + + def __del__(self): + + # Close db connections + pass + + + + def write_to_json(self): + + + + # self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' + print(self.unique_folder_naming) + + os.mkdir(self.extracted_folder+self.unique_folder_naming) + + contents_dict = {} + + file = self.extracted_folder + self.unique_folder_naming + 'extracted' + '.json' + + for i in range(len(self.docs)): + contents_dict[str(i)] = self.docs[i] + + with open(file, "w") as f: + json.dump(contents_dict, f, indent=4) + + print("len(contents_dict):",len(contents_dict)) + + + + + + + + + + + + + + +def run_lda(): + + docs = [] + s = LDA_wrapper(docs, local=True) + + path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json' + + docs = [] + + + with open(path, "r") as read_file: + fileList = json.load(read_file) + + for k in fileList: + docs.append(fileList[k]) + + s = LDA_wrapper(docs,local=True) + # s.topic_divider = 0 + # s.num_topics = 2 + # s.max_iter = 22 + # s.beta = 1 + s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' + os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/plsa-parameters/'+s.unique_folder_naming) + s.write_to_json() + # s.generate_topics_json() + + + + +__end__ = '__end__' + + +if __name__ == '__main__': + + run_lda() + + pass \ No newline at end of file From f072f75bff83f0ca93769d0a2a55bdfcf9589299 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Mon, 6 May 2019 18:28:33 +0300 Subject: [PATCH 03/11] finished upto preprocessing --- lda-service/lda_wrapper.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py index 4b71523..631d8c6 100644 --- a/lda-service/lda_wrapper.py +++ b/lda-service/lda_wrapper.py @@ -15,12 +15,14 @@ import json import logging +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa') sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing') +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa') sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/topic-analysis/plsa-service/preprocessing') +import example_plsa as pplsa import cleansing as pclean - class LDA_wrapper: def __init__(self, docs,local=False): @@ -78,6 +80,27 @@ def write_to_json(self): + def generate_topics_json(self): + + start_time_1 = time.time() + + pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict' + pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json' + pclean.output_dir = self.output_dir + self.unique_folder_naming + + os.mkdir(pclean.output_dir) + + # Do cleansing on the data and turing it to bad-of-words model + + with open(self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: + f.write('Preprocessing started.') + + pclean.pre_pro() + + with open(self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: + f.write('Preprocessing finished. Topic analysis started.') + + @@ -111,9 +134,9 @@ def run_lda(): # s.max_iter = 22 # s.beta = 1 s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' - os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/plsa-parameters/'+s.unique_folder_naming) + os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming) s.write_to_json() - # s.generate_topics_json() + s.generate_topics_json() From 08ff9e1febb3962c81f605cb80739f490d244d31 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Mon, 6 May 2019 18:31:29 +0300 Subject: [PATCH 04/11] comment out unnecessary file include --- lda-service/lda_wrapper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py index 631d8c6..b637001 100644 --- a/lda-service/lda_wrapper.py +++ b/lda-service/lda_wrapper.py @@ -15,12 +15,12 @@ import json import logging -sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa') +# sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa') sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing') -sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa') +# sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa') sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/topic-analysis/plsa-service/preprocessing') -import example_plsa as pplsa +# import example_plsa as pplsa import cleansing as pclean class LDA_wrapper: From b24a0ff4be18892b3feb6e0cd2b33e63a0afe55a Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Mon, 6 May 2019 20:37:02 +0300 Subject: [PATCH 05/11] about to start lda training --- lda-service/lda_wrapper.py | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py index b637001..dcbcd5a 100644 --- a/lda-service/lda_wrapper.py +++ b/lda-service/lda_wrapper.py @@ -15,6 +15,22 @@ import json import logging +import re +import numpy as np +import pandas as pd + +import gensim +import gensim.corpora as corpora +from gensim.utils import simple_preprocess +from gensim.models import CoherenceModel + +# Enable logging for gensim - optional +import logging +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR) + +import warnings +warnings.filterwarnings("ignore",category=DeprecationWarning) + # sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa') sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing') # sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa') @@ -100,6 +116,31 @@ def generate_topics_json(self): with open(self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f: f.write('Preprocessing finished. Topic analysis started.') + with open(pclean.output_dir+'cleaned.json', "r") as read_file: + ret = json.load(read_file) + + data_lemmatized = [] + + for k in ret: + data_lemmatized.append(ret[k].splitlines()) + + # Create Dictionary + id2word = corpora.Dictionary(data_lemmatized) + + # Create Corpus + texts = data_lemmatized + + # Term Document Frequency + corpus = [id2word.doc2bow(text) for text in texts] + + # View + # print(corpus[0:1]) + # print(id2word[1]) + + + + + From 9f6fdc540a7d19728a39e75e83c32e7db6bbc549 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Wed, 8 May 2019 13:08:17 +0300 Subject: [PATCH 06/11] Finished basic lda topic extraction --- lda-service/lda_wrapper.py | 61 ++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py index dcbcd5a..606c898 100644 --- a/lda-service/lda_wrapper.py +++ b/lda-service/lda_wrapper.py @@ -18,6 +18,7 @@ import re import numpy as np import pandas as pd +from pprint import pprint import gensim import gensim.corpora as corpora @@ -38,6 +39,7 @@ # import example_plsa as pplsa import cleansing as pclean +import porter_dictionary class LDA_wrapper: @@ -96,7 +98,10 @@ def write_to_json(self): - def generate_topics_json(self): + def generate_topics_gensim(self,num_topics, passes, chunksize, + update_every=0, alpha='auto', eta='auto', decay=0.5, offset=1.0, eval_every=1, + iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, + minimum_phi_value=0.01, per_word_topics=True, callbacks=None): start_time_1 = time.time() @@ -137,16 +142,50 @@ def generate_topics_json(self): # print(corpus[0:1]) # print(id2word[1]) + self.lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, + id2word=id2word, + num_topics=num_topics, + random_state=random_state, + update_every=update_every, + chunksize=chunksize, + passes=passes, + alpha=alpha, + eta=eta, + per_word_topics=per_word_topics, + decay=decay, + offset=offset, + eval_every=eval_every, + iterations=iterations, + gamma_threshold=gamma_threshold, + minimum_probability=minimum_probability, + minimum_phi_value=minimum_phi_value, + callbacks=callbacks) + port_dict = porter_dictionary.porter_dictionary() + topics = self.lda_model.show_topics(num_topics=num_topics,num_words=300,formatted=False) + extracted_topics = [] + for topic in topics: + a_topic = [] + for item in topic[1]: + a_topic.append(item[0]) + extracted_topics.append(a_topic) + port_dict.load_dict(self.dict_path + self.unique_folder_naming[:-1] + '_dict') + self.topics_destemmed = [] - - + for i in extracted_topics: + destemmed = [] + for j in i: + try: + destemmed.append(port_dict.dictionary[j][0]) + except: + logging.exception('message') + self.topics_destemmed.append(destemmed) @@ -158,7 +197,9 @@ def run_lda(): docs = [] s = LDA_wrapper(docs, local=True) - path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json' + # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json' + path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json' + # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hersheys_all.json' docs = [] @@ -177,7 +218,17 @@ def run_lda(): s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming) s.write_to_json() - s.generate_topics_json() + # s.generate_topics_gensim(num_topics=3,passes=100,chunksize=200) + s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2) + + # pprint(s.lda_model.print_topics(3,50)) + # topics = s.lda_model.show_topics(2,5,formatted=False) + # print(topics) + print_two_d(s.topics_destemmed) + +def print_two_d(two_d): + for i in two_d: + print(i) From 5ace658fd3623534b54c9acf1d88eb0bd2a8f771 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Wed, 8 May 2019 20:21:00 +0300 Subject: [PATCH 07/11] experimental code comparing lda with plsa --- lda-service/lda_wrapper.py | 85 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py index 606c898..28bdc9e 100644 --- a/lda-service/lda_wrapper.py +++ b/lda-service/lda_wrapper.py @@ -198,8 +198,10 @@ def run_lda(): s = LDA_wrapper(docs, local=True) # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json' - path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json' + # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json' + # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_bio_all.json' # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hersheys_all.json' + path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hr_all.json' docs = [] @@ -218,14 +220,91 @@ def run_lda(): s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming) s.write_to_json() - # s.generate_topics_gensim(num_topics=3,passes=100,chunksize=200) - s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2) + s.generate_topics_gensim(num_topics=3,passes=22,chunksize=200) + # s.generate_topics_gensim(num_topics=2,passes=22,chunksize=200) + # s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2) + # pprint(s.lda_model.print_topics(3,50)) # topics = s.lda_model.show_topics(2,5,formatted=False) # print(topics) print_two_d(s.topics_destemmed) + + # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/singnet_all_plsa_topics_2.txt' + # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hersheys_all_plsa_topics.txt' + # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/bio_all_plsa_topics.txt' + topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hr_all_plsa_topics.txt' + with open(topics_snet_all_plsa_file,'r') as f: + temp_list = f.readlines() + topics_snet_all_plsa = [] + for l in temp_list: + topics_snet_all_plsa.append(l.split(',')) + + for i in range(len(topics_snet_all_plsa)): + for j in range(len(topics_snet_all_plsa[0])): + topics_snet_all_plsa[i][j] = topics_snet_all_plsa[i][j].strip() + + topics_snet_all_plsa_file_2 = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/misc/topics/hr_all_plsa_topics_2.txt' + with open(topics_snet_all_plsa_file_2, 'r') as f: + temp_list = f.readlines() + topics_snet_all_plsa_2 = [] + for l in temp_list: + topics_snet_all_plsa_2.append(l.split(',')) + + for i in range(len(topics_snet_all_plsa_2)): + for j in range(len(topics_snet_all_plsa_2[0])): + topics_snet_all_plsa_2[i][j] = topics_snet_all_plsa_2[i][j].strip() + + + + # two topics + # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30)) + # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30)) + # print('=========================') + # print(dot_product(topics_snet_all_plsa[1],s.topics_destemmed[0],depth=30)) + # print(dot_product(topics_snet_all_plsa[1],s.topics_destemmed[1],depth=30)) + # two topics + + # three topics + print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30)) + print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30)) + print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[2],depth=30)) + print('=========================') + print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[0], depth=30)) + print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[1], depth=30)) + print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[2], depth=30)) + print('=========================') + print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[0], depth=30)) + print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[1], depth=30)) + print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[2], depth=30)) + print('=========================') + # three topics + + # plsa self + # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[0], depth=30)) + # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[1], depth=30)) + # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[2], depth=30)) + # print('=========================') + # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[0], depth=30)) + # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[1], depth=30)) + # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[2], depth=30)) + # print('=========================') + # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[0], depth=30)) + # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[1], depth=30)) + # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[2], depth=30)) + # print('=========================') + # plsa self + + +def dot_product(list_1,list_2,depth=30): + + count = 0 + for i in list_1[0:depth]: + if i in list_2[0:depth]: + count = count + 1 + return count + def print_two_d(two_d): for i in two_d: print(i) From c4ac8d50fc45cc5502d1aafd68abe33c8eb35972 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Mon, 19 Aug 2019 12:08:49 +0300 Subject: [PATCH 08/11] converted threading to mp to spawn plsa engine --- topic_analysis_grpc.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/topic_analysis_grpc.py b/topic_analysis_grpc.py index fabff29..700c0d6 100644 --- a/topic_analysis_grpc.py +++ b/topic_analysis_grpc.py @@ -24,6 +24,7 @@ import plsa_wrapper import threading +import multiprocessing as mp from service_spec import topic_analysis_pb2 from service_spec import topic_analysis_pb2_grpc @@ -96,8 +97,10 @@ def PLSA(self,request,context): unique_folder_naming = str(datetime.datetime.now()).replace(':', '-').replace('.', '-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' - thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta)) - thread1.start() + # thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta)) + p1 = mp.Process(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta)) + + p1.start() resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1].replace('-','e').replace(' ','d').replace('^','y')) From 43cd443a9588f0d42b6b8f9aa38282127d6fbafa Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Tue, 20 Aug 2019 14:01:44 +0300 Subject: [PATCH 09/11] changed number of threads to 1 --- topic_analysis_grpc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topic_analysis_grpc.py b/topic_analysis_grpc.py index 700c0d6..83890b1 100644 --- a/topic_analysis_grpc.py +++ b/topic_analysis_grpc.py @@ -160,7 +160,7 @@ def generate_topics_plsa(docs,unique_folder_naming,num_topics,topic_divider,maxi def serve(): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) topic_analysis_pb2_grpc.add_TopicAnalysisServicer_to_server(TopicAnalysis(), server) print('Starting server. Listening on port 5000.') server.add_insecure_port('127.0.0.1:5000') From 823ad744964a6401cabc7220349643123d8c125a Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Tue, 20 Aug 2019 14:02:53 +0300 Subject: [PATCH 10/11] bumped up docker version --- Docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Docker/Dockerfile b/Docker/Dockerfile index 17075cd..4cd1ced 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -80,7 +80,7 @@ RUN mkdir /home/top/pay/ \ -ENV DAEMON_VERSION 0.1.7 +ENV DAEMON_VERSION 1.0.0 RUN wget -O snet-daemon.tar.gz https://github.com/singnet/snet-daemon/releases/download/v$DAEMON_VERSION/snet-daemon-v$DAEMON_VERSION-linux-amd64.tar.gz && \ tar --strip-components=1 -xzf snet-daemon.tar.gz && \ From e38372c9ff06116a01361a21e806edc7fc74de25 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Wed, 21 Aug 2019 10:24:20 +0300 Subject: [PATCH 11/11] remaining todo for lda --- lda-service/lda_wrapper.py | 82 ++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py index 28bdc9e..4c44ad6 100644 --- a/lda-service/lda_wrapper.py +++ b/lda-service/lda_wrapper.py @@ -187,6 +187,17 @@ def generate_topics_gensim(self,num_topics, passes, chunksize, logging.exception('message') self.topics_destemmed.append(destemmed) + ''' + Seems remaining code is to extract any produced parameters from the resulting lda model, like the weights. We need to define the proto formats of course + for all the returned parameters + + also code that writes the final status that shows total running time that elapsed + + in general, compare the outputs of plsa and as much as possible try to apply it to the results that are returned by lda + ''' + + + @@ -201,7 +212,8 @@ def run_lda(): # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json' # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_bio_all.json' # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hersheys_all.json' - path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hr_all.json' + # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hr_all.json' + path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_all.json' docs = [] @@ -220,7 +232,8 @@ def run_lda(): s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming) s.write_to_json() - s.generate_topics_gensim(num_topics=3,passes=22,chunksize=200) + # s.generate_topics_gensim(num_topics=3,passes=22,chunksize=200) + s.generate_topics_gensim(num_topics=70,passes=22,chunksize=20000) # s.generate_topics_gensim(num_topics=2,passes=22,chunksize=200) # s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2) @@ -234,27 +247,28 @@ def run_lda(): # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/singnet_all_plsa_topics_2.txt' # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hersheys_all_plsa_topics.txt' # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/bio_all_plsa_topics.txt' - topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hr_all_plsa_topics.txt' - with open(topics_snet_all_plsa_file,'r') as f: - temp_list = f.readlines() - topics_snet_all_plsa = [] - for l in temp_list: - topics_snet_all_plsa.append(l.split(',')) - - for i in range(len(topics_snet_all_plsa)): - for j in range(len(topics_snet_all_plsa[0])): - topics_snet_all_plsa[i][j] = topics_snet_all_plsa[i][j].strip() - - topics_snet_all_plsa_file_2 = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/misc/topics/hr_all_plsa_topics_2.txt' - with open(topics_snet_all_plsa_file_2, 'r') as f: - temp_list = f.readlines() - topics_snet_all_plsa_2 = [] - for l in temp_list: - topics_snet_all_plsa_2.append(l.split(',')) - for i in range(len(topics_snet_all_plsa_2)): - for j in range(len(topics_snet_all_plsa_2[0])): - topics_snet_all_plsa_2[i][j] = topics_snet_all_plsa_2[i][j].strip() + # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hr_all_plsa_topics.txt' + # with open(topics_snet_all_plsa_file,'r') as f: + # temp_list = f.readlines() + # topics_snet_all_plsa = [] + # for l in temp_list: + # topics_snet_all_plsa.append(l.split(',')) + # + # for i in range(len(topics_snet_all_plsa)): + # for j in range(len(topics_snet_all_plsa[0])): + # topics_snet_all_plsa[i][j] = topics_snet_all_plsa[i][j].strip() + # + # topics_snet_all_plsa_file_2 = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/misc/topics/hr_all_plsa_topics_2.txt' + # with open(topics_snet_all_plsa_file_2, 'r') as f: + # temp_list = f.readlines() + # topics_snet_all_plsa_2 = [] + # for l in temp_list: + # topics_snet_all_plsa_2.append(l.split(',')) + # + # for i in range(len(topics_snet_all_plsa_2)): + # for j in range(len(topics_snet_all_plsa_2[0])): + # topics_snet_all_plsa_2[i][j] = topics_snet_all_plsa_2[i][j].strip() @@ -267,18 +281,18 @@ def run_lda(): # two topics # three topics - print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30)) - print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30)) - print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[2],depth=30)) - print('=========================') - print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[0], depth=30)) - print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[1], depth=30)) - print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[2], depth=30)) - print('=========================') - print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[0], depth=30)) - print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[1], depth=30)) - print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[2], depth=30)) - print('=========================') + # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30)) + # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30)) + # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[2],depth=30)) + # print('=========================') + # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[0], depth=30)) + # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[1], depth=30)) + # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[2], depth=30)) + # print('=========================') + # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[0], depth=30)) + # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[1], depth=30)) + # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[2], depth=30)) + # print('=========================') # three topics # plsa self