From bb8d4159a168669618700776fa421bbe0aade8e1 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Mon, 6 May 2019 17:25:03 +0300
Subject: [PATCH 01/11] test run plsa

---
 plsa-service/plsa/plsa_wrapper.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/plsa-service/plsa/plsa_wrapper.py b/plsa-service/plsa/plsa_wrapper.py
index 82e3d3d..f4ef541 100644
--- a/plsa-service/plsa/plsa_wrapper.py
+++ b/plsa-service/plsa/plsa_wrapper.py
@@ -145,7 +145,7 @@ def generate_topics_json(self):
 
 def run_plsa():
 
-    path = str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/misc/extracted.json'
+    path = str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/misc/extracted_2.json'
 
     docs = []
 
@@ -157,6 +157,12 @@ def run_plsa():
         docs.append(fileList[k])
 
     s = PLSA_wrapper(docs,local=True)
+    s.topic_divider = 0
+    s.num_topics = 2
+    s.max_iter = 22
+    s.beta = 1
+    s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
+    os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/plsa/plsa-parameters/'+s.unique_folder_naming)
     s.write_to_json()
     s.generate_topics_json()
 

From 4d87ca437875a0be51b48957d89369b406ab672d Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Mon, 6 May 2019 17:26:58 +0300
Subject: [PATCH 02/11] init lda service code

---
 lda-service/lda_wrapper.py | 128 +++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 lda-service/lda_wrapper.py

diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py
new file mode 100644
index 0000000..4b71523
--- /dev/null
+++ b/lda-service/lda_wrapper.py
@@ -0,0 +1,128 @@
+__author__ = 'eyob'
+# Tested on python3.6
+
+
+import psutil
+print('===================ram used at program start:',float(list(psutil.virtual_memory())[3])/1073741824.0,'GB')
+
+import os
+import sys
+import pathlib
+import csv
+import random
+import datetime
+import time
+import json
+import logging
+
+sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing')
+sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/topic-analysis/plsa-service/preprocessing')
+
+import cleansing as pclean
+
+
+class LDA_wrapper:
+
+    def __init__(self, docs,local=False):
+
+        self.docs = docs
+        if not local:
+            self.root_path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/lda/'
+        else:
+            self.root_path = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/lda/'
+        print('>>>>>>>>>>>>>self.root_path>>>>>>>>>>>')
+        print(self.root_path)
+        self.extracted_folder = self.root_path + 'extracted/'
+        self.file_dict = self.root_path + 'dict/'
+        self.source_texts = self.root_path + 'extracted/'
+        self.output_dir = self.root_path + 'cleaned/'
+        print(self.output_dir)
+        self.folder = self.root_path + 'cleaned/'
+        self.dict_path = self.root_path + 'dict/'
+        self.lda_parameters_path = self.root_path + 'lda-parameters/'
+        self.LDA_PARAMETERS_PATH = ''
+
+        # self.messages
+        self.unique_folder_naming = None
+        self.num_topics = None
+        self.topic_divider = None
+        self.max_iter = None
+
+    def __del__(self):
+
+        # Close db connections
+        pass
+
+
+
+    def write_to_json(self):
+
+
+
+        # self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
+        print(self.unique_folder_naming)
+
+        os.mkdir(self.extracted_folder+self.unique_folder_naming)
+
+        contents_dict = {}
+
+        file = self.extracted_folder + self.unique_folder_naming + 'extracted' + '.json'
+
+        for i in range(len(self.docs)):
+            contents_dict[str(i)] = self.docs[i]
+
+        with open(file, "w") as f:
+            json.dump(contents_dict, f, indent=4)
+
+        print("len(contents_dict):",len(contents_dict))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def run_lda():
+
+    docs = []
+    s = LDA_wrapper(docs, local=True)
+
+    path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json'
+
+    docs = []
+
+
+    with open(path, "r") as read_file:
+        fileList = json.load(read_file)
+
+    for k in fileList:
+        docs.append(fileList[k])
+
+    s = LDA_wrapper(docs,local=True)
+    # s.topic_divider = 0
+    # s.num_topics = 2
+    # s.max_iter = 22
+    # s.beta = 1
+    s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
+    os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/plsa-parameters/'+s.unique_folder_naming)
+    s.write_to_json()
+    # s.generate_topics_json()
+
+
+
+
+__end__ = '__end__'
+
+
+if __name__ == '__main__':
+
+    run_lda()
+
+    pass
\ No newline at end of file

From f072f75bff83f0ca93769d0a2a55bdfcf9589299 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Mon, 6 May 2019 18:28:33 +0300
Subject: [PATCH 03/11] finished upto preprocessing

---
 lda-service/lda_wrapper.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py
index 4b71523..631d8c6 100644
--- a/lda-service/lda_wrapper.py
+++ b/lda-service/lda_wrapper.py
@@ -15,12 +15,14 @@
 import json
 import logging
 
+sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa')
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing')
+sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa')
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/topic-analysis/plsa-service/preprocessing')
 
+import example_plsa as pplsa
 import cleansing as pclean
 
-
 class LDA_wrapper:
 
     def __init__(self, docs,local=False):
@@ -78,6 +80,27 @@ def write_to_json(self):
 
 
 
+    def generate_topics_json(self):
+
+        start_time_1 = time.time()
+
+        pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] + '_dict'
+        pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json'
+        pclean.output_dir = self.output_dir + self.unique_folder_naming
+
+        os.mkdir(pclean.output_dir)
+
+        # Do cleansing on the data and turing it to bad-of-words model
+
+        with open(self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f:
+            f.write('Preprocessing started.')
+
+        pclean.pre_pro()
+
+        with open(self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f:
+            f.write('Preprocessing finished. Topic analysis started.')
+
+
 
 
 
@@ -111,9 +134,9 @@ def run_lda():
     # s.max_iter = 22
     # s.beta = 1
     s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
-    os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/plsa-parameters/'+s.unique_folder_naming)
+    os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming)
     s.write_to_json()
-    # s.generate_topics_json()
+    s.generate_topics_json()
 
 
 

From 08ff9e1febb3962c81f605cb80739f490d244d31 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Mon, 6 May 2019 18:31:29 +0300
Subject: [PATCH 04/11] comment out unnecessary file include

---
 lda-service/lda_wrapper.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py
index 631d8c6..b637001 100644
--- a/lda-service/lda_wrapper.py
+++ b/lda-service/lda_wrapper.py
@@ -15,12 +15,12 @@
 import json
 import logging
 
-sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa')
+# sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa')
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing')
-sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa')
+# sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa')
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/topic-analysis/plsa-service/preprocessing')
 
-import example_plsa as pplsa
+# import example_plsa as pplsa
 import cleansing as pclean
 
 class LDA_wrapper:

From b24a0ff4be18892b3feb6e0cd2b33e63a0afe55a Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Mon, 6 May 2019 20:37:02 +0300
Subject: [PATCH 05/11] about to start lda training

---
 lda-service/lda_wrapper.py | 41 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py
index b637001..dcbcd5a 100644
--- a/lda-service/lda_wrapper.py
+++ b/lda-service/lda_wrapper.py
@@ -15,6 +15,22 @@
 import json
 import logging
 
+import re
+import numpy as np
+import pandas as pd
+
+import gensim
+import gensim.corpora as corpora
+from gensim.utils import simple_preprocess
+from gensim.models import CoherenceModel
+
+# Enable logging for gensim - optional
+import logging
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
+
+import warnings
+warnings.filterwarnings("ignore",category=DeprecationWarning)
+
 # sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa')
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/preprocessing')
 # sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa')
@@ -100,6 +116,31 @@ def generate_topics_json(self):
         with open(self.lda_parameters_path + self.unique_folder_naming + 'status.txt', 'w') as f:
             f.write('Preprocessing finished. Topic analysis started.')
 
+        with open(pclean.output_dir+'cleaned.json', "r") as read_file:
+            ret = json.load(read_file)
+
+        data_lemmatized = []
+
+        for k in ret:
+            data_lemmatized.append(ret[k].splitlines())
+
+        # Create Dictionary
+        id2word = corpora.Dictionary(data_lemmatized)
+
+        # Create Corpus
+        texts = data_lemmatized
+
+        # Term Document Frequency
+        corpus = [id2word.doc2bow(text) for text in texts]
+
+        # View
+        # print(corpus[0:1])
+        # print(id2word[1])
+
+
+
+
+
 
 
 

From 9f6fdc540a7d19728a39e75e83c32e7db6bbc549 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Wed, 8 May 2019 13:08:17 +0300
Subject: [PATCH 06/11] Finished basic lda topic extraction

---
 lda-service/lda_wrapper.py | 61 ++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py
index dcbcd5a..606c898 100644
--- a/lda-service/lda_wrapper.py
+++ b/lda-service/lda_wrapper.py
@@ -18,6 +18,7 @@
 import re
 import numpy as np
 import pandas as pd
+from pprint import pprint
 
 import gensim
 import gensim.corpora as corpora
@@ -38,6 +39,7 @@
 
 # import example_plsa as pplsa
 import cleansing as pclean
+import porter_dictionary
 
 class LDA_wrapper:
 
@@ -96,7 +98,10 @@ def write_to_json(self):
 
 
 
-    def generate_topics_json(self):
+    def generate_topics_gensim(self,num_topics, passes, chunksize,
+                               update_every=0, alpha='auto', eta='auto', decay=0.5, offset=1.0, eval_every=1,
+                               iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None,
+                               minimum_phi_value=0.01, per_word_topics=True, callbacks=None):
 
         start_time_1 = time.time()
 
@@ -137,16 +142,50 @@ def generate_topics_json(self):
         # print(corpus[0:1])
         # print(id2word[1])
 
+        self.lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                                    id2word=id2word,
+                                                    num_topics=num_topics,
+                                                    random_state=random_state,
+                                                    update_every=update_every,
+                                                    chunksize=chunksize,
+                                                    passes=passes,
+                                                    alpha=alpha,
+                                                    eta=eta,
+                                                    per_word_topics=per_word_topics,
+                                                    decay=decay,
+                                                    offset=offset,
+                                                    eval_every=eval_every,
+                                                    iterations=iterations,
+                                                    gamma_threshold=gamma_threshold,
+                                                    minimum_probability=minimum_probability,
+                                                    minimum_phi_value=minimum_phi_value,
+                                                    callbacks=callbacks)
 
+        port_dict = porter_dictionary.porter_dictionary()
 
+        topics = self.lda_model.show_topics(num_topics=num_topics,num_words=300,formatted=False)
 
+        extracted_topics = []
 
+        for topic in topics:
+            a_topic = []
+            for item in topic[1]:
+                a_topic.append(item[0])
+            extracted_topics.append(a_topic)
 
+        port_dict.load_dict(self.dict_path + self.unique_folder_naming[:-1] + '_dict')
 
 
+        self.topics_destemmed = []
 
-
-
+        for i in extracted_topics:
+            destemmed = []
+            for j in i:
+                try:
+                    destemmed.append(port_dict.dictionary[j][0])
+                except:
+                    logging.exception('message')
+            self.topics_destemmed.append(destemmed)
 
 
 
@@ -158,7 +197,9 @@ def run_lda():
     docs = []
     s = LDA_wrapper(docs, local=True)
 
-    path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json'
+    # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json'
+    path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json'
+    # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hersheys_all.json'
 
     docs = []
 
@@ -177,7 +218,17 @@ def run_lda():
     s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
     os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming)
     s.write_to_json()
-    s.generate_topics_json()
+    # s.generate_topics_gensim(num_topics=3,passes=100,chunksize=200)
+    s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2)
+
+    # pprint(s.lda_model.print_topics(3,50))
+    # topics = s.lda_model.show_topics(2,5,formatted=False)
+    # print(topics)
+    print_two_d(s.topics_destemmed)
+
+def print_two_d(two_d):
+    for i in two_d:
+        print(i)
 
 
 

From 5ace658fd3623534b54c9acf1d88eb0bd2a8f771 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Wed, 8 May 2019 20:21:00 +0300
Subject: [PATCH 07/11] experimental code comparing lda with plsa

---
 lda-service/lda_wrapper.py | 85 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 3 deletions(-)

diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py
index 606c898..28bdc9e 100644
--- a/lda-service/lda_wrapper.py
+++ b/lda-service/lda_wrapper.py
@@ -198,8 +198,10 @@ def run_lda():
     s = LDA_wrapper(docs, local=True)
 
     # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json'
-    path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json'
+    # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json'
+    # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_bio_all.json'
     # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hersheys_all.json'
+    path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hr_all.json'
 
     docs = []
 
@@ -218,14 +220,91 @@ def run_lda():
     s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
     os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming)
     s.write_to_json()
-    # s.generate_topics_gensim(num_topics=3,passes=100,chunksize=200)
-    s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2)
+    s.generate_topics_gensim(num_topics=3,passes=22,chunksize=200)
+    # s.generate_topics_gensim(num_topics=2,passes=22,chunksize=200)
+    # s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2)
+
 
     # pprint(s.lda_model.print_topics(3,50))
     # topics = s.lda_model.show_topics(2,5,formatted=False)
     # print(topics)
     print_two_d(s.topics_destemmed)
 
+
+    # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/singnet_all_plsa_topics_2.txt'
+    # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hersheys_all_plsa_topics.txt'
+    # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/bio_all_plsa_topics.txt'
+    topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hr_all_plsa_topics.txt'
+    with open(topics_snet_all_plsa_file,'r') as f:
+        temp_list = f.readlines()
+        topics_snet_all_plsa = []
+        for l in temp_list:
+            topics_snet_all_plsa.append(l.split(','))
+
+        for i in range(len(topics_snet_all_plsa)):
+            for j in range(len(topics_snet_all_plsa[0])):
+                topics_snet_all_plsa[i][j] = topics_snet_all_plsa[i][j].strip()
+
+    topics_snet_all_plsa_file_2 = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/misc/topics/hr_all_plsa_topics_2.txt'
+    with open(topics_snet_all_plsa_file_2, 'r') as f:
+        temp_list = f.readlines()
+        topics_snet_all_plsa_2 = []
+        for l in temp_list:
+            topics_snet_all_plsa_2.append(l.split(','))
+
+        for i in range(len(topics_snet_all_plsa_2)):
+            for j in range(len(topics_snet_all_plsa_2[0])):
+                topics_snet_all_plsa_2[i][j] = topics_snet_all_plsa_2[i][j].strip()
+
+
+
+    # two topics
+    # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30))
+    # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30))
+    # print('=========================')
+    # print(dot_product(topics_snet_all_plsa[1],s.topics_destemmed[0],depth=30))
+    # print(dot_product(topics_snet_all_plsa[1],s.topics_destemmed[1],depth=30))
+    # two topics
+
+    # three topics
+    print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30))
+    print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30))
+    print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[2],depth=30))
+    print('=========================')
+    print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[0], depth=30))
+    print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[1], depth=30))
+    print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[2], depth=30))
+    print('=========================')
+    print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[0], depth=30))
+    print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[1], depth=30))
+    print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[2], depth=30))
+    print('=========================')
+    # three topics
+
+    # plsa self
+    # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[0], depth=30))
+    # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[1], depth=30))
+    # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[2], depth=30))
+    # print('=========================')
+    # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[0], depth=30))
+    # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[1], depth=30))
+    # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[2], depth=30))
+    # print('=========================')
+    # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[0], depth=30))
+    # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[1], depth=30))
+    # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[2], depth=30))
+    # print('=========================')
+    # plsa self
+
+
+def dot_product(list_1,list_2,depth=30):
+
+    count = 0
+    for i in list_1[0:depth]:
+        if i in list_2[0:depth]:
+            count = count + 1
+    return count
+
 def print_two_d(two_d):
     for i in two_d:
         print(i)

From c4ac8d50fc45cc5502d1aafd68abe33c8eb35972 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Mon, 19 Aug 2019 12:08:49 +0300
Subject: [PATCH 08/11] converted threading to mp to spawn plsa engine

---
 topic_analysis_grpc.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/topic_analysis_grpc.py b/topic_analysis_grpc.py
index fabff29..700c0d6 100644
--- a/topic_analysis_grpc.py
+++ b/topic_analysis_grpc.py
@@ -24,6 +24,7 @@
 
 import plsa_wrapper
 import threading
+import multiprocessing as mp
 
 from service_spec import topic_analysis_pb2
 from service_spec import topic_analysis_pb2_grpc
@@ -96,8 +97,10 @@ def PLSA(self,request,context):
 
             unique_folder_naming = str(datetime.datetime.now()).replace(':', '-').replace('.', '-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
 
-            thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
-            thread1.start()
+            # thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
+            p1 = mp.Process(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
+
+            p1.start()
 
             resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1].replace('-','e').replace(' ','d').replace('^','y'))
 

From 43cd443a9588f0d42b6b8f9aa38282127d6fbafa Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Tue, 20 Aug 2019 14:01:44 +0300
Subject: [PATCH 09/11] changed number of threads to 1

---
 topic_analysis_grpc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topic_analysis_grpc.py b/topic_analysis_grpc.py
index 700c0d6..83890b1 100644
--- a/topic_analysis_grpc.py
+++ b/topic_analysis_grpc.py
@@ -160,7 +160,7 @@ def generate_topics_plsa(docs,unique_folder_naming,num_topics,topic_divider,maxi
 
 def serve():
 
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
     topic_analysis_pb2_grpc.add_TopicAnalysisServicer_to_server(TopicAnalysis(), server)
     print('Starting server. Listening on port 5000.')
     server.add_insecure_port('127.0.0.1:5000')

From 823ad744964a6401cabc7220349643123d8c125a Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Tue, 20 Aug 2019 14:02:53 +0300
Subject: [PATCH 10/11] bumped up docker version

---
 Docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Docker/Dockerfile b/Docker/Dockerfile
index 17075cd..4cd1ced 100644
--- a/Docker/Dockerfile
+++ b/Docker/Dockerfile
@@ -80,7 +80,7 @@ RUN mkdir /home/top/pay/ \
 
 
 
-ENV DAEMON_VERSION 0.1.7
+ENV DAEMON_VERSION 1.0.0
 
 RUN wget -O snet-daemon.tar.gz https://github.com/singnet/snet-daemon/releases/download/v$DAEMON_VERSION/snet-daemon-v$DAEMON_VERSION-linux-amd64.tar.gz && \
     tar  --strip-components=1 -xzf snet-daemon.tar.gz && \

From e38372c9ff06116a01361a21e806edc7fc74de25 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Wed, 21 Aug 2019 10:24:20 +0300
Subject: [PATCH 11/11] remaining todo for lda

---
 lda-service/lda_wrapper.py | 82 ++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/lda-service/lda_wrapper.py b/lda-service/lda_wrapper.py
index 28bdc9e..4c44ad6 100644
--- a/lda-service/lda_wrapper.py
+++ b/lda-service/lda_wrapper.py
@@ -187,6 +187,17 @@ def generate_topics_gensim(self,num_topics, passes, chunksize,
                     logging.exception('message')
             self.topics_destemmed.append(destemmed)
 
+        '''
+        Seems remaining code is to extract any produced parameters from the resulting lda model, like the weights. We need to define the proto formats of course
+        for all the returned parameters
+        
+        also code that writes the final status that shows total running time that elapsed
+        
+        in general, compare the outputs of plsa and as much as possible try to apply it to the results that are returned by lda
+        '''
+
+
+
 
 
 
@@ -201,7 +212,8 @@ def run_lda():
     # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json'
     # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_bio_all.json'
     # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hersheys_all.json'
-    path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hr_all.json'
+    # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hr_all.json'
+    path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_all.json'
 
     docs = []
 
@@ -220,7 +232,8 @@ def run_lda():
     s.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
     os.mkdir(str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/lda/lda-parameters/'+s.unique_folder_naming)
     s.write_to_json()
-    s.generate_topics_gensim(num_topics=3,passes=22,chunksize=200)
+    # s.generate_topics_gensim(num_topics=3,passes=22,chunksize=200)
+    s.generate_topics_gensim(num_topics=70,passes=22,chunksize=20000)
     # s.generate_topics_gensim(num_topics=2,passes=22,chunksize=200)
     # s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2)
 
@@ -234,27 +247,28 @@ def run_lda():
     # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/singnet_all_plsa_topics_2.txt'
     # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hersheys_all_plsa_topics.txt'
     # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/bio_all_plsa_topics.txt'
-    topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hr_all_plsa_topics.txt'
-    with open(topics_snet_all_plsa_file,'r') as f:
-        temp_list = f.readlines()
-        topics_snet_all_plsa = []
-        for l in temp_list:
-            topics_snet_all_plsa.append(l.split(','))
-
-        for i in range(len(topics_snet_all_plsa)):
-            for j in range(len(topics_snet_all_plsa[0])):
-                topics_snet_all_plsa[i][j] = topics_snet_all_plsa[i][j].strip()
-
-    topics_snet_all_plsa_file_2 = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/misc/topics/hr_all_plsa_topics_2.txt'
-    with open(topics_snet_all_plsa_file_2, 'r') as f:
-        temp_list = f.readlines()
-        topics_snet_all_plsa_2 = []
-        for l in temp_list:
-            topics_snet_all_plsa_2.append(l.split(','))
 
-        for i in range(len(topics_snet_all_plsa_2)):
-            for j in range(len(topics_snet_all_plsa_2[0])):
-                topics_snet_all_plsa_2[i][j] = topics_snet_all_plsa_2[i][j].strip()
+    # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hr_all_plsa_topics.txt'
+    # with open(topics_snet_all_plsa_file,'r') as f:
+    #     temp_list = f.readlines()
+    #     topics_snet_all_plsa = []
+    #     for l in temp_list:
+    #         topics_snet_all_plsa.append(l.split(','))
+    #
+    #     for i in range(len(topics_snet_all_plsa)):
+    #         for j in range(len(topics_snet_all_plsa[0])):
+    #             topics_snet_all_plsa[i][j] = topics_snet_all_plsa[i][j].strip()
+    #
+    # topics_snet_all_plsa_file_2 = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/misc/topics/hr_all_plsa_topics_2.txt'
+    # with open(topics_snet_all_plsa_file_2, 'r') as f:
+    #     temp_list = f.readlines()
+    #     topics_snet_all_plsa_2 = []
+    #     for l in temp_list:
+    #         topics_snet_all_plsa_2.append(l.split(','))
+    #
+    #     for i in range(len(topics_snet_all_plsa_2)):
+    #         for j in range(len(topics_snet_all_plsa_2[0])):
+    #             topics_snet_all_plsa_2[i][j] = topics_snet_all_plsa_2[i][j].strip()
 
 
 
@@ -267,18 +281,18 @@ def run_lda():
     # two topics
 
     # three topics
-    print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30))
-    print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30))
-    print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[2],depth=30))
-    print('=========================')
-    print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[0], depth=30))
-    print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[1], depth=30))
-    print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[2], depth=30))
-    print('=========================')
-    print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[0], depth=30))
-    print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[1], depth=30))
-    print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[2], depth=30))
-    print('=========================')
+    # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30))
+    # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30))
+    # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[2],depth=30))
+    # print('=========================')
+    # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[0], depth=30))
+    # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[1], depth=30))
+    # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[2], depth=30))
+    # print('=========================')
+    # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[0], depth=30))
+    # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[1], depth=30))
+    # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[2], depth=30))
+    # print('=========================')
     # three topics
 
     # plsa self