From 600497ae8cdad3231b1875ff7196ea47fda0f04c Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Tue, 5 Feb 2019 18:34:52 +0300
Subject: [PATCH 1/2] make number of topics a parametere

---
 plsa/plsa/example_plsa.py   | 13 ++++++++++---
 plsa/plsa/tester_python3.py |  2 ++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/plsa/plsa/example_plsa.py b/plsa/plsa/example_plsa.py
index 8bc0435..657457d 100644
--- a/plsa/plsa/example_plsa.py
+++ b/plsa/plsa/example_plsa.py
@@ -113,10 +113,13 @@ def feat(folder):
     print ("term-document matrix size", td.shape)
     print(td.shape[0],'terms by',td.shape[1],'docs')
     print("size of term-document matrix in bytes according to sys.getsizeof =",sys.getsizeof(td))
-    # num_topics = int(td.shape[1] / topic_divider)
+    if topic_divider == 0:
+        pass
+    else:
+        num_topics = int(td.shape[1] / topic_divider)
     # num_topics = 30
     # num_topics = 7
-    num_topics = 2
+    # num_topics = 2
     if (num_topics < 2):
         num_topics = 2
     # matrix_to_file(td)
@@ -127,7 +130,11 @@ def feat(folder):
     return td, vocab
 
 def K(D):
-    num_topics = int(D/topic_divider)
+    global num_topics
+    if topic_divider == 0:
+        pass
+    else:
+        num_topics = int(D/topic_divider)
     if (num_topics < 2):
         num_topics = 2
     return num_topics
diff --git a/plsa/plsa/tester_python3.py b/plsa/plsa/tester_python3.py
index 2f6eefa..1704169 100644
--- a/plsa/plsa/tester_python3.py
+++ b/plsa/plsa/tester_python3.py
@@ -102,6 +102,8 @@ def generate_topics(self):
         pclean.pre_pro()
 
         # Train using PLSA
+        pplsa.topic_divider = 0
+        pplsa.num_topics = 2
         pplsa.folder = pclean.output_dir[:-1]
         pplsa.dict_path = pclean.file_dict
         pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming

From 2b79e81fe929b3d1c2458a27000a75ae34163493 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Wed, 6 Feb 2019 16:22:29 +0300
Subject: [PATCH 2/2] finished using json instead of using many files for plsa

---
 plsa/plsa/example_plsa.py        | 48 ++++++++---------
 plsa/plsa/tester_python3.py      | 93 +++++++++++++++++++++++++++++++-
 plsa/plsa/tfidf/preprocessing.py | 25 +++++++++
 plsa/preprocessing/cleansing.py  | 31 ++++++++---
 4 files changed, 164 insertions(+), 33 deletions(-)

diff --git a/plsa/plsa/example_plsa.py b/plsa/plsa/example_plsa.py
index 657457d..6454cf2 100644
--- a/plsa/plsa/example_plsa.py
+++ b/plsa/plsa/example_plsa.py
@@ -11,7 +11,7 @@
 
 # import taskmanager as tm
 import pandas as pd
-from tfidf.preprocessing import read_files, preprocess_documents
+from tfidf.preprocessing import read_files, preprocess_documents, read_json, json_files_list
 # from tfidf.porter import PorterStemmer
 from tfidf.tfidf import *
 # from tfidf.preprocessing import file_list, empty_file_list
@@ -67,21 +67,24 @@
 
 def feat(folder):
     global num_topics
-    docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt"))))
+    # docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt"))))
+    docs = list(preprocess_documents(read_json(folder+"/cleaned.json")))
     assert(len(docs) > 0)
     print("len(docs) =",len(docs))
-    docs_2 = list(docs)
-    docs_reduced = reduce_docs(docs)
-
-
-    if docs_reduced.__len__() != docs_2.__len__():
-
-        list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced)
-        delete_docs(list_1)
-
-        docs = preprocess_documents(read_files(os.path.join(folder, "*.txt")))
-        assert(len(docs) > 0)
-        print("len(docs) =",len(docs))
+    # Uncomment this later and fix it with the new json theme
+    # docs_2 = list(docs)
+    # docs_reduced = reduce_docs(docs)
+    #
+    #
+    # if docs_reduced.__len__() != docs_2.__len__():
+    #
+    #     list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced)
+    #     delete_docs(list_1)
+    #
+    #     docs = preprocess_documents(read_files(os.path.join(folder, "*.txt")))
+    #     assert(len(docs) > 0)
+    #     print("len(docs) =",len(docs))
+    # Uncomment ends here
 
     # num_topics = int(len(docs) / topic_divider)
     # if(num_topics < 2):
@@ -90,9 +93,12 @@ def feat(folder):
     #docs = stemmer.stem_documents(docs)
     td_dict, vocab = tc(docs)
 
+    print("'''''''''''''''''''''''''''''''")
+    # print(td_dict)
+
     for doc in range(len(docs)):
         if docs[doc] == '':
-            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc)
+            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc,' and file name is:',json_files_list[doc])
             empty_docs_list.append(doc)
 
     print ('len(td_dict) =', len(td_dict))
@@ -309,15 +315,9 @@ def train(data, maxiter=500, debug=True):
     # Bug update over
 
 
-    file_list = []
-    pattern_1 = os.path.join(folder, "*.txt")
-
-    f = glob.glob(pattern_1)
-
-    for f_i in f:
-        # print(f_i)
-        file_i=str(f_i).split('/')[file_parts_number]
-        file_list.append(file_i)
+    file_list = json_files_list
+    print('"""""""""""""""""""""""""""""')
+    # print(file_list)
 
     print('>>>>>>> In method train:', empty_docs_list)
     for edl in empty_docs_list:
diff --git a/plsa/plsa/tester_python3.py b/plsa/plsa/tester_python3.py
index 1704169..0138177 100644
--- a/plsa/plsa/tester_python3.py
+++ b/plsa/plsa/tester_python3.py
@@ -11,6 +11,7 @@
 import random
 import datetime
 import time
+import json
 
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/plsa')
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing')
@@ -85,6 +86,89 @@ def write_to_files_slack(self):
             # idx = idx + 1
 
 
+
+    def write_to_json(self):
+
+        self.read_csv()
+
+
+        self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + self.channel + '/'
+        print(self.unique_folder_naming)
+
+        os.mkdir(self.extracted_folder+self.unique_folder_naming)
+
+        # idx = 0
+        contents_dict = {}
+
+        file = self.extracted_folder + self.unique_folder_naming + 'extracted' + '.json'
+
+        for row in self.messages:
+            if row['subtype'] == 'chat':
+                if self.channel == '':
+                    contents_dict[row['id']]=row['text']
+                    # with open(file, 'w') as f:
+                    #     f.write(row['text'])
+                else:
+                    if row['channel_name'] == self.channel:
+                        contents_dict[row['id']] = row['text']
+                        # with open(file, 'w') as f:
+                        #     f.write(row['text'])
+                    else:
+                        continue
+
+            # idx = idx + 1
+
+        with open(file, "w") as f:
+            json.dump(contents_dict, f, indent=4)
+
+        print("len(contents_dict):",len(contents_dict))
+
+
+
+    def generate_topics_json(self):
+
+        start_time_1 = time.time()
+
+        pplsa.file_parts_number=10
+        pclean.file_parts_number = 10
+        pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict'
+        pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json'
+        pclean.output_dir = self.output_dir + self.unique_folder_naming
+
+        os.mkdir(pclean.output_dir)
+
+
+        # Do cleansing on the data and turing it to bad-of-words model
+        pclean.pre_pro()
+
+        # Train using PLSA
+        pplsa.topic_divider = 0
+        pplsa.num_topics = 2
+        pplsa.folder = pclean.output_dir[:-1]
+        pplsa.dict_path = pclean.file_dict
+        pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
+        pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx'
+        pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional'
+        pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png'
+
+        # Folder paths to delete
+        self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH
+        self.output_dir_stream = pclean.output_dir
+        self.file_dict_stream = pclean.file_dict
+
+
+
+        os.mkdir(pplsa.PLSA_PARAMETERS_PATH)
+
+        pplsa.main()
+
+        end_time_1 = time.time()
+
+        print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4))
+
+
+
+
     def generate_topics(self):
 
         start_time_1 = time.time()
@@ -131,6 +215,12 @@ def generate_topics(self):
 
 
 
+def run_plsa_slack_json():
+    path_1 = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/misc/slack_messages.csv'
+    print(path_1)
+    s = TopicAnalysis(path_1,'singnet')
+    s.write_to_json()
+    s.generate_topics_json()
 
 
 
@@ -200,7 +290,8 @@ def isEnglish(s):
 
 if __name__ == '__main__':
 
-    run_plsa_slack()
+    run_plsa_slack_json()
+    # run_plsa_slack()
     # run_1()
     # test_preprocessing()
     #
diff --git a/plsa/plsa/tfidf/preprocessing.py b/plsa/plsa/tfidf/preprocessing.py
index 61d687a..ae05242 100644
--- a/plsa/plsa/tfidf/preprocessing.py
+++ b/plsa/plsa/tfidf/preprocessing.py
@@ -1,11 +1,14 @@
 import re
 import string
 import glob
+import json
 
 # file_list = []
 # file_parts_number = 9
 # file_parts_number = 8
 
+json_files_list = []
+
 def strip_punctuation(s):
     return re.sub("([%s]+)" % string.punctuation, " ", s)
 
@@ -94,6 +97,28 @@ def read_files(pattern):
     # file_list = []
     return map(read_file, glob.glob(pattern))
 
+def read_json(path):
+    global json_files_list
+    with open(path, "r") as read_file:
+        ret = json.load(read_file)
+
+    for k in ret:
+        json_files_list.append(k)
+
+    print("||||||||||||||||||||||||||||||||")
+    # print(ret)
+
+    docs = []
+
+    for k in ret:
+        docs.append(ret[k])
+
+    # return docs
+    return map(mapper,docs)
+
+def mapper(s):
+    return s
+
 # def empty_file_list():
 #     global file_list
 #     file_list = []
\ No newline at end of file
diff --git a/plsa/preprocessing/cleansing.py b/plsa/preprocessing/cleansing.py
index ecf2d9d..d13ed72 100644
--- a/plsa/preprocessing/cleansing.py
+++ b/plsa/preprocessing/cleansing.py
@@ -6,6 +6,7 @@
 import sys
 import pathlib
 import os
+import json
 
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa')
 sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing')
@@ -176,14 +177,19 @@ def return_cleaned(t1):
 
 
 def pre_pro():
-    fileList = glob.glob(source_texts)
-    fileList_len = fileList.__len__() - 1
+    # fileList = glob.glob(source_texts)
+    # fileList_len = fileList.__len__() - 1
+
+    cleaned_dict = {}
+
+    with open(source_texts, "r") as read_file:
+        fileList = json.load(read_file)
     k = 0
     print('------pre-process started-------')
     for files in fileList:
-        tFile = open(files)
+        tFile = fileList[files]
         # tFile = codecs.open(files, 'r', 'utf-8')
-        line = tFile.read().lower()
+        line = tFile.lower()
         # print(line)
         # print(type(line))
         # line = line.decode('utf-8')
@@ -268,14 +274,23 @@ def pre_pro():
 
                 port_dict.add_element(stemmed=term1,nonstemmed=temp_term1)
 
-        file_txt = open(output_dir + path.PurePath(files).parts[file_parts_number], "w")
+        # file_txt = open(output_dir + path.PurePath(files).parts[file_parts_number], "w")
+
+        temp = ''
 
         for i in por_tokens:
-            file_txt.write(i)
-            file_txt.write('\n')
-        file_txt.close()
+            temp = temp + i + '\n'
+
+        cleaned_dict[files] = temp
+
         # print('Processed ',k,'of',fileList_len)
         k = k + 1
+
+    file_json = output_dir + 'cleaned.json'
+
+    with open(file_json, "w") as f:
+        json.dump(cleaned_dict, f, indent=4)
+
     port_dict.write_dict_to_file(file_dict)
     print('***------pre-process finished--------')