From 600497ae8cdad3231b1875ff7196ea47fda0f04c Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Tue, 5 Feb 2019 18:34:52 +0300 Subject: [PATCH 1/2] make number of topics a parametere --- plsa/plsa/example_plsa.py | 13 ++++++++++--- plsa/plsa/tester_python3.py | 2 ++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/plsa/plsa/example_plsa.py b/plsa/plsa/example_plsa.py index 8bc0435..657457d 100644 --- a/plsa/plsa/example_plsa.py +++ b/plsa/plsa/example_plsa.py @@ -113,10 +113,13 @@ def feat(folder): print ("term-document matrix size", td.shape) print(td.shape[0],'terms by',td.shape[1],'docs') print("size of term-document matrix in bytes according to sys.getsizeof =",sys.getsizeof(td)) - # num_topics = int(td.shape[1] / topic_divider) + if topic_divider == 0: + pass + else: + num_topics = int(td.shape[1] / topic_divider) # num_topics = 30 # num_topics = 7 - num_topics = 2 + # num_topics = 2 if (num_topics < 2): num_topics = 2 # matrix_to_file(td) @@ -127,7 +130,11 @@ def feat(folder): return td, vocab def K(D): - num_topics = int(D/topic_divider) + global num_topics + if topic_divider == 0: + pass + else: + num_topics = int(D/topic_divider) if (num_topics < 2): num_topics = 2 return num_topics diff --git a/plsa/plsa/tester_python3.py b/plsa/plsa/tester_python3.py index 2f6eefa..1704169 100644 --- a/plsa/plsa/tester_python3.py +++ b/plsa/plsa/tester_python3.py @@ -102,6 +102,8 @@ def generate_topics(self): pclean.pre_pro() # Train using PLSA + pplsa.topic_divider = 0 + pplsa.num_topics = 2 pplsa.folder = pclean.output_dir[:-1] pplsa.dict_path = pclean.file_dict pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming From 2b79e81fe929b3d1c2458a27000a75ae34163493 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Wed, 6 Feb 2019 16:22:29 +0300 Subject: [PATCH 2/2] finished using json instead of using many files for plsa --- plsa/plsa/example_plsa.py | 48 ++++++++--------- plsa/plsa/tester_python3.py | 93 +++++++++++++++++++++++++++++++- plsa/plsa/tfidf/preprocessing.py | 25 +++++++++ plsa/preprocessing/cleansing.py | 31 ++++++++--- 4 files changed, 164 insertions(+), 33 deletions(-) diff --git a/plsa/plsa/example_plsa.py b/plsa/plsa/example_plsa.py index 657457d..6454cf2 100644 --- a/plsa/plsa/example_plsa.py +++ b/plsa/plsa/example_plsa.py @@ -11,7 +11,7 @@ # import taskmanager as tm import pandas as pd -from tfidf.preprocessing import read_files, preprocess_documents +from tfidf.preprocessing import read_files, preprocess_documents, read_json, json_files_list # from tfidf.porter import PorterStemmer from tfidf.tfidf import * # from tfidf.preprocessing import file_list, empty_file_list @@ -67,21 +67,24 @@ def feat(folder): global num_topics - docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt")))) + # docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt")))) + docs = list(preprocess_documents(read_json(folder+"/cleaned.json"))) assert(len(docs) > 0) print("len(docs) =",len(docs)) - docs_2 = list(docs) - docs_reduced = reduce_docs(docs) - - - if docs_reduced.__len__() != docs_2.__len__(): - - list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced) - delete_docs(list_1) - - docs = preprocess_documents(read_files(os.path.join(folder, "*.txt"))) - assert(len(docs) > 0) - print("len(docs) =",len(docs)) + # Uncomment this later and fix it with the new json theme + # docs_2 = list(docs) + # docs_reduced = reduce_docs(docs) + # + # + # if docs_reduced.__len__() != docs_2.__len__(): + # + # list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced) + # delete_docs(list_1) + # + # docs = preprocess_documents(read_files(os.path.join(folder, "*.txt"))) + # assert(len(docs) > 0) + # print("len(docs) =",len(docs)) + # Uncomment ends here # num_topics = int(len(docs) / topic_divider) # if(num_topics < 2): @@ -90,9 +93,12 @@ def feat(folder): #docs = stemmer.stem_documents(docs) td_dict, vocab = tc(docs) + print("'''''''''''''''''''''''''''''''") + # print(td_dict) + for doc in range(len(docs)): if docs[doc] == '': - print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc) + print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc,' and file name is:',json_files_list[doc]) empty_docs_list.append(doc) print ('len(td_dict) =', len(td_dict)) @@ -309,15 +315,9 @@ def train(data, maxiter=500, debug=True): # Bug update over - file_list = [] - pattern_1 = os.path.join(folder, "*.txt") - - f = glob.glob(pattern_1) - - for f_i in f: - # print(f_i) - file_i=str(f_i).split('/')[file_parts_number] - file_list.append(file_i) + file_list = json_files_list + print('"""""""""""""""""""""""""""""') + # print(file_list) print('>>>>>>> In method train:', empty_docs_list) for edl in empty_docs_list: diff --git a/plsa/plsa/tester_python3.py b/plsa/plsa/tester_python3.py index 1704169..0138177 100644 --- a/plsa/plsa/tester_python3.py +++ b/plsa/plsa/tester_python3.py @@ -11,6 +11,7 @@ import random import datetime import time +import json sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/plsa') sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing') @@ -85,6 +86,89 @@ def write_to_files_slack(self): # idx = idx + 1 + + def write_to_json(self): + + self.read_csv() + + + self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + self.channel + '/' + print(self.unique_folder_naming) + + os.mkdir(self.extracted_folder+self.unique_folder_naming) + + # idx = 0 + contents_dict = {} + + file = self.extracted_folder + self.unique_folder_naming + 'extracted' + '.json' + + for row in self.messages: + if row['subtype'] == 'chat': + if self.channel == '': + contents_dict[row['id']]=row['text'] + # with open(file, 'w') as f: + # f.write(row['text']) + else: + if row['channel_name'] == self.channel: + contents_dict[row['id']] = row['text'] + # with open(file, 'w') as f: + # f.write(row['text']) + else: + continue + + # idx = idx + 1 + + with open(file, "w") as f: + json.dump(contents_dict, f, indent=4) + + print("len(contents_dict):",len(contents_dict)) + + + + def generate_topics_json(self): + + start_time_1 = time.time() + + pplsa.file_parts_number=10 + pclean.file_parts_number = 10 + pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict' + pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json' + pclean.output_dir = self.output_dir + self.unique_folder_naming + + os.mkdir(pclean.output_dir) + + + # Do cleansing on the data and turing it to bad-of-words model + pclean.pre_pro() + + # Train using PLSA + pplsa.topic_divider = 0 + pplsa.num_topics = 2 + pplsa.folder = pclean.output_dir[:-1] + pplsa.dict_path = pclean.file_dict + pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming + pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx' + pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional' + pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png' + + # Folder paths to delete + self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH + self.output_dir_stream = pclean.output_dir + self.file_dict_stream = pclean.file_dict + + + + os.mkdir(pplsa.PLSA_PARAMETERS_PATH) + + pplsa.main() + + end_time_1 = time.time() + + print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4)) + + + + def generate_topics(self): start_time_1 = time.time() @@ -131,6 +215,12 @@ def generate_topics(self): +def run_plsa_slack_json(): + path_1 = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/misc/slack_messages.csv' + print(path_1) + s = TopicAnalysis(path_1,'singnet') + s.write_to_json() + s.generate_topics_json() @@ -200,7 +290,8 @@ def isEnglish(s): if __name__ == '__main__': - run_plsa_slack() + run_plsa_slack_json() + # run_plsa_slack() # run_1() # test_preprocessing() # diff --git a/plsa/plsa/tfidf/preprocessing.py b/plsa/plsa/tfidf/preprocessing.py index 61d687a..ae05242 100644 --- a/plsa/plsa/tfidf/preprocessing.py +++ b/plsa/plsa/tfidf/preprocessing.py @@ -1,11 +1,14 @@ import re import string import glob +import json # file_list = [] # file_parts_number = 9 # file_parts_number = 8 +json_files_list = [] + def strip_punctuation(s): return re.sub("([%s]+)" % string.punctuation, " ", s) @@ -94,6 +97,28 @@ def read_files(pattern): # file_list = [] return map(read_file, glob.glob(pattern)) +def read_json(path): + global json_files_list + with open(path, "r") as read_file: + ret = json.load(read_file) + + for k in ret: + json_files_list.append(k) + + print("||||||||||||||||||||||||||||||||") + # print(ret) + + docs = [] + + for k in ret: + docs.append(ret[k]) + + # return docs + return map(mapper,docs) + +def mapper(s): + return s + # def empty_file_list(): # global file_list # file_list = [] \ No newline at end of file diff --git a/plsa/preprocessing/cleansing.py b/plsa/preprocessing/cleansing.py index ecf2d9d..d13ed72 100644 --- a/plsa/preprocessing/cleansing.py +++ b/plsa/preprocessing/cleansing.py @@ -6,6 +6,7 @@ import sys import pathlib import os +import json sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa') sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing') @@ -176,14 +177,19 @@ def return_cleaned(t1): def pre_pro(): - fileList = glob.glob(source_texts) - fileList_len = fileList.__len__() - 1 + # fileList = glob.glob(source_texts) + # fileList_len = fileList.__len__() - 1 + + cleaned_dict = {} + + with open(source_texts, "r") as read_file: + fileList = json.load(read_file) k = 0 print('------pre-process started-------') for files in fileList: - tFile = open(files) + tFile = fileList[files] # tFile = codecs.open(files, 'r', 'utf-8') - line = tFile.read().lower() + line = tFile.lower() # print(line) # print(type(line)) # line = line.decode('utf-8') @@ -268,14 +274,23 @@ def pre_pro(): port_dict.add_element(stemmed=term1,nonstemmed=temp_term1) - file_txt = open(output_dir + path.PurePath(files).parts[file_parts_number], "w") + # file_txt = open(output_dir + path.PurePath(files).parts[file_parts_number], "w") + + temp = '' for i in por_tokens: - file_txt.write(i) - file_txt.write('\n') - file_txt.close() + temp = temp + i + '\n' + + cleaned_dict[files] = temp + # print('Processed ',k,'of',fileList_len) k = k + 1 + + file_json = output_dir + 'cleaned.json' + + with open(file_json, "w") as f: + json.dump(cleaned_dict, f, indent=4) + port_dict.write_dict_to_file(file_dict) print('***------pre-process finished--------')