From 8eabe30f80decb4c42fb39c28baf4ca947dc351a Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Tue, 5 Feb 2019 18:07:51 +0300 Subject: [PATCH] fixed empty doc bug --- plsa/plsa/example_plsa.py | 16 +++++++++++++++- plsa/plsa/plsa.py | 8 ++++---- plsa/plsa/tester_python3.py | 6 +++--- plsa/plsa/tfidf/tfidf.py | 1 + 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/plsa/plsa/example_plsa.py b/plsa/plsa/example_plsa.py index 511ce4b..8bc0435 100644 --- a/plsa/plsa/example_plsa.py +++ b/plsa/plsa/example_plsa.py @@ -21,6 +21,7 @@ # s_file_list = [] +empty_docs_list = [] file_parts_number = 8 # file_parts_number = 7 # Inspire @@ -88,6 +89,12 @@ def feat(folder): #stemmer = PorterStemmer() #docs = stemmer.stem_documents(docs) td_dict, vocab = tc(docs) + + for doc in range(len(docs)): + if docs[doc] == '': + print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc) + empty_docs_list.append(doc) + print ('len(td_dict) =', len(td_dict)) print ('len(vocab) =',len(vocab)) global number_of_words @@ -305,6 +312,10 @@ def train(data, maxiter=500, debug=True): file_i=str(f_i).split('/')[file_parts_number] file_list.append(file_i) + print('>>>>>>> In method train:', empty_docs_list) + for edl in empty_docs_list: + # print(file_list[edl]) + del file_list[edl] print('Dimenstionssssssssssssssssss') print("topic_list_len =",topic_list.__len__()) @@ -313,7 +324,6 @@ def train(data, maxiter=500, debug=True): print("p_z_d[0] =", p_z_d[0].__len__()) - topic_by_doc = open(PATH+'.csv', "w") for i in range(file_list.__len__()): topic_by_doc.write(',') @@ -328,6 +338,10 @@ def train(data, maxiter=500, debug=True): topic_by_doc.write('\n') topic_by_doc.close() + print('////////////////////////////') + print(p_z_d.__len__()) + print(p_z_d[0].__len__()) + word_by_topic_conditional = open(PATH_word_by_topic_conditional+'.csv', "w") diff --git a/plsa/plsa/plsa.py b/plsa/plsa/plsa.py index 05e3c22..cdea6b2 100644 --- a/plsa/plsa/plsa.py +++ b/plsa/plsa/plsa.py @@ -346,10 +346,10 @@ def topic_document(self): Retrun: P(z,d) ''' self.p_z_d= self.p_z*self.p_d_z - # print 'p_z_d-----' - # print (self.p_z_d.shape) - # print 'p_z_d.T-----' - # print (self.p_z_d.T.shape) + # print('p_z_d-----') + # print(self.p_z_d.shape) + # print('p_z_d.T-----') + # print(self.p_z_d.T.shape) return self.p_z_d.T # T is for transpose diff --git a/plsa/plsa/tester_python3.py b/plsa/plsa/tester_python3.py index 0915eb0..2f6eefa 100644 --- a/plsa/plsa/tester_python3.py +++ b/plsa/plsa/tester_python3.py @@ -67,11 +67,11 @@ def write_to_files_slack(self): os.mkdir(self.extracted_folder+self.unique_folder_naming) - idx = 0 + # idx = 0 for row in self.messages: if row['subtype'] == 'chat': - file = self.extracted_folder+self.unique_folder_naming+str(idx+2)+'.txt' + file = self.extracted_folder+self.unique_folder_naming+row['id']+'.txt' if self.channel == '': with open(file, 'w') as f: f.write(row['text']) @@ -82,7 +82,7 @@ def write_to_files_slack(self): else: continue - idx = idx + 1 + # idx = idx + 1 def generate_topics(self): diff --git a/plsa/plsa/tfidf/tfidf.py b/plsa/plsa/tfidf/tfidf.py index 3609441..ac59c51 100644 --- a/plsa/plsa/tfidf/tfidf.py +++ b/plsa/plsa/tfidf/tfidf.py @@ -16,6 +16,7 @@ def tc(dataset, tokenizer=tokenize): for doc in dataset: if doc == '': continue + # print(doc) d = {} # token => count for term in tokenizer(doc):