Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion plsa/plsa/example_plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

# s_file_list = []

empty_docs_list = []

file_parts_number = 8
# file_parts_number = 7 # Inspire
Expand Down Expand Up @@ -88,6 +89,12 @@ def feat(folder):
#stemmer = PorterStemmer()
#docs = stemmer.stem_documents(docs)
td_dict, vocab = tc(docs)

for doc in range(len(docs)):
if docs[doc] == '':
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc)
empty_docs_list.append(doc)

print ('len(td_dict) =', len(td_dict))
print ('len(vocab) =',len(vocab))
global number_of_words
Expand Down Expand Up @@ -305,6 +312,10 @@ def train(data, maxiter=500, debug=True):
file_i=str(f_i).split('/')[file_parts_number]
file_list.append(file_i)

print('>>>>>>> In method train:', empty_docs_list)
for edl in empty_docs_list:
# print(file_list[edl])
del file_list[edl]

print('Dimenstionssssssssssssssssss')
print("topic_list_len =",topic_list.__len__())
Expand All @@ -313,7 +324,6 @@ def train(data, maxiter=500, debug=True):
print("p_z_d[0] =", p_z_d[0].__len__())



topic_by_doc = open(PATH+'.csv', "w")
for i in range(file_list.__len__()):
topic_by_doc.write(',')
Expand All @@ -328,6 +338,10 @@ def train(data, maxiter=500, debug=True):
topic_by_doc.write('\n')
topic_by_doc.close()

print('////////////////////////////')
print(p_z_d.__len__())
print(p_z_d[0].__len__())


word_by_topic_conditional = open(PATH_word_by_topic_conditional+'.csv', "w")

Expand Down
8 changes: 4 additions & 4 deletions plsa/plsa/plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,10 +346,10 @@ def topic_document(self):
Retrun: P(z,d)
'''
self.p_z_d= self.p_z*self.p_d_z
# print 'p_z_d-----'
# print (self.p_z_d.shape)
# print 'p_z_d.T-----'
# print (self.p_z_d.T.shape)
# print('p_z_d-----')
# print(self.p_z_d.shape)
# print('p_z_d.T-----')
# print(self.p_z_d.T.shape)


return self.p_z_d.T # T is for transpose
Expand Down
6 changes: 3 additions & 3 deletions plsa/plsa/tester_python3.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ def write_to_files_slack(self):

os.mkdir(self.extracted_folder+self.unique_folder_naming)

idx = 0
# idx = 0

for row in self.messages:
if row['subtype'] == 'chat':
file = self.extracted_folder+self.unique_folder_naming+str(idx+2)+'.txt'
file = self.extracted_folder+self.unique_folder_naming+row['id']+'.txt'
if self.channel == '':
with open(file, 'w') as f:
f.write(row['text'])
Expand All @@ -82,7 +82,7 @@ def write_to_files_slack(self):
else:
continue

idx = idx + 1
# idx = idx + 1


def generate_topics(self):
Expand Down
1 change: 1 addition & 0 deletions plsa/plsa/tfidf/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def tc(dataset, tokenizer=tokenize):
for doc in dataset:
if doc == '':
continue
# print(doc)
d = {} # token => count

for term in tokenizer(doc):
Expand Down