Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 34 additions & 27 deletions plsa/plsa/example_plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# import taskmanager as tm
import pandas as pd
from tfidf.preprocessing import read_files, preprocess_documents
from tfidf.preprocessing import read_files, preprocess_documents, read_json, json_files_list
# from tfidf.porter import PorterStemmer
from tfidf.tfidf import *
# from tfidf.preprocessing import file_list, empty_file_list
Expand Down Expand Up @@ -67,21 +67,24 @@

def feat(folder):
global num_topics
docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt"))))
# docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt"))))
docs = list(preprocess_documents(read_json(folder+"/cleaned.json")))
assert(len(docs) > 0)
print("len(docs) =",len(docs))
docs_2 = list(docs)
docs_reduced = reduce_docs(docs)


if docs_reduced.__len__() != docs_2.__len__():

list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced)
delete_docs(list_1)

docs = preprocess_documents(read_files(os.path.join(folder, "*.txt")))
assert(len(docs) > 0)
print("len(docs) =",len(docs))
# Uncomment this later and fix it with the new json theme
# docs_2 = list(docs)
# docs_reduced = reduce_docs(docs)
#
#
# if docs_reduced.__len__() != docs_2.__len__():
#
# list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced)
# delete_docs(list_1)
#
# docs = preprocess_documents(read_files(os.path.join(folder, "*.txt")))
# assert(len(docs) > 0)
# print("len(docs) =",len(docs))
# Uncomment ends here

# num_topics = int(len(docs) / topic_divider)
# if(num_topics < 2):
Expand All @@ -90,9 +93,12 @@ def feat(folder):
#docs = stemmer.stem_documents(docs)
td_dict, vocab = tc(docs)

print("'''''''''''''''''''''''''''''''")
# print(td_dict)

for doc in range(len(docs)):
if docs[doc] == '':
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc)
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc,' and file name is:',json_files_list[doc])
empty_docs_list.append(doc)

print ('len(td_dict) =', len(td_dict))
Expand All @@ -113,10 +119,13 @@ def feat(folder):
print ("term-document matrix size", td.shape)
print(td.shape[0],'terms by',td.shape[1],'docs')
print("size of term-document matrix in bytes according to sys.getsizeof =",sys.getsizeof(td))
# num_topics = int(td.shape[1] / topic_divider)
if topic_divider == 0:
pass
else:
num_topics = int(td.shape[1] / topic_divider)
# num_topics = 30
# num_topics = 7
num_topics = 2
# num_topics = 2
if (num_topics < 2):
num_topics = 2
# matrix_to_file(td)
Expand All @@ -127,7 +136,11 @@ def feat(folder):
return td, vocab

def K(D):
num_topics = int(D/topic_divider)
global num_topics
if topic_divider == 0:
pass
else:
num_topics = int(D/topic_divider)
if (num_topics < 2):
num_topics = 2
return num_topics
Expand Down Expand Up @@ -302,15 +315,9 @@ def train(data, maxiter=500, debug=True):
# Bug update over


file_list = []
pattern_1 = os.path.join(folder, "*.txt")

f = glob.glob(pattern_1)

for f_i in f:
# print(f_i)
file_i=str(f_i).split('/')[file_parts_number]
file_list.append(file_i)
file_list = json_files_list
print('"""""""""""""""""""""""""""""')
# print(file_list)

print('>>>>>>> In method train:', empty_docs_list)
for edl in empty_docs_list:
Expand Down
95 changes: 94 additions & 1 deletion plsa/plsa/tester_python3.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import random
import datetime
import time
import json

sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/plsa')
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing')
Expand Down Expand Up @@ -85,6 +86,89 @@ def write_to_files_slack(self):
# idx = idx + 1



def write_to_json(self):

self.read_csv()


self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + self.channel + '/'
print(self.unique_folder_naming)

os.mkdir(self.extracted_folder+self.unique_folder_naming)

# idx = 0
contents_dict = {}

file = self.extracted_folder + self.unique_folder_naming + 'extracted' + '.json'

for row in self.messages:
if row['subtype'] == 'chat':
if self.channel == '':
contents_dict[row['id']]=row['text']
# with open(file, 'w') as f:
# f.write(row['text'])
else:
if row['channel_name'] == self.channel:
contents_dict[row['id']] = row['text']
# with open(file, 'w') as f:
# f.write(row['text'])
else:
continue

# idx = idx + 1

with open(file, "w") as f:
json.dump(contents_dict, f, indent=4)

print("len(contents_dict):",len(contents_dict))



def generate_topics_json(self):

start_time_1 = time.time()

pplsa.file_parts_number=10
pclean.file_parts_number = 10
pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict'
pclean.source_texts = self.source_texts + self.unique_folder_naming + 'extracted.json'
pclean.output_dir = self.output_dir + self.unique_folder_naming

os.mkdir(pclean.output_dir)


# Do cleansing on the data and turing it to bad-of-words model
pclean.pre_pro()

# Train using PLSA
pplsa.topic_divider = 0
pplsa.num_topics = 2
pplsa.folder = pclean.output_dir[:-1]
pplsa.dict_path = pclean.file_dict
pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx'
pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional'
pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png'

# Folder paths to delete
self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH
self.output_dir_stream = pclean.output_dir
self.file_dict_stream = pclean.file_dict



os.mkdir(pplsa.PLSA_PARAMETERS_PATH)

pplsa.main()

end_time_1 = time.time()

print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4))




def generate_topics(self):

start_time_1 = time.time()
Expand All @@ -102,6 +186,8 @@ def generate_topics(self):
pclean.pre_pro()

# Train using PLSA
pplsa.topic_divider = 0
pplsa.num_topics = 2
pplsa.folder = pclean.output_dir[:-1]
pplsa.dict_path = pclean.file_dict
pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
Expand Down Expand Up @@ -129,6 +215,12 @@ def generate_topics(self):



def run_plsa_slack_json():
path_1 = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/misc/slack_messages.csv'
print(path_1)
s = TopicAnalysis(path_1,'singnet')
s.write_to_json()
s.generate_topics_json()



Expand Down Expand Up @@ -198,7 +290,8 @@ def isEnglish(s):

if __name__ == '__main__':

run_plsa_slack()
run_plsa_slack_json()
# run_plsa_slack()
# run_1()
# test_preprocessing()
#
Expand Down
25 changes: 25 additions & 0 deletions plsa/plsa/tfidf/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import re
import string
import glob
import json

# file_list = []
# file_parts_number = 9
# file_parts_number = 8

json_files_list = []

def strip_punctuation(s):
return re.sub("([%s]+)" % string.punctuation, " ", s)

Expand Down Expand Up @@ -94,6 +97,28 @@ def read_files(pattern):
# file_list = []
return map(read_file, glob.glob(pattern))

def read_json(path):
global json_files_list
with open(path, "r") as read_file:
ret = json.load(read_file)

for k in ret:
json_files_list.append(k)

print("||||||||||||||||||||||||||||||||")
# print(ret)

docs = []

for k in ret:
docs.append(ret[k])

# return docs
return map(mapper,docs)

def mapper(s):
return s

# def empty_file_list():
# global file_list
# file_list = []
31 changes: 23 additions & 8 deletions plsa/preprocessing/cleansing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sys
import pathlib
import os
import json

sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa')
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing')
Expand Down Expand Up @@ -176,14 +177,19 @@ def return_cleaned(t1):


def pre_pro():
fileList = glob.glob(source_texts)
fileList_len = fileList.__len__() - 1
# fileList = glob.glob(source_texts)
# fileList_len = fileList.__len__() - 1

cleaned_dict = {}

with open(source_texts, "r") as read_file:
fileList = json.load(read_file)
k = 0
print('------pre-process started-------')
for files in fileList:
tFile = open(files)
tFile = fileList[files]
# tFile = codecs.open(files, 'r', 'utf-8')
line = tFile.read().lower()
line = tFile.lower()
# print(line)
# print(type(line))
# line = line.decode('utf-8')
Expand Down Expand Up @@ -268,14 +274,23 @@ def pre_pro():

port_dict.add_element(stemmed=term1,nonstemmed=temp_term1)

file_txt = open(output_dir + path.PurePath(files).parts[file_parts_number], "w")
# file_txt = open(output_dir + path.PurePath(files).parts[file_parts_number], "w")

temp = ''

for i in por_tokens:
file_txt.write(i)
file_txt.write('\n')
file_txt.close()
temp = temp + i + '\n'

cleaned_dict[files] = temp

# print('Processed ',k,'of',fileList_len)
k = k + 1

file_json = output_dir + 'cleaned.json'

with open(file_json, "w") as f:
json.dump(cleaned_dict, f, indent=4)

port_dict.write_dict_to_file(file_dict)
print('***------pre-process finished--------')

Expand Down