From b9a27f215bc23db4f5c3cdfd684f7493019c87a4 Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Wed, 9 Jan 2019 17:27:42 +0300 Subject: [PATCH] Init commit --- README.md | 1 + plsa/__init__.py | 0 plsa/plsa/__init__.py | 0 plsa/plsa/_plsa.pyx | 72 ++++ plsa/plsa/example_plsa.py | 482 ++++++++++++++++++++++++ plsa/plsa/plsa.py | 455 ++++++++++++++++++++++ plsa/plsa/plsa_train.c | 271 +++++++++++++ plsa/plsa/porter_dictionary.py | 66 ++++ plsa/plsa/setup.py | 16 + plsa/plsa/tester_python3.py | 206 ++++++++++ plsa/plsa/tests.py | 42 +++ plsa/plsa/tfidf/__init__.py | 0 plsa/plsa/tfidf/featsel.py | 116 ++++++ plsa/plsa/tfidf/porter.py | 375 ++++++++++++++++++ plsa/plsa/tfidf/preprocessing.py | 99 +++++ plsa/plsa/tfidf/smoothing.py | 18 + plsa/plsa/tfidf/tfidf.py | 168 +++++++++ plsa/preprocessing/__init__.py | 0 plsa/preprocessing/cleansing.py | 287 ++++++++++++++ plsa/preprocessing/porter_dictionary.py | 66 ++++ 20 files changed, 2740 insertions(+) create mode 100644 README.md create mode 100644 plsa/__init__.py create mode 100644 plsa/plsa/__init__.py create mode 100644 plsa/plsa/_plsa.pyx create mode 100644 plsa/plsa/example_plsa.py create mode 100644 plsa/plsa/plsa.py create mode 100644 plsa/plsa/plsa_train.c create mode 100644 plsa/plsa/porter_dictionary.py create mode 100644 plsa/plsa/setup.py create mode 100644 plsa/plsa/tester_python3.py create mode 100644 plsa/plsa/tests.py create mode 100644 plsa/plsa/tfidf/__init__.py create mode 100644 plsa/plsa/tfidf/featsel.py create mode 100644 plsa/plsa/tfidf/porter.py create mode 100644 plsa/plsa/tfidf/preprocessing.py create mode 100644 plsa/plsa/tfidf/smoothing.py create mode 100644 plsa/plsa/tfidf/tfidf.py create mode 100644 plsa/preprocessing/__init__.py create mode 100644 plsa/preprocessing/cleansing.py create mode 100644 plsa/preprocessing/porter_dictionary.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..865a0e5 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# topic-analysis \ No newline at end of file diff --git a/plsa/__init__.py b/plsa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plsa/plsa/__init__.py b/plsa/plsa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plsa/plsa/_plsa.pyx b/plsa/plsa/_plsa.pyx new file mode 100644 index 0000000..14c950c --- /dev/null +++ b/plsa/plsa/_plsa.pyx @@ -0,0 +1,72 @@ +import numpy as np +cimport numpy as np + +cdef extern void _train(unsigned int n_ele, + unsigned int n_z, + unsigned int n_w, + unsigned int n_d, + unsigned int *tdnz, + double *p_z, + double *p_w_z, + double *p_d_z, + double *p_z_old, + double *p_w_z_old, + double *p_d_z_old, + unsigned int maxiter, + double eps, + double beta, + unsigned int min_iteration, + unsigned int folding_in, + unsigned int debug, + double *logL_c) + +def nonzero(td): + """ + Convert a sparse matrix td to a Nx3 matrix where N is the number of non-zero + elements. The 1st column is the word count, the 2nd is the word index + and the 3rd column is the document index. + """ + rows, cols = td.nonzero() + vals = td[rows,cols] + if "scipy.sparse" in str(vals.__class__): vals = vals.toarray() + return np.asfortranarray(np.vstack((vals, rows, cols)).T) + +def train(td, + np.ndarray[np.float64_t, ndim=1, mode='c']p_z, + np.ndarray[np.float64_t, ndim=2, mode='c']p_w_z, + np.ndarray[np.float64_t, ndim=2, mode='c']p_d_z, + np.ndarray[np.float64_t, ndim=1, mode='c']p_z_old, + np.ndarray[np.float64_t, ndim=2, mode='c']p_w_z_old, + np.ndarray[np.float64_t, ndim=2, mode='c']p_d_z_old, + maxiter, + eps, + beta, + min_iteration, + folding_in, + debug, + np.ndarray[np.float64_t, ndim=1, mode='c']logL_c): + + cdef np.ndarray[np.uint32_t, ndim=2, mode='fortran'] tdnz + + tdnz = nonzero(td).astype(np.uint32) + + _train(tdnz.shape[0], + p_z.shape[0], + p_w_z.shape[0], + p_d_z.shape[0], + tdnz.data, + p_z.data, + p_w_z.data, + p_d_z.data, + p_z_old.data, + p_w_z_old.data, + p_d_z_old.data, + maxiter, + eps, + beta, + min_iteration, + folding_in, + debug, + logL_c.data) + + diff --git a/plsa/plsa/example_plsa.py b/plsa/plsa/example_plsa.py new file mode 100644 index 0000000..511ce4b --- /dev/null +++ b/plsa/plsa/example_plsa.py @@ -0,0 +1,482 @@ +#!/usr/bin/env python + +import sys + +import os +import time +import glob +import random +import logging +import numpy as np + +# import taskmanager as tm +import pandas as pd +from tfidf.preprocessing import read_files, preprocess_documents +# from tfidf.porter import PorterStemmer +from tfidf.tfidf import * +# from tfidf.preprocessing import file_list, empty_file_list + +from plsa import pLSA +import porter_dictionary + +# s_file_list = [] + + +file_parts_number = 8 +# file_parts_number = 7 # Inspire + +folder = '' +dict_path = '' + +file= 'plsa_topics.txt' +# file1='word_topics' +file2='topic_probability_pz' + +PLSA_PARAMETERS_PATH = '' +# file_txt1 = open(PLSA_PARAMETERS_PATH + file1, "w") + +PATH='' +PATH_word_by_topic_conditional='' + +matrix_file = '' +# num_topics = 1672 +# num_topics = 5403 +# num_topics = 5 + +num_topics = 24 +topic_divider = 5 +# num_topics = 256 +num_topics_generated = 300 +# cc = 0.3 # This is convergence criterion +cc = 0.0000000000000000007 # This is convergence criterion +# cc = 0.7 # This is convergence criterion -- was used for :D +# cc = 0.13 # This is convergence criterion +# maxiter2 = 50 +maxiter2 = 22 +beta = 1 +min_iteration = 10 +logL_pic = '' +number_of_words = 0 +number_of_docs = 0 +RAM_limit = 25 # In giga bytes + +print('RAM usage has been limited to {} GBs >>>>>>>>>>>>>>>>>>>>>>>>>>'.format(RAM_limit)) + +# @tm.task(str) + +def feat(folder): + global num_topics + docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt")))) + assert(len(docs) > 0) + print("len(docs) =",len(docs)) + docs_2 = list(docs) + docs_reduced = reduce_docs(docs) + + + if docs_reduced.__len__() != docs_2.__len__(): + + list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced) + delete_docs(list_1) + + docs = preprocess_documents(read_files(os.path.join(folder, "*.txt"))) + assert(len(docs) > 0) + print("len(docs) =",len(docs)) + + # num_topics = int(len(docs) / topic_divider) + # if(num_topics < 2): + # num_topics = 2 + #stemmer = PorterStemmer() + #docs = stemmer.stem_documents(docs) + td_dict, vocab = tc(docs) + print ('len(td_dict) =', len(td_dict)) + print ('len(vocab) =',len(vocab)) + global number_of_words + global number_of_docs + number_of_words = len(vocab) + number_of_docs = len(td_dict) + print('type(docs):',type(docs)) + print('type(vocab):',type(vocab)) + # print('docs',docs) + # print('td_dict:',td_dict) + # print('vocab',vocab) + td = to_sparse_matrix(td_dict, vocab).toarray() + # print('td:',td) + print('type(td):',type(td)) + # idf = to_vector(idf_from_tc(td_dict), vocab) + print ("term-document matrix size", td.shape) + print(td.shape[0],'terms by',td.shape[1],'docs') + print("size of term-document matrix in bytes according to sys.getsizeof =",sys.getsizeof(td)) + # num_topics = int(td.shape[1] / topic_divider) + # num_topics = 30 + # num_topics = 7 + num_topics = 2 + if (num_topics < 2): + num_topics = 2 + # matrix_to_file(td) + # print 'td\n',td + # print 'vocab\n',vocab + # return td, idf, vocab + # exit(0) + return td, vocab + +def K(D): + num_topics = int(D/topic_divider) + if (num_topics < 2): + num_topics = 2 + return num_topics + +def docs_to_delete(docs,docs_red): + + list_1 = [] + + for i in range(docs.__len__()): + + index = next((k for k in range(docs_red.__len__()) if docs[i]==docs_red[k]),-1) + + if index != -1: + del docs_red[index] + else: + list_1.append(i) + + print('Number of files for deletion ',list_1.__len__()) + + return list_1 + +def delete_docs(list_1): + + pattern_1 = os.path.join(folder, "*.txt") + + f = glob.glob(pattern_1) + + print('Deleting files started ....................') + + for i in list_1: + + try: + os.remove(f[i]) + + except Exception as e: + + print('Error during file deletion while reducing docs') + logging.exception("message") + + print('Deleting files ended ||||||||||||||||||||||') + + +def reduce_docs(docs): + + G = 1024**3 + percent_to_delete = 0.05 + + counter = 0 + + + while True: + + counter = counter + 1 + print("counter =", counter) + + # Calculate number of keywords + words_coll = [] + for words in docs: + words_coll.extend(words.split()) + + W = len(set(words_coll)) + + # ram=((d*w)+(d+w)*k*2)*8)/G + ram = (float(len(docs)*W) + float(len(docs)+W) * float(K(len(docs))*2))*8.0/float(G) + + + # No more for debugging purposes + print("ram =", ram) + print('len(docs =', len(docs)) + print("W =", W) + + if ram > RAM_limit: + + # # For debugging + # if counter == 68: + # break + + if counter == 1: + print('Ram limiter code initialted ---------------------------------') + + del_len = int(len(docs) * percent_to_delete) + + del_list = random.sample(range(0, len(docs)), del_len) + + for i in range(del_list.__len__()): + del docs[del_list[i]-i] + + + else: + + break + + print("Final W =", W) + print("Final len(docs) =", len(docs)) + print("Final counter =", counter) + + return docs + + + + + +def matrix_to_file(mat): + f = open(matrix_file,'w') + + # print mat.__len__() + # print mat[0].__len__() + + f.write('function[a] = matPlsaFull()\n\n') + + f.write('a = [\n') + + for i in range(mat.__len__()): + print (i) + for j in range(mat[0].__len__()): + f.write(str(mat[i][j])) + f.write(' ') + f.write('\n\n') + + f.write('\n\n];\n\n\n\n\n') + + f.close() + + exit() + + +# @tm.task(feat, int, int) +def train(data, maxiter=500, debug=True): + # td, idf, vocab = data + # s_file_list= sorted(file_list) + # print('file_list:',file_list) + # print s_file_list + topic_list= range(0,num_topics) + # print topic_list + # file_list_2 = list(file_list) + # empty_file_list() + #Bug update + # df= pd.DataFrame(0,index=topic_list,columns=file_list_2) + # Bug update over + + + + td, vocab = data + # td = td[:,:-1] + plsa = pLSA() + plsa.debug = debug + plsa.logL_pic = logL_pic + # model=plsa.train(td, num_topics, maxiter) + model=plsa.train(td=td,Z=num_topics,maxiter=maxiter2,eps=cc,beta=beta,min_iteration=min_iteration) + p_z_d=plsa.topic_document() + ii=0 + + # print 'model2',model[0] + # print 'p_z_D =',p_z_d[0] + # print('row',len(p_z_d)) + # print('column',len(p_z_d[0])) + # print(df.shape) + + # Bug update + # for i in df.index: + # jj=0 + # for j in df.columns: + # df.loc[i,j]= p_z_d[ii][jj] + # # print 'df loc', df.loc[i,j] + # # print'ii jj', p_z_d[ii][jj] + # jj=jj+1 + # ii=ii+1 + # Bug update over + # print('PATH =',PATH) + # Bug update + # df.to_csv(PATH+'.csv') + # Bug update over + + + file_list = [] + pattern_1 = os.path.join(folder, "*.txt") + + f = glob.glob(pattern_1) + + for f_i in f: + # print(f_i) + file_i=str(f_i).split('/')[file_parts_number] + file_list.append(file_i) + + + print('Dimenstionssssssssssssssssss') + print("topic_list_len =",topic_list.__len__()) + print("p_z_d_len =", p_z_d.__len__()) + print("file_list_len =",file_list.__len__()) + print("p_z_d[0] =", p_z_d[0].__len__()) + + + + topic_by_doc = open(PATH+'.csv', "w") + for i in range(file_list.__len__()): + topic_by_doc.write(',') + topic_by_doc.write(file_list[i]) + topic_by_doc.write('\n') + + for i in range(p_z_d.__len__()): + topic_by_doc.write(str(i)) + for j in range(p_z_d[0].__len__()): + topic_by_doc.write(',') + topic_by_doc.write(str(p_z_d[i][j])) + topic_by_doc.write('\n') + topic_by_doc.close() + + + word_by_topic_conditional = open(PATH_word_by_topic_conditional+'.csv', "w") + + p_w_z_transposed_truncated = np.sort(plsa.p_w_z.transpose()[:,0:num_topics_generated]) + + for i in range(p_w_z_transposed_truncated.__len__()): + for j in range(p_w_z_transposed_truncated[0].__len__()): + word_by_topic_conditional.write(str(p_w_z_transposed_truncated[i][num_topics_generated-j-1])) + word_by_topic_conditional.write(',') + word_by_topic_conditional.write('\n') + word_by_topic_conditional.close() + + + + + + # print('docs==========================') + # + # for i in file_list: + # print(i) + # for i in p_z_d: + # print(i) + + pz=model[0] + topic_prob_file = open(PLSA_PARAMETERS_PATH + file2, "w") + for z in pz: + topic_prob_file.write(str(z)) + topic_prob_file.write('\n') + topic_prob_file.close() + return model + + +# @tm.task(feat, int, int) +def average_train(data, maxiter=500, debug=True): + td, idf, vocab = data + td = td[:,:-1] + plsa = pLSA() + plsa.debug = debug + return plsa.average_train(10)(td, 10, maxiter) + +# @tm.task(feat, train, int, int) +def folding_in(data, model, maxiter=30, debug=True): + td, idf, vocab = data + d = td[:,-1] + plsa = pLSA(model) + plsa.debug = debug + print (plsa.folding_in(d, maxiter)) + +# @tm.nocache +# @tm.task(train) +def document_topics(model): + plsa = pLSA(model) + for i in plsa.document_topics(): + print (i) + # file_txt1.write(str(i)) + # file_txt1.write('\n') + # print plsa.document_topics() + +# @tm.nocache +# @tm.task(train) +def document_cluster(model): + plsa = pLSA(model) + print (plsa.document_cluster()) + +# @tm.nocache +# @tm.task(train) +def word_topics(model): + plsa = pLSA(model) + for i in plsa.word_topics(): + print (i) + # file_txt1.write(str(i)) + # file_txt1.write('\n') + # print plsa.word_topics() + +# @tm.nocache +# @tm.task(train) +def word_cluster(model): + plsa = pLSA(model) + print (plsa.word_cluster()) + +# @tm.nocache +# @tm.task(train) +def unigram_smoothing(model): + plsa = pLSA(model) + print (plsa.unigram_smoothing()) + +# @tm.nocache +# @tm.task(feat, train, int) +def topic_labels(data, model, N=50): + # td, idf, vocab = data + file_txt = open(PLSA_PARAMETERS_PATH + file, "w") + port_dict = porter_dictionary.porter_dictionary() + port_dict.load_dict(dict_path) + # print port_dict.dictionary + td, vocab = data + plsa = pLSA(model) + inv_vocab = inverse_vocab(vocab) + dict_vocab=[] + # vocab_list=[x for x in inv_vocab[1]] + + # print vocab_list + for ind in inv_vocab: + try: + dict_vocab.append(port_dict.dictionary[inv_vocab[ind]][0]) + except: + dict_vocab.append(inv_vocab[ind]) + # print len(dict_vocab) + for i in plsa.topic_labels(dict_vocab, N): + # print (i) + # file_txt.write(str(i)) + for j in i: + file_txt.write(j+', ') + file_txt.write('\n') + file_txt.close() + # print plsa.topic_labels(inv_vocab, N) + +# @tm.nocache +# @tm.task(feat, train) +def global_weights(data, model): + td, idf, vocab = data + plsa = pLSA(model) + print (plsa.global_weights(idf)) + +def main(): + # import sys + + # try: + # # tm.TaskManager.OUTPUT_FOLDER = "./tmp" + # tm.run_command(sys.argv[1:]) + + + # except tm.TaskManagerError, m: + # print >>sys.stderr, m + + print ('Training started at',time.strftime("%c")) + start_time = time.time() + data=feat(folder) + model=train(data) + print ('>>>>>>>>>>>>>Finished training') + end_time = time.time() + print ('Training took ' + str(round((end_time - start_time) / 60, 4)) + ' minutes.') + topic_labels(data,model,num_topics_generated) + end_time = time.time() + print ('Total time ' + str(round((end_time - start_time) / 60,4)) + ' minutes.') + + # doc_topics=document_topics(model) + # topic_labels(data,model) + # word_cluster(model) + # word_topics(model) + # document_topics(model) +if __name__ == "__main__": + main() diff --git a/plsa/plsa/plsa.py b/plsa/plsa/plsa.py new file mode 100644 index 0000000..e22f95a --- /dev/null +++ b/plsa/plsa/plsa.py @@ -0,0 +1,455 @@ +# Copyright (C) 2010 Mathieu Blondel +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +""" +Implementation of probabilistic Latent Semantic Analysis/Indexing as described +in + +"Probabilistic Latent Semantic Indexing", Hofmann, SIGIR99 + +Notation: + + w: word + d: document + z: topic + + V: vocabulary size + D: number of documents + Z: number of topics + +""" + +import psutil +import numpy as np +import matplotlib +# Force matplotlib to not use any Xwindows backend. +matplotlib.use('Agg') +import matplotlib.pyplot as plt +# import pandas as pd +import inspect +# import _plsa + + +# beta2 = 0.75 +beta2 = 1 +# HAVE_EXT = True +HAVE_EXT = True +min_iteration2 = 10 # 10 was the previous value +init_with_seed = False + + + + +frame = inspect.stack()[1] +file_name = str(inspect.getfile((frame[0]))) + + +logL_pic = '' + + +try: + import _plsa + if HAVE_EXT == True: + print ('xxxxxxxxxxxxxxxxxxxxxxxxxxxxx. There is _plsa') + else: + HAVE_EXT = False + print ('^^^^^^^^^^^^^^^^^^^^^^^^^^^. No _plsa') +except: + HAVE_EXT = False + print ('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&. No _plsa') + +# print 'HAVE_EXT =',HAVE_EXT + + +def normalize_1d(a, out=None): + if out is None: out = np.empty_like(a) + s = np.sum(a) + if s != 0.0 and len(a) != 1: + np.divide(a, s, out) + return out + +def normalize(M, axis=0, out=None): + if len(M.shape) == 1: return normalize_1d(M, out) + if out is None: out = np.empty_like(M) + if axis == 0: + M = M.T #M.swapaxes(0,1) + out = out.T + + for i in range(len(M)): + normalize_1d(M[i], out[i]) + + if axis == 0: out = out.T + + return out + +def loglikelihood(td, p_z, p_w_z, p_d_z): + """ + Compute the log-likelihood that the model generated the data. + """ + V, D = td.shape + L = 0.0 + for w,d in zip(*td.nonzero()): + # p_d_w = np.sum(p_z * p_w_z[w,:] * p_d_z[d,:]) + p_d_w = np.sum(p_z * p_w_z[w,:] * p_d_z[d,:]/(np.sum(p_z*p_d_z[d,:]))) + if p_d_w > 0: L += td[w,d] * np.log(p_d_w) + return L + +def train(td, + p_z, p_w_z, p_d_z, + p_z_old, p_w_z_old, p_d_z_old, + maxiter, eps, + folding_in, debug,logL_c): + + logL = [] + + R = td.sum() # total number of word counts + + lik = loglikelihood(td, p_z, p_w_z, p_d_z) + logL.append(lik) + print ('Starting logL =',lik) + # beta2 = 1 + for iteration in range(1, maxiter+1): + # Swap old and new + p_d_z_old, p_d_z = (p_d_z, p_d_z_old) + p_w_z_old, p_w_z = (p_w_z, p_w_z_old) + p_z_old, p_z = (p_z, p_z_old) + + # Set to 0.0 without memory allocation + p_d_z *= 0.0 + if not folding_in: + p_w_z *= 0.0 + p_z *= 0.0 + + if (beta2 != 1): + print ('tempering started') + p_z_old = np.power(p_z_old, beta2) + # print('Finished','p_z_old') + # beta2 = 1 + p_d_z_old = np.power(p_d_z_old, beta2) + # print('Finished','p_d_z_old') + p_w_z_old = np.power(p_w_z_old, beta2) + # print('Finished','p_w_z_old') + print ('tempering finished') + + + for w,d in zip(*td.nonzero()): + # E-step + # beta2 = 0.75 + + p_z_d_w = p_z_old * p_d_z_old[d, :] * p_w_z_old[w, :] + + # if (beta2 ==1): + # p_z_d_w = p_z_old * p_d_z_old[d,:] * p_w_z_old[w,:] + # else: + # p_z_old_beta2 = np.power(p_z_old,beta2) + # # print('Finished','p_z_old_beta2') + # # beta2 = 1 + # p_d_z_old_beta2 = np.power(p_d_z_old,beta2) + # # print('Finished','p_d_z_old_beta2') + # p_w_z_old_beta2 = np.power(p_w_z_old,beta2) + # # print('Finished','p_w_z_old_beta2') + # p_z_d_w = p_z_old_beta2 * p_d_z_old_beta2[d,:] * p_w_z_old_beta2[w,:] + # # print('Finished','p_z_d_w') + + + normalize(p_z_d_w, out=p_z_d_w) + # print('Finished normalizing') + + + + # M-step + s = td[w,d] * p_z_d_w + p_d_z[d,:] += s + + if not folding_in: + p_w_z[w,:] += s + p_z += s + + # normalize + normalize(p_d_z, axis=0, out=p_d_z) + + if not folding_in: + normalize(p_w_z, axis=0, out=p_w_z) + p_z /= R + + lik_new = loglikelihood(td, p_z, p_w_z, p_d_z) + logL.append(lik_new) + # lik_diff = lik_new - lik + lik_diff = -(lik_new - lik)*100/lik + # assert(lik_diff >= -1e-10) + lik = lik_new + + + if debug: + print ("Iteration", iteration) + + # I commented this out to save computational time + # print "Parameter change" + # pz=np.abs(p_z - p_z_old).sum() + # pwz=np.abs(p_w_z - p_w_z_old).sum() + # pdz=np.abs(p_d_z - p_d_z_old).sum() + # p_z_d= pz*p_d_z + # print "P(z): ",pz + # print "P(w|z): ",pwz + # print "P(d|z): ",pdz + # print "P(z,d):" , p_z_d + # + # print "L += %f" % lik_diff + # print "Lik_new",lik + print ("lik_diff =",lik_diff,'%') + + + if iteration > min_iteration2 and lik_diff < eps : + print ("No more progress, stopping EM at iteration", iteration) + print ("LogL =", logL) + axis1 = range(1,len(logL)+1) + plt.plot(axis1, logL) + plt.savefig(logL_pic) + break + + + + # print 'p_z-----' + # print p_z + + # ddf=pd.read_csv('/xxx/topic-by-Document.csv') + # ddf = ddf.set_index('Unnamed: 0') + # print ddf + # for i in ddf.index: + # for j in ddf.columns: + # ddf.loc[i,j]=p_d_z.T[i,j] + + + + +# def power + +class pLSA(object): + + def __init__(self, model=None): + """ + model: a model, as returned by get_model() or train(). + """ + self.p_z = None + self.p_w_z = None + self.p_d_z = None + self.p_z_d= None + if model is not None: self.set_model(model) + self.debug = False + self.logL_c = np.zeros(503) + self.logL_pic = '' + + def random_init(self, Z, V, D): + """ + Z: the number of topics desired. + V: vocabulary size. + D: number of documents. + """ + if init_with_seed: + np.random.seed(0) # uncomment for deterministic init + if self.p_z is None: + self.p_z = normalize(np.random.random(Z)) + print(self.p_z) + np.savetxt("pz.csv", self.p_z, delimiter=",") + if self.p_w_z is None: + self.p_w_z = normalize(np.random.random((V,Z)), axis=0) + np.savetxt("pwz.csv", self.p_w_z, delimiter=",") + if self.p_d_z is None: + self.p_d_z = normalize(np.random.random((D,Z)), axis=0) + np.savetxt("pdz.csv", self.p_d_z, delimiter=",") + + + def train(self, td, Z, maxiter=500, eps=0.01, folding_in=False,beta=beta2,min_iteration = min_iteration2): # eps=0.01 is default value + """ + Train the model. + + td: a V x D term-document matrix of term-counts. + Z: number of topics desired. + + td can be dense or sparse (dok_matrix recommended). + """ + V, D = td.shape + + self.random_init(Z, V, D) + + p_d_z_old = np.zeros_like(self.p_d_z) + p_w_z_old = np.zeros_like(self.p_w_z) + p_z_old = np.zeros_like(self.p_z) + + train_func = _plsa.train if HAVE_EXT else train + print('RRRRRRRRRRRRRRRAM used just before training:', float(list(psutil.virtual_memory())[3]) / 1073741824.0, 'GB') + train_func(td.astype(np.uint32), + self.p_z, self.p_w_z, self.p_d_z, + p_z_old, p_w_z_old, p_d_z_old, + maxiter, eps,beta,min_iteration, + folding_in, self.debug,self.logL_c) + + # self.P_D_Z = self.p_d_z + # self.P_Z = self.p_z + + # print 'here????????????????????????????' + + if (HAVE_EXT == True): + bound_logL = list(self.logL_c).index(0) + logL_c_list = list(self.logL_c[:bound_logL]) + print ('logL =',logL_c_list) + axis1 = range(1, logL_c_list.__len__() + 1) + plt.clf() + plt.plot(axis1, logL_c_list) + # print('logL_pic=',self.logL_pic) + plt.savefig(self.logL_pic) + + return self.get_model() + + def average_train(self, N): + """ + Return a function compatible with train. + This function executes the training N times and takes the average. + """ + assert(N >= 1) + + def _wrap(*args, **kw): + print ("pLSA 1") + model = list(pLSA().train(*args, **kw)) + + for i in range(1,N): + print ("pLSA", i+1) + model2 = pLSA().train(*args, **kw) + for j in range(len(model)): + model[j] += model2[j] + + normalize(model[0], out=model[0]) + normalize(model[1], axis=0, out=model[1]) + normalize(model[2], axis=0, out=model[2]) + + self.set_model(tuple(model)) + + return self.get_model() + + return _wrap + def topic_document(self): + ''' + Compute the probabilities of topics belonging to document + Retrun: P(z,d) + ''' + self.p_z_d= self.p_z*self.p_d_z + # print 'p_z_d-----' + # print (self.p_z_d.shape) + # print 'p_z_d.T-----' + # print (self.p_z_d.T.shape) + + + return self.p_z_d.T # T is for transpose + + + def document_topics(self): + """ + Compute the probabilities of documents belonging to topics. + + Return: a Z x D matrix of P(z|d) probabilities. + + Note: This can be seen as a dimensionality reduction since a Z x D + matrix is obtained from a V x D matrix, where Z << V. + """ + return normalize((self.p_d_z * self.p_z[np.newaxis,:]).T, axis=0) + + def document_cluster(self): + """ + Find the main topic (cluster) of documents. + + Return: a D-array of cluster indices. + """ + return self.document_topics().argmax(axis=0) + + def word_topics(self): + """ + Compute the probabilities of words belonging to topics. + + Return: a Z x V matrix of P(z|w) probabilities. + """ + return normalize((self.p_w_z * self.p_z[np.newaxis,:]).T, axis=0) + + def word_cluster(self): + """ + Find the main topic (cluster) of words. + + Return: a D-array of cluster indices. + """ + return self.word_topics().argmax(axis=0) + + def topic_labels(self, inv_vocab, N=10): + """ + For each topic z, find the N words w with highest probability P(w|z). + + inv_vocab: a term-index => term-string dictionary + + Return: Z lists of N words. + """ + Z = len(self.p_z) + ret = [] + for z in range(Z): + ind = np.argsort(self.p_w_z[:,z])[-N:][::-1] + ret.append([inv_vocab[i] for i in ind]) + return ret + + def unigram_smoothing(self): + """ + Compute the smoothed probability P(w|d) by "back-projecting" the + features from the latent space to the original space. + + Return: a V x D matrix of smoothed P(w|d) probabilities. + """ + V, Z = self.p_w_z.shape + D, Z = self.p_d_z.shape + p_w_d = np.zeros((V,D), dtype=np.double) + for d in range(D): + for w in range(V): + p_w_d[w,d] = np.sum(self.p_w_z[w,:] * self.p_d_z[d,:]) + return p_w_d + + def folding_in(self, d, maxiter=50, eps=0.01): + """ + Compute the probabilities of a new document d belonging to topics. + + d: a V-array of term-counts. + + Return: a Z-array of P(z|d) probabilities. + """ + V = d.shape[0] + Z = len(self.p_z) + plsa = pLSA() + plsa.debug = self.debug + plsa.p_z = self.p_z + plsa.p_w_z = self.p_w_z + plsa.p_z_d= self.p_z_d + plsa.train(d[:,np.newaxis], Z, maxiter, eps, folding_in=True) + return normalize(self.p_z * plsa.p_d_z[:,0]) + + def global_weights(self, gw): + """ + Compute global weight vector in latent space. + + gw: a V-array of global weights (e.g., idf). + + Return: a Z-array of global weights. + """ + return np.sum(gw[:,np.newaxis] * self.p_w_z, axis=0) + + def get_model(self): + return (self.p_z, self.p_w_z, self.p_d_z, self.p_z_d) + + def set_model(self, model): + self.p_z, self.p_w_z, self.p_d_z,self.p_z_d = model diff --git a/plsa/plsa/plsa_train.c b/plsa/plsa/plsa_train.c new file mode 100644 index 0000000..2d3a0c5 --- /dev/null +++ b/plsa/plsa/plsa_train.c @@ -0,0 +1,271 @@ +/* +# Copyright (C) 2010 Mathieu Blondel +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +#include +#include +#include +#include +#include + +#define C_(cols, i, j) (i * cols + j) +#define F_(rows, i, j) (i + j * rows) +#define Z_(i,j) (C_(n_z, i, j)) + +#define SWAP(a,b,tmp) tmp = a; a = b; b = tmp + + +//int min_iteration = 30; +//int min_iteration = 120; +//int min_iteration = 10; +//int min_iteration = 6; +//double beta = 0.75; +//double beta = 1; + +static void +normalize_axis0(double *a, + unsigned int j, + unsigned int rows, + unsigned int cols) +{ + double sum = 0.0; + unsigned int n; + + for (n=0; n < rows; n++) + sum += a[C_(cols,n,j)]; + + if (sum > 0.0) + { + for (n=0; n < rows; n++) + a[C_(cols,n,j)] /= sum; + } +} + +static unsigned int +total_wc(unsigned int n_ele, + unsigned int *tdnz) +{ + unsigned int n, wc=0; + + for (n=0; n < n_ele; n++) + wc += tdnz[n]; + + return wc; +} + +static double +loglikelihood(unsigned int n_ele, + unsigned int n_z, + unsigned int *tdnz, + double *p_z, + double *p_w_z, + double *p_d_z) +{ + unsigned int n, z, w, d, wc; + double sum, L = 0.0; + double sum111 = 0.0; + + for (n=0; n < n_ele; n++) + { + wc = tdnz[n]; + w = tdnz[F_(n_ele,n,1)]; + d = tdnz[F_(n_ele,n,2)]; + + sum = 0.0; + sum111 = 0.0; + for (z=0; z < n_z; z++) + { + sum += p_z[z] * p_w_z[Z_(w,z)] * p_d_z[Z_(d,z)]; + sum111 += p_z[z] * p_d_z[Z_(d,z)]; + } + +// printf("Printeddddd %f %f",sum,sum111); + sum = sum / sum111; + + if (sum > 0.0) + L += wc * log(sum); + } + + return L; +} + +void +_train(unsigned int n_ele, + unsigned int n_z, + unsigned int n_w, + unsigned int n_d, + /* + tdnz is a fortran-style 2d-array where: + - 1st column: word count + - 2nd column: word index + - 3rd column: document index + */ + unsigned int *tdnz, + double *p_z, + double *p_w_z, + double *p_d_z, + double *p_z_old, + double *p_w_z_old, + double *p_d_z_old, + unsigned int maxiter, + double eps, + double beta, + unsigned int min_iteration, + unsigned int folding_in, + unsigned int debug, + double *logL_c) + +{ + + unsigned int iter, n, z, w, d, wc, R; + double *p_z_d_w, *tmp, sum, lik, lik_new, lik_diff; + + R = total_wc(n_ele, tdnz); + + p_z_d_w = (double *)calloc(n_z, sizeof(double)); + + lik = loglikelihood(n_ele, n_z, tdnz, p_z, p_w_z, p_d_z); + + logL_c[0] = lik; + + + printf("Starting logL = %f.\n", lik); + + unsigned int max = 0; + + unsigned int iii = 0; + + if(n_z*n_w > n_z*n_d) + { max = n_z*n_w;} + else + { max = n_z*n_d;} + + + + + + + for (iter=0; iter < maxiter; iter++) + { + + SWAP(p_z, p_z_old, tmp); + SWAP(p_w_z, p_w_z_old, tmp); + SWAP(p_d_z, p_d_z_old, tmp); + + bzero(p_z, n_z * sizeof(double)); + if (!folding_in) + { + bzero(p_w_z, n_w * n_z * sizeof(double)); + bzero(p_d_z, n_d * n_z * sizeof(double)); + } + + if (beta != 1) + { + + printf("Tempering started...."); + + for(iii = 0; iii < max;iii++) + { + if(iii < n_z) + { + p_z_old[iii] = pow(p_z_old[iii],beta); + } + + if(iii < n_z*n_w) + { + p_w_z_old[iii] = pow(p_w_z_old[iii],beta); + } + + if(iii < n_z*n_d) + { + p_d_z_old[iii] = pow(p_d_z_old[iii],beta); + } + + } + + + printf("Tempering finished.\n"); + + } + + for (n=0; n < n_ele; n++) + { + wc = tdnz[n]; + w = tdnz[F_(n_ele,n,1)]; + d = tdnz[F_(n_ele,n,2)]; + + sum = 0.0; + + for (z=0; z < n_z; z++) + { + sum += (p_z_d_w[z] = p_z_old[z] * p_d_z_old[Z_(d,z)] * + p_w_z_old[Z_(w,z)]); + } + + if (sum > 0.0) + { + for (z=0; z < n_z; z++) + { + p_z_d_w[z] *= (wc/ sum); + p_d_z[Z_(d,z)] += p_z_d_w[z]; + if (!folding_in) + { + p_w_z[Z_(w,z)] += p_z_d_w[z]; + p_z[z] += p_z_d_w[z]; + } + } + } + } /* end for n */ + + for (z=0; z < n_z; z++) + { + normalize_axis0(p_d_z, z, n_d, n_z); + if (!folding_in) + { + normalize_axis0(p_w_z, z, n_w, n_z); + p_z[z] /= R; + } + } + + lik_new = loglikelihood(n_ele, n_z, tdnz, p_z, p_w_z, p_d_z); + //lik_diff = lik_new - lik; + lik_diff = -(lik_new - lik)*100/lik; + lik = lik_new; + logL_c[iter+1] = lik; + + + /*if (iter > 0 && iter % 5 == 0) + { + printf("."); fflush(stdout); + }*/ + + printf("Iteration %d\n",iter+1); + printf("lik_diff = %f \n",lik_diff); + + if (iter > (min_iteration-1) && lik_diff < eps) + { + printf("\neps = %f \n", eps); + break; + } + + } /* end for iter */ + + printf("\nStopped at iteration %d.\n", iter+1); + + + free(p_z_d_w); +} diff --git a/plsa/plsa/porter_dictionary.py b/plsa/plsa/porter_dictionary.py new file mode 100644 index 0000000..6df0e5c --- /dev/null +++ b/plsa/plsa/porter_dictionary.py @@ -0,0 +1,66 @@ +__author__ = 'eyob' + + +import pickle + +class porter_dictionary: + + + def __init__(self): + + self.dictionary = dict() + + + + def add_element(self,stemmed, nonstemmed): + + self.arr = [] + + if stemmed in self.dictionary: + print('Stemmed word',stemmed,'present') + self.arr = self.dictionary[stemmed] + + if nonstemmed in self.arr: + print('Nonstemmed word',nonstemmed,'present') + pass + else: + print('Nonstemmed word',nonstemmed,'absent') + self.arr.append(nonstemmed) + self.dictionary.update({stemmed:self.arr}) + else: + print('Stemmed word',stemmed,'absent') + self.arr.append(nonstemmed) + self.dictionary.update({stemmed:self.arr}) + + + + def write_dict_to_file(self,file_name): + + with open(file_name, 'wb') as handle: + pickle.dump(self.dictionary,handle,protocol=2) + + + self.sorted_keys = sorted(self.dictionary) + + with open(file_name+'.txt','w') as file1: + for i in self.sorted_keys: + file1.write(i+str(self.dictionary[i])+'\n') + + + + + def load_dict(self,file_name): + with open(file_name, 'rb') as handle: + self.dictionary = pickle.load(handle) + + + + + +if __name__ == '__main__': + + pass + + + + diff --git a/plsa/plsa/setup.py b/plsa/plsa/setup.py new file mode 100644 index 0000000..157dbc5 --- /dev/null +++ b/plsa/plsa/setup.py @@ -0,0 +1,16 @@ +#@ -@*@- coding: utf@-@8 -*- +from distutils.core import setup +from distutils.extension import Extension +from Cython.Distutils import build_ext + +import numpy as np + +setup( + name="plsa", + cmdclass = {'build_ext': build_ext}, + ext_modules = [Extension("_plsa", ["_plsa.pyx", "plsa_train.c"], + include_dirs = [np.get_include(),'.'], + extra_compile_args=['-O3'])], + py_modules = ['plsa',], +) + diff --git a/plsa/plsa/tester_python3.py b/plsa/plsa/tester_python3.py new file mode 100644 index 0000000..0915eb0 --- /dev/null +++ b/plsa/plsa/tester_python3.py @@ -0,0 +1,206 @@ +__author__ = 'eyob' +# Tested on python3.6 + +import psutil +print('===================ram used at program start:',float(list(psutil.virtual_memory())[3])/1073741824.0,'GB') + +import os +import sys +import pathlib +import csv +import random +import datetime +import time + +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/plsa') +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing') + +import example_plsa as pplsa +import cleansing as pclean + +class TopicAnalysis: + + def __init__(self, path,channel=''): + + self.data_path = path + self.channel = channel + self.root_path = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/plsa/' + print(self.root_path) + self.extracted_folder = self.root_path + 'extracted/' + self.file_dict = self.root_path + 'dict/' + self.source_texts = self.root_path + 'extracted/' + self.output_dir = self.root_path + 'cleaned/' + print(self.output_dir) + self.folder = self.root_path + 'cleaned/' + self.dict_path = self.root_path + 'dict/' + self.plsa_parameters_path = self.root_path + 'plsa-parameters/' + self.PLSA_PARAMETERS_PATH = '' + + # self.messages + # self.unique_folder_naming + + def __del__(self): + + # Close db connections + pass + + def read_csv(self): + + messages_list = [] + + with open(self.data_path, mode='r') as csv_file: + csv_reader = csv.DictReader(csv_file, delimiter=',') + for row in csv_reader: + messages_list.append(row) + + self.messages = messages_list + + + + def write_to_files_slack(self): + + self.read_csv() + + + self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + self.channel + '/' + print(self.unique_folder_naming) + + os.mkdir(self.extracted_folder+self.unique_folder_naming) + + idx = 0 + + for row in self.messages: + if row['subtype'] == 'chat': + file = self.extracted_folder+self.unique_folder_naming+str(idx+2)+'.txt' + if self.channel == '': + with open(file, 'w') as f: + f.write(row['text']) + else: + if row['channel_name'] == self.channel: + with open(file, 'w') as f: + f.write(row['text']) + else: + continue + + idx = idx + 1 + + + def generate_topics(self): + + start_time_1 = time.time() + + pplsa.file_parts_number=10 + pclean.file_parts_number = 10 + pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict' + pclean.source_texts = self.source_texts + self.unique_folder_naming + '*.txt' + pclean.output_dir = self.output_dir + self.unique_folder_naming + + os.mkdir(pclean.output_dir) + + + # Do cleansing on the data and turing it to bad-of-words model + pclean.pre_pro() + + # Train using PLSA + pplsa.folder = pclean.output_dir[:-1] + pplsa.dict_path = pclean.file_dict + pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming + pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx' + pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional' + pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png' + + # Folder paths to delete + self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH + self.output_dir_stream = pclean.output_dir + self.file_dict_stream = pclean.file_dict + + + + os.mkdir(pplsa.PLSA_PARAMETERS_PATH) + + pplsa.main() + + end_time_1 = time.time() + + print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4)) + + + + + + + + + +def run_plsa_slack(): + path_1 = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/misc/slack_messages.csv' + print(path_1) + s = TopicAnalysis(path_1,'singnet') + s.write_to_files_slack() + s.generate_topics() + + +def run_1(): + + t = TopicAnalysis('local') + + + pass + + +def test_preprocessing(): + + root_folder = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/test/' + + pclean.file_parts_number=10 + pplsa.file_parts_number = 10 + pclean.file_dict = root_folder + 'dict/test_dict' + pclean.source_texts = root_folder + 'extracted/*.txt' + pclean.output_dir = root_folder + 'cleaned/' + + + # Do cleansing on the data and turing it to bad-of-words model + pclean.pre_pro() + + # Train using PLSA + pplsa.folder = pclean.output_dir[:-1] + pplsa.dict_path = pclean.file_dict + pplsa.folder = pclean.output_dir[:-1] + pplsa.main() + + + + +def isEnglish(s): + print(s) + # s = s.decode('utf-8') + # try: + # s.encode('ascii') + # except UnicodeEncodeError: + # return False + # else: + # return True + + try: + s.encode(encoding='utf-8').decode('ascii') + except UnicodeDecodeError: + return False + else: + return True + + + + + + +__end__ = '__end__' + + +if __name__ == '__main__': + + run_plsa_slack() + # run_1() + # test_preprocessing() + # + # print(isEnglish('meeeee')) + pass diff --git a/plsa/plsa/tests.py b/plsa/plsa/tests.py new file mode 100644 index 0000000..dab2a5c --- /dev/null +++ b/plsa/plsa/tests.py @@ -0,0 +1,42 @@ + + +import unittest + +from plsa import pLSA, normalize +import numpy as np + +class UtilsTest(unittest.TestCase): + + def test_normalize(self): + # 1d-case + a = normalize(np.random.random(10)) + self.assertAlmostEqual(1.0, np.sum(a)) + + # 1d-case where all values are 0 + a = normalize(np.array([0.0] * 10)) + self.assertEqual(0.0, np.sum(a)) + + # 2d-case, axis 1 + M = normalize(np.random.random(20).reshape(2,10), axis=1) + res = M.sum(axis=1) # a 2-array + self.assertAlmostEqual(1.0, res[0]) + self.assertAlmostEqual(1.0, res[1]) + + # 2d-case, axis 0 + M = normalize(np.random.random(20).reshape(10,2), axis=0) + res = M.sum(axis=0) + self.assertAlmostEqual(1.0, res[0]) + self.assertAlmostEqual(1.0, res[1]) + + # 2d-case, where the normalized axis has length 1 + M_ = np.random.random(10).reshape(10,1) + M = normalize(M_, axis=1) + self.assertTrue(np.array_equal(M, M_)) + + M_ = np.random.random(10).reshape(1,10) + M = normalize(M_, axis=0) + self.assertTrue(np.array_equal(M, M_)) + + +if __name__ == "__main__": + unittest.main() diff --git a/plsa/plsa/tfidf/__init__.py b/plsa/plsa/tfidf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plsa/plsa/tfidf/featsel.py b/plsa/plsa/tfidf/featsel.py new file mode 100644 index 0000000..f9d096e --- /dev/null +++ b/plsa/plsa/tfidf/featsel.py @@ -0,0 +1,116 @@ +from __future__ import division # enable float division by default + +import numpy as np +from numpy import log2 + +from tfidf import vocab_array, vocab_dict + +def get_counts(td, delta, t, c): + in_c = delta[:,c] == 1 # |X|-array + not_in_c = delta[:,c] == 0 # |X|-array + N11 = np.sum(td[t,in_c] > 0) # contains t and is in c + N01 = np.sum(td[t,in_c] == 0) + N10 = np.sum(td[t,not_in_c] > 0) + N00 = np.sum(td[t,not_in_c] == 0) + return (N11,N01,N10,N00) + +def get_group_counts(N11,N01,N10,N00): + N1_ = N10 + N11 + N_1 = N11 + N01 + N0_ = N01 + N00 + N_0 = N10 + N00 + N = N10 + N11 + N01 + N00 + return (N1_,N_1,N0_,N_0,N) + +def mutual_information(td, delta): + """ + td: a term-document V x |X| matrix + delta: |X| x M matrix where delta(i,j) = 1 if document i belongs to class j + + output: a V x M matrix of scores I(t,c) + """ + V,X = td.shape + X_,M = delta.shape + + assert(X==X_) + + I = np.zeros((V,M), dtype=np.double) + + for t in range(V): + for c in range(M): + N11,N01,N10,N00 = get_counts(td, delta, t, c) + N1_,N_1,N0_,N_0,N = get_group_counts(N11,N01,N10,N00) + + # FIXME: how to deal with log2(0) when the numerator is 0? + + I[t,c] = N11/N * log2((N*N11)/(N1_*N_1)) + \ + N01/N * log2((N*N01)/(N0_*N_1)) + \ + N10/N * log2((N*N10)/(N1_*N_0)) + \ + N00/N * log2((N*N00)/(N0_*N_0)) + + return I + +def chi2(td, delta): + """ + td: a term-document V x |X| matrix + delta: |X| x M matrix where delta(i,j) = 1 if document i belongs to class j + + output: a V x M matrix of scores chi2(t,c) + """ + V,X = td.shape + X_,M = delta.shape + + assert(X==X_) + + chi2m = np.zeros((V,M), dtype=np.double) + + for t in range(V): + for c in range(M): + N11,N01,N10,N00 = get_counts(td, delta, t, c) + N1_,N_1,N0_,N_0,N = get_group_counts(N11,N01,N10,N00) + + chi2m[t,c] = (N*(N11*N00-N10*N01)**2)/(N1_*N_1*N0_*N_0) + + return chi2m + +def select_max(td, vocab, A, K): + """ + Select the best K/M features for each of the M classes + + td: a term-document V x |X| matrix + delta: |X| x M matrix where delta(i,j) = 1 if document i belongs to class j + + A: matrix returned by chi2 or mutual_information + + output: the new reduced term-document matrix and the new vocabulary dict + """ + V, M = A.shape + + d = {} + + for m in range(M): + k = 1 + # best features which are not selected yet + best_feat = [a for a in A[:,m].argsort()[::-1] if not a in d] + d.update(dict((a,1) for a in best_feat[:int(K/M)])) + + best_feat = np.array(d.keys()) + varr = vocab_array(vocab) + + return td[best_feat, :], vocab_dict(varr[best_feat]) + +def select_avg(td, vocab, A, K, weights=None): + """ + Select the best K features by averaging the scores + + td: a term-document V x |X| matrix + delta: |X| x M matrix where delta(i,j) = 1 if document i belongs to class j + + A: matrix returned by chi2 or mutual_information + + output: the new reduced term-document matrix and the new vocabulary dict + """ + a = np.average(A, axis=1, weights=weights) + best_feat = a.argsort()[::-1][:K] + varr = vocab_array(vocab) + return td[best_feat, :], vocab_dict(varr[best_feat]) diff --git a/plsa/plsa/tfidf/porter.py b/plsa/plsa/tfidf/porter.py new file mode 100644 index 0000000..cb49972 --- /dev/null +++ b/plsa/plsa/tfidf/porter.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python + +"""Porter Stemming Algorithm +This is the Porter stemming algorithm, ported to Python from the +version coded up in ANSI C by the author. It may be be regarded +as canonical, in that it follows the algorithm presented in + +Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +no. 3, pp 130-137, + +only differing from it at the points maked --DEPARTURE-- below. + +See also http://www.tartarus.org/~martin/PorterStemmer + +The algorithm as described in the paper could be exactly replicated +by adjusting the points of DEPARTURE, but this is barely necessary, +because (a) the points of DEPARTURE are definitely improvements, and +(b) no encoding of the Porter stemmer I have seen is anything like +as exact as this version, even with the points of DEPARTURE! + +Vivake Gupta (v@nano.com) + +Release 1: January 2001 + +Further adjustments by Santiago Bruno (bananabruno@gmail.com) +to allow word input not restricted to one word per line, leading +to: + +release 2: July 2008 +""" + +import sys + +class PorterStemmer: + + def __init__(self): + """The main part of the stemming algorithm starts here. + b is a buffer holding a word to be stemmed. The letters are in b[k0], + b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is + readjusted downwards as the stemming progresses. Zero termination is + not in fact used in the algorithm. + + Note that only lower case sequences are stemmed. Forcing to lower case + should be done before stem(...) is called. + """ + + self.b = "" # buffer for word to be stemmed + self.k = 0 + self.k0 = 0 + self.j = 0 # j is a general offset into the string + + def cons(self, i): + """cons(i) is TRUE <=> b[i] is a consonant.""" + if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': + return 0 + if self.b[i] == 'y': + if i == self.k0: + return 1 + else: + return (not self.cons(i - 1)) + return 1 + + def m(self): + """m() measures the number of consonant sequences between k0 and j. + if c is a consonant sequence and v a vowel sequence, and <..> + indicates arbitrary presence, + + gives 0 + vc gives 1 + vcvc gives 2 + vcvcvc gives 3 + .... + """ + n = 0 + i = self.k0 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + while 1: + while 1: + if i > self.j: + return n + if self.cons(i): + break + i = i + 1 + i = i + 1 + n = n + 1 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + + def vowelinstem(self): + """vowelinstem() is TRUE <=> k0,...j contains a vowel""" + for i in range(self.k0, self.j + 1): + if not self.cons(i): + return 1 + return 0 + + def doublec(self, j): + """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" + if j < (self.k0 + 1): + return 0 + if (self.b[j] != self.b[j-1]): + return 0 + return self.cons(j) + + def cvc(self, i): + """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + """ + if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2): + return 0 + ch = self.b[i] + if ch == 'w' or ch == 'x' or ch == 'y': + return 0 + return 1 + + def ends(self, s): + """ends(s) is TRUE <=> k0,...k ends with the string s.""" + length = len(s) + if s[length - 1] != self.b[self.k]: # tiny speed-up + return 0 + if length > (self.k - self.k0 + 1): + return 0 + if self.b[self.k-length+1:self.k+1] != s: + return 0 + self.j = self.k - length + return 1 + + def setto(self, s): + """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" + length = len(s) + self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] + self.k = self.j + length + + def r(self, s): + """r(s) is used further down.""" + if self.m() > 0: + self.setto(s) + + def step1ab(self): + """step1ab() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + """ + if self.b[self.k] == 's': + if self.ends("sses"): + self.k = self.k - 2 + elif self.ends("ies"): + self.setto("i") + elif self.b[self.k - 1] != 's': + self.k = self.k - 1 + if self.ends("eed"): + if self.m() > 0: + self.k = self.k - 1 + elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): + self.k = self.j + if self.ends("at"): self.setto("ate") + elif self.ends("bl"): self.setto("ble") + elif self.ends("iz"): self.setto("ize") + elif self.doublec(self.k): + self.k = self.k - 1 + ch = self.b[self.k] + if ch == 'l' or ch == 's' or ch == 'z': + self.k = self.k + 1 + elif (self.m() == 1 and self.cvc(self.k)): + self.setto("e") + + def step1c(self): + """step1c() turns terminal y to i when there is another vowel in the stem.""" + if (self.ends("y") and self.vowelinstem()): + self.b = self.b[:self.k] + 'i' + self.b[self.k+1:] + + def step2(self): + """step2() maps double suffices to single ones. + so -ization ( = -ize plus -ation) maps to -ize etc. note that the + string before the suffix must give m() > 0. + """ + if self.b[self.k - 1] == 'a': + if self.ends("ational"): self.r("ate") + elif self.ends("tional"): self.r("tion") + elif self.b[self.k - 1] == 'c': + if self.ends("enci"): self.r("ence") + elif self.ends("anci"): self.r("ance") + elif self.b[self.k - 1] == 'e': + if self.ends("izer"): self.r("ize") + elif self.b[self.k - 1] == 'l': + if self.ends("bli"): self.r("ble") # --DEPARTURE-- + # To match the published algorithm, replace this phrase with + # if self.ends("abli"): self.r("able") + elif self.ends("alli"): self.r("al") + elif self.ends("entli"): self.r("ent") + elif self.ends("eli"): self.r("e") + elif self.ends("ousli"): self.r("ous") + elif self.b[self.k - 1] == 'o': + if self.ends("ization"): self.r("ize") + elif self.ends("ation"): self.r("ate") + elif self.ends("ator"): self.r("ate") + elif self.b[self.k - 1] == 's': + if self.ends("alism"): self.r("al") + elif self.ends("iveness"): self.r("ive") + elif self.ends("fulness"): self.r("ful") + elif self.ends("ousness"): self.r("ous") + elif self.b[self.k - 1] == 't': + if self.ends("aliti"): self.r("al") + elif self.ends("iviti"): self.r("ive") + elif self.ends("biliti"): self.r("ble") + elif self.b[self.k - 1] == 'g': # --DEPARTURE-- + if self.ends("logi"): self.r("log") + # To match the published algorithm, delete this phrase + + def step3(self): + """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" + if self.b[self.k] == 'e': + if self.ends("icate"): self.r("ic") + elif self.ends("ative"): self.r("") + elif self.ends("alize"): self.r("al") + elif self.b[self.k] == 'i': + if self.ends("iciti"): self.r("ic") + elif self.b[self.k] == 'l': + if self.ends("ical"): self.r("ic") + elif self.ends("ful"): self.r("") + elif self.b[self.k] == 's': + if self.ends("ness"): self.r("") + + def step4(self): + """step4() takes off -ant, -ence etc., in context vcvc.""" + if self.b[self.k - 1] == 'a': + if self.ends("al"): pass + else: return + elif self.b[self.k - 1] == 'c': + if self.ends("ance"): pass + elif self.ends("ence"): pass + else: return + elif self.b[self.k - 1] == 'e': + if self.ends("er"): pass + else: return + elif self.b[self.k - 1] == 'i': + if self.ends("ic"): pass + else: return + elif self.b[self.k - 1] == 'l': + if self.ends("able"): pass + elif self.ends("ible"): pass + else: return + elif self.b[self.k - 1] == 'n': + if self.ends("ant"): pass + elif self.ends("ement"): pass + elif self.ends("ment"): pass + elif self.ends("ent"): pass + else: return + elif self.b[self.k - 1] == 'o': + if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass + elif self.ends("ou"): pass + # takes care of -ous + else: return + elif self.b[self.k - 1] == 's': + if self.ends("ism"): pass + else: return + elif self.b[self.k - 1] == 't': + if self.ends("ate"): pass + elif self.ends("iti"): pass + else: return + elif self.b[self.k - 1] == 'u': + if self.ends("ous"): pass + else: return + elif self.b[self.k - 1] == 'v': + if self.ends("ive"): pass + else: return + elif self.b[self.k - 1] == 'z': + if self.ends("ize"): pass + else: return + else: + return + if self.m() > 1: + self.k = self.j + + def step5(self): + """step5() removes a final -e if m() > 1, and changes -ll to -l if + m() > 1. + """ + self.j = self.k + if self.b[self.k] == 'e': + a = self.m() + if a > 1 or (a == 1 and not self.cvc(self.k-1)): + self.k = self.k - 1 + if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: + self.k = self.k -1 + + def stem(self, p, i=None, j=None): + """In stem(p,i,j), p is a char pointer, and the string to be stemmed + is from p[i] to p[j] inclusive. Typically i is zero and j is the + offset to the last character of a string, (p[j+1] == '\0'). The + stemmer adjusts the characters p[i] ... p[j] and returns the new + end-point of the string, k. Stemming never increases word length, so + i <= k <= j. To turn the stemmer into a module, declare 'stem' as + extern, and delete the remainder of this file. + """ + # copy the parameters into statics + if i is None: i = 0 + if j is None: j = len(p)-1 + self.b = p + self.k = j + self.k0 = i + if self.k <= self.k0 + 1: + return self.b # --DEPARTURE-- + + # With this line, strings of length 1 or 2 don't go through the + # stemming process, although no mention is made of this in the + # published algorithm. Remove the line to match the published + # algorithm. + + self.step1ab() + self.step1c() + self.step2() + self.step3() + self.step4() + self.step5() + return self.b[self.k0:self.k+1] + + def stem_sentence(self, txt): + return " ".join(map(self.stem, txt.split())) + + def stem_documents(self, docs): + return map(self.stem_sentence, docs) + + +if __name__ == '__main__': + p = PorterStemmer() + if len(sys.argv) > 1: + for f in sys.argv[1:]: + infile = open(f, 'r') + while 1: + output = '' + word = '' + line = infile.readline() + if line == '': + break + for c in line: + if c.isalpha(): + word += c.lower() + else: + if word: + output += p.stem(word, 0,len(word)-1) + word = '' + output += c.lower() + print (output,) + infile.close() diff --git a/plsa/plsa/tfidf/preprocessing.py b/plsa/plsa/tfidf/preprocessing.py new file mode 100644 index 0000000..61d687a --- /dev/null +++ b/plsa/plsa/tfidf/preprocessing.py @@ -0,0 +1,99 @@ +import re +import string +import glob + +# file_list = [] +# file_parts_number = 9 +# file_parts_number = 8 + +def strip_punctuation(s): + return re.sub("([%s]+)" % string.punctuation, " ", s) + +def strip_punctuation2(s): + return s.translate(string.maketrans("",""), string.punctuation) + +def strip_tags(s): + # assumes s is already lowercase + return re.sub(r"<([^>]+)>", "", s) + +def strip_short(s, minsize=3): + return " ".join([e for e in s.split() if len(e) >= minsize]) + +def strip_numeric(s): + return re.sub(r"[0-9]+", "", s) + +def strip_non_alphanum(s): + # assumes s is already lowercase + return re.sub(r"[^a-z0-9\ ]", " ", s) + +def strip_multiple_whitespaces(s): + return re.sub(r"(\s|\\n|\\r|\\t)+", " ", s) + #return s + +def split_alphanum(s): + s = re.sub(r"([a-z]+)([0-9]+)", r"\1 \2", s) + return re.sub(r"([0-9]+)([a-z]+)", r"\1 \2", s) + +STOPWORDS = """ +a about again all almost also although always among an +and another any are as at +be because been before being between both but by +can could +did do does done due during +each either enough especially etc +for found from further +had has have having here how however +i if in into is it its itself +just +kg km +made mainly make may mg might ml mm most mostly must +nearly neither no nor not +obtained of often on our overall +perhaps pmid +quite +rather really regarding +seem seen several should show showed shown shows significantly +since so some such +than that the their theirs them then there therefore these they too +this those through thus to +upon use used using +various very +was we were what when which while with within without would will +""" + +STOPWORDS = dict((w,1) for w in STOPWORDS.strip().replace("\n", " ").split()) + +def remove_stopwords(s): + return " ".join([w for w in s.split() if w not in STOPWORDS]) + +# DEFAULT_FILTERS = [str.lower, strip_tags, strip_punctuation, +# strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short] + +DEFAULT_FILTERS = [str.lower, strip_tags, strip_punctuation, +strip_multiple_whitespaces, strip_numeric, strip_short] + +def preprocess_string(s, filters=DEFAULT_FILTERS): + for f in filters: + s = f(s) + return s + +def preprocess_documents(docs): + # print docs + return map(preprocess_string, docs) + +def read_file(path): + f = open(path) + ret = f.read() + # # print path + # file=str(path).split('/')[file_parts_number] + # file_list.append((file)) + return ret + +def read_files(pattern): + # global file_list + # file_list = [] + return map(read_file, glob.glob(pattern)) + +# def empty_file_list(): +# global file_list +# file_list = [] \ No newline at end of file diff --git a/plsa/plsa/tfidf/smoothing.py b/plsa/plsa/tfidf/smoothing.py new file mode 100644 index 0000000..5595646 --- /dev/null +++ b/plsa/plsa/tfidf/smoothing.py @@ -0,0 +1,18 @@ + +def laplace(td): + """ + Laplace smoothing a.k.a as "add one" smoothing. + + In a bayesian setting, this corresponds to a uniform prior on events. + + It has a tendency to over-estimate probabilities of unseen terms for large + matrices. + + td: V x X term document matrix + """ + V, X = td.shape + return (1.0 + td) / (V + td.sum(axis=0)) + +def lidstone(td, lambda_=0.5): + V, X = td.shape + return (lambda_ + td) / (V * lambda_ + td.sum(axis=0)) \ No newline at end of file diff --git a/plsa/plsa/tfidf/tfidf.py b/plsa/plsa/tfidf/tfidf.py new file mode 100644 index 0000000..3609441 --- /dev/null +++ b/plsa/plsa/tfidf/tfidf.py @@ -0,0 +1,168 @@ +## -@@*@@-@@ :D coding: utf@!-8 -*- + +from math import log + +import numpy as np +import scipy.sparse as sp +import tables as tb + +def tokenize(text): + return text.split() + +def tc(dataset, tokenizer=tokenize): + vocab = {} + docs = [] + + for doc in dataset: + if doc == '': + continue + d = {} # token => count + + for term in tokenizer(doc): + vocab[term] = 1 + d[term] = d.get(term, 0) + 1 + + docs.append(d) + + sorted_terms = sorted(vocab.keys()) + vocab = dict([(t, i) for i, t in enumerate(sorted_terms)]) + + + return docs, vocab + +def tf_from_tc(term_counts): + docs = [] + + for doc in term_counts: + d = {} + length = sum(doc.values()) + for term, count in doc.items(): + d[term] = float(count) / length + docs.append(d) + + return docs + +def idc_from_tc(term_counts): + t = {} + for doc in term_counts: + for term in doc: + t[term] = t.get(term, 0) + 1 + return t + +def idf_from_tc(term_counts): + n_docs = len(term_counts) + idf = {} + idc = idc_from_tc(term_counts) + for term in idc: + idf[term] = log(n_docs*1.0/(idc[term])) + return idf + +def tf_mul_idf(tf, idf): + docs = [] + + for doc in tf: + d = {} + for term in doc: + d[term] = doc[term] * idf[term] + docs.append(d) + + return docs + +def to_vector(idf_dict, vocab): + ret = np.zeros(len(idf_dict)) + for term, idx in vocab.items(): + ret[idx] = idf_dict[term] + return ret + +def to_sparse_matrix(tfidf_dict, vocab): + # fd = tb.open_file ( fname, mode = 'w' ) + tfm = sp.lil_matrix((len(vocab), len(tfidf_dict)), dtype=np.double) + # print 'tfm',shape(tfm) + # filters = tb.Filters(complevel=5, complib='blosc') + # out_data = f.create_earray(f.root, 'data', tb.Float32Atom(), shape=(0,), filters=filters) + # out_indices = f.create_earray(f.root, 'indices', tb.Int32Atom(),shape=(0,), filters=filters) + # out_indptr = f.create_earray(f.root, 'indptr', tb.Int32Atom(), shape=(0,), filters=filters) + # out_indptr.append(np.array([0])) #this is needed as a first indptr + # max_indptr = 0 + for j, doc in enumerate(tfidf_dict): + for term in doc: + try: + i = vocab[term] + tfm[i,j] = doc[term] + # res=doc[term] + # print res + # + # out_data.append((res[np.newaxis,:])) + except KeyError: + pass + # indices = res.indices + # indptr = res.indptr + # out_indices.append(indices) + # out_indptr.append(max_indptr+indptr[1:]) + # max_indptr = indices.shape[0] + # print tfm.shape + # info = fd.create_group ( '/', 'info' ) + # fd.create_vlarray ( info, 'dtype', tfm.dtype) + # fd.create_vlarray ( info, 'shape', tfm.shape ) + # fd.create_vlarray ( info, 'format', tfm.format ) + + # data = fd.create_group ( '/', 'data' ) + # vld = fd.create_earray ( data, 'data',pt.Float64Atom(),shape=(0,) ) + # vlr = fd.create_earray ( data, 'rows',pt.UInt32Atom(),shape=(0,)) + # for u in xrange ( tfm.shape [ 0 ] ): + # vld.append ( tfm.data [ u ] ) + # vlr.append ( tfm.rows [ u ] ) + # fd.close() + return tfm + +def inverse_vocab(vocab): + """ + Converts a vocab dictionary term => index to index => term + """ + return dict((i,t) for t,i in vocab.items()) + +def vocab_array(vocab): + """ + Converts vocab dictionary to vocab array + """ + return np.char.array(sorted(vocab.keys(), + lambda a,b: cmp(vocab[a],vocab[b]))) + +def vocab_dict(vocab): + """ + Converts vocab array to vocab dictionary + """ + return dict((term,i) for i,term in enumerate(vocab)) + +def replace_vocab(td, oldvocab, newvocab): + """ + td: V x X term-document matrix + oldvocab: dictionary + newvocab: dictionary + """ + newtd = np.zeros((len(newvocab),td.shape[1])) + for term in newvocab: + try: + newtd[newvocab[term]] = td[oldvocab[term]] + except KeyError: + newtd[newvocab[term]] = 0 + return newtd + +class tfidf(object): + def __init__(self, dataset, tokenizer=tokenize): + self._dataset = dataset + self._tokenizer = tokenizer + + def as_dict(self): + term_counts, vocab = tc(self._dataset, self._tokenizer) + tf = tf_from_tc(term_counts) + idf = idf_from_tc(term_counts) + return tf_mul_idf(tf, idf), vocab + + def as_sparse_matrix(self): + tfidf_dict, vocab = self.as_dict() + return to_sparse_matrix(tfidf_dict, vocab), vocab + + def as_array(self): + tfm, vocab = self.as_sparse_matrix() + return tfm.toarray(), vocab diff --git a/plsa/preprocessing/__init__.py b/plsa/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plsa/preprocessing/cleansing.py b/plsa/preprocessing/cleansing.py new file mode 100644 index 0000000..ecf2d9d --- /dev/null +++ b/plsa/preprocessing/cleansing.py @@ -0,0 +1,287 @@ +__author__ = 'masresha' +# Runs on python3.6 +# this block of code first reads a files directory + + +import sys +import pathlib +import os + +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa') +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing') + + +# import codecs + +import glob +import nltk +from nltk.corpus import stopwords +import pathlib as path +from stemming.porter2 import stem +import string +import preprocessing.porter_dictionary as pp +# import re + +port_dict = pp.porter_dictionary() + + + +# file_parts_number = 9 + + +file_dict = '' +source_texts = '' +output_dir = '' +file_parts_number = 8 +# file_parts_number = 7 # Inspire + + + +# fileList = glob.glob(source_texts) + + +punct=['…','•','”','→','↑','“','‘','’','—','£','€','$'] + + + + + + + +pos_dict = {'JJ': 'a', 'JJR': 'a', + 'JJS': 'a', 'NN': 'n', + 'NNS': 'n', 'NNP': 'n', + 'NNPS': 'n', 'PRP': 'n', + 'PRP$': 'n', 'RB': 'r', + 'RBR': 'r', 'RBS': 'r', + 'VB': 'v', 'VBD': 'v', + 'VBG': 'v', 'VBN': 'v', + 'VBZ': 'v', } + +wnl = nltk.WordNetLemmatizer() +stop = stopwords.words('english') + +z = [] +wnl_tokens = [] + + +def is_float(x): + try: + float(x) + return True + except: + return False + + +def has_token_punct(token): + for char in token: + if (char in string.punctuation) or (char in punct): + return True + break + else: + pass + + +def find_punct(token): + ch = [] + for char in token: + if ((char in string.punctuation) or (char in punct)): + ch.append(char) + + return ch + # break +def isEnglish(s): + # print('444444444444444444444444') + # print(s) + # s = s.decode('utf-8') + # try: + # s.encode('ascii') + # except UnicodeEncodeError: + # return False + # else: + # return True + + try: + s.encode(encoding='utf-8').decode('ascii') + except UnicodeDecodeError: + return False + else: + return True + +def return_cleaned(t1): + returned_token = [] + + + + for i in t1: + # print(i) + # print(type(i)) + if (len(i) < 2 or len(i)>12): + # print('-------',i) + pass + elif (i[:2] == '//'): + # print('*******',i) + pass + elif (is_float(i)): + pass + # elif (i.__contains__(',')): + # pass + elif (i.__contains__('www.') or i.__contains__('xxx.') or i.__contains__('yyy.') or i.__contains__('.gov') ): + pass + elif (str(i).endswith('.com') or str(i).endswith('.html') or str(i).endswith('.php') or str(i).endswith( + '.aspx') or str(i).endswith('.asp') or str(i).endswith('htm') or str(i).endswith('pdf')): + pass + elif (str(i).startswith('http') or str(i).startswith('https') or str(i).startswith('/') ): + pass + # elif(isEnglish(i)==False): + # pass + else: + returned_token.append(i) + + # print returned_token + #removes punctuation from a string and remove the remaining string if it is a number + token_list = [] + for token in returned_token: + + if ((has_token_punct(token))): + chr = find_punct(token) + new_token=token + for ch in chr: + if (ch=='-'): + new_token = str(new_token).replace(ch, ' ') + + else: + new_token = str(new_token).replace(ch, '') + # checks if after punctuation is removed and its length is <2 + if (is_float(new_token)): + # if ((new_token.isalpha()=='false' or new_token.isdigit()=='false')): + pass + elif(isEnglish(new_token)==False): + pass + + else: + token_list.append(new_token) + elif(isEnglish(token)==False): + pass + else: + token_list.append(token) + + # print ('Token List',new_token) + return token_list +# print (token_list) + +# print returned_token +# return returned_token + + + +def pre_pro(): + fileList = glob.glob(source_texts) + fileList_len = fileList.__len__() - 1 + k = 0 + print('------pre-process started-------') + for files in fileList: + tFile = open(files) + # tFile = codecs.open(files, 'r', 'utf-8') + line = tFile.read().lower() + # print(line) + # print(type(line)) + # line = line.decode('utf-8') + tokens = nltk.word_tokenize(line) + # print(tokens) + for ijk in range(len(tokens)): + # tokens[ijk] = tokens[ijk].encode('utf-8') + tokens[ijk] = tokens[ijk] + + clean_tokens = return_cleaned(tokens) + + + + # print clean_tokens + clean_tokens = filter(lambda name: name.strip(), clean_tokens) + # result = list(filter(None, clean_tokens)) + # print clean_tokens + final_tok=[] + for tok in clean_tokens: + # print (tok) + tok= str(tok).rstrip(' ') + # if(len(tok) < 2 ): + # clean_tokens.remove(tok) + if(len(str(tok).split(' ')) > 1): + # print tok+' '+str(len(str(tok).split(' '))) + # print tok + token=nltk.word_tokenize(tok) + cl_tok= return_cleaned(token) + for tk in cl_tok: + # print tk + final_tok.append(tk) + # clean_tokens.remove(tok) + else: + final_tok.append(tok) + # print clean_tokens + # clean_tokens.append(token) + + # print token + # tokens= nltk.word_tokenize(clean_tokens) + # print tokens + + # tokens = Pun_pattern.sub("", str(tokens)) + # tokenstemp = [] + # for t1 in tokens: + # for tt1 in t1: + # if(not string.punctuation.__contains__(tt1)): + # if(t1 != "'s"): + # tokenstemp.append(t1) + # break + + # tokens = tokenstemp + + filtered_words = [w for w in final_tok if not w in stopwords.words('english')] + # print filtered_words + POS_Tokens = nltk.pos_tag(filtered_words) + + z = [] + for x in POS_Tokens: + try: + z.append(pos_dict[x[1]]) + except: + z.append('n') + + wnl_tokens = [] + for i in range(len(filtered_words)): + # if (len(clean_tokens[i])>3): + # wnl_tokens.append(wnl.lemmatize(clean_tokens[i], z[i])) + # else: + wnl_tokens.append(filtered_words[i]) + por_tokens = [stem(t) for t in wnl_tokens] + # print por_tokens + + temp_term1 = '' + term1 = '' + + for stmd in range(len(filtered_words)): + + term1 = por_tokens[stmd] + temp_term1 = filtered_words[stmd] + + # print term1,temp_term1 + + port_dict.add_element(stemmed=term1,nonstemmed=temp_term1) + + file_txt = open(output_dir + path.PurePath(files).parts[file_parts_number], "w") + + for i in por_tokens: + file_txt.write(i) + file_txt.write('\n') + file_txt.close() + # print('Processed ',k,'of',fileList_len) + k = k + 1 + port_dict.write_dict_to_file(file_dict) + print('***------pre-process finished--------') + +if __name__ == '__main__': + + pre_pro() + + + diff --git a/plsa/preprocessing/porter_dictionary.py b/plsa/preprocessing/porter_dictionary.py new file mode 100644 index 0000000..c9e77cf --- /dev/null +++ b/plsa/preprocessing/porter_dictionary.py @@ -0,0 +1,66 @@ +__author__ = 'eyob' + + +import pickle + +class porter_dictionary: + + + def __init__(self): + + self.dictionary = dict() + + + + def add_element(self,stemmed, nonstemmed): + + self.arr = [] + + if stemmed in self.dictionary: + # print('Stemmed word',stemmed,'present') + self.arr = self.dictionary[stemmed] + + if nonstemmed in self.arr: + # print('Nonstemmed word',nonstemmed,'present') + pass + else: + # print('Nonstemmed word',nonstemmed,'absent') + self.arr.append(nonstemmed) + self.dictionary.update({stemmed:self.arr}) + else: + # print('Stemmed word',stemmed,'absent') + self.arr.append(nonstemmed) + self.dictionary.update({stemmed:self.arr}) + + + + def write_dict_to_file(self,file_name): + + with open(file_name, 'wb') as handle: + pickle.dump(self.dictionary,handle,protocol=2) + + + self.sorted_keys = sorted(self.dictionary) + + with open(file_name+'.txt','w') as file1: + for i in self.sorted_keys: + file1.write(i+str(self.dictionary[i])+'\n') + + + + + def load_dict(self,file_name): + with open(file_name, 'rb') as handle: + self.dictionary = pickle.load(handle) + + + + + +if __name__ == '__main__': + + pass + + + +