From b9a27f215bc23db4f5c3cdfd684f7493019c87a4 Mon Sep 17 00:00:00 2001
From: Eyob Yirdaw <edyirdaw@gmail.com>
Date: Wed, 9 Jan 2019 17:27:42 +0300
Subject: [PATCH] Init commit

---
 README.md                               |   1 +
 plsa/__init__.py                        |   0
 plsa/plsa/__init__.py                   |   0
 plsa/plsa/_plsa.pyx                     |  72 ++++
 plsa/plsa/example_plsa.py               | 482 ++++++++++++++++++++++++
 plsa/plsa/plsa.py                       | 455 ++++++++++++++++++++++
 plsa/plsa/plsa_train.c                  | 271 +++++++++++++
 plsa/plsa/porter_dictionary.py          |  66 ++++
 plsa/plsa/setup.py                      |  16 +
 plsa/plsa/tester_python3.py             | 206 ++++++++++
 plsa/plsa/tests.py                      |  42 +++
 plsa/plsa/tfidf/__init__.py             |   0
 plsa/plsa/tfidf/featsel.py              | 116 ++++++
 plsa/plsa/tfidf/porter.py               | 375 ++++++++++++++++++
 plsa/plsa/tfidf/preprocessing.py        |  99 +++++
 plsa/plsa/tfidf/smoothing.py            |  18 +
 plsa/plsa/tfidf/tfidf.py                | 168 +++++++++
 plsa/preprocessing/__init__.py          |   0
 plsa/preprocessing/cleansing.py         | 287 ++++++++++++++
 plsa/preprocessing/porter_dictionary.py |  66 ++++
 20 files changed, 2740 insertions(+)
 create mode 100644 README.md
 create mode 100644 plsa/__init__.py
 create mode 100644 plsa/plsa/__init__.py
 create mode 100644 plsa/plsa/_plsa.pyx
 create mode 100644 plsa/plsa/example_plsa.py
 create mode 100644 plsa/plsa/plsa.py
 create mode 100644 plsa/plsa/plsa_train.c
 create mode 100644 plsa/plsa/porter_dictionary.py
 create mode 100644 plsa/plsa/setup.py
 create mode 100644 plsa/plsa/tester_python3.py
 create mode 100644 plsa/plsa/tests.py
 create mode 100644 plsa/plsa/tfidf/__init__.py
 create mode 100644 plsa/plsa/tfidf/featsel.py
 create mode 100644 plsa/plsa/tfidf/porter.py
 create mode 100644 plsa/plsa/tfidf/preprocessing.py
 create mode 100644 plsa/plsa/tfidf/smoothing.py
 create mode 100644 plsa/plsa/tfidf/tfidf.py
 create mode 100644 plsa/preprocessing/__init__.py
 create mode 100644 plsa/preprocessing/cleansing.py
 create mode 100644 plsa/preprocessing/porter_dictionary.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..865a0e5
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+# topic-analysis
\ No newline at end of file
diff --git a/plsa/__init__.py b/plsa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/plsa/plsa/__init__.py b/plsa/plsa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/plsa/plsa/_plsa.pyx b/plsa/plsa/_plsa.pyx
new file mode 100644
index 0000000..14c950c
--- /dev/null
+++ b/plsa/plsa/_plsa.pyx
@@ -0,0 +1,72 @@
+import numpy as np
+cimport numpy as np
+
+cdef extern void _train(unsigned int n_ele,
+                        unsigned int n_z,
+                        unsigned int n_w,
+                        unsigned int n_d,
+                        unsigned int *tdnz,
+                        double *p_z,
+                        double *p_w_z,
+                        double *p_d_z,
+                        double *p_z_old,
+                        double *p_w_z_old,
+                        double *p_d_z_old,
+                        unsigned int maxiter,
+                        double eps,
+                        double beta,
+                        unsigned int min_iteration,
+                        unsigned int folding_in,
+                        unsigned int debug,
+                        double *logL_c)
+
+def nonzero(td):
+    """
+    Convert a sparse matrix td to a Nx3 matrix where N is the number of non-zero
+    elements. The 1st column is the word count, the 2nd is the word index
+    and the 3rd column is the document index.
+    """
+    rows, cols = td.nonzero()
+    vals = td[rows,cols]
+    if "scipy.sparse" in str(vals.__class__): vals = vals.toarray()
+    return np.asfortranarray(np.vstack((vals, rows, cols)).T)
+
+def train(td,
+          np.ndarray[np.float64_t, ndim=1, mode='c']p_z,
+          np.ndarray[np.float64_t, ndim=2, mode='c']p_w_z,
+          np.ndarray[np.float64_t, ndim=2, mode='c']p_d_z,
+          np.ndarray[np.float64_t, ndim=1, mode='c']p_z_old,
+          np.ndarray[np.float64_t, ndim=2, mode='c']p_w_z_old,
+          np.ndarray[np.float64_t, ndim=2, mode='c']p_d_z_old,
+          maxiter,
+          eps,
+          beta,
+          min_iteration,
+          folding_in,
+          debug,
+          np.ndarray[np.float64_t, ndim=1, mode='c']logL_c):
+
+    cdef np.ndarray[np.uint32_t, ndim=2, mode='fortran'] tdnz
+
+    tdnz = nonzero(td).astype(np.uint32)
+
+    _train(<unsigned int>tdnz.shape[0],
+           <unsigned int>p_z.shape[0],
+           <unsigned int>p_w_z.shape[0],
+           <unsigned int>p_d_z.shape[0],
+           <unsigned int *>tdnz.data,
+           <double *>p_z.data,
+           <double *>p_w_z.data,
+           <double *>p_d_z.data,
+           <double *>p_z_old.data,
+           <double *>p_w_z_old.data,
+           <double *>p_d_z_old.data,
+           <unsigned int>maxiter,
+           <double> eps,
+           <double> beta,
+           <unsigned int> min_iteration,
+           <unsigned int>folding_in,
+           <unsigned int>debug,
+           <double *>logL_c.data)
+
+
diff --git a/plsa/plsa/example_plsa.py b/plsa/plsa/example_plsa.py
new file mode 100644
index 0000000..511ce4b
--- /dev/null
+++ b/plsa/plsa/example_plsa.py
@@ -0,0 +1,482 @@
+#!/usr/bin/env python
+
+import sys
+
+import os
+import time
+import glob
+import random
+import logging
+import numpy as np
+
+# import taskmanager as tm
+import pandas as pd
+from tfidf.preprocessing import read_files, preprocess_documents
+# from tfidf.porter import PorterStemmer
+from tfidf.tfidf import *
+# from tfidf.preprocessing import file_list, empty_file_list
+
+from plsa import pLSA
+import porter_dictionary
+
+# s_file_list = []
+
+
+file_parts_number = 8
+# file_parts_number = 7 # Inspire
+
+folder = ''
+dict_path = ''
+
+file= 'plsa_topics.txt'
+# file1='word_topics'
+file2='topic_probability_pz'
+
+PLSA_PARAMETERS_PATH = ''
+# file_txt1 = open(PLSA_PARAMETERS_PATH + file1, "w")
+
+PATH=''
+PATH_word_by_topic_conditional=''
+
+matrix_file = ''
+# num_topics = 1672
+# num_topics = 5403
+# num_topics = 5
+
+num_topics = 24
+topic_divider = 5
+# num_topics = 256
+num_topics_generated = 300
+# cc = 0.3 # This is convergence criterion
+cc = 0.0000000000000000007 # This is convergence criterion
+# cc = 0.7 # This is convergence criterion -- was used for :D
+# cc = 0.13 # This is convergence criterion
+# maxiter2 = 50
+maxiter2 = 22
+beta = 1
+min_iteration = 10
+logL_pic = ''
+number_of_words = 0
+number_of_docs = 0
+RAM_limit = 25 # In giga bytes
+
+print('RAM usage has been limited to {} GBs >>>>>>>>>>>>>>>>>>>>>>>>>>'.format(RAM_limit))
+
+# @tm.task(str)
+
+def feat(folder):
+    global num_topics
+    docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt"))))
+    assert(len(docs) > 0)
+    print("len(docs) =",len(docs))
+    docs_2 = list(docs)
+    docs_reduced = reduce_docs(docs)
+
+
+    if docs_reduced.__len__() != docs_2.__len__():
+
+        list_1 = docs_to_delete(docs=docs_2, docs_red=docs_reduced)
+        delete_docs(list_1)
+
+        docs = preprocess_documents(read_files(os.path.join(folder, "*.txt")))
+        assert(len(docs) > 0)
+        print("len(docs) =",len(docs))
+
+    # num_topics = int(len(docs) / topic_divider)
+    # if(num_topics < 2):
+    #     num_topics = 2
+    #stemmer = PorterStemmer()
+    #docs = stemmer.stem_documents(docs)
+    td_dict, vocab = tc(docs)
+    print ('len(td_dict) =', len(td_dict))
+    print ('len(vocab) =',len(vocab))
+    global number_of_words
+    global number_of_docs
+    number_of_words = len(vocab)
+    number_of_docs = len(td_dict)
+    print('type(docs):',type(docs))
+    print('type(vocab):',type(vocab))
+    # print('docs',docs)
+    # print('td_dict:',td_dict)
+    # print('vocab',vocab)
+    td = to_sparse_matrix(td_dict, vocab).toarray()
+    # print('td:',td)
+    print('type(td):',type(td))
+    # idf = to_vector(idf_from_tc(td_dict), vocab)
+    print ("term-document matrix size", td.shape)
+    print(td.shape[0],'terms by',td.shape[1],'docs')
+    print("size of term-document matrix in bytes according to sys.getsizeof =",sys.getsizeof(td))
+    # num_topics = int(td.shape[1] / topic_divider)
+    # num_topics = 30
+    # num_topics = 7
+    num_topics = 2
+    if (num_topics < 2):
+        num_topics = 2
+    # matrix_to_file(td)
+    # print 'td\n',td
+    # print 'vocab\n',vocab
+    # return td, idf, vocab
+    # exit(0)
+    return td, vocab
+
+def K(D):
+    num_topics = int(D/topic_divider)
+    if (num_topics < 2):
+        num_topics = 2
+    return num_topics
+
+def docs_to_delete(docs,docs_red):
+
+    list_1 = []
+
+    for i in range(docs.__len__()):
+
+        index = next((k for k in range(docs_red.__len__()) if docs[i]==docs_red[k]),-1)
+
+        if index != -1:
+            del docs_red[index]
+        else:
+            list_1.append(i)
+
+    print('Number of files for deletion ',list_1.__len__())
+
+    return list_1
+
+def delete_docs(list_1):
+
+    pattern_1 = os.path.join(folder, "*.txt")
+
+    f = glob.glob(pattern_1)
+
+    print('Deleting files started ....................')
+
+    for i in list_1:
+
+        try:
+            os.remove(f[i])
+
+        except Exception as e:
+
+            print('Error during file deletion while reducing docs')
+            logging.exception("message")
+
+    print('Deleting files ended ||||||||||||||||||||||')
+
+
+def reduce_docs(docs):
+
+    G = 1024**3
+    percent_to_delete = 0.05
+
+    counter = 0
+
+
+    while True:
+
+        counter = counter + 1
+        print("counter =", counter)
+
+        # Calculate number of keywords
+        words_coll = []
+        for words in docs:
+            words_coll.extend(words.split())
+
+        W = len(set(words_coll))
+
+        # ram=((d*w)+(d+w)*k*2)*8)/G
+        ram = (float(len(docs)*W) + float(len(docs)+W) * float(K(len(docs))*2))*8.0/float(G)
+
+
+        # No more for debugging purposes
+        print("ram =", ram)
+        print('len(docs =', len(docs))
+        print("W =", W)
+
+        if ram > RAM_limit:
+
+            # # For debugging
+            # if counter == 68:
+            #     break
+
+            if counter == 1:
+                print('Ram limiter code initialted ---------------------------------')
+
+            del_len = int(len(docs) * percent_to_delete)
+
+            del_list = random.sample(range(0, len(docs)), del_len)
+
+            for i in range(del_list.__len__()):
+                del docs[del_list[i]-i]
+
+
+        else:
+
+            break
+
+    print("Final W =", W)
+    print("Final len(docs) =", len(docs))
+    print("Final counter =", counter)
+
+    return docs
+
+
+
+
+
+def matrix_to_file(mat):
+    f = open(matrix_file,'w')
+
+    # print mat.__len__()
+    # print mat[0].__len__()
+
+    f.write('function[a] = matPlsaFull()\n\n')
+
+    f.write('a = [\n')
+
+    for i in range(mat.__len__()):
+        print (i)
+        for j in range(mat[0].__len__()):
+            f.write(str(mat[i][j]))
+            f.write('   ')
+        f.write('\n\n')
+
+    f.write('\n\n];\n\n\n\n\n')
+
+    f.close()
+
+    exit()
+
+
+# @tm.task(feat, int, int)
+def train(data, maxiter=500, debug=True):
+    # td, idf, vocab = data
+    # s_file_list= sorted(file_list)
+    # print('file_list:',file_list)
+    # print s_file_list
+    topic_list= range(0,num_topics)
+    # print topic_list
+    # file_list_2 = list(file_list)
+    # empty_file_list()
+    #Bug update
+    # df= pd.DataFrame(0,index=topic_list,columns=file_list_2)
+    # Bug update over
+
+
+
+    td, vocab = data
+    # td = td[:,:-1]
+    plsa = pLSA()
+    plsa.debug = debug
+    plsa.logL_pic = logL_pic
+    # model=plsa.train(td, num_topics, maxiter)
+    model=plsa.train(td=td,Z=num_topics,maxiter=maxiter2,eps=cc,beta=beta,min_iteration=min_iteration)
+    p_z_d=plsa.topic_document()
+    ii=0
+
+    # print 'model2',model[0]
+    # print 'p_z_D =',p_z_d[0]
+    # print('row',len(p_z_d))
+    # print('column',len(p_z_d[0]))
+    # print(df.shape)
+
+    # Bug update
+    # for i in df.index:
+    #     jj=0
+    #     for j in df.columns:
+    #         df.loc[i,j]= p_z_d[ii][jj]
+    #         # print 'df loc', df.loc[i,j]
+    #         # print'ii jj', p_z_d[ii][jj]
+    #         jj=jj+1
+    #     ii=ii+1
+    # Bug update over
+    # print('PATH =',PATH)
+    # Bug update
+    # df.to_csv(PATH+'.csv')
+    # Bug update over
+
+
+    file_list = []
+    pattern_1 = os.path.join(folder, "*.txt")
+
+    f = glob.glob(pattern_1)
+
+    for f_i in f:
+        # print(f_i)
+        file_i=str(f_i).split('/')[file_parts_number]
+        file_list.append(file_i)
+
+
+    print('Dimenstionssssssssssssssssss')
+    print("topic_list_len =",topic_list.__len__())
+    print("p_z_d_len =", p_z_d.__len__())
+    print("file_list_len =",file_list.__len__())
+    print("p_z_d[0] =", p_z_d[0].__len__())
+
+
+
+    topic_by_doc = open(PATH+'.csv', "w")
+    for i in range(file_list.__len__()):
+        topic_by_doc.write(',')
+        topic_by_doc.write(file_list[i])
+    topic_by_doc.write('\n')
+
+    for i in range(p_z_d.__len__()):
+        topic_by_doc.write(str(i))
+        for j in range(p_z_d[0].__len__()):
+            topic_by_doc.write(',')
+            topic_by_doc.write(str(p_z_d[i][j]))
+        topic_by_doc.write('\n')
+    topic_by_doc.close()
+
+
+    word_by_topic_conditional = open(PATH_word_by_topic_conditional+'.csv', "w")
+
+    p_w_z_transposed_truncated = np.sort(plsa.p_w_z.transpose()[:,0:num_topics_generated])
+
+    for i in range(p_w_z_transposed_truncated.__len__()):
+        for j in range(p_w_z_transposed_truncated[0].__len__()):
+            word_by_topic_conditional.write(str(p_w_z_transposed_truncated[i][num_topics_generated-j-1]))
+            word_by_topic_conditional.write(',')
+        word_by_topic_conditional.write('\n')
+    word_by_topic_conditional.close()
+
+
+
+
+
+    # print('docs==========================')
+    #
+    # for i in file_list:
+    #     print(i)
+    # for i in p_z_d:
+    #     print(i)
+
+    pz=model[0]
+    topic_prob_file = open(PLSA_PARAMETERS_PATH + file2, "w")
+    for z in pz:
+        topic_prob_file.write(str(z))
+        topic_prob_file.write('\n')
+    topic_prob_file.close()
+    return model
+
+
+# @tm.task(feat, int, int)
+def average_train(data, maxiter=500, debug=True):
+    td, idf, vocab = data
+    td = td[:,:-1]
+    plsa = pLSA()
+    plsa.debug = debug
+    return plsa.average_train(10)(td, 10, maxiter)
+
+# @tm.task(feat, train, int, int)
+def folding_in(data, model, maxiter=30, debug=True):
+    td, idf, vocab = data
+    d = td[:,-1]
+    plsa = pLSA(model)
+    plsa.debug = debug
+    print (plsa.folding_in(d, maxiter))
+
+# @tm.nocache
+# @tm.task(train)
+def document_topics(model):
+    plsa = pLSA(model)
+    for i in  plsa.document_topics():
+       print (i)
+       # file_txt1.write(str(i))
+       # file_txt1.write('\n')
+    # print plsa.document_topics()
+
+# @tm.nocache
+# @tm.task(train)
+def document_cluster(model):
+    plsa = pLSA(model)
+    print (plsa.document_cluster())
+
+# @tm.nocache
+# @tm.task(train)
+def word_topics(model):
+    plsa = pLSA(model)
+    for i in  plsa.word_topics():
+       print (i)
+       # file_txt1.write(str(i))
+       # file_txt1.write('\n')
+    # print plsa.word_topics()
+
+# @tm.nocache
+# @tm.task(train)
+def word_cluster(model):
+    plsa = pLSA(model)
+    print (plsa.word_cluster())
+
+# @tm.nocache
+# @tm.task(train)
+def unigram_smoothing(model):
+    plsa = pLSA(model)
+    print (plsa.unigram_smoothing())
+
+# @tm.nocache
+# @tm.task(feat, train, int)
+def topic_labels(data, model, N=50):
+    # td, idf, vocab = data
+    file_txt = open(PLSA_PARAMETERS_PATH + file, "w")
+    port_dict = porter_dictionary.porter_dictionary()
+    port_dict.load_dict(dict_path)
+    # print port_dict.dictionary
+    td, vocab = data
+    plsa = pLSA(model)
+    inv_vocab = inverse_vocab(vocab)
+    dict_vocab=[]
+    # vocab_list=[x for x in inv_vocab[1]]
+
+    # print vocab_list
+    for ind in inv_vocab:
+       try:
+          dict_vocab.append(port_dict.dictionary[inv_vocab[ind]][0])
+       except:
+             dict_vocab.append(inv_vocab[ind])
+    # print len(dict_vocab)
+    for i in plsa.topic_labels(dict_vocab, N):
+       # print (i)
+       # file_txt.write(str(i))
+       for j in i:
+           file_txt.write(j+', ')
+       file_txt.write('\n')
+    file_txt.close()
+    # print plsa.topic_labels(inv_vocab, N)
+
+# @tm.nocache
+# @tm.task(feat, train)
+def global_weights(data, model):
+    td, idf, vocab = data
+    plsa = pLSA(model)
+    print (plsa.global_weights(idf))
+
+def main():
+    # import sys
+
+    # try:
+    #     # tm.TaskManager.OUTPUT_FOLDER = "./tmp"
+    #     tm.run_command(sys.argv[1:])
+
+
+    # except tm.TaskManagerError, m:
+    #     print >>sys.stderr, m
+
+    print ('Training started at',time.strftime("%c"))
+    start_time = time.time()
+    data=feat(folder)
+    model=train(data)
+    print ('>>>>>>>>>>>>>Finished training')
+    end_time = time.time()
+    print ('Training took ' + str(round((end_time - start_time) / 60, 4)) + ' minutes.')
+    topic_labels(data,model,num_topics_generated)
+    end_time = time.time()
+    print ('Total time ' + str(round((end_time - start_time) / 60,4)) + ' minutes.')
+
+    # doc_topics=document_topics(model)
+    # topic_labels(data,model)
+    # word_cluster(model)
+    # word_topics(model)
+    # document_topics(model)
+if __name__ == "__main__":
+    main()
diff --git a/plsa/plsa/plsa.py b/plsa/plsa/plsa.py
new file mode 100644
index 0000000..e22f95a
--- /dev/null
+++ b/plsa/plsa/plsa.py
@@ -0,0 +1,455 @@
+# Copyright (C) 2010 Mathieu Blondel
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+"""
+Implementation of probabilistic Latent Semantic Analysis/Indexing as described
+in
+
+"Probabilistic Latent Semantic Indexing", Hofmann, SIGIR99
+
+Notation:
+
+    w: word
+    d: document
+    z: topic
+
+    V: vocabulary size
+    D: number of documents
+    Z: number of topics
+
+"""
+
+import psutil
+import numpy as np
+import matplotlib
+# Force matplotlib to not use any Xwindows backend.
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+# import pandas as pd
+import inspect
+# import _plsa
+
+
+# beta2 = 0.75
+beta2 = 1
+# HAVE_EXT = True
+HAVE_EXT = True
+min_iteration2 = 10 # 10 was the previous value
+init_with_seed = False
+
+
+
+
+frame = inspect.stack()[1]
+file_name = str(inspect.getfile((frame[0])))
+
+
+logL_pic = ''
+
+
+try:
+    import _plsa
+    if HAVE_EXT == True:
+        print ('xxxxxxxxxxxxxxxxxxxxxxxxxxxxx. There is _plsa')
+    else:
+        HAVE_EXT = False
+        print ('^^^^^^^^^^^^^^^^^^^^^^^^^^^. No _plsa')
+except:
+    HAVE_EXT = False
+    print ('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&. No _plsa')
+
+# print 'HAVE_EXT =',HAVE_EXT
+
+
+def normalize_1d(a, out=None):
+    if out is None: out = np.empty_like(a)
+    s = np.sum(a)
+    if s != 0.0 and len(a) != 1:
+        np.divide(a, s, out)
+    return out
+
+def normalize(M, axis=0, out=None):
+    if len(M.shape) == 1: return normalize_1d(M, out)
+    if out is None: out = np.empty_like(M)
+    if axis == 0:
+        M = M.T #M.swapaxes(0,1)
+        out = out.T
+
+    for i in range(len(M)):
+        normalize_1d(M[i], out[i])
+
+    if axis == 0: out = out.T
+
+    return out
+
+def loglikelihood(td, p_z, p_w_z, p_d_z):
+    """
+    Compute the log-likelihood that the model generated the data.
+    """
+    V, D = td.shape
+    L = 0.0
+    for w,d in zip(*td.nonzero()):
+        # p_d_w = np.sum(p_z * p_w_z[w,:] * p_d_z[d,:])
+        p_d_w = np.sum(p_z * p_w_z[w,:] * p_d_z[d,:]/(np.sum(p_z*p_d_z[d,:])))
+        if p_d_w > 0: L += td[w,d] * np.log(p_d_w)
+    return L
+
+def train(td,
+          p_z, p_w_z, p_d_z,
+          p_z_old, p_w_z_old, p_d_z_old,
+          maxiter, eps,
+          folding_in, debug,logL_c):
+
+    logL = []
+
+    R = td.sum() # total number of word counts
+
+    lik = loglikelihood(td, p_z, p_w_z, p_d_z)
+    logL.append(lik)
+    print ('Starting logL =',lik)
+    # beta2 = 1
+    for iteration in range(1, maxiter+1):
+        # Swap old and new
+        p_d_z_old, p_d_z = (p_d_z, p_d_z_old)
+        p_w_z_old, p_w_z = (p_w_z, p_w_z_old)
+        p_z_old, p_z = (p_z, p_z_old)
+
+        # Set to 0.0 without memory allocation
+        p_d_z *= 0.0
+        if not folding_in:
+            p_w_z *= 0.0
+            p_z *= 0.0
+
+        if (beta2 != 1):
+            print ('tempering started')
+            p_z_old = np.power(p_z_old, beta2)
+            # print('Finished','p_z_old')
+            # beta2 = 1
+            p_d_z_old = np.power(p_d_z_old, beta2)
+            # print('Finished','p_d_z_old')
+            p_w_z_old = np.power(p_w_z_old, beta2)
+            # print('Finished','p_w_z_old')
+            print ('tempering finished')
+
+
+        for w,d in zip(*td.nonzero()):
+            # E-step
+            # beta2 = 0.75
+
+            p_z_d_w = p_z_old * p_d_z_old[d, :] * p_w_z_old[w, :]
+
+            # if (beta2 ==1):
+            #     p_z_d_w = p_z_old * p_d_z_old[d,:] * p_w_z_old[w,:]
+            # else:
+            #     p_z_old_beta2 = np.power(p_z_old,beta2)
+            #     # print('Finished','p_z_old_beta2')
+            #     # beta2 = 1
+            #     p_d_z_old_beta2 = np.power(p_d_z_old,beta2)
+            #     # print('Finished','p_d_z_old_beta2')
+            #     p_w_z_old_beta2 = np.power(p_w_z_old,beta2)
+            #     # print('Finished','p_w_z_old_beta2')
+            #     p_z_d_w = p_z_old_beta2 * p_d_z_old_beta2[d,:] * p_w_z_old_beta2[w,:]
+            #     # print('Finished','p_z_d_w')
+                
+
+            normalize(p_z_d_w, out=p_z_d_w)
+            # print('Finished normalizing')
+
+
+
+            # M-step
+            s = td[w,d] *  p_z_d_w
+            p_d_z[d,:] += s
+
+            if not folding_in:
+                p_w_z[w,:] += s
+                p_z += s
+
+        # normalize
+        normalize(p_d_z, axis=0, out=p_d_z)
+
+        if not folding_in:
+            normalize(p_w_z, axis=0, out=p_w_z)
+            p_z /= R
+
+        lik_new = loglikelihood(td, p_z, p_w_z, p_d_z)
+        logL.append(lik_new)
+        # lik_diff = lik_new - lik
+        lik_diff = -(lik_new - lik)*100/lik
+        # assert(lik_diff >= -1e-10)
+        lik = lik_new
+
+
+        if debug:
+            print ("Iteration", iteration)
+
+            # I commented this out to save computational time
+            # print "Parameter change"
+            # pz=np.abs(p_z - p_z_old).sum()
+            # pwz=np.abs(p_w_z - p_w_z_old).sum()
+            # pdz=np.abs(p_d_z - p_d_z_old).sum()
+            # p_z_d= pz*p_d_z
+            # print "P(z): ",pz
+            # print "P(w|z): ",pwz
+            # print "P(d|z): ",pdz
+            # print "P(z,d):" , p_z_d
+            #
+            # print "L += %f" % lik_diff
+            # print "Lik_new",lik
+            print ("lik_diff =",lik_diff,'%')
+
+
+        if iteration > min_iteration2 and lik_diff < eps :
+            print ("No more progress, stopping EM at iteration", iteration)
+            print ("LogL =", logL)
+            axis1 = range(1,len(logL)+1)
+            plt.plot(axis1, logL)
+            plt.savefig(logL_pic)
+            break
+
+
+
+    # print 'p_z-----'
+    # print p_z
+
+    # ddf=pd.read_csv('/xxx/topic-by-Document.csv')
+    # ddf = ddf.set_index('Unnamed: 0')
+    # print ddf
+    # for i in ddf.index:
+    #     for j in ddf.columns:
+    #          ddf.loc[i,j]=p_d_z.T[i,j]
+
+
+
+
+# def power
+
+class pLSA(object):
+
+    def __init__(self, model=None):
+        """
+        model: a model, as returned by get_model() or train().
+        """
+        self.p_z = None
+        self.p_w_z = None
+        self.p_d_z = None
+        self.p_z_d= None
+        if model is not None: self.set_model(model)
+        self.debug = False
+        self.logL_c = np.zeros(503)
+        self.logL_pic = ''
+
+    def random_init(self, Z, V, D):
+        """
+        Z: the number of topics desired.
+        V: vocabulary size.
+        D: number of documents.
+        """
+        if init_with_seed:
+            np.random.seed(0) # uncomment for deterministic init
+        if self.p_z is None:
+            self.p_z = normalize(np.random.random(Z))
+            print(self.p_z)
+            np.savetxt("pz.csv", self.p_z, delimiter=",")
+        if self.p_w_z is None:
+            self.p_w_z = normalize(np.random.random((V,Z)), axis=0)
+            np.savetxt("pwz.csv", self.p_w_z, delimiter=",")
+        if self.p_d_z is None:
+            self.p_d_z = normalize(np.random.random((D,Z)), axis=0)
+            np.savetxt("pdz.csv", self.p_d_z, delimiter=",")
+
+
+    def train(self, td, Z, maxiter=500, eps=0.01, folding_in=False,beta=beta2,min_iteration = min_iteration2): # eps=0.01 is default value
+        """
+        Train the model.
+
+        td: a V x D term-document matrix of term-counts.
+        Z: number of topics desired.
+
+        td can be dense or sparse (dok_matrix recommended).
+        """
+        V, D = td.shape
+
+        self.random_init(Z, V, D)
+
+        p_d_z_old = np.zeros_like(self.p_d_z)
+        p_w_z_old = np.zeros_like(self.p_w_z)
+        p_z_old = np.zeros_like(self.p_z)
+
+        train_func = _plsa.train if HAVE_EXT else train
+        print('RRRRRRRRRRRRRRRAM used just before training:', float(list(psutil.virtual_memory())[3]) / 1073741824.0, 'GB')
+        train_func(td.astype(np.uint32),
+                   self.p_z, self.p_w_z, self.p_d_z,
+                   p_z_old, p_w_z_old, p_d_z_old,
+                   maxiter, eps,beta,min_iteration,
+                   folding_in, self.debug,self.logL_c)
+
+        # self.P_D_Z = self.p_d_z
+        # self.P_Z = self.p_z
+
+        # print 'here????????????????????????????'
+
+        if (HAVE_EXT == True):
+            bound_logL = list(self.logL_c).index(0)
+            logL_c_list = list(self.logL_c[:bound_logL])
+            print ('logL =',logL_c_list)
+            axis1 = range(1, logL_c_list.__len__() + 1)
+            plt.clf()
+            plt.plot(axis1, logL_c_list)
+            # print('logL_pic=',self.logL_pic)
+            plt.savefig(self.logL_pic)
+
+        return self.get_model()
+
+    def average_train(self, N):
+        """
+        Return a function compatible with train.
+        This function executes the training N times and takes the average.
+        """
+        assert(N >= 1)
+
+        def _wrap(*args, **kw):
+            print ("pLSA 1")
+            model = list(pLSA().train(*args, **kw))
+
+            for i in range(1,N):
+                print ("pLSA", i+1)
+                model2 = pLSA().train(*args, **kw)
+                for j in range(len(model)):
+                    model[j] += model2[j]
+
+            normalize(model[0], out=model[0])
+            normalize(model[1], axis=0, out=model[1])
+            normalize(model[2], axis=0, out=model[2])
+
+            self.set_model(tuple(model))
+
+            return self.get_model()
+
+        return _wrap
+    def topic_document(self):
+        '''
+        Compute the probabilities of topics belonging to document
+        Retrun: P(z,d)
+        '''
+        self.p_z_d= self.p_z*self.p_d_z
+        # print 'p_z_d-----'
+        # print (self.p_z_d.shape)
+        # print 'p_z_d.T-----'
+        # print (self.p_z_d.T.shape)
+
+
+        return self.p_z_d.T  # T is for transpose
+
+
+    def document_topics(self):
+        """
+        Compute the probabilities of documents belonging to topics.
+
+        Return: a Z x D matrix of P(z|d) probabilities.
+
+        Note: This can be seen as a dimensionality reduction since a Z x D
+        matrix is obtained from a V x D matrix, where Z << V.
+        """
+        return normalize((self.p_d_z * self.p_z[np.newaxis,:]).T, axis=0)
+
+    def document_cluster(self):
+        """
+        Find the main topic (cluster) of documents.
+
+        Return: a D-array of cluster indices.
+        """
+        return self.document_topics().argmax(axis=0)
+
+    def word_topics(self):
+        """
+        Compute the probabilities of words belonging to topics.
+
+        Return: a Z x V matrix of P(z|w) probabilities.
+        """
+        return normalize((self.p_w_z * self.p_z[np.newaxis,:]).T, axis=0)
+
+    def word_cluster(self):
+        """
+        Find the main topic (cluster) of words.
+
+        Return: a D-array of cluster indices.
+        """
+        return self.word_topics().argmax(axis=0)
+
+    def topic_labels(self, inv_vocab, N=10):
+        """
+        For each topic z, find the N words w with highest probability P(w|z).
+
+        inv_vocab: a term-index => term-string dictionary
+
+        Return: Z lists of N words.
+        """
+        Z = len(self.p_z)
+        ret = []
+        for z in range(Z):
+            ind = np.argsort(self.p_w_z[:,z])[-N:][::-1]
+            ret.append([inv_vocab[i] for i in ind])
+        return ret
+
+    def unigram_smoothing(self):
+        """
+        Compute the smoothed probability P(w|d) by "back-projecting" the
+        features from the latent space to the original space.
+
+        Return: a V x D matrix of smoothed P(w|d) probabilities.
+        """
+        V, Z = self.p_w_z.shape
+        D, Z = self.p_d_z.shape
+        p_w_d = np.zeros((V,D), dtype=np.double)
+        for d in range(D):
+            for w in range(V):
+                p_w_d[w,d] = np.sum(self.p_w_z[w,:] * self.p_d_z[d,:])
+        return p_w_d
+
+    def folding_in(self, d, maxiter=50, eps=0.01):
+        """
+        Compute the probabilities of a new document d belonging to topics.
+
+        d: a V-array of term-counts.
+
+        Return: a Z-array of P(z|d) probabilities.
+        """
+        V = d.shape[0]
+        Z = len(self.p_z)
+        plsa = pLSA()
+        plsa.debug = self.debug
+        plsa.p_z = self.p_z
+        plsa.p_w_z = self.p_w_z
+        plsa.p_z_d= self.p_z_d
+        plsa.train(d[:,np.newaxis], Z, maxiter, eps, folding_in=True)
+        return normalize(self.p_z * plsa.p_d_z[:,0])
+
+    def global_weights(self, gw):
+        """
+        Compute global weight vector in latent space.
+
+        gw: a V-array of global weights (e.g., idf).
+
+        Return: a Z-array of global weights.
+        """
+        return np.sum(gw[:,np.newaxis] * self.p_w_z, axis=0)
+
+    def get_model(self):
+        return (self.p_z, self.p_w_z, self.p_d_z, self.p_z_d)
+
+    def set_model(self, model):
+        self.p_z, self.p_w_z, self.p_d_z,self.p_z_d = model
diff --git a/plsa/plsa/plsa_train.c b/plsa/plsa/plsa_train.c
new file mode 100644
index 0000000..2d3a0c5
--- /dev/null
+++ b/plsa/plsa/plsa_train.c
@@ -0,0 +1,271 @@
+/*
+# Copyright (C) 2010 Mathieu Blondel
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <strings.h>
+
+#define C_(cols, i, j) (i * cols + j)
+#define F_(rows, i, j) (i + j * rows)
+#define Z_(i,j) (C_(n_z, i, j))
+
+#define SWAP(a,b,tmp) tmp = a; a = b; b = tmp
+
+
+//int min_iteration = 30;
+//int min_iteration = 120;
+//int min_iteration = 10;
+//int min_iteration = 6;
+//double beta = 0.75;
+//double beta = 1;
+
+static void
+normalize_axis0(double *a,
+                unsigned int j,
+                unsigned int rows,
+                unsigned int cols)
+{
+    double sum = 0.0;
+    unsigned int n;
+
+    for (n=0; n < rows; n++)
+        sum += a[C_(cols,n,j)];
+
+    if (sum > 0.0)
+    {
+        for (n=0; n < rows; n++)
+            a[C_(cols,n,j)] /= sum;
+    }
+}
+
+static unsigned int
+total_wc(unsigned int n_ele,
+         unsigned int *tdnz)
+{
+    unsigned int n, wc=0;
+
+    for (n=0; n < n_ele; n++)
+        wc += tdnz[n];
+
+    return wc;
+}
+
+static double
+loglikelihood(unsigned int n_ele,
+              unsigned int n_z,
+              unsigned int *tdnz,
+              double *p_z,
+              double *p_w_z,
+              double *p_d_z)
+{
+    unsigned int n, z, w, d, wc;
+    double sum, L = 0.0;
+    double sum111 = 0.0;
+
+    for (n=0; n < n_ele; n++)
+    {
+        wc = tdnz[n];
+        w = tdnz[F_(n_ele,n,1)];
+        d = tdnz[F_(n_ele,n,2)];
+
+        sum = 0.0;
+        sum111 = 0.0;
+        for (z=0; z < n_z; z++)
+        {
+           sum += p_z[z] * p_w_z[Z_(w,z)] * p_d_z[Z_(d,z)];
+           sum111 += p_z[z] * p_d_z[Z_(d,z)];
+        }
+
+//        printf("Printeddddd %f %f",sum,sum111);
+        sum = sum / sum111;
+
+        if (sum > 0.0)
+            L += wc * log(sum);
+    }
+
+    return L;
+}
+
+void
+_train(unsigned int n_ele,
+       unsigned int n_z,
+       unsigned int n_w,
+       unsigned int n_d,
+       /*
+       tdnz is a fortran-style 2d-array where:
+           - 1st column: word count
+           - 2nd column: word index
+           - 3rd column: document index
+       */
+       unsigned int *tdnz,
+       double *p_z,
+       double *p_w_z,
+       double *p_d_z,
+       double *p_z_old,
+       double *p_w_z_old,
+       double *p_d_z_old,
+       unsigned int maxiter,
+       double eps,
+       double beta,
+       unsigned int min_iteration,
+       unsigned int folding_in,
+       unsigned int debug,
+       double *logL_c)
+
+{
+
+    unsigned int iter, n, z, w, d, wc, R;
+    double *p_z_d_w, *tmp, sum, lik, lik_new, lik_diff;
+
+    R = total_wc(n_ele, tdnz);
+
+    p_z_d_w = (double *)calloc(n_z, sizeof(double));
+
+    lik = loglikelihood(n_ele, n_z, tdnz, p_z, p_w_z, p_d_z);
+
+    logL_c[0] = lik;
+
+
+    printf("Starting logL = %f.\n", lik);
+
+    unsigned int max = 0;
+
+    unsigned int iii = 0;
+
+    if(n_z*n_w > n_z*n_d)
+    { max = n_z*n_w;}
+    else
+    { max = n_z*n_d;}
+
+
+
+
+
+
+    for (iter=0; iter < maxiter; iter++)
+    {
+
+        SWAP(p_z, p_z_old, tmp);
+        SWAP(p_w_z, p_w_z_old, tmp);
+        SWAP(p_d_z, p_d_z_old, tmp);
+
+        bzero(p_z, n_z * sizeof(double));
+        if (!folding_in)
+        {
+            bzero(p_w_z, n_w * n_z * sizeof(double));
+            bzero(p_d_z, n_d * n_z * sizeof(double));
+        }
+
+        if (beta != 1)
+        {
+
+            printf("Tempering started....");
+
+            for(iii = 0; iii < max;iii++)
+            {
+                if(iii < n_z)
+                {
+                    p_z_old[iii] = pow(p_z_old[iii],beta);
+                }
+
+                if(iii < n_z*n_w)
+                {
+                    p_w_z_old[iii] = pow(p_w_z_old[iii],beta);
+                }
+
+                if(iii < n_z*n_d)
+                {
+                    p_d_z_old[iii] = pow(p_d_z_old[iii],beta);
+                }
+
+            }
+
+
+            printf("Tempering finished.\n");
+
+        }
+
+        for (n=0; n < n_ele; n++)
+        {
+            wc = tdnz[n];
+            w = tdnz[F_(n_ele,n,1)];
+            d = tdnz[F_(n_ele,n,2)];
+
+            sum = 0.0;
+
+            for (z=0; z < n_z; z++)
+            {
+                sum += (p_z_d_w[z] = p_z_old[z] * p_d_z_old[Z_(d,z)] *
+                        p_w_z_old[Z_(w,z)]);
+            }
+
+            if (sum > 0.0)
+            {
+                for (z=0; z < n_z; z++)
+                {
+                    p_z_d_w[z] *= (wc/ sum);
+                    p_d_z[Z_(d,z)] += p_z_d_w[z];
+                    if (!folding_in)
+                    {
+                        p_w_z[Z_(w,z)] += p_z_d_w[z];
+                        p_z[z] += p_z_d_w[z];
+                    }
+                }
+            }
+        } /* end for n */
+
+        for (z=0; z < n_z; z++)
+        {
+            normalize_axis0(p_d_z, z, n_d, n_z);
+            if (!folding_in)
+            {
+                normalize_axis0(p_w_z, z, n_w, n_z);
+                p_z[z] /= R;
+            }
+        }
+
+        lik_new = loglikelihood(n_ele, n_z, tdnz, p_z, p_w_z, p_d_z);
+        //lik_diff = lik_new - lik;
+        lik_diff = -(lik_new - lik)*100/lik;
+        lik = lik_new;
+        logL_c[iter+1] = lik;
+
+
+        /*if (iter > 0 && iter % 5 == 0)
+        {
+            printf("."); fflush(stdout);
+        }*/
+
+        printf("Iteration %d\n",iter+1);
+        printf("lik_diff = %f \n",lik_diff);
+
+        if (iter > (min_iteration-1) && lik_diff < eps)
+        {
+            printf("\neps = %f \n", eps);
+            break;
+        }
+
+    } /* end for iter */
+
+    printf("\nStopped at iteration %d.\n", iter+1);
+
+
+    free(p_z_d_w);
+}
diff --git a/plsa/plsa/porter_dictionary.py b/plsa/plsa/porter_dictionary.py
new file mode 100644
index 0000000..6df0e5c
--- /dev/null
+++ b/plsa/plsa/porter_dictionary.py
@@ -0,0 +1,66 @@
+__author__ = 'eyob'
+
+
+import pickle
+
+class porter_dictionary:
+
+
+    def __init__(self):
+
+        self.dictionary = dict()
+
+
+
+    def add_element(self,stemmed, nonstemmed):
+
+        self.arr = []
+
+        if stemmed in self.dictionary:
+            print('Stemmed word',stemmed,'present')
+            self.arr = self.dictionary[stemmed]
+
+            if nonstemmed in self.arr:
+                print('Nonstemmed word',nonstemmed,'present')
+                pass
+            else:
+                print('Nonstemmed word',nonstemmed,'absent')
+                self.arr.append(nonstemmed)
+                self.dictionary.update({stemmed:self.arr})
+        else:
+            print('Stemmed word',stemmed,'absent')
+            self.arr.append(nonstemmed)
+            self.dictionary.update({stemmed:self.arr})
+
+
+
+    def write_dict_to_file(self,file_name):
+
+        with open(file_name, 'wb') as handle:
+            pickle.dump(self.dictionary,handle,protocol=2)
+
+
+        self.sorted_keys = sorted(self.dictionary)
+
+        with open(file_name+'.txt','w') as file1:
+            for i in self.sorted_keys:
+                file1.write(i+str(self.dictionary[i])+'\n')
+
+
+
+
+    def load_dict(self,file_name):
+        with open(file_name, 'rb') as handle:
+            self.dictionary = pickle.load(handle)
+
+
+
+
+
+if __name__ == '__main__':
+
+    pass
+
+
+
+
diff --git a/plsa/plsa/setup.py b/plsa/plsa/setup.py
new file mode 100644
index 0000000..157dbc5
--- /dev/null
+++ b/plsa/plsa/setup.py
@@ -0,0 +1,16 @@
+#@ -@*@- coding: utf@-@8 -*-
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+
+import numpy as np
+
+setup(
+    name="plsa",
+    cmdclass = {'build_ext': build_ext},
+    ext_modules = [Extension("_plsa", ["_plsa.pyx", "plsa_train.c"],
+                             include_dirs = [np.get_include(),'.'],
+                             extra_compile_args=['-O3'])],
+    py_modules = ['plsa',],
+)
+
diff --git a/plsa/plsa/tester_python3.py b/plsa/plsa/tester_python3.py
new file mode 100644
index 0000000..0915eb0
--- /dev/null
+++ b/plsa/plsa/tester_python3.py
@@ -0,0 +1,206 @@
+__author__ = 'eyob'
+# Tested on python3.6
+
+import psutil
+print('===================ram used at program start:',float(list(psutil.virtual_memory())[3])/1073741824.0,'GB')
+
+import os
+import sys
+import pathlib
+import csv
+import random
+import datetime
+import time
+
+sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/plsa')
+sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing')
+
+import example_plsa as pplsa
+import cleansing as pclean
+
+class TopicAnalysis:
+
+    def __init__(self, path,channel=''):
+
+        self.data_path = path
+        self.channel = channel
+        self.root_path = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/plsa/'
+        print(self.root_path)
+        self.extracted_folder = self.root_path + 'extracted/'
+        self.file_dict = self.root_path + 'dict/'
+        self.source_texts = self.root_path + 'extracted/'
+        self.output_dir = self.root_path + 'cleaned/'
+        print(self.output_dir)
+        self.folder = self.root_path + 'cleaned/'
+        self.dict_path = self.root_path + 'dict/'
+        self.plsa_parameters_path = self.root_path + 'plsa-parameters/'
+        self.PLSA_PARAMETERS_PATH = ''
+
+        # self.messages
+        # self.unique_folder_naming
+
+    def __del__(self):
+
+        # Close db connections
+        pass
+
+    def read_csv(self):
+
+        messages_list = []
+
+        with open(self.data_path, mode='r') as csv_file:
+            csv_reader = csv.DictReader(csv_file, delimiter=',')
+            for row in csv_reader:
+                messages_list.append(row)
+
+        self.messages = messages_list
+
+
+
+    def write_to_files_slack(self):
+
+        self.read_csv()
+
+
+        self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + self.channel + '/'
+        print(self.unique_folder_naming)
+
+        os.mkdir(self.extracted_folder+self.unique_folder_naming)
+
+        idx = 0
+
+        for row in self.messages:
+            if row['subtype'] == 'chat':
+                file = self.extracted_folder+self.unique_folder_naming+str(idx+2)+'.txt'
+                if self.channel == '':
+                    with open(file, 'w') as f:
+                        f.write(row['text'])
+                else:
+                    if row['channel_name'] == self.channel:
+                        with open(file, 'w') as f:
+                            f.write(row['text'])
+                    else:
+                        continue
+
+            idx = idx + 1
+
+
+    def generate_topics(self):
+
+        start_time_1 = time.time()
+
+        pplsa.file_parts_number=10
+        pclean.file_parts_number = 10
+        pclean.file_dict = self.file_dict + self.unique_folder_naming[:-1] +'_dict'
+        pclean.source_texts = self.source_texts + self.unique_folder_naming + '*.txt'
+        pclean.output_dir = self.output_dir + self.unique_folder_naming
+
+        os.mkdir(pclean.output_dir)
+
+
+        # Do cleansing on the data and turing it to bad-of-words model
+        pclean.pre_pro()
+
+        # Train using PLSA
+        pplsa.folder = pclean.output_dir[:-1]
+        pplsa.dict_path = pclean.file_dict
+        pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
+        pplsa.PATH = pplsa.PLSA_PARAMETERS_PATH + 'topic-by-doc-matirx'
+        pplsa.PATH_word_by_topic_conditional = pplsa.PLSA_PARAMETERS_PATH + 'word_by_topic_conditional'
+        pplsa.logL_pic = pplsa.PLSA_PARAMETERS_PATH + 'logL.png'
+
+        # Folder paths to delete
+        self.PLSA_PARAMETERS_PATH = pplsa.PLSA_PARAMETERS_PATH
+        self.output_dir_stream = pclean.output_dir
+        self.file_dict_stream = pclean.file_dict
+
+
+
+        os.mkdir(pplsa.PLSA_PARAMETERS_PATH)
+
+        pplsa.main()
+
+        end_time_1 = time.time()
+
+        print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4))
+
+
+
+
+
+
+
+
+
+def run_plsa_slack():
+    path_1 = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/misc/slack_messages.csv'
+    print(path_1)
+    s = TopicAnalysis(path_1,'singnet')
+    s.write_to_files_slack()
+    s.generate_topics()
+
+
+def run_1():
+
+    t = TopicAnalysis('local')
+
+
+    pass
+
+
+def test_preprocessing():
+
+    root_folder = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/plsa/test/'
+
+    pclean.file_parts_number=10
+    pplsa.file_parts_number = 10
+    pclean.file_dict = root_folder + 'dict/test_dict'
+    pclean.source_texts = root_folder + 'extracted/*.txt'
+    pclean.output_dir = root_folder + 'cleaned/'
+
+
+    # Do cleansing on the data and turing it to bad-of-words model
+    pclean.pre_pro()
+
+    # Train using PLSA
+    pplsa.folder = pclean.output_dir[:-1]
+    pplsa.dict_path = pclean.file_dict
+    pplsa.folder = pclean.output_dir[:-1]
+    pplsa.main()
+
+
+
+
+def isEnglish(s):
+    print(s)
+    # s = s.decode('utf-8')
+    # try:
+    #     s.encode('ascii')
+    # except UnicodeEncodeError:
+    #     return False
+    # else:
+    #     return True
+
+    try:
+        s.encode(encoding='utf-8').decode('ascii')
+    except UnicodeDecodeError:
+        return False
+    else:
+        return True
+
+
+
+
+
+
+__end__ = '__end__'
+
+
+if __name__ == '__main__':
+
+    run_plsa_slack()
+    # run_1()
+    # test_preprocessing()
+    #
+    # print(isEnglish('meeeee'))
+    pass
diff --git a/plsa/plsa/tests.py b/plsa/plsa/tests.py
new file mode 100644
index 0000000..dab2a5c
--- /dev/null
+++ b/plsa/plsa/tests.py
@@ -0,0 +1,42 @@
+
+
+import unittest
+
+from plsa import pLSA, normalize
+import numpy as np
+
+class UtilsTest(unittest.TestCase):
+
+    def test_normalize(self):
+        # 1d-case
+        a = normalize(np.random.random(10))
+        self.assertAlmostEqual(1.0, np.sum(a))
+
+        # 1d-case where all values are 0
+        a = normalize(np.array([0.0] * 10))
+        self.assertEqual(0.0, np.sum(a))
+
+        # 2d-case, axis 1
+        M = normalize(np.random.random(20).reshape(2,10), axis=1)
+        res = M.sum(axis=1) # a 2-array
+        self.assertAlmostEqual(1.0, res[0])
+        self.assertAlmostEqual(1.0, res[1])
+
+        # 2d-case, axis 0
+        M = normalize(np.random.random(20).reshape(10,2), axis=0)
+        res = M.sum(axis=0)
+        self.assertAlmostEqual(1.0, res[0])
+        self.assertAlmostEqual(1.0, res[1])
+
+        # 2d-case, where the normalized axis has length 1
+        M_ = np.random.random(10).reshape(10,1)
+        M = normalize(M_, axis=1)
+        self.assertTrue(np.array_equal(M, M_))
+
+        M_ = np.random.random(10).reshape(1,10)
+        M = normalize(M_, axis=0)
+        self.assertTrue(np.array_equal(M, M_))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/plsa/plsa/tfidf/__init__.py b/plsa/plsa/tfidf/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/plsa/plsa/tfidf/featsel.py b/plsa/plsa/tfidf/featsel.py
new file mode 100644
index 0000000..f9d096e
--- /dev/null
+++ b/plsa/plsa/tfidf/featsel.py
@@ -0,0 +1,116 @@
+from __future__ import division # enable float division by default
+
+import numpy as np
+from numpy import log2
+
+from tfidf import vocab_array, vocab_dict
+
+def get_counts(td, delta, t, c):
+    in_c = delta[:,c] == 1 # |X|-array
+    not_in_c = delta[:,c] == 0 # |X|-array
+    N11 = np.sum(td[t,in_c] > 0) # contains t and is in c
+    N01 = np.sum(td[t,in_c] == 0)
+    N10 = np.sum(td[t,not_in_c] > 0)
+    N00 = np.sum(td[t,not_in_c] == 0)
+    return (N11,N01,N10,N00)
+
+def get_group_counts(N11,N01,N10,N00):
+    N1_ = N10 + N11
+    N_1 = N11 + N01
+    N0_ = N01 + N00
+    N_0 = N10 + N00
+    N = N10 + N11 + N01 + N00
+    return (N1_,N_1,N0_,N_0,N)
+
+def mutual_information(td, delta):
+    """
+    td: a term-document V x |X| matrix
+    delta: |X| x M matrix where delta(i,j) = 1 if document i belongs to class j
+
+    output: a V x M matrix of scores I(t,c)
+    """
+    V,X = td.shape
+    X_,M = delta.shape
+
+    assert(X==X_)
+
+    I = np.zeros((V,M), dtype=np.double)
+
+    for t in range(V):
+        for c in range(M):
+            N11,N01,N10,N00 = get_counts(td, delta, t, c)
+            N1_,N_1,N0_,N_0,N = get_group_counts(N11,N01,N10,N00)
+
+            # FIXME: how to deal with log2(0) when the numerator is 0?
+
+            I[t,c] = N11/N * log2((N*N11)/(N1_*N_1)) + \
+                     N01/N * log2((N*N01)/(N0_*N_1)) + \
+                     N10/N * log2((N*N10)/(N1_*N_0)) + \
+                     N00/N * log2((N*N00)/(N0_*N_0))
+
+    return I
+
+def chi2(td, delta):
+    """
+    td: a term-document V x |X| matrix
+    delta: |X| x M matrix where delta(i,j) = 1 if document i belongs to class j
+
+    output: a V x M matrix of scores chi2(t,c)
+    """
+    V,X = td.shape
+    X_,M = delta.shape
+
+    assert(X==X_)
+
+    chi2m = np.zeros((V,M), dtype=np.double)
+
+    for t in range(V):
+        for c in range(M):
+            N11,N01,N10,N00 = get_counts(td, delta, t, c)
+            N1_,N_1,N0_,N_0,N = get_group_counts(N11,N01,N10,N00)
+
+            chi2m[t,c] = (N*(N11*N00-N10*N01)**2)/(N1_*N_1*N0_*N_0)
+
+    return chi2m
+
+def select_max(td, vocab, A, K):
+    """
+    Select the best K/M features for each of the M classes
+
+    td: a term-document V x |X| matrix
+    delta: |X| x M matrix where delta(i,j) = 1 if document i belongs to class j
+
+    A: matrix returned by chi2 or mutual_information
+
+    output: the new reduced term-document matrix and the new vocabulary dict
+    """
+    V, M = A.shape
+
+    d = {}
+
+    for m in range(M):
+        k = 1
+        # best features which are not selected yet
+        best_feat = [a for a in A[:,m].argsort()[::-1] if not a in d]
+        d.update(dict((a,1) for a in best_feat[:int(K/M)]))
+
+    best_feat = np.array(d.keys())
+    varr = vocab_array(vocab)
+
+    return td[best_feat, :], vocab_dict(varr[best_feat])
+
+def select_avg(td, vocab, A, K, weights=None):
+    """
+    Select the best K features by averaging the scores
+
+    td: a term-document V x |X| matrix
+    delta: |X| x M matrix where delta(i,j) = 1 if document i belongs to class j
+
+    A: matrix returned by chi2 or mutual_information
+
+    output: the new reduced term-document matrix and the new vocabulary dict
+    """
+    a = np.average(A, axis=1, weights=weights)
+    best_feat = a.argsort()[::-1][:K]
+    varr = vocab_array(vocab)
+    return td[best_feat, :], vocab_dict(varr[best_feat])
diff --git a/plsa/plsa/tfidf/porter.py b/plsa/plsa/tfidf/porter.py
new file mode 100644
index 0000000..cb49972
--- /dev/null
+++ b/plsa/plsa/tfidf/porter.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python
+
+"""Porter Stemming Algorithm
+This is the Porter stemming algorithm, ported to Python from the
+version coded up in ANSI C by the author. It may be be regarded
+as canonical, in that it follows the algorithm presented in
+
+Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+no. 3, pp 130-137,
+
+only differing from it at the points maked --DEPARTURE-- below.
+
+See also http://www.tartarus.org/~martin/PorterStemmer
+
+The algorithm as described in the paper could be exactly replicated
+by adjusting the points of DEPARTURE, but this is barely necessary,
+because (a) the points of DEPARTURE are definitely improvements, and
+(b) no encoding of the Porter stemmer I have seen is anything like
+as exact as this version, even with the points of DEPARTURE!
+
+Vivake Gupta (v@nano.com)
+
+Release 1: January 2001
+
+Further adjustments by Santiago Bruno (bananabruno@gmail.com)
+to allow word input not restricted to one word per line, leading
+to:
+
+release 2: July 2008
+"""
+
+import sys
+
+class PorterStemmer:
+
+    def __init__(self):
+        """The main part of the stemming algorithm starts here.
+        b is a buffer holding a word to be stemmed. The letters are in b[k0],
+        b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
+        readjusted downwards as the stemming progresses. Zero termination is
+        not in fact used in the algorithm.
+
+        Note that only lower case sequences are stemmed. Forcing to lower case
+        should be done before stem(...) is called.
+        """
+
+        self.b = ""  # buffer for word to be stemmed
+        self.k = 0
+        self.k0 = 0
+        self.j = 0   # j is a general offset into the string
+
+    def cons(self, i):
+        """cons(i) is TRUE <=> b[i] is a consonant."""
+        if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
+            return 0
+        if self.b[i] == 'y':
+            if i == self.k0:
+                return 1
+            else:
+                return (not self.cons(i - 1))
+        return 1
+
+    def m(self):
+        """m() measures the number of consonant sequences between k0 and j.
+        if c is a consonant sequence and v a vowel sequence, and <..>
+        indicates arbitrary presence,
+
+           <c><v>       gives 0
+           <c>vc<v>     gives 1
+           <c>vcvc<v>   gives 2
+           <c>vcvcvc<v> gives 3
+           ....
+        """
+        n = 0
+        i = self.k0
+        while 1:
+            if i > self.j:
+                return n
+            if not self.cons(i):
+                break
+            i = i + 1
+        i = i + 1
+        while 1:
+            while 1:
+                if i > self.j:
+                    return n
+                if self.cons(i):
+                    break
+                i = i + 1
+            i = i + 1
+            n = n + 1
+            while 1:
+                if i > self.j:
+                    return n
+                if not self.cons(i):
+                    break
+                i = i + 1
+            i = i + 1
+
+    def vowelinstem(self):
+        """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
+        for i in range(self.k0, self.j + 1):
+            if not self.cons(i):
+                return 1
+        return 0
+
+    def doublec(self, j):
+        """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
+        if j < (self.k0 + 1):
+            return 0
+        if (self.b[j] != self.b[j-1]):
+            return 0
+        return self.cons(j)
+
+    def cvc(self, i):
+        """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+        and also if the second c is not w,x or y. this is used when trying to
+        restore an e at the end of a short  e.g.
+
+           cav(e), lov(e), hop(e), crim(e), but
+           snow, box, tray.
+        """
+        if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
+            return 0
+        ch = self.b[i]
+        if ch == 'w' or ch == 'x' or ch == 'y':
+            return 0
+        return 1
+
+    def ends(self, s):
+        """ends(s) is TRUE <=> k0,...k ends with the string s."""
+        length = len(s)
+        if s[length - 1] != self.b[self.k]: # tiny speed-up
+            return 0
+        if length > (self.k - self.k0 + 1):
+            return 0
+        if self.b[self.k-length+1:self.k+1] != s:
+            return 0
+        self.j = self.k - length
+        return 1
+
+    def setto(self, s):
+        """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
+        length = len(s)
+        self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
+        self.k = self.j + length
+
+    def r(self, s):
+        """r(s) is used further down."""
+        if self.m() > 0:
+            self.setto(s)
+
+    def step1ab(self):
+        """step1ab() gets rid of plurals and -ed or -ing. e.g.
+
+           caresses  ->  caress
+           ponies    ->  poni
+           ties      ->  ti
+           caress    ->  caress
+           cats      ->  cat
+
+           feed      ->  feed
+           agreed    ->  agree
+           disabled  ->  disable
+
+           matting   ->  mat
+           mating    ->  mate
+           meeting   ->  meet
+           milling   ->  mill
+           messing   ->  mess
+
+           meetings  ->  meet
+        """
+        if self.b[self.k] == 's':
+            if self.ends("sses"):
+                self.k = self.k - 2
+            elif self.ends("ies"):
+                self.setto("i")
+            elif self.b[self.k - 1] != 's':
+                self.k = self.k - 1
+        if self.ends("eed"):
+            if self.m() > 0:
+                self.k = self.k - 1
+        elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
+            self.k = self.j
+            if self.ends("at"):   self.setto("ate")
+            elif self.ends("bl"): self.setto("ble")
+            elif self.ends("iz"): self.setto("ize")
+            elif self.doublec(self.k):
+                self.k = self.k - 1
+                ch = self.b[self.k]
+                if ch == 'l' or ch == 's' or ch == 'z':
+                    self.k = self.k + 1
+            elif (self.m() == 1 and self.cvc(self.k)):
+                self.setto("e")
+
+    def step1c(self):
+        """step1c() turns terminal y to i when there is another vowel in the stem."""
+        if (self.ends("y") and self.vowelinstem()):
+            self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
+
+    def step2(self):
+        """step2() maps double suffices to single ones.
+        so -ization ( = -ize plus -ation) maps to -ize etc. note that the
+        string before the suffix must give m() > 0.
+        """
+        if self.b[self.k - 1] == 'a':
+            if self.ends("ational"):   self.r("ate")
+            elif self.ends("tional"):  self.r("tion")
+        elif self.b[self.k - 1] == 'c':
+            if self.ends("enci"):      self.r("ence")
+            elif self.ends("anci"):    self.r("ance")
+        elif self.b[self.k - 1] == 'e':
+            if self.ends("izer"):      self.r("ize")
+        elif self.b[self.k - 1] == 'l':
+            if self.ends("bli"):       self.r("ble") # --DEPARTURE--
+            # To match the published algorithm, replace this phrase with
+            #   if self.ends("abli"):      self.r("able")
+            elif self.ends("alli"):    self.r("al")
+            elif self.ends("entli"):   self.r("ent")
+            elif self.ends("eli"):     self.r("e")
+            elif self.ends("ousli"):   self.r("ous")
+        elif self.b[self.k - 1] == 'o':
+            if self.ends("ization"):   self.r("ize")
+            elif self.ends("ation"):   self.r("ate")
+            elif self.ends("ator"):    self.r("ate")
+        elif self.b[self.k - 1] == 's':
+            if self.ends("alism"):     self.r("al")
+            elif self.ends("iveness"): self.r("ive")
+            elif self.ends("fulness"): self.r("ful")
+            elif self.ends("ousness"): self.r("ous")
+        elif self.b[self.k - 1] == 't':
+            if self.ends("aliti"):     self.r("al")
+            elif self.ends("iviti"):   self.r("ive")
+            elif self.ends("biliti"):  self.r("ble")
+        elif self.b[self.k - 1] == 'g': # --DEPARTURE--
+            if self.ends("logi"):      self.r("log")
+        # To match the published algorithm, delete this phrase
+
+    def step3(self):
+        """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
+        if self.b[self.k] == 'e':
+            if self.ends("icate"):     self.r("ic")
+            elif self.ends("ative"):   self.r("")
+            elif self.ends("alize"):   self.r("al")
+        elif self.b[self.k] == 'i':
+            if self.ends("iciti"):     self.r("ic")
+        elif self.b[self.k] == 'l':
+            if self.ends("ical"):      self.r("ic")
+            elif self.ends("ful"):     self.r("")
+        elif self.b[self.k] == 's':
+            if self.ends("ness"):      self.r("")
+
+    def step4(self):
+        """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
+        if self.b[self.k - 1] == 'a':
+            if self.ends("al"): pass
+            else: return
+        elif self.b[self.k - 1] == 'c':
+            if self.ends("ance"): pass
+            elif self.ends("ence"): pass
+            else: return
+        elif self.b[self.k - 1] == 'e':
+            if self.ends("er"): pass
+            else: return
+        elif self.b[self.k - 1] == 'i':
+            if self.ends("ic"): pass
+            else: return
+        elif self.b[self.k - 1] == 'l':
+            if self.ends("able"): pass
+            elif self.ends("ible"): pass
+            else: return
+        elif self.b[self.k - 1] == 'n':
+            if self.ends("ant"): pass
+            elif self.ends("ement"): pass
+            elif self.ends("ment"): pass
+            elif self.ends("ent"): pass
+            else: return
+        elif self.b[self.k - 1] == 'o':
+            if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
+            elif self.ends("ou"): pass
+            # takes care of -ous
+            else: return
+        elif self.b[self.k - 1] == 's':
+            if self.ends("ism"): pass
+            else: return
+        elif self.b[self.k - 1] == 't':
+            if self.ends("ate"): pass
+            elif self.ends("iti"): pass
+            else: return
+        elif self.b[self.k - 1] == 'u':
+            if self.ends("ous"): pass
+            else: return
+        elif self.b[self.k - 1] == 'v':
+            if self.ends("ive"): pass
+            else: return
+        elif self.b[self.k - 1] == 'z':
+            if self.ends("ize"): pass
+            else: return
+        else:
+            return
+        if self.m() > 1:
+            self.k = self.j
+
+    def step5(self):
+        """step5() removes a final -e if m() > 1, and changes -ll to -l if
+        m() > 1.
+        """
+        self.j = self.k
+        if self.b[self.k] == 'e':
+            a = self.m()
+            if a > 1 or (a == 1 and not self.cvc(self.k-1)):
+                self.k = self.k - 1
+        if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
+            self.k = self.k -1
+
+    def stem(self, p, i=None, j=None):
+        """In stem(p,i,j), p is a char pointer, and the string to be stemmed
+        is from p[i] to p[j] inclusive. Typically i is zero and j is the
+        offset to the last character of a string, (p[j+1] == '\0'). The
+        stemmer adjusts the characters p[i] ... p[j] and returns the new
+        end-point of the string, k. Stemming never increases word length, so
+        i <= k <= j. To turn the stemmer into a module, declare 'stem' as
+        extern, and delete the remainder of this file.
+        """
+        # copy the parameters into statics
+        if i is None: i = 0
+        if j is None: j = len(p)-1
+        self.b = p
+        self.k = j
+        self.k0 = i
+        if self.k <= self.k0 + 1:
+            return self.b # --DEPARTURE--
+
+        # With this line, strings of length 1 or 2 don't go through the
+        # stemming process, although no mention is made of this in the
+        # published algorithm. Remove the line to match the published
+        # algorithm.
+
+        self.step1ab()
+        self.step1c()
+        self.step2()
+        self.step3()
+        self.step4()
+        self.step5()
+        return self.b[self.k0:self.k+1]
+
+    def stem_sentence(self, txt):
+        return " ".join(map(self.stem, txt.split()))
+
+    def stem_documents(self, docs):
+        return map(self.stem_sentence, docs)
+
+
+if __name__ == '__main__':
+    p = PorterStemmer()
+    if len(sys.argv) > 1:
+        for f in sys.argv[1:]:
+            infile = open(f, 'r')
+            while 1:
+                output = ''
+                word = ''
+                line = infile.readline()
+                if line == '':
+                    break
+                for c in line:
+                    if c.isalpha():
+                        word += c.lower()
+                    else:
+                        if word:
+                            output += p.stem(word, 0,len(word)-1)
+                            word = ''
+                        output += c.lower()
+                print (output,)
+            infile.close()
diff --git a/plsa/plsa/tfidf/preprocessing.py b/plsa/plsa/tfidf/preprocessing.py
new file mode 100644
index 0000000..61d687a
--- /dev/null
+++ b/plsa/plsa/tfidf/preprocessing.py
@@ -0,0 +1,99 @@
+import re
+import string
+import glob
+
+# file_list = []
+# file_parts_number = 9
+# file_parts_number = 8
+
+def strip_punctuation(s):
+    return re.sub("([%s]+)" % string.punctuation, " ", s)
+
+def strip_punctuation2(s):
+    return s.translate(string.maketrans("",""), string.punctuation)
+
+def strip_tags(s):
+    # assumes s is already lowercase
+    return re.sub(r"<([^>]+)>", "", s)
+
+def strip_short(s, minsize=3):
+    return " ".join([e for e in s.split() if len(e) >= minsize])
+
+def strip_numeric(s):
+    return re.sub(r"[0-9]+", "", s)
+
+def strip_non_alphanum(s):
+    # assumes s is already lowercase
+    return re.sub(r"[^a-z0-9\ ]", " ", s)
+
+def strip_multiple_whitespaces(s):
+    return re.sub(r"(\s|\\n|\\r|\\t)+", " ", s)
+    #return s
+
+def split_alphanum(s):
+    s = re.sub(r"([a-z]+)([0-9]+)", r"\1 \2", s)
+    return re.sub(r"([0-9]+)([a-z]+)", r"\1 \2", s)
+
+STOPWORDS = """
+a about again all almost also although always among an
+and another any are as at 
+be because been before being between both but by 
+can could
+did do does done due during
+each either enough especially etc
+for found from further
+had has have having here how however
+i if in into is it its itself
+just
+kg km
+made mainly make may mg might ml mm most mostly must
+nearly neither no nor not
+obtained of often on our overall
+perhaps pmid
+quite
+rather really regarding
+seem seen several should show showed shown shows significantly
+since so some such
+than that the their theirs them then there therefore these they too
+this those through thus to
+upon use used using
+various very
+was we were what when which while with within without would will
+"""
+
+STOPWORDS = dict((w,1) for w in STOPWORDS.strip().replace("\n", " ").split())
+
+def remove_stopwords(s):
+    return " ".join([w for w in s.split() if w not in STOPWORDS])
+
+# DEFAULT_FILTERS = [str.lower, strip_tags, strip_punctuation,
+# strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]
+
+DEFAULT_FILTERS = [str.lower, strip_tags, strip_punctuation,
+strip_multiple_whitespaces, strip_numeric,  strip_short]
+
+def preprocess_string(s, filters=DEFAULT_FILTERS):
+    for f in filters:
+        s = f(s)
+    return s
+
+def preprocess_documents(docs):
+    # print docs
+    return map(preprocess_string, docs)
+
+def read_file(path):
+    f = open(path)
+    ret = f.read()
+    # # print path
+    # file=str(path).split('/')[file_parts_number]
+    # file_list.append((file))
+    return ret
+
+def read_files(pattern):
+    # global file_list
+    # file_list = []
+    return map(read_file, glob.glob(pattern))
+
+# def empty_file_list():
+#     global file_list
+#     file_list = []
\ No newline at end of file
diff --git a/plsa/plsa/tfidf/smoothing.py b/plsa/plsa/tfidf/smoothing.py
new file mode 100644
index 0000000..5595646
--- /dev/null
+++ b/plsa/plsa/tfidf/smoothing.py
@@ -0,0 +1,18 @@
+
+def laplace(td):
+    """
+    Laplace smoothing a.k.a as "add one" smoothing.
+
+    In a bayesian setting, this corresponds to a uniform prior on events.
+
+    It has a tendency to over-estimate probabilities of unseen terms for large
+    matrices.
+
+    td: V x X term document matrix
+    """
+    V, X = td.shape
+    return (1.0 + td) / (V + td.sum(axis=0))
+
+def lidstone(td, lambda_=0.5):
+    V, X = td.shape
+    return (lambda_ + td) / (V * lambda_ + td.sum(axis=0))
\ No newline at end of file
diff --git a/plsa/plsa/tfidf/tfidf.py b/plsa/plsa/tfidf/tfidf.py
new file mode 100644
index 0000000..3609441
--- /dev/null
+++ b/plsa/plsa/tfidf/tfidf.py
@@ -0,0 +1,168 @@
+## -@@*@@-@@ :D coding: utf@!-8 -*-
+
+from math import log
+
+import numpy as np
+import scipy.sparse as sp
+import tables as tb
+
+def tokenize(text):
+    return text.split()
+
+def tc(dataset, tokenizer=tokenize):
+    vocab = {}
+    docs = []
+
+    for doc in dataset:
+        if doc == '':
+            continue
+        d = {} # token => count
+
+        for term in tokenizer(doc):
+            vocab[term] = 1
+            d[term] = d.get(term, 0) + 1
+
+        docs.append(d)
+
+    sorted_terms = sorted(vocab.keys())
+    vocab = dict([(t, i) for i, t in enumerate(sorted_terms)])
+
+
+    return docs, vocab
+
+def tf_from_tc(term_counts):
+    docs = []
+
+    for doc in term_counts:
+        d = {}
+        length = sum(doc.values())
+        for term, count in doc.items():
+            d[term] = float(count) / length
+        docs.append(d)
+
+    return docs
+
+def idc_from_tc(term_counts):
+    t = {}
+    for doc in term_counts:
+        for term in doc:
+            t[term] = t.get(term, 0) + 1
+    return t
+
+def idf_from_tc(term_counts):
+    n_docs = len(term_counts)
+    idf = {}
+    idc = idc_from_tc(term_counts)
+    for term in idc:
+        idf[term] = log(n_docs*1.0/(idc[term]))
+    return idf
+
+def tf_mul_idf(tf, idf):
+    docs = []
+
+    for doc in tf:
+        d = {}
+        for term in doc:
+            d[term] = doc[term] * idf[term]
+        docs.append(d)
+
+    return docs
+
+def to_vector(idf_dict, vocab):
+    ret = np.zeros(len(idf_dict))
+    for term, idx in vocab.items():
+        ret[idx] = idf_dict[term]
+    return ret
+
+def to_sparse_matrix(tfidf_dict, vocab):
+    # fd = tb.open_file ( fname, mode = 'w' )
+    tfm = sp.lil_matrix((len(vocab), len(tfidf_dict)), dtype=np.double)
+    # print 'tfm',shape(tfm)
+    # filters = tb.Filters(complevel=5, complib='blosc')
+    # out_data = f.create_earray(f.root, 'data', tb.Float32Atom(), shape=(0,), filters=filters)
+    # out_indices = f.create_earray(f.root, 'indices', tb.Int32Atom(),shape=(0,), filters=filters)
+    # out_indptr = f.create_earray(f.root, 'indptr', tb.Int32Atom(), shape=(0,), filters=filters)
+    # out_indptr.append(np.array([0])) #this is needed as a first indptr
+    # max_indptr = 0
+    for j, doc in enumerate(tfidf_dict):
+        for term in doc:
+            try:
+                i = vocab[term]
+                tfm[i,j] = doc[term]
+                # res=doc[term]
+                # print res
+                #
+                # out_data.append((res[np.newaxis,:]))
+            except KeyError:
+                pass
+        # indices = res.indices
+        # indptr = res.indptr
+        # out_indices.append(indices)
+        # out_indptr.append(max_indptr+indptr[1:])
+        # max_indptr = indices.shape[0]
+    # print tfm.shape
+    # info = fd.create_group ( '/', 'info' )
+    # fd.create_vlarray ( info, 'dtype', tfm.dtype)
+    # fd.create_vlarray ( info, 'shape', tfm.shape )
+    # fd.create_vlarray ( info, 'format', tfm.format )
+
+    # data = fd.create_group ( '/', 'data' )
+    # vld = fd.create_earray ( data, 'data',pt.Float64Atom(),shape=(0,) )
+    # vlr = fd.create_earray ( data, 'rows',pt.UInt32Atom(),shape=(0,))
+    # for u in xrange ( tfm.shape [ 0 ] ):
+    #       vld.append ( tfm.data [ u ] )
+    #       vlr.append ( tfm.rows [ u ] )
+    # fd.close()
+    return tfm
+
+def inverse_vocab(vocab):
+    """
+    Converts a vocab dictionary term => index to index => term
+    """
+    return dict((i,t) for t,i in vocab.items())
+
+def vocab_array(vocab):
+    """
+    Converts vocab dictionary to vocab array
+    """
+    return np.char.array(sorted(vocab.keys(), 
+                                lambda a,b: cmp(vocab[a],vocab[b])))
+
+def vocab_dict(vocab):
+    """
+    Converts vocab array to vocab dictionary
+    """
+    return dict((term,i) for i,term in enumerate(vocab))
+
+def replace_vocab(td, oldvocab, newvocab):
+    """
+    td: V x X term-document matrix
+    oldvocab: dictionary
+    newvocab: dictionary
+    """
+    newtd = np.zeros((len(newvocab),td.shape[1]))
+    for term in newvocab:
+        try:
+            newtd[newvocab[term]] = td[oldvocab[term]]
+        except KeyError:
+            newtd[newvocab[term]] = 0
+    return newtd
+
+class tfidf(object):
+    def __init__(self, dataset, tokenizer=tokenize):
+        self._dataset = dataset
+        self._tokenizer = tokenizer
+
+    def as_dict(self):
+        term_counts, vocab = tc(self._dataset, self._tokenizer)
+        tf = tf_from_tc(term_counts)
+        idf = idf_from_tc(term_counts)
+        return tf_mul_idf(tf, idf), vocab
+
+    def as_sparse_matrix(self):
+        tfidf_dict, vocab = self.as_dict()
+        return to_sparse_matrix(tfidf_dict, vocab), vocab
+
+    def as_array(self):
+        tfm, vocab = self.as_sparse_matrix()
+        return tfm.toarray(), vocab
diff --git a/plsa/preprocessing/__init__.py b/plsa/preprocessing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/plsa/preprocessing/cleansing.py b/plsa/preprocessing/cleansing.py
new file mode 100644
index 0000000..ecf2d9d
--- /dev/null
+++ b/plsa/preprocessing/cleansing.py
@@ -0,0 +1,287 @@
+__author__ = 'masresha'
+# Runs on python3.6
+# this block of code first reads a files directory
+
+
+import sys
+import pathlib
+import os
+
+sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa')
+sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing')
+
+
+# import codecs
+
+import glob
+import nltk
+from nltk.corpus import stopwords
+import pathlib as path
+from stemming.porter2 import stem
+import string
+import preprocessing.porter_dictionary as pp
+# import re
+
+port_dict = pp.porter_dictionary()
+
+
+
+# file_parts_number = 9
+
+
+file_dict = ''
+source_texts = ''
+output_dir = ''
+file_parts_number = 8
+# file_parts_number = 7 # Inspire
+
+
+
+# fileList = glob.glob(source_texts)
+
+
+punct=['…','•','”','→','↑','“','‘','’','—','£','€','$']
+
+
+
+
+
+
+
+pos_dict = {'JJ': 'a', 'JJR': 'a',
+            'JJS': 'a', 'NN': 'n',
+            'NNS': 'n', 'NNP': 'n',
+            'NNPS': 'n', 'PRP': 'n',
+            'PRP$': 'n', 'RB': 'r',
+            'RBR': 'r', 'RBS': 'r',
+            'VB': 'v', 'VBD': 'v',
+            'VBG': 'v', 'VBN': 'v',
+            'VBZ': 'v', }
+
+wnl = nltk.WordNetLemmatizer()
+stop = stopwords.words('english')
+
+z = []
+wnl_tokens = []
+
+
+def is_float(x):
+    try:
+        float(x)
+        return True
+    except:
+        return False
+
+
+def has_token_punct(token):
+    for char in token:
+        if (char in string.punctuation) or (char in punct):
+            return True
+            break
+        else:
+            pass
+
+
+def find_punct(token):
+    ch = []
+    for char in token:
+        if ((char in string.punctuation) or (char in punct)):
+            ch.append(char)
+
+    return ch
+    # break
+def isEnglish(s):
+    # print('444444444444444444444444')
+    # print(s)
+    # s = s.decode('utf-8')
+    # try:
+    #     s.encode('ascii')
+    # except UnicodeEncodeError:
+    #     return False
+    # else:
+    #     return True
+
+    try:
+        s.encode(encoding='utf-8').decode('ascii')
+    except UnicodeDecodeError:
+        return False
+    else:
+        return True
+
+def return_cleaned(t1):
+    returned_token = []
+
+
+
+    for i in t1:
+        # print(i)
+        # print(type(i))
+        if (len(i) < 2 or len(i)>12):
+            # print('-------',i)
+            pass
+        elif (i[:2] == '//'):
+            # print('*******',i)
+            pass
+        elif (is_float(i)):
+            pass
+        # elif (i.__contains__(',')):
+        # pass
+        elif (i.__contains__('www.') or i.__contains__('xxx.') or i.__contains__('yyy.') or i.__contains__('.gov') ):
+            pass
+        elif (str(i).endswith('.com') or str(i).endswith('.html') or str(i).endswith('.php') or str(i).endswith(
+                '.aspx') or str(i).endswith('.asp') or str(i).endswith('htm') or str(i).endswith('pdf')):
+            pass
+        elif (str(i).startswith('http') or str(i).startswith('https') or str(i).startswith('/') ):
+            pass
+        # elif(isEnglish(i)==False):
+        #         pass
+        else:
+            returned_token.append(i)
+
+    # print returned_token
+    #removes punctuation from a string and remove the  remaining string if it is a number
+    token_list = []
+    for token in returned_token:
+
+        if ((has_token_punct(token))):
+            chr = find_punct(token)
+            new_token=token
+            for ch in chr:
+                if (ch=='-'):
+                 new_token = str(new_token).replace(ch, ' ')
+
+                else:
+                  new_token = str(new_token).replace(ch, '')
+            # checks if after punctuation is removed  and its length is <2
+            if (is_float(new_token)):
+            # if ((new_token.isalpha()=='false' or new_token.isdigit()=='false')):
+                pass
+            elif(isEnglish(new_token)==False):
+                pass
+
+            else:
+                token_list.append(new_token)
+        elif(isEnglish(token)==False):
+                pass
+        else:
+            token_list.append(token)
+
+    # print  ('Token List',new_token)
+    return token_list
+# print (token_list)
+
+# print  returned_token
+# return returned_token
+
+
+
+def pre_pro():
+    fileList = glob.glob(source_texts)
+    fileList_len = fileList.__len__() - 1
+    k = 0
+    print('------pre-process started-------')
+    for files in fileList:
+        tFile = open(files)
+        # tFile = codecs.open(files, 'r', 'utf-8')
+        line = tFile.read().lower()
+        # print(line)
+        # print(type(line))
+        # line = line.decode('utf-8')
+        tokens = nltk.word_tokenize(line)
+        # print(tokens)
+        for ijk in range(len(tokens)):
+            # tokens[ijk] = tokens[ijk].encode('utf-8')
+            tokens[ijk] = tokens[ijk]
+
+        clean_tokens = return_cleaned(tokens)
+
+
+
+        # print clean_tokens
+        clean_tokens = filter(lambda name: name.strip(), clean_tokens)
+        # result = list(filter(None, clean_tokens))
+        # print clean_tokens
+        final_tok=[]
+        for tok in clean_tokens:
+            # print (tok)
+            tok= str(tok).rstrip(' ')
+            # if(len(tok) < 2 ):
+            #     clean_tokens.remove(tok)
+            if(len(str(tok).split(' ')) > 1):
+                # print tok+' '+str(len(str(tok).split(' ')))
+                # print tok
+                token=nltk.word_tokenize(tok)
+                cl_tok= return_cleaned(token)
+                for tk in cl_tok:
+                    # print tk
+                    final_tok.append(tk)
+                # clean_tokens.remove(tok)
+            else:
+                final_tok.append(tok)
+            # print clean_tokens
+                # clean_tokens.append(token)
+
+                # print token
+        # tokens= nltk.word_tokenize(clean_tokens)
+        # print tokens
+
+        # tokens = Pun_pattern.sub("", str(tokens))
+        # tokenstemp = []
+        # for t1 in tokens:
+        # for tt1 in t1:
+        #         if(not string.punctuation.__contains__(tt1)):
+        #             if(t1 != "'s"):
+        #                 tokenstemp.append(t1)
+        #             break
+
+        # tokens = tokenstemp
+
+        filtered_words = [w for w in final_tok if not w in stopwords.words('english')]
+        # print filtered_words
+        POS_Tokens = nltk.pos_tag(filtered_words)
+
+        z = []
+        for x in POS_Tokens:
+            try:
+                z.append(pos_dict[x[1]])
+            except:
+                z.append('n')
+
+        wnl_tokens = []
+        for i in range(len(filtered_words)):
+            # if (len(clean_tokens[i])>3):
+            #     wnl_tokens.append(wnl.lemmatize(clean_tokens[i], z[i]))
+            # else:
+                wnl_tokens.append(filtered_words[i])
+        por_tokens = [stem(t) for t in wnl_tokens]
+        # print por_tokens
+
+        temp_term1 = ''
+        term1 = ''
+
+        for stmd in range(len(filtered_words)):
+
+                term1 = por_tokens[stmd]
+                temp_term1 = filtered_words[stmd]
+
+                # print term1,temp_term1
+
+                port_dict.add_element(stemmed=term1,nonstemmed=temp_term1)
+
+        file_txt = open(output_dir + path.PurePath(files).parts[file_parts_number], "w")
+
+        for i in por_tokens:
+            file_txt.write(i)
+            file_txt.write('\n')
+        file_txt.close()
+        # print('Processed ',k,'of',fileList_len)
+        k = k + 1
+    port_dict.write_dict_to_file(file_dict)
+    print('***------pre-process finished--------')
+
+if __name__ == '__main__':
+
+    pre_pro()
+
+
+
diff --git a/plsa/preprocessing/porter_dictionary.py b/plsa/preprocessing/porter_dictionary.py
new file mode 100644
index 0000000..c9e77cf
--- /dev/null
+++ b/plsa/preprocessing/porter_dictionary.py
@@ -0,0 +1,66 @@
+__author__ = 'eyob'
+
+
+import pickle
+
+class porter_dictionary:
+
+
+    def __init__(self):
+
+        self.dictionary = dict()
+
+
+
+    def add_element(self,stemmed, nonstemmed):
+
+        self.arr = []
+
+        if stemmed in self.dictionary:
+            # print('Stemmed word',stemmed,'present')
+            self.arr = self.dictionary[stemmed]
+
+            if nonstemmed in self.arr:
+                # print('Nonstemmed word',nonstemmed,'present')
+                pass
+            else:
+                # print('Nonstemmed word',nonstemmed,'absent')
+                self.arr.append(nonstemmed)
+                self.dictionary.update({stemmed:self.arr})
+        else:
+            # print('Stemmed word',stemmed,'absent')
+            self.arr.append(nonstemmed)
+            self.dictionary.update({stemmed:self.arr})
+
+
+
+    def write_dict_to_file(self,file_name):
+
+        with open(file_name, 'wb') as handle:
+            pickle.dump(self.dictionary,handle,protocol=2)
+
+
+        self.sorted_keys = sorted(self.dictionary)
+
+        with open(file_name+'.txt','w') as file1:
+            for i in self.sorted_keys:
+                file1.write(i+str(self.dictionary[i])+'\n')
+
+
+
+
+    def load_dict(self,file_name):
+        with open(file_name, 'rb') as handle:
+            self.dictionary = pickle.load(handle)
+
+
+
+
+
+if __name__ == '__main__':
+
+    pass
+
+
+
+