From 4484aba5f9201b6f00c02c16002dc02ed28e63ff Mon Sep 17 00:00:00 2001 From: Eyob Yirdaw Date: Thu, 7 Feb 2019 17:59:32 +0300 Subject: [PATCH] Fixed plsa code to work with python EM algorithm, did init grpc code --- plsa/plsa/example_plsa.py | 1 + plsa/plsa/plsa.py | 57 ++++++++++++++++++++----------- plsa/plsa/plsa_wrapper.py | 2 -- servcie_spec/topic_analysis.proto | 32 +++++++++++++++++ topic_analysis_grpc.py | 7 ++++ 5 files changed, 77 insertions(+), 22 deletions(-) create mode 100644 servcie_spec/topic_analysis.proto create mode 100644 topic_analysis_grpc.py diff --git a/plsa/plsa/example_plsa.py b/plsa/plsa/example_plsa.py index 6454cf2..1b5a36a 100644 --- a/plsa/plsa/example_plsa.py +++ b/plsa/plsa/example_plsa.py @@ -57,6 +57,7 @@ beta = 1 min_iteration = 10 logL_pic = '' +eps = 0.01 number_of_words = 0 number_of_docs = 0 RAM_limit = 25 # In giga bytes diff --git a/plsa/plsa/plsa.py b/plsa/plsa/plsa.py index cdea6b2..72c64eb 100644 --- a/plsa/plsa/plsa.py +++ b/plsa/plsa/plsa.py @@ -45,8 +45,10 @@ # beta2 = 0.75 beta2 = 1 +eps2=0.01 +maxiter2=22 +HAVE_EXT = False # HAVE_EXT = True -HAVE_EXT = True min_iteration2 = 10 # 10 was the previous value init_with_seed = False @@ -110,7 +112,7 @@ def loglikelihood(td, p_z, p_w_z, p_d_z): def train(td, p_z, p_w_z, p_d_z, p_z_old, p_w_z_old, p_d_z_old, - maxiter, eps, + maxiter, eps,beta,min_iteration, folding_in, debug,logL_c): logL = [] @@ -120,7 +122,8 @@ def train(td, lik = loglikelihood(td, p_z, p_w_z, p_d_z) logL.append(lik) print ('Starting logL =',lik) - # beta2 = 1 + # beta = 1 + counter = 0 for iteration in range(1, maxiter+1): # Swap old and new p_d_z_old, p_d_z = (p_d_z, p_d_z_old) @@ -133,35 +136,36 @@ def train(td, p_w_z *= 0.0 p_z *= 0.0 - if (beta2 != 1): + if (beta != 1): print ('tempering started') - p_z_old = np.power(p_z_old, beta2) + p_z_old = np.power(p_z_old, beta) # print('Finished','p_z_old') - # beta2 = 1 - p_d_z_old = np.power(p_d_z_old, beta2) + # beta = 1 + p_d_z_old = np.power(p_d_z_old, beta) # print('Finished','p_d_z_old') - p_w_z_old = np.power(p_w_z_old, beta2) + p_w_z_old = np.power(p_w_z_old, beta) # print('Finished','p_w_z_old') print ('tempering finished') for w,d in zip(*td.nonzero()): # E-step - # beta2 = 0.75 + # beta = 0.75 p_z_d_w = p_z_old * p_d_z_old[d, :] * p_w_z_old[w, :] - # if (beta2 ==1): + # if (beta ==1): # p_z_d_w = p_z_old * p_d_z_old[d,:] * p_w_z_old[w,:] # else: - # p_z_old_beta2 = np.power(p_z_old,beta2) - # # print('Finished','p_z_old_beta2') - # # beta2 = 1 - # p_d_z_old_beta2 = np.power(p_d_z_old,beta2) - # # print('Finished','p_d_z_old_beta2') - # p_w_z_old_beta2 = np.power(p_w_z_old,beta2) - # # print('Finished','p_w_z_old_beta2') - # p_z_d_w = p_z_old_beta2 * p_d_z_old_beta2[d,:] * p_w_z_old_beta2[w,:] + # p_z_old_beta = np.power(p_z_old,beta) + # # print('Finished','p_z_old_beta') + # # beta = 1 + # p_d_z_old_beta = np.power(p_d_z_old,beta) + # # print('Finished','p_d_z_old_beta') + # p_w_z_old_beta = np.power(p_w_z_old,beta) + # # print('Finished','p_w_z_old_beta') + # p_z_d_w = p_z_old_beta * p_d_z_old_beta[d,:] * p_w_z_old_beta[w,:] + # p_z_d_w = p_z_old_beta * p_d_z_old_beta[d,:] * p_w_z_old_beta[w,:] # # print('Finished','p_z_d_w') @@ -178,6 +182,8 @@ def train(td, p_w_z[w,:] += s p_z += s + counter = iteration + # normalize normalize(p_d_z, axis=0, out=p_d_z) @@ -212,7 +218,7 @@ def train(td, print ("lik_diff =",lik_diff,'%') - if iteration > min_iteration2 and lik_diff < eps : + if iteration > min_iteration and lik_diff < eps : print ("No more progress, stopping EM at iteration", iteration) print ("LogL =", logL) axis1 = range(1,len(logL)+1) @@ -221,6 +227,14 @@ def train(td, break + print('Stopped at iteration ',counter) + + print("LogL =", logL) + axis1 = range(1, len(logL) + 1) + plt.plot(axis1, logL) + plt.savefig(logL_pic) + + # print 'p_z-----' # print p_z @@ -272,7 +286,7 @@ def random_init(self, Z, V, D): # np.savetxt("pdz.csv", self.p_d_z, delimiter=",") - def train(self, td, Z, maxiter=500, eps=0.01, folding_in=False,beta=beta2,min_iteration = min_iteration2): # eps=0.01 is default value + def train(self, td, Z, maxiter=maxiter2, eps=eps2, folding_in=False,beta=beta2,min_iteration = min_iteration2): # eps=0.01 is default value """ Train the model. @@ -289,6 +303,9 @@ def train(self, td, Z, maxiter=500, eps=0.01, folding_in=False,beta=beta2,min_it p_w_z_old = np.zeros_like(self.p_w_z) p_z_old = np.zeros_like(self.p_z) + global logL_pic + logL_pic = self.logL_pic + train_func = _plsa.train if HAVE_EXT else train print('RRRRRRRRRRRRRRRAM used just before training:', float(list(psutil.virtual_memory())[3]) / 1073741824.0, 'GB') train_func(td.astype(np.uint32), diff --git a/plsa/plsa/plsa_wrapper.py b/plsa/plsa/plsa_wrapper.py index c2457dc..951c9ec 100644 --- a/plsa/plsa/plsa_wrapper.py +++ b/plsa/plsa/plsa_wrapper.py @@ -100,8 +100,6 @@ def generate_topics_json(self): self.output_dir_stream = pclean.output_dir self.file_dict_stream = pclean.file_dict - - os.mkdir(pplsa.PLSA_PARAMETERS_PATH) pplsa.main() diff --git a/servcie_spec/topic_analysis.proto b/servcie_spec/topic_analysis.proto new file mode 100644 index 0000000..0ee0862 --- /dev/null +++ b/servcie_spec/topic_analysis.proto @@ -0,0 +1,32 @@ +syntax = "proto3"; + + + + +message PLSARequest { + + repeated string docs = 1; + repeated string edges = 2; +} + +message PLSAResponse{ + + repeated string topics = 1; + +} + + + + + +///// Topic analysis Services + +service NetowrkAnalyticsBipartite { + + + rpc BipartiteGraph (PLSARequest) returns (PLSAResponse) {}; + + +} + +///// End Network Analytics Services \ No newline at end of file diff --git a/topic_analysis_grpc.py b/topic_analysis_grpc.py new file mode 100644 index 0000000..b74e849 --- /dev/null +++ b/topic_analysis_grpc.py @@ -0,0 +1,7 @@ +# Tested on python3.6 + +import grpc +from concurrent import futures +import time +import logging +