Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions plsa/plsa/example_plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
beta = 1
min_iteration = 10
logL_pic = ''
eps = 0.01
number_of_words = 0
number_of_docs = 0
RAM_limit = 25 # In giga bytes
Expand Down
57 changes: 37 additions & 20 deletions plsa/plsa/plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,10 @@

# beta2 = 0.75
beta2 = 1
eps2=0.01
maxiter2=22
HAVE_EXT = False
# HAVE_EXT = True
HAVE_EXT = True
min_iteration2 = 10 # 10 was the previous value
init_with_seed = False

Expand Down Expand Up @@ -110,7 +112,7 @@ def loglikelihood(td, p_z, p_w_z, p_d_z):
def train(td,
p_z, p_w_z, p_d_z,
p_z_old, p_w_z_old, p_d_z_old,
maxiter, eps,
maxiter, eps,beta,min_iteration,
folding_in, debug,logL_c):

logL = []
Expand All @@ -120,7 +122,8 @@ def train(td,
lik = loglikelihood(td, p_z, p_w_z, p_d_z)
logL.append(lik)
print ('Starting logL =',lik)
# beta2 = 1
# beta = 1
counter = 0
for iteration in range(1, maxiter+1):
# Swap old and new
p_d_z_old, p_d_z = (p_d_z, p_d_z_old)
Expand All @@ -133,35 +136,36 @@ def train(td,
p_w_z *= 0.0
p_z *= 0.0

if (beta2 != 1):
if (beta != 1):
print ('tempering started')
p_z_old = np.power(p_z_old, beta2)
p_z_old = np.power(p_z_old, beta)
# print('Finished','p_z_old')
# beta2 = 1
p_d_z_old = np.power(p_d_z_old, beta2)
# beta = 1
p_d_z_old = np.power(p_d_z_old, beta)
# print('Finished','p_d_z_old')
p_w_z_old = np.power(p_w_z_old, beta2)
p_w_z_old = np.power(p_w_z_old, beta)
# print('Finished','p_w_z_old')
print ('tempering finished')


for w,d in zip(*td.nonzero()):
# E-step
# beta2 = 0.75
# beta = 0.75

p_z_d_w = p_z_old * p_d_z_old[d, :] * p_w_z_old[w, :]

# if (beta2 ==1):
# if (beta ==1):
# p_z_d_w = p_z_old * p_d_z_old[d,:] * p_w_z_old[w,:]
# else:
# p_z_old_beta2 = np.power(p_z_old,beta2)
# # print('Finished','p_z_old_beta2')
# # beta2 = 1
# p_d_z_old_beta2 = np.power(p_d_z_old,beta2)
# # print('Finished','p_d_z_old_beta2')
# p_w_z_old_beta2 = np.power(p_w_z_old,beta2)
# # print('Finished','p_w_z_old_beta2')
# p_z_d_w = p_z_old_beta2 * p_d_z_old_beta2[d,:] * p_w_z_old_beta2[w,:]
# p_z_old_beta = np.power(p_z_old,beta)
# # print('Finished','p_z_old_beta')
# # beta = 1
# p_d_z_old_beta = np.power(p_d_z_old,beta)
# # print('Finished','p_d_z_old_beta')
# p_w_z_old_beta = np.power(p_w_z_old,beta)
# # print('Finished','p_w_z_old_beta')
# p_z_d_w = p_z_old_beta * p_d_z_old_beta[d,:] * p_w_z_old_beta[w,:]
# p_z_d_w = p_z_old_beta * p_d_z_old_beta[d,:] * p_w_z_old_beta[w,:]
# # print('Finished','p_z_d_w')


Expand All @@ -178,6 +182,8 @@ def train(td,
p_w_z[w,:] += s
p_z += s

counter = iteration

# normalize
normalize(p_d_z, axis=0, out=p_d_z)

Expand Down Expand Up @@ -212,7 +218,7 @@ def train(td,
print ("lik_diff =",lik_diff,'%')


if iteration > min_iteration2 and lik_diff < eps :
if iteration > min_iteration and lik_diff < eps :
print ("No more progress, stopping EM at iteration", iteration)
print ("LogL =", logL)
axis1 = range(1,len(logL)+1)
Expand All @@ -221,6 +227,14 @@ def train(td,
break


print('Stopped at iteration ',counter)

print("LogL =", logL)
axis1 = range(1, len(logL) + 1)
plt.plot(axis1, logL)
plt.savefig(logL_pic)



# print 'p_z-----'
# print p_z
Expand Down Expand Up @@ -272,7 +286,7 @@ def random_init(self, Z, V, D):
# np.savetxt("pdz.csv", self.p_d_z, delimiter=",")


def train(self, td, Z, maxiter=500, eps=0.01, folding_in=False,beta=beta2,min_iteration = min_iteration2): # eps=0.01 is default value
def train(self, td, Z, maxiter=maxiter2, eps=eps2, folding_in=False,beta=beta2,min_iteration = min_iteration2): # eps=0.01 is default value
"""
Train the model.

Expand All @@ -289,6 +303,9 @@ def train(self, td, Z, maxiter=500, eps=0.01, folding_in=False,beta=beta2,min_it
p_w_z_old = np.zeros_like(self.p_w_z)
p_z_old = np.zeros_like(self.p_z)

global logL_pic
logL_pic = self.logL_pic

train_func = _plsa.train if HAVE_EXT else train
print('RRRRRRRRRRRRRRRAM used just before training:', float(list(psutil.virtual_memory())[3]) / 1073741824.0, 'GB')
train_func(td.astype(np.uint32),
Expand Down
2 changes: 0 additions & 2 deletions plsa/plsa/plsa_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,6 @@ def generate_topics_json(self):
self.output_dir_stream = pclean.output_dir
self.file_dict_stream = pclean.file_dict



os.mkdir(pplsa.PLSA_PARAMETERS_PATH)

pplsa.main()
Expand Down
32 changes: 32 additions & 0 deletions servcie_spec/topic_analysis.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
syntax = "proto3";




message PLSARequest {

repeated string docs = 1;
repeated string edges = 2;
}

message PLSAResponse{

repeated string topics = 1;

}





///// Topic analysis Services

service NetowrkAnalyticsBipartite {


rpc BipartiteGraph (PLSARequest) returns (PLSAResponse) {};


}

///// End Network Analytics Services
7 changes: 7 additions & 0 deletions topic_analysis_grpc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Tested on python3.6

import grpc
from concurrent import futures
import time
import logging