Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ tables
matplotlib
nltk
stemming==1.0
flask-httpauth
122 changes: 122 additions & 0 deletions analysis_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Tested on python3.6


import time
import csv
import numpy as np

import os
import sys
import pathlib
import logging

sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/topic-analysis/plsa-service/plsa')

from flask import Flask, jsonify
from flask import make_response
from flask import request
from flask_httpauth import HTTPBasicAuth
auth = HTTPBasicAuth()

app = Flask(__name__)



# '/topic-analysis/api/v1.0/results'
@app.route('/topic-analysis/api/v1.0/results', methods=['GET'])
# @auth.login_required
def results():


try:
# Code to test exception handler for this try
# a=1/0

print('In generate_topics:', time.strftime("%c"))
handle = request.args['handle']
print("handle =", handle)

except Exception as e:

logging.exception("message")
return make_response(jsonify({'Error': 'Request was not fulfilled. Please try again.', "error_msg": str(e)}),400)




try:

parameters_path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/plsa/' + 'plsa-parameters/' + handle + '/'
print(parameters_path)

with open(parameters_path + 'plsa_topics.txt', 'r') as f:
topics = f.read().splitlines()

topic_by_doc = []
word_by_topic_conditional = []
logLikelihoods = []
docs_list = []

with open(parameters_path + 'topic-by-doc-matirx.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')

docs_list = next(csv_reader)[1:]

for row in csv_reader:
topic_by_doc.append(list((np.array(row[1:])).astype(np.float)))

with open(parameters_path + 'topic_probability_pz', 'r') as f:
topic_probabilities = f.read().splitlines()

topic_probabilities = list((np.array(topic_probabilities)).astype(np.float))

with open(parameters_path + 'word_by_topic_conditional.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')

for row in csv_reader:
word_by_topic_conditional.append(list((np.array(row[:-1])).astype(np.float)))

with open(parameters_path + 'logL.txt', 'r') as f:
logLikelihoods = f.read().splitlines()

logLikelihoods = list((np.array(logLikelihoods)).astype(np.float))

# resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', docs_list=docs_list, topics=topics,
# topicByDocMatirx=topic_by_doc, topicProbabilities=topic_probabilities,
# wordByTopicConditional=word_by_topic_conditional,
# logLikelihoods=logLikelihoods)


except Exception as e:

logging.exception("message")

# NOT: This line is tested: it throws back error message correctly

return make_response(jsonify({'Error': 'Request was not fulfilled. Please try again.', "error_msg": str(e)}), 500)







@app.errorhandler(404)
def not_found(error):
print ('In not_found:', time.strftime("%c"))
return make_response(jsonify({'Error': 'Not found'}), 404)





__end__ = '__end__'



if __name__ == '__main__':



# app.run(debug=True)
app.run(debug=False)
34 changes: 29 additions & 5 deletions plsa-service/plsa/plsa_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import datetime
import time
import json
import logging

sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa')
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/preprocessing')
Expand Down Expand Up @@ -44,7 +45,11 @@ def __init__(self, docs,local=False):
self.PLSA_PARAMETERS_PATH = ''

# self.messages
# self.unique_folder_naming
self.unique_folder_naming = None
self.num_topics = None
self.topic_divider = None
self.max_iter = None
self.beta = None

def __del__(self):

Expand All @@ -57,7 +62,7 @@ def write_to_json(self):



self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
# self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
print(self.unique_folder_naming)

os.mkdir(self.extracted_folder+self.unique_folder_naming)
Expand Down Expand Up @@ -90,11 +95,20 @@ def generate_topics_json(self):


# Do cleansing on the data and turing it to bad-of-words model

with open(self.plsa_parameters_path + self.unique_folder_naming+'status.txt','w') as f:
f.write('Preprocessing started.')

pclean.pre_pro()

with open(self.plsa_parameters_path + self.unique_folder_naming+'status.txt','w') as f:
f.write('Preprocessing finished. Topic analysis started.')

# Train using PLSA
pplsa.topic_divider = 0
pplsa.num_topics = 2
pplsa.topic_divider = self.topic_divider
pplsa.num_topics = self.num_topics
pplsa.maxiter2 = self.max_iter
pplsa.beta = self.beta
pplsa.folder = pclean.output_dir[:-1]
pplsa.dict_path = pclean.file_dict
pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming
Expand All @@ -107,14 +121,24 @@ def generate_topics_json(self):
self.output_dir_stream = pclean.output_dir
self.file_dict_stream = pclean.file_dict

os.mkdir(pplsa.PLSA_PARAMETERS_PATH)

try:
os.mkdir(pplsa.PLSA_PARAMETERS_PATH)
except:
print('-----------------------Folder exists-------------------------')


pplsa.main()


end_time_1 = time.time()

print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4))

with open(self.plsa_parameters_path + self.unique_folder_naming+'status.txt','w') as f:
f.write('Topic analysis finished.\n')
f.write(str(round((end_time_1 - start_time_1) / 60, 4)))




Expand Down
7 changes: 1 addition & 6 deletions service_spec/topic_analysis.proto
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,7 @@ message PLSAResponse{

bool status = 1;
string message = 2;
repeated string docs_list = 3;
repeated string topics = 4;
repeated FloatRow topicByDocMatirx = 5;
repeated double topicProbabilities = 6;
repeated FloatRow wordByTopicConditional = 7;
repeated double logLikelihoods = 8;
string handle = 3;


}
Expand Down
17 changes: 3 additions & 14 deletions snet_test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ def csv_reader():


def try_plsa():
channel = grpc.insecure_channel('localhost:500')
channel = grpc.insecure_channel('localhost:5000')
# channel = grpc.insecure_channel('172.17.0.75:5001')
stub = topic_analysis_pb2_grpc.TopicAnalysisStub(channel)


plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=3,maxiter=22,beta=1)
plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=3,maxiter=50,beta=0.6)

resp = stub.PLSA(plsa_request)

Expand All @@ -72,18 +72,7 @@ def try_plsa():
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.message)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.docs_list)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.topics)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.topicByDocMatirx)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.topicProbabilities)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.wordByTopicConditional)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.logLikelihoods)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.handle)



Expand Down
87 changes: 39 additions & 48 deletions topic_analysis_grpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import os
import csv
import numpy as np
import datetime
import random


SLEEP_TIME = 86400 # One day
Expand All @@ -19,15 +21,8 @@

print(sys.path)

# import example_plsa as pplsa
# import plsa as plsa1
# import cleansing as pclean
#
#
# import random
# import json
# import datetime
import plsa_wrapper
import threading

from service_spec import topic_analysis_pb2
from service_spec import topic_analysis_pb2_grpc
Expand Down Expand Up @@ -81,48 +76,12 @@ def PLSA(self,request,context):

try:

s = plsa_wrapper.PLSA_wrapper(docs)
s.write_to_json()
s.generate_topics_json()

with open(s.PLSA_PARAMETERS_PATH+'plsa_topics.txt','r') as f:
topics = f.read().splitlines()

topic_by_doc = []
word_by_topic_conditional = []
logLikelihoods = []
docs_list = []

with open(s.PLSA_PARAMETERS_PATH+'topic-by-doc-matirx.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')

docs_list = next(csv_reader)[1:]

for row in csv_reader:
topic_by_doc.append(topic_analysis_pb2.FloatRow(doubleValue=list((np.array(row[1:])).astype(np.float))))


with open(s.PLSA_PARAMETERS_PATH+'topic_probability_pz','r') as f:
topic_probabilities = f.read().splitlines()

topic_probabilities = list((np.array(topic_probabilities)).astype(np.float))


with open(s.PLSA_PARAMETERS_PATH+'word_by_topic_conditional.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')

for row in csv_reader:
word_by_topic_conditional.append(topic_analysis_pb2.FloatRow(doubleValue=list((np.array(row[:-1])).astype(np.float))))

with open(s.PLSA_PARAMETERS_PATH+'logL.txt','r') as f:
logLikelihoods = f.read().splitlines()

logLikelihoods = list((np.array(logLikelihoods)).astype(np.float))


resp = topic_analysis_pb2.PLSAResponse(status=True,message='success',docs_list=docs_list,topics=topics,topicByDocMatirx=topic_by_doc,topicProbabilities=topic_probabilities,wordByTopicConditional=word_by_topic_conditional,logLikelihoods=logLikelihoods)
unique_folder_naming = str(datetime.datetime.now()).replace(':', '-').replace('.', '-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'

thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
thread1.start()

resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1])


print('status:',resp.status)
Expand All @@ -145,6 +104,38 @@ def PLSA(self,request,context):
return resp


def generate_topics_plsa(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta):

# Put try catch here and add status

s = plsa_wrapper.PLSA_wrapper(docs)

try:

os.mkdir(s.plsa_parameters_path+unique_folder_naming)

# 1/0

with open(s.plsa_parameters_path+unique_folder_naming+'status.txt','w') as f:
f.write('Analysis started.')

s.unique_folder_naming = unique_folder_naming
s.num_topics = num_topics
s.topic_divider = topic_divider
s.max_iter = maxiter
s.beta = beta
s.write_to_json()
s.generate_topics_json()

except Exception as e:

logging.exception("message")

with open(s.plsa_parameters_path+unique_folder_naming+'status.txt','w') as f:
f.write('Failed.')
f.write('\n')
f.write(str(e))




Expand Down