diff --git a/Docker/requirements.txt b/Docker/requirements.txt index 9befa89..56429aa 100644 --- a/Docker/requirements.txt +++ b/Docker/requirements.txt @@ -10,3 +10,4 @@ tables matplotlib nltk stemming==1.0 +flask-httpauth diff --git a/analysis_results.py b/analysis_results.py new file mode 100644 index 0000000..22d2109 --- /dev/null +++ b/analysis_results.py @@ -0,0 +1,122 @@ +# Tested on python3.6 + + +import time +import csv +import numpy as np + +import os +import sys +import pathlib +import logging + +sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/topic-analysis/plsa-service/plsa') + +from flask import Flask, jsonify +from flask import make_response +from flask import request +from flask_httpauth import HTTPBasicAuth +auth = HTTPBasicAuth() + +app = Flask(__name__) + + + +# '/topic-analysis/api/v1.0/results' +@app.route('/topic-analysis/api/v1.0/results', methods=['GET']) +# @auth.login_required +def results(): + + + try: + # Code to test exception handler for this try + # a=1/0 + + print('In generate_topics:', time.strftime("%c")) + handle = request.args['handle'] + print("handle =", handle) + + except Exception as e: + + logging.exception("message") + return make_response(jsonify({'Error': 'Request was not fulfilled. Please try again.', "error_msg": str(e)}),400) + + + + + try: + + parameters_path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/plsa/' + 'plsa-parameters/' + handle + '/' + print(parameters_path) + + with open(parameters_path + 'plsa_topics.txt', 'r') as f: + topics = f.read().splitlines() + + topic_by_doc = [] + word_by_topic_conditional = [] + logLikelihoods = [] + docs_list = [] + + with open(parameters_path + 'topic-by-doc-matirx.csv') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + + docs_list = next(csv_reader)[1:] + + for row in csv_reader: + topic_by_doc.append(list((np.array(row[1:])).astype(np.float))) + + with open(parameters_path + 'topic_probability_pz', 'r') as f: + topic_probabilities = f.read().splitlines() + + topic_probabilities = list((np.array(topic_probabilities)).astype(np.float)) + + with open(parameters_path + 'word_by_topic_conditional.csv') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + + for row in csv_reader: + word_by_topic_conditional.append(list((np.array(row[:-1])).astype(np.float))) + + with open(parameters_path + 'logL.txt', 'r') as f: + logLikelihoods = f.read().splitlines() + + logLikelihoods = list((np.array(logLikelihoods)).astype(np.float)) + + # resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', docs_list=docs_list, topics=topics, + # topicByDocMatirx=topic_by_doc, topicProbabilities=topic_probabilities, + # wordByTopicConditional=word_by_topic_conditional, + # logLikelihoods=logLikelihoods) + + + except Exception as e: + + logging.exception("message") + + # NOT: This line is tested: it throws back error message correctly + + return make_response(jsonify({'Error': 'Request was not fulfilled. Please try again.', "error_msg": str(e)}), 500) + + + + + + + +@app.errorhandler(404) +def not_found(error): + print ('In not_found:', time.strftime("%c")) + return make_response(jsonify({'Error': 'Not found'}), 404) + + + + + +__end__ = '__end__' + + + +if __name__ == '__main__': + + + + # app.run(debug=True) + app.run(debug=False) diff --git a/plsa-service/plsa/plsa_wrapper.py b/plsa-service/plsa/plsa_wrapper.py index 9ada868..82e3d3d 100644 --- a/plsa-service/plsa/plsa_wrapper.py +++ b/plsa-service/plsa/plsa_wrapper.py @@ -12,6 +12,7 @@ import datetime import time import json +import logging sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa') sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/preprocessing') @@ -44,7 +45,11 @@ def __init__(self, docs,local=False): self.PLSA_PARAMETERS_PATH = '' # self.messages - # self.unique_folder_naming + self.unique_folder_naming = None + self.num_topics = None + self.topic_divider = None + self.max_iter = None + self.beta = None def __del__(self): @@ -57,7 +62,7 @@ def write_to_json(self): - self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' + # self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' print(self.unique_folder_naming) os.mkdir(self.extracted_folder+self.unique_folder_naming) @@ -90,11 +95,20 @@ def generate_topics_json(self): # Do cleansing on the data and turing it to bad-of-words model + + with open(self.plsa_parameters_path + self.unique_folder_naming+'status.txt','w') as f: + f.write('Preprocessing started.') + pclean.pre_pro() + with open(self.plsa_parameters_path + self.unique_folder_naming+'status.txt','w') as f: + f.write('Preprocessing finished. Topic analysis started.') + # Train using PLSA - pplsa.topic_divider = 0 - pplsa.num_topics = 2 + pplsa.topic_divider = self.topic_divider + pplsa.num_topics = self.num_topics + pplsa.maxiter2 = self.max_iter + pplsa.beta = self.beta pplsa.folder = pclean.output_dir[:-1] pplsa.dict_path = pclean.file_dict pplsa.PLSA_PARAMETERS_PATH = self.plsa_parameters_path + self.unique_folder_naming @@ -107,14 +121,24 @@ def generate_topics_json(self): self.output_dir_stream = pclean.output_dir self.file_dict_stream = pclean.file_dict - os.mkdir(pplsa.PLSA_PARAMETERS_PATH) + + try: + os.mkdir(pplsa.PLSA_PARAMETERS_PATH) + except: + print('-----------------------Folder exists-------------------------') + pplsa.main() + end_time_1 = time.time() print('Total training time took:',round((end_time_1 - start_time_1) / 60, 4)) + with open(self.plsa_parameters_path + self.unique_folder_naming+'status.txt','w') as f: + f.write('Topic analysis finished.\n') + f.write(str(round((end_time_1 - start_time_1) / 60, 4))) + diff --git a/service_spec/topic_analysis.proto b/service_spec/topic_analysis.proto index faa86e4..865f4ec 100644 --- a/service_spec/topic_analysis.proto +++ b/service_spec/topic_analysis.proto @@ -22,12 +22,7 @@ message PLSAResponse{ bool status = 1; string message = 2; - repeated string docs_list = 3; - repeated string topics = 4; - repeated FloatRow topicByDocMatirx = 5; - repeated double topicProbabilities = 6; - repeated FloatRow wordByTopicConditional = 7; - repeated double logLikelihoods = 8; + string handle = 3; } diff --git a/snet_test_client.py b/snet_test_client.py index 060eff4..e67c383 100644 --- a/snet_test_client.py +++ b/snet_test_client.py @@ -58,12 +58,12 @@ def csv_reader(): def try_plsa(): - channel = grpc.insecure_channel('localhost:500') + channel = grpc.insecure_channel('localhost:5000') # channel = grpc.insecure_channel('172.17.0.75:5001') stub = topic_analysis_pb2_grpc.TopicAnalysisStub(channel) - plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=3,maxiter=22,beta=1) + plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=3,maxiter=50,beta=0.6) resp = stub.PLSA(plsa_request) @@ -72,18 +72,7 @@ def try_plsa(): print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') print(resp.message) print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') - print(resp.docs_list) - print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') - print(resp.topics) - print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') - print(resp.topicByDocMatirx) - print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') - print(resp.topicProbabilities) - print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') - print(resp.wordByTopicConditional) - print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') - print(resp.logLikelihoods) - print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') + print(resp.handle) diff --git a/topic_analysis_grpc.py b/topic_analysis_grpc.py index cea5019..c758a49 100644 --- a/topic_analysis_grpc.py +++ b/topic_analysis_grpc.py @@ -9,6 +9,8 @@ import os import csv import numpy as np +import datetime +import random SLEEP_TIME = 86400 # One day @@ -19,15 +21,8 @@ print(sys.path) -# import example_plsa as pplsa -# import plsa as plsa1 -# import cleansing as pclean -# -# -# import random -# import json -# import datetime import plsa_wrapper +import threading from service_spec import topic_analysis_pb2 from service_spec import topic_analysis_pb2_grpc @@ -81,48 +76,12 @@ def PLSA(self,request,context): try: - s = plsa_wrapper.PLSA_wrapper(docs) - s.write_to_json() - s.generate_topics_json() - - with open(s.PLSA_PARAMETERS_PATH+'plsa_topics.txt','r') as f: - topics = f.read().splitlines() - - topic_by_doc = [] - word_by_topic_conditional = [] - logLikelihoods = [] - docs_list = [] - - with open(s.PLSA_PARAMETERS_PATH+'topic-by-doc-matirx.csv') as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',') - - docs_list = next(csv_reader)[1:] - - for row in csv_reader: - topic_by_doc.append(topic_analysis_pb2.FloatRow(doubleValue=list((np.array(row[1:])).astype(np.float)))) - - - with open(s.PLSA_PARAMETERS_PATH+'topic_probability_pz','r') as f: - topic_probabilities = f.read().splitlines() - - topic_probabilities = list((np.array(topic_probabilities)).astype(np.float)) - - - with open(s.PLSA_PARAMETERS_PATH+'word_by_topic_conditional.csv') as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',') - - for row in csv_reader: - word_by_topic_conditional.append(topic_analysis_pb2.FloatRow(doubleValue=list((np.array(row[:-1])).astype(np.float)))) - - with open(s.PLSA_PARAMETERS_PATH+'logL.txt','r') as f: - logLikelihoods = f.read().splitlines() - - logLikelihoods = list((np.array(logLikelihoods)).astype(np.float)) - - - resp = topic_analysis_pb2.PLSAResponse(status=True,message='success',docs_list=docs_list,topics=topics,topicByDocMatirx=topic_by_doc,topicProbabilities=topic_probabilities,wordByTopicConditional=word_by_topic_conditional,logLikelihoods=logLikelihoods) + unique_folder_naming = str(datetime.datetime.now()).replace(':', '-').replace('.', '-') + '^' + str(random.randint(100000000000, 999999999999)) + '/' + thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta)) + thread1.start() + resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1]) print('status:',resp.status) @@ -145,6 +104,38 @@ def PLSA(self,request,context): return resp +def generate_topics_plsa(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta): + + # Put try catch here and add status + + s = plsa_wrapper.PLSA_wrapper(docs) + + try: + + os.mkdir(s.plsa_parameters_path+unique_folder_naming) + + # 1/0 + + with open(s.plsa_parameters_path+unique_folder_naming+'status.txt','w') as f: + f.write('Analysis started.') + + s.unique_folder_naming = unique_folder_naming + s.num_topics = num_topics + s.topic_divider = topic_divider + s.max_iter = maxiter + s.beta = beta + s.write_to_json() + s.generate_topics_json() + + except Exception as e: + + logging.exception("message") + + with open(s.plsa_parameters_path+unique_folder_naming+'status.txt','w') as f: + f.write('Failed.') + f.write('\n') + f.write(str(e)) +