singnet · edyirdaw · Feb 15, 2019 · Feb 14, 2019 · Feb 14, 2019 · Feb 14, 2019
diff --git a/analysis_results.py b/analysis_results.py
@@ -21,6 +21,7 @@
 app = Flask(__name__)
 
 
+status_list = ['Analysis started.','Preprocessing started.','Preprocessing finished. Topic analysis started.','Topic analysis finished.','Failed.']
 
 # '/topic-analysis/api/v1.0/results'
 @app.route('/topic-analysis/api/v1.0/results', methods=['GET'])
@@ -32,8 +33,8 @@ def results():
         # Code to test exception handler for this try
         # a=1/0
 
-        print('In generate_topics:', time.strftime("%c"))
-        handle = request.args['handle']
+        print('In results:', time.strftime("%c"))
+        handle = request.args['handle'].replace('e','-').replace('d',' ').replace('y','^')
         print("handle =", handle)
 
     except Exception as e:
@@ -49,6 +50,16 @@ def results():
         parameters_path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/plsa/' + 'plsa-parameters/' + handle + '/'
         print(parameters_path)
 
+
+        with open(parameters_path + 'status.txt', 'r') as f:
+            status = f.read().splitlines()
+
+        if status[0] not in status_list:
+            return make_response(jsonify({'Error': 'Analysis ended unexpectedly, corrupt status file or status file not written yet', "error_msg": ''}), 500)
+
+        if status[0] != 'Topic analysis finished.':
+            return make_response(jsonify({'status':status}), 200)
+
         with open(parameters_path + 'plsa_topics.txt', 'r') as f:
             topics = f.read().splitlines()
 
@@ -81,10 +92,17 @@ def results():
 
             logLikelihoods = list((np.array(logLikelihoods)).astype(np.float))
 
-        # resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', docs_list=docs_list, topics=topics,
-        #                                        topicByDocMatirx=topic_by_doc, topicProbabilities=topic_probabilities,
-        #                                        wordByTopicConditional=word_by_topic_conditional,
-        #                                        logLikelihoods=logLikelihoods)
+        resp = {}
+        resp['status'] = status[0]
+        resp['total running time in minutes'] = float(status[1])
+        resp['docs_list'] = docs_list
+        resp['topics'] = topics
+        resp['topicByDocMatirx'] = topic_by_doc
+        resp['topicProbabilities'] = topic_probabilities
+        resp['wordByTopicConditional'] = word_by_topic_conditional
+        resp['logLikelihoods'] = logLikelihoods
+
+        return make_response(jsonify(resp), 200)
 
 
     except Exception as e:
@@ -119,4 +137,4 @@ def not_found(error):
 
 
     # app.run(debug=True)
-    app.run(debug=False)
+    app.run(debug=False,port=4999)
diff --git a/docs/test_doc.txt b/docs/test_doc.txt
diff --git a/plsa-service/plsa/example_plsa.py b/plsa-service/plsa/example_plsa.py
@@ -12,7 +12,7 @@
 
 # import taskmanager as tm
 import pandas as pd
-from tfidf.preprocessing import read_files, preprocess_documents, read_json, json_files_list
+from tfidf.preprocessing import read_files, preprocess_documents, read_json
 # from tfidf.porter import PorterStemmer
 from tfidf.tfidf import *
 # from tfidf.preprocessing import file_list, empty_file_list
@@ -23,7 +23,6 @@
 
 # s_file_list = []
 
-empty_docs_list = []
 
 file_parts_number = 8
 # file_parts_number = 7 # Inspire
@@ -71,7 +70,9 @@
 def feat(folder):
     global num_topics
     # docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt"))))
-    docs = list(preprocess_documents(read_json(folder+"/cleaned.json")))
+    ret_val_1 = read_json(folder + "/cleaned.json")
+    json_files_list = ret_val_1[1]
+    docs = list(preprocess_documents(ret_val_1[0]))
     assert(len(docs) > 0)
     print("len(docs) =",len(docs))
     # Uncomment this later and fix it with the new json theme
@@ -99,6 +100,8 @@ def feat(folder):
     print("'''''''''''''''''''''''''''''''")
     # print(td_dict)
 
+    empty_docs_list = []
+
     for doc in range(len(docs)):
         if docs[doc] == '':
             print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc,' and file name is:',json_files_list[doc])
@@ -136,7 +139,7 @@ def feat(folder):
     # print 'vocab\n',vocab
     # return td, idf, vocab
     # exit(0)
-    return td, vocab
+    return td, vocab, empty_docs_list, json_files_list
 
 def K(D):
     global num_topics
@@ -271,7 +274,7 @@ def matrix_to_file(mat):
 
 
 # @tm.task(feat, int, int)
-def train(data, maxiter=500, debug=True):
+def train(data,empty_docs_list,json_files_list,maxiter=500, debug=True):
     # td, idf, vocab = data
     # s_file_list= sorted(file_list)
     # print('file_list:',file_list)
@@ -323,7 +326,7 @@ def train(data, maxiter=500, debug=True):
     # print(file_list)
 
     print('>>>>>>> In method train:', empty_docs_list)
-    for edl in empty_docs_list:
+    for edl in sorted(empty_docs_list, reverse=True):
         # print(file_list[edl])
         del file_list[edl]
 
@@ -489,7 +492,10 @@ def main():
     print ('Training started at',time.strftime("%c"))
     start_time = time.time()
     data=feat(folder)
-    model=train(data)
+    empty_docs_list = data[2]
+    json_files_list = data[3]
+    data = data[:2]
+    model=train(data, empty_docs_list, json_files_list)
     print ('>>>>>>>>>>>>>Finished training')
     end_time = time.time()
     print ('Training took ' + str(round((end_time - start_time) / 60, 4)) + ' minutes.')

diff --git a/plsa-service/plsa/tfidf/preprocessing.py b/plsa-service/plsa/tfidf/preprocessing.py
@@ -7,7 +7,6 @@
 # file_parts_number = 9
 # file_parts_number = 8
 
-json_files_list = []
 
 def strip_punctuation(s):
     return re.sub("([%s]+)" % string.punctuation, " ", s)
@@ -98,10 +97,11 @@ def read_files(pattern):
     return map(read_file, glob.glob(pattern))
 
 def read_json(path):
-    global json_files_list
     with open(path, "r") as read_file:
         ret = json.load(read_file)
 
+    json_files_list = []
+
     for k in ret:
         json_files_list.append(k)
 
@@ -114,7 +114,7 @@ def read_json(path):
         docs.append(ret[k])
 
     # return docs
-    return map(mapper,docs)
+    return map(mapper,docs),json_files_list
 
 def mapper(s):
     return s

diff --git a/snet_test_client.py b/snet_test_client.py
@@ -63,7 +63,7 @@ def try_plsa():
     stub = topic_analysis_pb2_grpc.TopicAnalysisStub(channel)
 
 
-    plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=3,maxiter=50,beta=0.6)
+    plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=2,maxiter=22,beta=1)
 
     resp = stub.PLSA(plsa_request)
 

diff --git a/test_topic_analysis_grpc.py b/test_topic_analysis_grpc.py
@@ -0,0 +1,95 @@
+# Tested on python3.6
+
+
+import unittest
+import grpc
+
+import json
+import time
+
+from service_spec import topic_analysis_pb2
+from service_spec import topic_analysis_pb2_grpc
+
+import topic_analysis_grpc
+import analysis_results
+
+sleep_time_secs = 10 # This is to allow for topic models to be generated before unit testing occurs in the following code
+
+class TestTopicAnalysisGrpc(unittest.TestCase):
+
+
+    def setUp(self):
+
+        self.app = analysis_results.app.test_client()
+        self.docs = []
+
+        sample_doc = 'docs/test_doc.txt'
+        with open(sample_doc,'r') as f:
+            self.docs = f.read().splitlines()
+
+        self.docs = list(filter(lambda a: a != '', self.docs))
+
+        channel = grpc.insecure_channel('localhost:5000')
+        self.stub = topic_analysis_pb2_grpc.TopicAnalysisStub(channel)
+
+        self.server = topic_analysis_grpc.serve_test()
+        self.server.start()
+
+    def tearDown(self):
+        self.server.stop(0)
+        print('Server stopped')
+
+    def test_response_format_grpc(self):
+
+        plsa_request = topic_analysis_pb2.PLSARequest(docs=self.docs, num_topics=2, maxiter=22, beta=1)
+
+        resp = self.stub.PLSA(plsa_request)
+
+        print('////////////// Sleeping till topic analysis finishes')
+        time.sleep(sleep_time_secs)
+        print('\\\\\\\\\\\\\\\\\\\\\\\\\\\\  Wide awake now')
+
+        print(resp)
+
+        self.assertEqual([resp.status,resp.message],[True,'success'])
+
+        resp2 = self.app.get('/topic-analysis/api/v1.0/results?handle='+resp.handle)
+        resp2_data = json.loads(resp2.get_data(as_text=True))
+        print(';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;')
+
+        self.assertEqual(resp2_data['status'],'Topic analysis finished.')
+        self.assertGreater(resp2_data['total running time in minutes'],0.0)
+        self.assertEqual(resp2_data['docs_list'], [str(i) for i in range(0,44)])
+        self.assertEqual(len(resp2_data['topics']),2)
+        self.assertIsInstance(resp2_data['topics'][0],str)
+        self.assertIsInstance(resp2_data['topics'][1],str)
+        self.assertEqual(len(resp2_data['topicByDocMatirx']),2)
+        self.assertEqual(len(resp2_data['topicByDocMatirx'][0]),44)
+        self.assertAlmostEqual(sum(sum(resp2_data['topicByDocMatirx'],[])),1.0,delta=0.1)
+        print('sum of p(z,d)=',sum(sum(resp2_data['topicByDocMatirx'],[])))
+        self.assertAlmostEqual(resp2_data['topicProbabilities'][0]+ resp2_data['topicProbabilities'][1],1.0,delta=0.1)
+        self.assertEqual(len(resp2_data['wordByTopicConditional']), 2)
+        self.assertEqual(len(resp2_data['wordByTopicConditional'][0]), 300)
+        self.assertAlmostEqual(sum(sum(resp2_data['wordByTopicConditional'], [])), 1.0, delta=0.1)
+        print('sum of p(w|z)=',sum(sum(resp2_data['wordByTopicConditional'],[])))
+        self.assertEqual(len(resp2_data['logLikelihoods']),23)
+        for i in range(0,23):
+            self.assertLess(resp2_data['logLikelihoods'][i],0)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+__end__ = '__end__'
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/topic_analysis_grpc.py b/topic_analysis_grpc.py
@@ -81,7 +81,7 @@ def PLSA(self,request,context):
             thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
             thread1.start()
 
-            resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1])
+            resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1].replace('-','e').replace(' ','d').replace('^','y'))
 
 
             print('status:',resp.status)