Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions analysis_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
app = Flask(__name__)


status_list = ['Analysis started.','Preprocessing started.','Preprocessing finished. Topic analysis started.','Topic analysis finished.','Failed.']

# '/topic-analysis/api/v1.0/results'
@app.route('/topic-analysis/api/v1.0/results', methods=['GET'])
Expand All @@ -32,8 +33,8 @@ def results():
# Code to test exception handler for this try
# a=1/0

print('In generate_topics:', time.strftime("%c"))
handle = request.args['handle']
print('In results:', time.strftime("%c"))
handle = request.args['handle'].replace('e','-').replace('d',' ').replace('y','^')
print("handle =", handle)

except Exception as e:
Expand All @@ -49,6 +50,16 @@ def results():
parameters_path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/plsa/' + 'plsa-parameters/' + handle + '/'
print(parameters_path)


with open(parameters_path + 'status.txt', 'r') as f:
status = f.read().splitlines()

if status[0] not in status_list:
return make_response(jsonify({'Error': 'Analysis ended unexpectedly, corrupt status file or status file not written yet', "error_msg": ''}), 500)

if status[0] != 'Topic analysis finished.':
return make_response(jsonify({'status':status}), 200)

with open(parameters_path + 'plsa_topics.txt', 'r') as f:
topics = f.read().splitlines()

Expand Down Expand Up @@ -81,10 +92,17 @@ def results():

logLikelihoods = list((np.array(logLikelihoods)).astype(np.float))

# resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', docs_list=docs_list, topics=topics,
# topicByDocMatirx=topic_by_doc, topicProbabilities=topic_probabilities,
# wordByTopicConditional=word_by_topic_conditional,
# logLikelihoods=logLikelihoods)
resp = {}
resp['status'] = status[0]
resp['total running time in minutes'] = float(status[1])
resp['docs_list'] = docs_list
resp['topics'] = topics
resp['topicByDocMatirx'] = topic_by_doc
resp['topicProbabilities'] = topic_probabilities
resp['wordByTopicConditional'] = word_by_topic_conditional
resp['logLikelihoods'] = logLikelihoods

return make_response(jsonify(resp), 200)


except Exception as e:
Expand Down Expand Up @@ -119,4 +137,4 @@ def not_found(error):


# app.run(debug=True)
app.run(debug=False)
app.run(debug=False,port=4999)
82 changes: 82 additions & 0 deletions docs/test_doc.txt

Large diffs are not rendered by default.

20 changes: 13 additions & 7 deletions plsa-service/plsa/example_plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

# import taskmanager as tm
import pandas as pd
from tfidf.preprocessing import read_files, preprocess_documents, read_json, json_files_list
from tfidf.preprocessing import read_files, preprocess_documents, read_json
# from tfidf.porter import PorterStemmer
from tfidf.tfidf import *
# from tfidf.preprocessing import file_list, empty_file_list
Expand All @@ -23,7 +23,6 @@

# s_file_list = []

empty_docs_list = []

file_parts_number = 8
# file_parts_number = 7 # Inspire
Expand Down Expand Up @@ -71,7 +70,9 @@
def feat(folder):
global num_topics
# docs = list(preprocess_documents(read_files(os.path.join(folder, "*.txt"))))
docs = list(preprocess_documents(read_json(folder+"/cleaned.json")))
ret_val_1 = read_json(folder + "/cleaned.json")
json_files_list = ret_val_1[1]
docs = list(preprocess_documents(ret_val_1[0]))
assert(len(docs) > 0)
print("len(docs) =",len(docs))
# Uncomment this later and fix it with the new json theme
Expand Down Expand Up @@ -99,6 +100,8 @@ def feat(folder):
print("'''''''''''''''''''''''''''''''")
# print(td_dict)

empty_docs_list = []

for doc in range(len(docs)):
if docs[doc] == '':
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Empty doc detected with id:',doc,' and file name is:',json_files_list[doc])
Expand Down Expand Up @@ -136,7 +139,7 @@ def feat(folder):
# print 'vocab\n',vocab
# return td, idf, vocab
# exit(0)
return td, vocab
return td, vocab, empty_docs_list, json_files_list

def K(D):
global num_topics
Expand Down Expand Up @@ -271,7 +274,7 @@ def matrix_to_file(mat):


# @tm.task(feat, int, int)
def train(data, maxiter=500, debug=True):
def train(data,empty_docs_list,json_files_list,maxiter=500, debug=True):
# td, idf, vocab = data
# s_file_list= sorted(file_list)
# print('file_list:',file_list)
Expand Down Expand Up @@ -323,7 +326,7 @@ def train(data, maxiter=500, debug=True):
# print(file_list)

print('>>>>>>> In method train:', empty_docs_list)
for edl in empty_docs_list:
for edl in sorted(empty_docs_list, reverse=True):
# print(file_list[edl])
del file_list[edl]

Expand Down Expand Up @@ -489,7 +492,10 @@ def main():
print ('Training started at',time.strftime("%c"))
start_time = time.time()
data=feat(folder)
model=train(data)
empty_docs_list = data[2]
json_files_list = data[3]
data = data[:2]
model=train(data, empty_docs_list, json_files_list)
print ('>>>>>>>>>>>>>Finished training')
end_time = time.time()
print ('Training took ' + str(round((end_time - start_time) / 60, 4)) + ' minutes.')
Expand Down
6 changes: 3 additions & 3 deletions plsa-service/plsa/tfidf/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# file_parts_number = 9
# file_parts_number = 8

json_files_list = []

def strip_punctuation(s):
return re.sub("([%s]+)" % string.punctuation, " ", s)
Expand Down Expand Up @@ -98,10 +97,11 @@ def read_files(pattern):
return map(read_file, glob.glob(pattern))

def read_json(path):
global json_files_list
with open(path, "r") as read_file:
ret = json.load(read_file)

json_files_list = []

for k in ret:
json_files_list.append(k)

Expand All @@ -114,7 +114,7 @@ def read_json(path):
docs.append(ret[k])

# return docs
return map(mapper,docs)
return map(mapper,docs),json_files_list

def mapper(s):
return s
Expand Down
2 changes: 1 addition & 1 deletion snet_test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def try_plsa():
stub = topic_analysis_pb2_grpc.TopicAnalysisStub(channel)


plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=3,maxiter=50,beta=0.6)
plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=2,maxiter=22,beta=1)

resp = stub.PLSA(plsa_request)

Expand Down
95 changes: 95 additions & 0 deletions test_topic_analysis_grpc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Tested on python3.6


import unittest
import grpc

import json
import time

from service_spec import topic_analysis_pb2
from service_spec import topic_analysis_pb2_grpc

import topic_analysis_grpc
import analysis_results

sleep_time_secs = 10 # This is to allow for topic models to be generated before unit testing occurs in the following code

class TestTopicAnalysisGrpc(unittest.TestCase):


def setUp(self):

self.app = analysis_results.app.test_client()
self.docs = []

sample_doc = 'docs/test_doc.txt'
with open(sample_doc,'r') as f:
self.docs = f.read().splitlines()

self.docs = list(filter(lambda a: a != '', self.docs))

channel = grpc.insecure_channel('localhost:5000')
self.stub = topic_analysis_pb2_grpc.TopicAnalysisStub(channel)

self.server = topic_analysis_grpc.serve_test()
self.server.start()

def tearDown(self):
self.server.stop(0)
print('Server stopped')

def test_response_format_grpc(self):

plsa_request = topic_analysis_pb2.PLSARequest(docs=self.docs, num_topics=2, maxiter=22, beta=1)

resp = self.stub.PLSA(plsa_request)

print('////////////// Sleeping till topic analysis finishes')
time.sleep(sleep_time_secs)
print('\\\\\\\\\\\\\\\\\\\\\\\\\\\\ Wide awake now')

print(resp)

self.assertEqual([resp.status,resp.message],[True,'success'])

resp2 = self.app.get('/topic-analysis/api/v1.0/results?handle='+resp.handle)
resp2_data = json.loads(resp2.get_data(as_text=True))
print(';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;')

self.assertEqual(resp2_data['status'],'Topic analysis finished.')
self.assertGreater(resp2_data['total running time in minutes'],0.0)
self.assertEqual(resp2_data['docs_list'], [str(i) for i in range(0,44)])
self.assertEqual(len(resp2_data['topics']),2)
self.assertIsInstance(resp2_data['topics'][0],str)
self.assertIsInstance(resp2_data['topics'][1],str)
self.assertEqual(len(resp2_data['topicByDocMatirx']),2)
self.assertEqual(len(resp2_data['topicByDocMatirx'][0]),44)
self.assertAlmostEqual(sum(sum(resp2_data['topicByDocMatirx'],[])),1.0,delta=0.1)
print('sum of p(z,d)=',sum(sum(resp2_data['topicByDocMatirx'],[])))
self.assertAlmostEqual(resp2_data['topicProbabilities'][0]+ resp2_data['topicProbabilities'][1],1.0,delta=0.1)
self.assertEqual(len(resp2_data['wordByTopicConditional']), 2)
self.assertEqual(len(resp2_data['wordByTopicConditional'][0]), 300)
self.assertAlmostEqual(sum(sum(resp2_data['wordByTopicConditional'], [])), 1.0, delta=0.1)
print('sum of p(w|z)=',sum(sum(resp2_data['wordByTopicConditional'],[])))
self.assertEqual(len(resp2_data['logLikelihoods']),23)
for i in range(0,23):
self.assertLess(resp2_data['logLikelihoods'][i],0)














__end__ = '__end__'

if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion topic_analysis_grpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def PLSA(self,request,context):
thread1 = threading.Thread(target=generate_topics_plsa, args=(docs,unique_folder_naming,num_topics,topic_divider,maxiter,beta))
thread1.start()

resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1])
resp = topic_analysis_pb2.PLSAResponse(status=True, message='success', handle=unique_folder_naming[:-1].replace('-','e').replace(' ','d').replace('^','y'))


print('status:',resp.status)
Expand Down