Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
.idea/
*~
plsa-service/__pycache__/
plsa-service/plsa/_plsa.c
service_spec/__pycache__/
service_spec/topic_analysis_pb2.py
service_spec/topic_analysis_pb2_grpc.py
File renamed without changes.
File renamed without changes.
File renamed without changes.
26 changes: 15 additions & 11 deletions plsa/plsa/example_plsa.py → plsa-service/plsa/example_plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@
import logging
import numpy as np


# import taskmanager as tm
import pandas as pd
from tfidf.preprocessing import read_files, preprocess_documents, read_json, json_files_list
# from tfidf.porter import PorterStemmer
from tfidf.tfidf import *
# from tfidf.preprocessing import file_list, empty_file_list

from plsa import pLSA
# from plsa import pLSA
import plsa as plsa1
import porter_dictionary

# s_file_list = []
Expand Down Expand Up @@ -286,7 +288,7 @@ def train(data, maxiter=500, debug=True):

td, vocab = data
# td = td[:,:-1]
plsa = pLSA()
plsa = plsa1.pLSA()
plsa.debug = debug
plsa.logL_pic = logL_pic
# model=plsa.train(td, num_topics, maxiter)
Expand Down Expand Up @@ -386,22 +388,22 @@ def train(data, maxiter=500, debug=True):
def average_train(data, maxiter=500, debug=True):
td, idf, vocab = data
td = td[:,:-1]
plsa = pLSA()
plsa = plsa1.pLSA()
plsa.debug = debug
return plsa.average_train(10)(td, 10, maxiter)

# @tm.task(feat, train, int, int)
def folding_in(data, model, maxiter=30, debug=True):
td, idf, vocab = data
d = td[:,-1]
plsa = pLSA(model)
plsa = plsa1.pLSA(model)
plsa.debug = debug
print (plsa.folding_in(d, maxiter))

# @tm.nocache
# @tm.task(train)
def document_topics(model):
plsa = pLSA(model)
plsa = plsa1.pLSA(model)
for i in plsa.document_topics():
print (i)
# file_txt1.write(str(i))
Expand All @@ -411,13 +413,13 @@ def document_topics(model):
# @tm.nocache
# @tm.task(train)
def document_cluster(model):
plsa = pLSA(model)
plsa = plsa1.pLSA(model)
print (plsa.document_cluster())

# @tm.nocache
# @tm.task(train)
def word_topics(model):
plsa = pLSA(model)
plsa = plsa1.pLSA(model)
for i in plsa.word_topics():
print (i)
# file_txt1.write(str(i))
Expand All @@ -427,13 +429,13 @@ def word_topics(model):
# @tm.nocache
# @tm.task(train)
def word_cluster(model):
plsa = pLSA(model)
plsa = plsa1.pLSA(model)
print (plsa.word_cluster())

# @tm.nocache
# @tm.task(train)
def unigram_smoothing(model):
plsa = pLSA(model)
plsa = plsa1.pLSA(model)
print (plsa.unigram_smoothing())

# @tm.nocache
Expand All @@ -445,7 +447,7 @@ def topic_labels(data, model, N=50):
port_dict.load_dict(dict_path)
# print port_dict.dictionary
td, vocab = data
plsa = pLSA(model)
plsa = plsa1.pLSA(model)
inv_vocab = inverse_vocab(vocab)
dict_vocab=[]
# vocab_list=[x for x in inv_vocab[1]]
Expand All @@ -470,7 +472,7 @@ def topic_labels(data, model, N=50):
# @tm.task(feat, train)
def global_weights(data, model):
td, idf, vocab = data
plsa = pLSA(model)
plsa = plsa1.pLSA(model)
print (plsa.global_weights(idf))

def main():
Expand Down Expand Up @@ -500,5 +502,7 @@ def main():
# word_cluster(model)
# word_topics(model)
# document_topics(model)


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion plsa/plsa/plsa.py → plsa-service/plsa/plsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
Z: number of topics

"""

import psutil
import numpy as np
import matplotlib
Expand Down
File renamed without changes.
21 changes: 14 additions & 7 deletions plsa/plsa/plsa_wrapper.py → plsa-service/plsa/plsa_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,25 @@
import time
import json

sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/plsa')
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa/preprocessing')
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa')
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/preprocessing')
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/topic-analysis/plsa-service/plsa')
sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/topic-analysis/plsa-service/preprocessing')


import example_plsa as pplsa
import cleansing as pclean

class TopicAnalysis:
class PLSA_wrapper:

def __init__(self, docs):
def __init__(self, docs,local=False):

self.docs = docs
self.root_path = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/plsa/'
if not local:
self.root_path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/plsa/'
else:
self.root_path = str(pathlib.Path(os.path.abspath('')).parents[2]) + '/appData/plsa/'
print('>>>>>>>>>>>>>self.root_path>>>>>>>>>>>')
print(self.root_path)
self.extracted_folder = self.root_path + 'extracted/'
self.file_dict = self.root_path + 'dict/'
Expand Down Expand Up @@ -114,7 +121,7 @@ def generate_topics_json(self):

def run_plsa():

path = ''
path = str(pathlib.Path(os.path.abspath('')).parents[2])+'/appData/misc/extracted.json'

docs = []

Expand All @@ -125,7 +132,7 @@ def run_plsa():
for k in fileList:
docs.append(fileList[k])

s = TopicAnalysis(docs)
s = PLSA_wrapper(docs,local=True)
s.write_to_json()
s.generate_topics_json()

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
import pathlib as path
from stemming.porter2 import stem
import string
import preprocessing.porter_dictionary as pp
# import preprocessing.porter_dictionary as pp
import porter_dictionary as pp
# import re

port_dict = pp.porter_dictionary()
Expand Down
32 changes: 0 additions & 32 deletions servcie_spec/topic_analysis.proto

This file was deleted.

49 changes: 49 additions & 0 deletions service_spec/topic_analysis.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
syntax = "proto3";




message PLSARequest {

repeated string docs = 1;
sint64 num_topics = 2;
sint64 topic_divider = 3;
sint64 maxiter = 4;
float beta = 5;
}


message FloatRow{

repeated float floatRow = 1;
}

message PLSAResponse{

bool status = 1;
string message = 2;
repeated string docs_list = 3;
repeated string topics = 4;
repeated FloatRow topicByDocMatirx = 5;
repeated float topicProbabilities = 6;
repeated float wordByTopicConditional = 7;
repeated float logLikelihoods = 8;


}





///// Topic analysis Services

service TopicAnalysis {


rpc PLSA (PLSARequest) returns (PLSAResponse) {};


}

///// End Network Analytics Services
93 changes: 93 additions & 0 deletions snet_test_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Tested on python3.6

import logging
import os
import pathlib
import json
import csv
import numpy as np

import grpc

from service_spec import topic_analysis_pb2, topic_analysis_pb2_grpc

import subprocess




def sample_data():

path = str(pathlib.Path(os.path.abspath('')).parents[0])+'/appData/misc/extracted.json'

docs = []

with open(path, "r") as read_file:
fileList = json.load(read_file)

for k in fileList:
docs.append(fileList[k])

return docs


def csv_reader():

path = str(pathlib.Path(os.path.abspath('')).parents[0]) + '/appData/misc/topic-by-doc-matirx.csv'

resp = []


with open(path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')

docs_list = next(csv_reader)

print(docs_list[1:])

for row in csv_reader:
print('^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(len(row))
# print(row[1:])
resp.append(list((np.array(row[1:])).astype(np.float)))

print('`````````````````````````````````')
print(resp)




def try_plsa():
channel = grpc.insecure_channel('localhost:5000')
stub = topic_analysis_pb2_grpc.TopicAnalysisStub(channel)


plsa_request = topic_analysis_pb2.PLSARequest(docs=sample_data(),num_topics=3,maxiter=22,beta=1)

resp = stub.PLSA(plsa_request)


print(resp.status)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.message)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.docs_list)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.topics)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(resp.topicByDocMatirx)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')








if __name__ == '__main__':

try_plsa()
# csv_reader()


Loading