1+ __author__ = 'eyob'
2+ # Tested on python3.6
3+
4+
5+ import psutil
6+ print ('===================ram used at program start:' ,float (list (psutil .virtual_memory ())[3 ])/ 1073741824.0 ,'GB' )
7+
8+ import os
9+ import sys
10+ import pathlib
11+ import csv
12+ import random
13+ import datetime
14+ import time
15+ import json
16+ import logging
17+
18+ import re
19+ import numpy as np
20+ import pandas as pd
21+ from pprint import pprint
22+
23+ import gensim
24+ import gensim .corpora as corpora
25+ from gensim .utils import simple_preprocess
26+ from gensim .models import CoherenceModel
27+
28+ # Enable logging for gensim - optional
29+ import logging
30+ logging .basicConfig (format = '%(asctime)s : %(levelname)s : %(message)s' , level = logging .ERROR )
31+
32+ import warnings
33+ warnings .filterwarnings ("ignore" ,category = DeprecationWarning )
34+
35+ # sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[0])+'/plsa-service/plsa')
36+ sys .path .append (str (pathlib .Path (os .path .abspath ('' )).parents [0 ])+ '/plsa-service/preprocessing' )
37+ # sys.path.append(str(pathlib.Path(os.path.abspath('')).parents[1])+'/plsa-service/plsa')
38+ sys .path .append (str (pathlib .Path (os .path .abspath ('' )).parents [1 ])+ '/topic-analysis/plsa-service/preprocessing' )
39+
40+ # import example_plsa as pplsa
41+ import cleansing as pclean
42+ import porter_dictionary
43+
44+ class LDA_wrapper :
45+
46+ def __init__ (self , docs ,local = False ):
47+
48+ self .docs = docs
49+ if not local :
50+ self .root_path = str (pathlib .Path (os .path .abspath ('' )).parents [0 ]) + '/appData/lda/'
51+ else :
52+ self .root_path = str (pathlib .Path (os .path .abspath ('' )).parents [1 ]) + '/appData/lda/'
53+ print ('>>>>>>>>>>>>>self.root_path>>>>>>>>>>>' )
54+ print (self .root_path )
55+ self .extracted_folder = self .root_path + 'extracted/'
56+ self .file_dict = self .root_path + 'dict/'
57+ self .source_texts = self .root_path + 'extracted/'
58+ self .output_dir = self .root_path + 'cleaned/'
59+ print (self .output_dir )
60+ self .folder = self .root_path + 'cleaned/'
61+ self .dict_path = self .root_path + 'dict/'
62+ self .lda_parameters_path = self .root_path + 'lda-parameters/'
63+ self .LDA_PARAMETERS_PATH = ''
64+
65+ # self.messages
66+ self .unique_folder_naming = None
67+ self .num_topics = None
68+ self .topic_divider = None
69+ self .max_iter = None
70+
71+ def __del__ (self ):
72+
73+ # Close db connections
74+ pass
75+
76+
77+
78+ def write_to_json (self ):
79+
80+
81+
82+ # self.unique_folder_naming = str(datetime.datetime.now()).replace(':','-').replace('.','-') + '^' + str(random.randint(100000000000, 999999999999)) + '/'
83+ print (self .unique_folder_naming )
84+
85+ os .mkdir (self .extracted_folder + self .unique_folder_naming )
86+
87+ contents_dict = {}
88+
89+ file = self .extracted_folder + self .unique_folder_naming + 'extracted' + '.json'
90+
91+ for i in range (len (self .docs )):
92+ contents_dict [str (i )] = self .docs [i ]
93+
94+ with open (file , "w" ) as f :
95+ json .dump (contents_dict , f , indent = 4 )
96+
97+ print ("len(contents_dict):" ,len (contents_dict ))
98+
99+
100+
101+ def generate_topics_gensim (self ,num_topics , passes , chunksize ,
102+ update_every = 0 , alpha = 'auto' , eta = 'auto' , decay = 0.5 , offset = 1.0 , eval_every = 1 ,
103+ iterations = 50 , gamma_threshold = 0.001 , minimum_probability = 0.01 , random_state = None ,
104+ minimum_phi_value = 0.01 , per_word_topics = True , callbacks = None ):
105+
106+ start_time_1 = time .time ()
107+
108+ pclean .file_dict = self .file_dict + self .unique_folder_naming [:- 1 ] + '_dict'
109+ pclean .source_texts = self .source_texts + self .unique_folder_naming + 'extracted.json'
110+ pclean .output_dir = self .output_dir + self .unique_folder_naming
111+
112+ os .mkdir (pclean .output_dir )
113+
114+ # Do cleansing on the data and turing it to bad-of-words model
115+
116+ with open (self .lda_parameters_path + self .unique_folder_naming + 'status.txt' , 'w' ) as f :
117+ f .write ('Preprocessing started.' )
118+
119+ pclean .pre_pro ()
120+
121+ with open (self .lda_parameters_path + self .unique_folder_naming + 'status.txt' , 'w' ) as f :
122+ f .write ('Preprocessing finished. Topic analysis started.' )
123+
124+ with open (pclean .output_dir + 'cleaned.json' , "r" ) as read_file :
125+ ret = json .load (read_file )
126+
127+ data_lemmatized = []
128+
129+ for k in ret :
130+ data_lemmatized .append (ret [k ].splitlines ())
131+
132+ # Create Dictionary
133+ id2word = corpora .Dictionary (data_lemmatized )
134+
135+ # Create Corpus
136+ texts = data_lemmatized
137+
138+ # Term Document Frequency
139+ corpus = [id2word .doc2bow (text ) for text in texts ]
140+
141+ # View
142+ # print(corpus[0:1])
143+ # print(id2word[1])
144+
145+ self .lda_model = gensim .models .ldamodel .LdaModel (corpus = corpus ,
146+ id2word = id2word ,
147+ num_topics = num_topics ,
148+ random_state = random_state ,
149+ update_every = update_every ,
150+ chunksize = chunksize ,
151+ passes = passes ,
152+ alpha = alpha ,
153+ eta = eta ,
154+ per_word_topics = per_word_topics ,
155+ decay = decay ,
156+ offset = offset ,
157+ eval_every = eval_every ,
158+ iterations = iterations ,
159+ gamma_threshold = gamma_threshold ,
160+ minimum_probability = minimum_probability ,
161+ minimum_phi_value = minimum_phi_value ,
162+ callbacks = callbacks )
163+
164+ port_dict = porter_dictionary .porter_dictionary ()
165+
166+ topics = self .lda_model .show_topics (num_topics = num_topics ,num_words = 300 ,formatted = False )
167+
168+ extracted_topics = []
169+
170+ for topic in topics :
171+ a_topic = []
172+ for item in topic [1 ]:
173+ a_topic .append (item [0 ])
174+ extracted_topics .append (a_topic )
175+
176+ port_dict .load_dict (self .dict_path + self .unique_folder_naming [:- 1 ] + '_dict' )
177+
178+
179+ self .topics_destemmed = []
180+
181+ for i in extracted_topics :
182+ destemmed = []
183+ for j in i :
184+ try :
185+ destemmed .append (port_dict .dictionary [j ][0 ])
186+ except :
187+ logging .exception ('message' )
188+ self .topics_destemmed .append (destemmed )
189+
190+ '''
191+ Seems remaining code is to extract any produced parameters from the resulting lda model, like the weights. We need to define the proto formats of course
192+ for all the returned parameters
193+
194+ also code that writes the final status that shows total running time that elapsed
195+
196+ in general, compare the outputs of plsa and as much as possible try to apply it to the results that are returned by lda
197+ '''
198+
199+
200+
201+
202+
203+
204+
205+
206+ def run_lda ():
207+
208+ docs = []
209+ s = LDA_wrapper (docs , local = True )
210+
211+ # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_2.json'
212+ # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_singnet_all.json'
213+ # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_bio_all.json'
214+ # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hersheys_all.json'
215+ # path = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/extracted_hr_all.json'
216+ path = str (pathlib .Path (os .path .abspath ('' )).parents [1 ])+ '/appData/misc/extracted_all.json'
217+
218+ docs = []
219+
220+
221+ with open (path , "r" ) as read_file :
222+ fileList = json .load (read_file )
223+
224+ for k in fileList :
225+ docs .append (fileList [k ])
226+
227+ s = LDA_wrapper (docs ,local = True )
228+ # s.topic_divider = 0
229+ # s.num_topics = 2
230+ # s.max_iter = 22
231+ # s.beta = 1
232+ s .unique_folder_naming = str (datetime .datetime .now ()).replace (':' ,'-' ).replace ('.' ,'-' ) + '^' + str (random .randint (100000000000 , 999999999999 )) + '/'
233+ os .mkdir (str (pathlib .Path (os .path .abspath ('' )).parents [1 ])+ '/appData/lda/lda-parameters/' + s .unique_folder_naming )
234+ s .write_to_json ()
235+ # s.generate_topics_gensim(num_topics=3,passes=22,chunksize=200)
236+ s .generate_topics_gensim (num_topics = 70 ,passes = 22 ,chunksize = 20000 )
237+ # s.generate_topics_gensim(num_topics=2,passes=22,chunksize=200)
238+ # s.generate_topics_gensim(num_topics=2,passes=100,chunksize=200,random_state=2)
239+
240+
241+ # pprint(s.lda_model.print_topics(3,50))
242+ # topics = s.lda_model.show_topics(2,5,formatted=False)
243+ # print(topics)
244+ print_two_d (s .topics_destemmed )
245+
246+
247+ # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/singnet_all_plsa_topics_2.txt'
248+ # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hersheys_all_plsa_topics.txt'
249+ # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/bio_all_plsa_topics.txt'
250+
251+ # topics_snet_all_plsa_file = str(pathlib.Path(os.path.abspath('')).parents[1])+'/appData/misc/topics/hr_all_plsa_topics.txt'
252+ # with open(topics_snet_all_plsa_file,'r') as f:
253+ # temp_list = f.readlines()
254+ # topics_snet_all_plsa = []
255+ # for l in temp_list:
256+ # topics_snet_all_plsa.append(l.split(','))
257+ #
258+ # for i in range(len(topics_snet_all_plsa)):
259+ # for j in range(len(topics_snet_all_plsa[0])):
260+ # topics_snet_all_plsa[i][j] = topics_snet_all_plsa[i][j].strip()
261+ #
262+ # topics_snet_all_plsa_file_2 = str(pathlib.Path(os.path.abspath('')).parents[1]) + '/appData/misc/topics/hr_all_plsa_topics_2.txt'
263+ # with open(topics_snet_all_plsa_file_2, 'r') as f:
264+ # temp_list = f.readlines()
265+ # topics_snet_all_plsa_2 = []
266+ # for l in temp_list:
267+ # topics_snet_all_plsa_2.append(l.split(','))
268+ #
269+ # for i in range(len(topics_snet_all_plsa_2)):
270+ # for j in range(len(topics_snet_all_plsa_2[0])):
271+ # topics_snet_all_plsa_2[i][j] = topics_snet_all_plsa_2[i][j].strip()
272+
273+
274+
275+ # two topics
276+ # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30))
277+ # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30))
278+ # print('=========================')
279+ # print(dot_product(topics_snet_all_plsa[1],s.topics_destemmed[0],depth=30))
280+ # print(dot_product(topics_snet_all_plsa[1],s.topics_destemmed[1],depth=30))
281+ # two topics
282+
283+ # three topics
284+ # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[0],depth=30))
285+ # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[1],depth=30))
286+ # print(dot_product(topics_snet_all_plsa[0],s.topics_destemmed[2],depth=30))
287+ # print('=========================')
288+ # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[0], depth=30))
289+ # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[1], depth=30))
290+ # print(dot_product(topics_snet_all_plsa[1], s.topics_destemmed[2], depth=30))
291+ # print('=========================')
292+ # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[0], depth=30))
293+ # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[1], depth=30))
294+ # print(dot_product(topics_snet_all_plsa[2], s.topics_destemmed[2], depth=30))
295+ # print('=========================')
296+ # three topics
297+
298+ # plsa self
299+ # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[0], depth=30))
300+ # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[1], depth=30))
301+ # print(dot_product(topics_snet_all_plsa[0], topics_snet_all_plsa_2[2], depth=30))
302+ # print('=========================')
303+ # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[0], depth=30))
304+ # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[1], depth=30))
305+ # print(dot_product(topics_snet_all_plsa[1], topics_snet_all_plsa_2[2], depth=30))
306+ # print('=========================')
307+ # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[0], depth=30))
308+ # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[1], depth=30))
309+ # print(dot_product(topics_snet_all_plsa[2], topics_snet_all_plsa_2[2], depth=30))
310+ # print('=========================')
311+ # plsa self
312+
313+
314+ def dot_product (list_1 ,list_2 ,depth = 30 ):
315+
316+ count = 0
317+ for i in list_1 [0 :depth ]:
318+ if i in list_2 [0 :depth ]:
319+ count = count + 1
320+ return count
321+
322+ def print_two_d (two_d ):
323+ for i in two_d :
324+ print (i )
325+
326+
327+
328+
329+ __end__ = '__end__'
330+
331+
332+ if __name__ == '__main__' :
333+
334+ run_lda ()
335+
336+ pass
0 commit comments