From ba5d9aa2c28a7e8aae6a1f5c5560845cc77a3a26 Mon Sep 17 00:00:00 2001
From: nicolasfredesfranco <nicolas.fredes.13@sansano.usm.cl>
Date: Fri, 8 Jun 2018 09:14:50 -0400
Subject: [PATCH] pre procesasmiento y diccionario listos

Se hace el diccionario de freciencias, y el pre presamiento esta listo.
---
 ppsing.py | 23 ++++++++++++++++-------
 tweet2.py |  3 ++-
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/ppsing.py b/ppsing.py
index d9a69c5..8d68d36 100644
--- a/ppsing.py
+++ b/ppsing.py
@@ -4,11 +4,11 @@
 
 
 
-abc={'q','w','e','r','t','y','u','i','o','p','a','s','d','f','g','h','j','k','l','z','x','c','v','b','n','m',' ','.'}
+abc={'q','w','e','r','t','y','u','i','o','p','a','s','d','f','g','h','j','k','l','z','x','c','v','b','n','m',' ','.'}#,'1','2','3','4','5','6','7','8','9'}
 
 stop_words=['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'mas', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'si', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'tambien', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mi', 'antes', 'algunos', 'que', 'unos', 'yo', 'otro', 'otras', 'otra', 'el', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tu', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosostros', 'vosostras', 'os', 'mio', 'mia', 'mios', 'mias', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas', 'estoy', 'estas', 'esta', 'estamos', 'estais', 'estan', 'este', 'estes', 'estemos', 'esteis', 'esten', 'estare', 'estaras', 'estara', 'estaremos', 'estareis', 'estaran', 'estaria', 'estarias', 'estariamos', 'estariais', 'estarian', 'estaba', 'estabas', 'estabamos', 'estabais', 'estaban', 'estuve', 'estuviste', 'estuvo', 'estuvimos', 'estuvisteis', 'estuvieron', 'estuviera', 'estuvieras', 'estuvieramos', 'estuvierais', 'estuvieran', 'estuviese', 'estuvieses', 'estuviesemos', 'estuvieseis', 'estuviesen', 'estando', 'estado', 'estada', 'estados', 'estadas', 'estad', 'he', 'has', 'ha', 'hemos', 'habeis', 'han', 'haya', 'hayas', 'hayamos', 'hayais', 'hayan', 'habre', 'habras', 'habra', 'habremos', 'habreis', 'habran', 'habria', 'habrias', 'habriamos', 'habriais', 'habrian', 'habia', 'habias', 'habiamos', 'habiais', 'habian', 'hube', 'hubiste', 'hubo', 'hubimos', 'hubisteis', 'hubieron', 'hubiera', 'hubieras', 'hubieramos', 'hubierais', 'hubieran', 'hubiese', 'hubieses', 'hubiesemos', 'hubieseis', 'hubiesen', 'habiendo', 'habido', 'habida', 'habidos', 'habidas', 'soy', 'eres', 'es', 'somos', 'sois', 'son', 'sea', 'seas', 'seamos', 'seais', 'sean', 'sere', 'seras', 'sera', 'seremos', 'sereis', 'seran', 'seria', 'serias', 'seriamos', 'seriais', 'serian', 'era', 'eras', 'eramos', 'erais', 'eran', 'fui', 'fuiste', 'fue', 'fuimos', 'fuisteis', 'fueron', 'fuera', 'fueras', 'fueramos', 'fuerais', 'fueran', 'fuese', 'fueses', 'fuesemos', 'fueseis', 'fuesen', 'sintiendo', 'sentido', 'sentida', 'sentidos', 'sentidas', 'siente', 'sentid', 'tengo', 'tienes', 'tiene', 'tenemos', 'teneis', 'tienen', 'tenga', 'tengas', 'tengamos', 'tengais', 'tengan', 'tendre', 'tendras', 'tendra', 'tendremos', 'tendreis', 'tendran', 'tendria', 'tendrias', 'tendriamos', 'tendriais', 'tendrian', 'tenia', 'tenias', 'teniamos', 'teniais', 'tenian', 'tuve', 'tuviste', 'tuvo', 'tuvimos', 'tuvisteis', 'tuvieron', 'tuviera', 'tuvieras', 'tuvieramos', 'tuvierais', 'tuvieran', 'tuviese', 'tuvieses', 'tuviesemos', 'tuvieseis', 'tuviesen', 'teniendo', 'tenido', 'tenida', 'tenidos', 'tenidas', 'tened','asi','q','d']
 
-frec_x_word=dic()
+frec_words=dict()
 
 #esta funcion elmina todos los signos que no son palabras
 def elimina_sign(text):
@@ -24,8 +24,8 @@ def elimina_tildes(s):
 
 
 def processing(text):
-    output_data=list()        
-    output_dic
+    output=list()        
+    
     #quita el enlace
     text=text.split('https')[0]
 
@@ -45,9 +45,18 @@ def processing(text):
     for fr in frases:
         if len(fr)!=0: 
             words_sw = word_tokenize(fr)
-            #words=[]
-            #for i
-            output.append(list(filter(lambda x: x not in stop_words, words_sw)))
+            words=[]
+            for i in words_sw:
+                if i not in stop_words:
+                    words.append(i)
+                    
+                    if i not in frec_words.keys():
+                        frec_words[i]=1;
+                    else:
+                        frec_words[i]=frec_words[i]+1
+
+
+            output.append(words)
 
     return output
 
diff --git a/tweet2.py b/tweet2.py
index a43fdcc..4b493e4 100644
--- a/tweet2.py
+++ b/tweet2.py
@@ -48,6 +48,7 @@ def get_all_tweets(num_tweets):
 
     num_tweets=int(input('ingrese cantidad de tweets:'))
     tweets=get_all_tweets(num_tweets)
-    print(tweets)    
+    print(tweets)
+    print(ppsing.frec_words)