From 19757bcbb12e1922a1b5d83b3e01fd6a12eaeaf7 Mon Sep 17 00:00:00 2001 From: tlofano Date: Tue, 3 Dec 2019 01:35:33 -0300 Subject: [PATCH 1/3] Mas frecuentes en los k mas baratos/caros --- featuresNLTK.ipynb | 484 ++++++++++++++++++++++++++++++++++++++++- html/featuresNLTK.html | 408 +++++++++++++++++++++++++++++++++- 2 files changed, 884 insertions(+), 8 deletions(-) diff --git a/featuresNLTK.ipynb b/featuresNLTK.ipynb index 7d15fdf..d7d4e89 100644 --- a/featuresNLTK.ipynb +++ b/featuresNLTK.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -209,6 +209,482 @@ " df_palabras = feature_cantidad_menos_frecuentes(df_train, 'titulo', 200)\n", " df_palabras[['id', 'titulo', 'cant_palabras_menos_frecuentes_titulo']].head()" ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "def palabras_mas_usadas_en_mas_caros_baratos(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):\n", + " \"\"\"Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas.\"\"\"\n", + " \"\"\"Para cada texto de la columna col, cuenta cuantas de estas palabras contiene\"\"\"\n", + " \"\"\"Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df\"\"\"\n", + " \"\"\"Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,\n", + " buscaria las palabras mas frecuentes de los mas baratos\"\"\"\n", + " \n", + " info = 'CAROS' if mirar_mas_caros else 'BARATOS'\n", + " \n", + " ver_info_a_filtrar(df, col)\n", + " df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')\n", + " arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)\n", + " set_mas_frecuentes = set(arr_mas_frecuentes)\n", + "\n", + " def contador_mas_frecuentes(texto):\n", + " contador = 0\n", + " palabras_del_df = word_tokenize(texto)\n", + " for palabra in palabras_del_df:\n", + " if palabra in set_mas_frecuentes:\n", + " contador = contador + 1\n", + " return contador\n", + " \n", + " df['palabras_mas_frecuentes_' + info + '_' + col] = df[~df[col].isnull()][col].apply(contador_mas_frecuentes)\n", + " df['palabras_mas_frecuentes_' + info + '_' + col] = df['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)\n", + " \n", + " if df_test is None:\n", + " return df, df_test\n", + " \n", + " df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test[~df_test[col].isnull()][col].apply(contador_mas_frecuentes)\n", + " df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)\n", + " \n", + " return df, df_test" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "La cantidad de publicaciones con titulo no nulo es: 234613\n", + "La cantidad total de publicaciones es: 240000\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtitulodescripciontipodepropiedaddireccionciudadprovinciaantiguedadhabitacionesgarages...latlngfechagimnasiousosmultiplespiscinaescuelascercanascentroscomercialescercanospreciopalabras_mas_frecuentes_BARATOS_titulo
0254099depto. tipo a-402depto. interior de 80.15m2, consta de sala com...ApartamentoAvenida Division del Norte 2005Benito JuárezDistrito FederalNaN2.01.0...NaNNaN2015-08-23 00:00:000.00.00.00.00.02273000.01.0
153461condominio horizontal en venta<p>entre sonora y guerrero, atr&aacute;s del h...Casa en condominioAV. MEXICOLa Magdalena ContrerasDistrito Federal10.03.02.0...19.310205-99.2276552013-06-28 00:00:000.00.00.01.01.03600000.02.0
2247984casa en venta urbi 3 recamaras tonaladescripcion \\nla mejor ubicacion residencial e...CasaUrbi TonalaTonaláJalisco5.03.02.0...NaNNaN2015-10-17 00:00:000.00.00.00.00.01200000.05.0
3209067casa sola en toluca zinacantepec con credito i...casa en privada con caseta de vigilancia casas...CasaIGNACIO MANUEL ALTAMIRANO 128ZinacantepecEdo. de México1.02.01.0...19.301890-99.6880152012-03-09 00:00:000.00.00.01.01.0650000.02.0
4185997paseos del solbonito departamento en excelentes condiciones ...ApartamentoPASEOS DEL SOLZapopanJalisco10.02.01.0...NaNNaN2016-06-07 00:00:000.00.00.00.00.01150000.02.0
..................................................................
239995119879bonita casas de 2 recamaras a 10 minutos del c...vendo casa en bosques de ica residencial a 10 ...CasaBOSQUESZinacantepecEdo. de México0.02.02.0...NaNNaN2015-02-08 00:00:000.00.00.00.00.0650000.04.0
239996259178casa en condominio a 10 min. del centro de tolucacasa con un jardin amplio, un cuarto de servic...CasaFiliberto Navas 325TolucaEdo. de México0.03.03.0...19.294665-99.6929162014-07-10 00:00:000.00.00.01.01.01940000.03.0
239997131932nicolas san juandepartamento con excelente ubicación, muy cerc...ApartamentoNicolas San JuanBenito JuárezDistrito Federal20.02.01.0...NaNNaN2015-03-03 00:00:000.00.00.00.00.03400000.02.0
239998146867casa sola. javier rojo gomez.casa sola, dividida en cuatro departamentos de...CasaJavier Rojo Gomez 120IztapalapaDistrito Federal20.04.00.0...19.366651-99.0822462014-12-26 00:00:001.00.00.01.01.02890000.01.0
239999121958departamento en bosques de las lomas / av. st...id:19816, muy bonito e iluminado departamento,...ApartamentoAVE. STIMCuajimalpa de MorelosDistrito Federal1.03.02.0...NaNNaN2015-06-19 00:00:000.00.00.00.00.03650000.01.0
\n", + "

240000 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " id titulo \\\n", + "0 254099 depto. tipo a-402 \n", + "1 53461 condominio horizontal en venta \n", + "2 247984 casa en venta urbi 3 recamaras tonala \n", + "3 209067 casa sola en toluca zinacantepec con credito i... \n", + "4 185997 paseos del sol \n", + "... ... ... \n", + "239995 119879 bonita casas de 2 recamaras a 10 minutos del c... \n", + "239996 259178 casa en condominio a 10 min. del centro de toluca \n", + "239997 131932 nicolas san juan \n", + "239998 146867 casa sola. javier rojo gomez. \n", + "239999 121958 departamento en bosques de las lomas / av. st... \n", + "\n", + " descripcion tipodepropiedad \\\n", + "0 depto. interior de 80.15m2, consta de sala com... Apartamento \n", + "1

entre sonora y guerrero, atrás del h... Casa en condominio \n", + "2 descripcion \\nla mejor ubicacion residencial e... Casa \n", + "3 casa en privada con caseta de vigilancia casas... Casa \n", + "4 bonito departamento en excelentes condiciones ... Apartamento \n", + "... ... ... \n", + "239995 vendo casa en bosques de ica residencial a 10 ... Casa \n", + "239996 casa con un jardin amplio, un cuarto de servic... Casa \n", + "239997 departamento con excelente ubicación, muy cerc... Apartamento \n", + "239998 casa sola, dividida en cuatro departamentos de... Casa \n", + "239999 id:19816, muy bonito e iluminado departamento,... Apartamento \n", + "\n", + " direccion ciudad \\\n", + "0 Avenida Division del Norte 2005 Benito Juárez \n", + "1 AV. MEXICO La Magdalena Contreras \n", + "2 Urbi Tonala Tonalá \n", + "3 IGNACIO MANUEL ALTAMIRANO 128 Zinacantepec \n", + "4 PASEOS DEL SOL Zapopan \n", + "... ... ... \n", + "239995 BOSQUES Zinacantepec \n", + "239996 Filiberto Navas 325 Toluca \n", + "239997 Nicolas San Juan Benito Juárez \n", + "239998 Javier Rojo Gomez 120 Iztapalapa \n", + "239999 AVE. STIM Cuajimalpa de Morelos \n", + "\n", + " provincia antiguedad habitaciones garages ... lat \\\n", + "0 Distrito Federal NaN 2.0 1.0 ... NaN \n", + "1 Distrito Federal 10.0 3.0 2.0 ... 19.310205 \n", + "2 Jalisco 5.0 3.0 2.0 ... NaN \n", + "3 Edo. de México 1.0 2.0 1.0 ... 19.301890 \n", + "4 Jalisco 10.0 2.0 1.0 ... NaN \n", + "... ... ... ... ... ... ... \n", + "239995 Edo. de México 0.0 2.0 2.0 ... NaN \n", + "239996 Edo. de México 0.0 3.0 3.0 ... 19.294665 \n", + "239997 Distrito Federal 20.0 2.0 1.0 ... NaN \n", + "239998 Distrito Federal 20.0 4.0 0.0 ... 19.366651 \n", + "239999 Distrito Federal 1.0 3.0 2.0 ... NaN \n", + "\n", + " lng fecha gimnasio usosmultiples piscina \\\n", + "0 NaN 2015-08-23 00:00:00 0.0 0.0 0.0 \n", + "1 -99.227655 2013-06-28 00:00:00 0.0 0.0 0.0 \n", + "2 NaN 2015-10-17 00:00:00 0.0 0.0 0.0 \n", + "3 -99.688015 2012-03-09 00:00:00 0.0 0.0 0.0 \n", + "4 NaN 2016-06-07 00:00:00 0.0 0.0 0.0 \n", + "... ... ... ... ... ... \n", + "239995 NaN 2015-02-08 00:00:00 0.0 0.0 0.0 \n", + "239996 -99.692916 2014-07-10 00:00:00 0.0 0.0 0.0 \n", + "239997 NaN 2015-03-03 00:00:00 0.0 0.0 0.0 \n", + "239998 -99.082246 2014-12-26 00:00:00 1.0 0.0 0.0 \n", + "239999 NaN 2015-06-19 00:00:00 0.0 0.0 0.0 \n", + "\n", + " escuelascercanas centroscomercialescercanos precio \\\n", + "0 0.0 0.0 2273000.0 \n", + "1 1.0 1.0 3600000.0 \n", + "2 0.0 0.0 1200000.0 \n", + "3 1.0 1.0 650000.0 \n", + "4 0.0 0.0 1150000.0 \n", + "... ... ... ... \n", + "239995 0.0 0.0 650000.0 \n", + "239996 1.0 1.0 1940000.0 \n", + "239997 0.0 0.0 3400000.0 \n", + "239998 1.0 1.0 2890000.0 \n", + "239999 0.0 0.0 3650000.0 \n", + "\n", + " palabras_mas_frecuentes_BARATOS_titulo \n", + "0 1.0 \n", + "1 2.0 \n", + "2 5.0 \n", + "3 2.0 \n", + "4 2.0 \n", + "... ... \n", + "239995 4.0 \n", + "239996 3.0 \n", + "239997 2.0 \n", + "239998 1.0 \n", + "239999 1.0 \n", + "\n", + "[240000 rows x 24 columns]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def ejemplo():\n", + " df = df_train\n", + " df_aut = df_test\n", + " palabras_mas_usadas_en_mas_caros_baratos(df, 'titulo', 80, 200, df_aut, False)" + ] } ], "metadata": { diff --git a/html/featuresNLTK.html b/html/featuresNLTK.html index 52d3f54..397620e 100644 --- a/html/featuresNLTK.html +++ b/html/featuresNLTK.html @@ -13076,7 +13076,7 @@

-
In [2]:
+
In [1]:
import pandas as pd
@@ -13116,7 +13116,7 @@
 
-
In [3]:
+
In [24]:
def ver_info_a_filtrar(df, col):
@@ -13132,7 +13132,7 @@
 
-
In [4]:
+
In [8]:
def generar_palabras_no_queridas(arr=None):
@@ -13157,7 +13157,7 @@
 
-
In [5]:
+
In [11]:
def generar_palabras_mas_frecuentes(df, col, n):
@@ -13339,6 +13339,406 @@
 
+
+
+
+
In [63]:
+
+
+
def palabras_mas_usadas_en_mas_caros_baratos(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):
+    """Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas."""
+    """Para cada texto de la columna col, cuenta cuantas de estas palabras contiene"""
+    """Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df"""
+    """Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,
+    buscaria las palabras mas frecuentes de los mas baratos"""
+    
+    info = 'CAROS' if mirar_mas_caros else 'BARATOS'
+    
+    ver_info_a_filtrar(df, col)
+    df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')
+    arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)
+    set_mas_frecuentes = set(arr_mas_frecuentes)
+
+    def contador_mas_frecuentes(texto):
+        contador = 0
+        palabras_del_df = word_tokenize(texto)
+        for palabra in palabras_del_df:
+            if palabra in set_mas_frecuentes:
+                contador = contador + 1
+        return contador
+    
+    df['palabras_mas_frecuentes_' + info + '_' + col] = df[~df[col].isnull()][col].apply(contador_mas_frecuentes)
+    df['palabras_mas_frecuentes_' + info + '_' + col] = df['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)
+    
+    if df_test is None:
+        return df, df_test
+    
+    df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test[~df_test[col].isnull()][col].apply(contador_mas_frecuentes)
+    df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)
+    
+    return df, df_test
+
+ +
+
+
+ +
+
+
+
In [62]:
+
+
+
def ejemplo():
+    df = df_train
+    df_aut = df_test
+    palabras_mas_usadas_en_mas_caros_baratos(df, 'titulo', 80, 200, df_aut, False)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
La cantidad de publicaciones con titulo no nulo es: 234613
+La cantidad total de publicaciones es: 240000
+
+
+
+ +
+ +
Out[62]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idtitulodescripciontipodepropiedaddireccionciudadprovinciaantiguedadhabitacionesgarages...latlngfechagimnasiousosmultiplespiscinaescuelascercanascentroscomercialescercanospreciopalabras_mas_frecuentes_BARATOS_titulo
0254099depto. tipo a-402depto. interior de 80.15m2, consta de sala com...ApartamentoAvenida Division del Norte 2005Benito JuárezDistrito FederalNaN2.01.0...NaNNaN2015-08-23 00:00:000.00.00.00.00.02273000.01.0
153461condominio horizontal en venta<p>entre sonora y guerrero, atr&aacute;s del h...Casa en condominioAV. MEXICOLa Magdalena ContrerasDistrito Federal10.03.02.0...19.310205-99.2276552013-06-28 00:00:000.00.00.01.01.03600000.02.0
2247984casa en venta urbi 3 recamaras tonaladescripcion \nla mejor ubicacion residencial e...CasaUrbi TonalaTonaláJalisco5.03.02.0...NaNNaN2015-10-17 00:00:000.00.00.00.00.01200000.05.0
3209067casa sola en toluca zinacantepec con credito i...casa en privada con caseta de vigilancia casas...CasaIGNACIO MANUEL ALTAMIRANO 128ZinacantepecEdo. de México1.02.01.0...19.301890-99.6880152012-03-09 00:00:000.00.00.01.01.0650000.02.0
4185997paseos del solbonito departamento en excelentes condiciones ...ApartamentoPASEOS DEL SOLZapopanJalisco10.02.01.0...NaNNaN2016-06-07 00:00:000.00.00.00.00.01150000.02.0
..................................................................
239995119879bonita casas de 2 recamaras a 10 minutos del c...vendo casa en bosques de ica residencial a 10 ...CasaBOSQUESZinacantepecEdo. de México0.02.02.0...NaNNaN2015-02-08 00:00:000.00.00.00.00.0650000.04.0
239996259178casa en condominio a 10 min. del centro de tolucacasa con un jardin amplio, un cuarto de servic...CasaFiliberto Navas 325TolucaEdo. de México0.03.03.0...19.294665-99.6929162014-07-10 00:00:000.00.00.01.01.01940000.03.0
239997131932nicolas san juandepartamento con excelente ubicación, muy cerc...ApartamentoNicolas San JuanBenito JuárezDistrito Federal20.02.01.0...NaNNaN2015-03-03 00:00:000.00.00.00.00.03400000.02.0
239998146867casa sola. javier rojo gomez.casa sola, dividida en cuatro departamentos de...CasaJavier Rojo Gomez 120IztapalapaDistrito Federal20.04.00.0...19.366651-99.0822462014-12-26 00:00:001.00.00.01.01.02890000.01.0
239999121958departamento en bosques de las lomas / av. st...id:19816, muy bonito e iluminado departamento,...ApartamentoAVE. STIMCuajimalpa de MorelosDistrito Federal1.03.02.0...NaNNaN2015-06-19 00:00:000.00.00.00.00.03650000.01.0
+

240000 rows × 24 columns

+
+
+ +
+ +
+
+
From 1498f1aa117b5d31604c5e2fa65bc2d98f6c7f9a Mon Sep 17 00:00:00 2001 From: tlofano Date: Tue, 3 Dec 2019 03:03:14 -0300 Subject: [PATCH 2/3] ohe de las palabras mas frecuentes en las propiedades mas caras/baratas --- featuresNLTK.ipynb | 558 +++++++++-------------------------------- html/featuresNLTK.html | 460 +++++++++------------------------ 2 files changed, 247 insertions(+), 771 deletions(-) diff --git a/featuresNLTK.ipynb b/featuresNLTK.ipynb index d7d4e89..4d0bc3b 100644 --- a/featuresNLTK.ipynb +++ b/featuresNLTK.ipynb @@ -19,6 +19,7 @@ "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "nltk.download('stopwords')\n", "\n", @@ -28,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -87,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -122,16 +123,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "La cantidad de publicaciones con titulo no nulo es: 234613\n", - "La cantidad total de publicaciones es: 240000\n" - ] - } - ], + "outputs": [], "source": [ "def ejemplo():\n", " # Ejemplo de uso de las palabras mas frecuentes\n", @@ -141,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -167,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -212,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -252,7 +244,63 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "def ejemplo():\n", + " df = df_train\n", + " df_aut = df_test\n", + " palabras_mas_usadas_en_mas_caros_baratos(df, 'titulo', 80, 200, df_aut, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#Es mas lenteja\n", + "\n", + "def mas_frecuentes_caros_baratos_ohe(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):\n", + " \"\"\"Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas.\"\"\"\n", + " \"\"\"Hace ohe de las palabras mas usadas obtenidas, sobre la columna col\"\"\"\n", + " \"\"\"Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df\"\"\"\n", + " \"\"\"Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,\n", + " buscaria las palabras mas frecuentes de los mas baratos\"\"\"\n", + " \n", + " def fill_nans(df, columns, value):\n", + " for column in columns:\n", + " df[column] = df[column].fillna(value)\n", + " return df\n", + " \n", + " ver_info_a_filtrar(df, col)\n", + " df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')\n", + " arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)\n", + " df[col] = df[col].fillna('vacio')\n", + " cv = CountVectorizer(vocabulary=arr_mas_frecuentes) \n", + " r = pd.SparseDataFrame(cv.fit_transform(df[col]), df['id'], cv.get_feature_names(), default_fill_value=0)\n", + " r = r.reset_index()\n", + " df_merge = df.merge(r, on='id')\n", + " df_merge = fill_nans(df_merge, cv.get_feature_names(), 0)\n", + " \n", + " if df_test is None:\n", + " return df_merge, df_test\n", + " \n", + " df_test[col] = df_test[col].fillna('vacio')\n", + " r_test = pd.SparseDataFrame(cv.fit_transform(df_test[col]), df_test['id'], cv.get_feature_names(), default_fill_value=0)\n", + " r_test = r_test.reset_index()\n", + " df_merge_aux = df_test.merge(r_test, on='id')\n", + " df_merge_aux = fill_nans(df_merge_aux, cv.get_feature_names(), 0)\n", + " \n", + " return df_merge, df_merge_aux" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "scrolled": false }, @@ -266,424 +314,68 @@ ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idtitulodescripciontipodepropiedaddireccionciudadprovinciaantiguedadhabitacionesgarages...latlngfechagimnasiousosmultiplespiscinaescuelascercanascentroscomercialescercanospreciopalabras_mas_frecuentes_BARATOS_titulo
0254099depto. tipo a-402depto. interior de 80.15m2, consta de sala com...ApartamentoAvenida Division del Norte 2005Benito JuárezDistrito FederalNaN2.01.0...NaNNaN2015-08-23 00:00:000.00.00.00.00.02273000.01.0
153461condominio horizontal en venta<p>entre sonora y guerrero, atr&aacute;s del h...Casa en condominioAV. MEXICOLa Magdalena ContrerasDistrito Federal10.03.02.0...19.310205-99.2276552013-06-28 00:00:000.00.00.01.01.03600000.02.0
2247984casa en venta urbi 3 recamaras tonaladescripcion \\nla mejor ubicacion residencial e...CasaUrbi TonalaTonaláJalisco5.03.02.0...NaNNaN2015-10-17 00:00:000.00.00.00.00.01200000.05.0
3209067casa sola en toluca zinacantepec con credito i...casa en privada con caseta de vigilancia casas...CasaIGNACIO MANUEL ALTAMIRANO 128ZinacantepecEdo. de México1.02.01.0...19.301890-99.6880152012-03-09 00:00:000.00.00.01.01.0650000.02.0
4185997paseos del solbonito departamento en excelentes condiciones ...ApartamentoPASEOS DEL SOLZapopanJalisco10.02.01.0...NaNNaN2016-06-07 00:00:000.00.00.00.00.01150000.02.0
..................................................................
239995119879bonita casas de 2 recamaras a 10 minutos del c...vendo casa en bosques de ica residencial a 10 ...CasaBOSQUESZinacantepecEdo. de México0.02.02.0...NaNNaN2015-02-08 00:00:000.00.00.00.00.0650000.04.0
239996259178casa en condominio a 10 min. del centro de tolucacasa con un jardin amplio, un cuarto de servic...CasaFiliberto Navas 325TolucaEdo. de México0.03.03.0...19.294665-99.6929162014-07-10 00:00:000.00.00.01.01.01940000.03.0
239997131932nicolas san juandepartamento con excelente ubicación, muy cerc...ApartamentoNicolas San JuanBenito JuárezDistrito Federal20.02.01.0...NaNNaN2015-03-03 00:00:000.00.00.00.00.03400000.02.0
239998146867casa sola. javier rojo gomez.casa sola, dividida en cuatro departamentos de...CasaJavier Rojo Gomez 120IztapalapaDistrito Federal20.04.00.0...19.366651-99.0822462014-12-26 00:00:001.00.00.01.01.02890000.01.0
239999121958departamento en bosques de las lomas / av. st...id:19816, muy bonito e iluminado departamento,...ApartamentoAVE. STIMCuajimalpa de MorelosDistrito Federal1.03.02.0...NaNNaN2015-06-19 00:00:000.00.00.00.00.03650000.01.0
\n", - "

240000 rows × 24 columns

\n", - "
" - ], - "text/plain": [ - " id titulo \\\n", - "0 254099 depto. tipo a-402 \n", - "1 53461 condominio horizontal en venta \n", - "2 247984 casa en venta urbi 3 recamaras tonala \n", - "3 209067 casa sola en toluca zinacantepec con credito i... \n", - "4 185997 paseos del sol \n", - "... ... ... \n", - "239995 119879 bonita casas de 2 recamaras a 10 minutos del c... \n", - "239996 259178 casa en condominio a 10 min. del centro de toluca \n", - "239997 131932 nicolas san juan \n", - "239998 146867 casa sola. javier rojo gomez. \n", - "239999 121958 departamento en bosques de las lomas / av. st... \n", - "\n", - " descripcion tipodepropiedad \\\n", - "0 depto. interior de 80.15m2, consta de sala com... Apartamento \n", - "1

entre sonora y guerrero, atrás del h... Casa en condominio \n", - "2 descripcion \\nla mejor ubicacion residencial e... Casa \n", - "3 casa en privada con caseta de vigilancia casas... Casa \n", - "4 bonito departamento en excelentes condiciones ... Apartamento \n", - "... ... ... \n", - "239995 vendo casa en bosques de ica residencial a 10 ... Casa \n", - "239996 casa con un jardin amplio, un cuarto de servic... Casa \n", - "239997 departamento con excelente ubicación, muy cerc... Apartamento \n", - "239998 casa sola, dividida en cuatro departamentos de... Casa \n", - "239999 id:19816, muy bonito e iluminado departamento,... Apartamento \n", - "\n", - " direccion ciudad \\\n", - "0 Avenida Division del Norte 2005 Benito Juárez \n", - "1 AV. MEXICO La Magdalena Contreras \n", - "2 Urbi Tonala Tonalá \n", - "3 IGNACIO MANUEL ALTAMIRANO 128 Zinacantepec \n", - "4 PASEOS DEL SOL Zapopan \n", - "... ... ... \n", - "239995 BOSQUES Zinacantepec \n", - "239996 Filiberto Navas 325 Toluca \n", - "239997 Nicolas San Juan Benito Juárez \n", - "239998 Javier Rojo Gomez 120 Iztapalapa \n", - "239999 AVE. STIM Cuajimalpa de Morelos \n", - "\n", - " provincia antiguedad habitaciones garages ... lat \\\n", - "0 Distrito Federal NaN 2.0 1.0 ... NaN \n", - "1 Distrito Federal 10.0 3.0 2.0 ... 19.310205 \n", - "2 Jalisco 5.0 3.0 2.0 ... NaN \n", - "3 Edo. de México 1.0 2.0 1.0 ... 19.301890 \n", - "4 Jalisco 10.0 2.0 1.0 ... NaN \n", - "... ... ... ... ... ... ... \n", - "239995 Edo. de México 0.0 2.0 2.0 ... NaN \n", - "239996 Edo. de México 0.0 3.0 3.0 ... 19.294665 \n", - "239997 Distrito Federal 20.0 2.0 1.0 ... NaN \n", - "239998 Distrito Federal 20.0 4.0 0.0 ... 19.366651 \n", - "239999 Distrito Federal 1.0 3.0 2.0 ... NaN \n", - "\n", - " lng fecha gimnasio usosmultiples piscina \\\n", - "0 NaN 2015-08-23 00:00:00 0.0 0.0 0.0 \n", - "1 -99.227655 2013-06-28 00:00:00 0.0 0.0 0.0 \n", - "2 NaN 2015-10-17 00:00:00 0.0 0.0 0.0 \n", - "3 -99.688015 2012-03-09 00:00:00 0.0 0.0 0.0 \n", - "4 NaN 2016-06-07 00:00:00 0.0 0.0 0.0 \n", - "... ... ... ... ... ... \n", - "239995 NaN 2015-02-08 00:00:00 0.0 0.0 0.0 \n", - "239996 -99.692916 2014-07-10 00:00:00 0.0 0.0 0.0 \n", - "239997 NaN 2015-03-03 00:00:00 0.0 0.0 0.0 \n", - "239998 -99.082246 2014-12-26 00:00:00 1.0 0.0 0.0 \n", - "239999 NaN 2015-06-19 00:00:00 0.0 0.0 0.0 \n", - "\n", - " escuelascercanas centroscomercialescercanos precio \\\n", - "0 0.0 0.0 2273000.0 \n", - "1 1.0 1.0 3600000.0 \n", - "2 0.0 0.0 1200000.0 \n", - "3 1.0 1.0 650000.0 \n", - "4 0.0 0.0 1150000.0 \n", - "... ... ... ... \n", - "239995 0.0 0.0 650000.0 \n", - "239996 1.0 1.0 1940000.0 \n", - "239997 0.0 0.0 3400000.0 \n", - "239998 1.0 1.0 2890000.0 \n", - "239999 0.0 0.0 3650000.0 \n", - "\n", - " palabras_mas_frecuentes_BARATOS_titulo \n", - "0 1.0 \n", - "1 2.0 \n", - "2 5.0 \n", - "3 2.0 \n", - "4 2.0 \n", - "... ... \n", - "239995 4.0 \n", - "239996 3.0 \n", - "239997 2.0 \n", - "239998 1.0 \n", - "239999 1.0 \n", - "\n", - "[240000 rows x 24 columns]" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/ipykernel_launcher.py:20: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n", + "Use a regular DataFrame whose columns are SparseArrays instead.\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:257: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n", + "Use a Series with sparse values instead.\n", + "\n", + " >>> series = pd.Series(pd.SparseArray(...))\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " sparse_index=BlockIndex(N, blocs, blens),\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:269: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n", + "Use a Series with sparse values instead.\n", + "\n", + " >>> series = pd.Series(pd.SparseArray(...))\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " if column not in sdict\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/generic.py:4583: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n", + "Use a Series with sparse values instead.\n", + "\n", + " >>> series = pd.Series(pd.SparseArray(...))\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " return self._constructor(new_data).__finalize__(self)\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/generic.py:5997: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n", + "Use a regular DataFrame whose columns are SparseArrays instead.\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " return self._constructor(data).__finalize__(self)\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/frame.py:3471: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n", + "Use a Series with sparse values instead.\n", + "\n", + " >>> series = pd.Series(pd.SparseArray(...))\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " return klass(values, index=self.index, name=items, fastpath=True)\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:745: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n", + "Use a regular DataFrame whose columns are SparseArrays instead.\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " default_fill_value=self._default_fill_value,\n" + ] } ], "source": [ - "def ejemplo():\n", - " df = df_train\n", - " df_aut = df_test\n", - " palabras_mas_usadas_en_mas_caros_baratos(df, 'titulo', 80, 200, df_aut, False)" + "# def ejemplo():\n", + "df = df_train\n", + "df_aut = df_test\n", + "df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)\n", + "display(df_merge)\n", + "display(df_merge_aux)" ] } ], diff --git a/html/featuresNLTK.html b/html/featuresNLTK.html index 397620e..d0457d8 100644 --- a/html/featuresNLTK.html +++ b/html/featuresNLTK.html @@ -13083,6 +13083,7 @@ import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords +from sklearn.feature_extraction.text import CountVectorizer nltk.download('stopwords') @@ -13116,7 +13117,7 @@

-
In [24]:
+
In [2]:
def ver_info_a_filtrar(df, col):
@@ -13132,7 +13133,7 @@
 
-
In [8]:
+
In [3]:
def generar_palabras_no_queridas(arr=None):
@@ -13157,7 +13158,7 @@
 
-
In [11]:
+
In [4]:
def generar_palabras_mas_frecuentes(df, col, n):
@@ -13187,7 +13188,7 @@
 
-
In [6]:
+
In [5]:
def feature_cantidad_mas_frecuentes(df, col, n):
@@ -13237,29 +13238,10 @@
 
-
-
- - -
- -
- - -
-
La cantidad de publicaciones con titulo no nulo es: 234613
-La cantidad total de publicaciones es: 240000
-
-
-
- -
-
-
-
In [ ]:
+
In [6]:
def generar_palabras_menos_frecuentes(df, col, n):
@@ -13289,7 +13271,7 @@
 
-
In [ ]:
+
In [7]:
def feature_cantidad_menos_frecuentes(df, col, n):
@@ -13342,7 +13324,7 @@
 
-
In [63]:
+
In [8]:
def palabras_mas_usadas_en_mas_caros_baratos(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):
@@ -13386,7 +13368,7 @@
 
-
In [62]:
+
In [ ]:
def ejemplo():
@@ -13399,6 +13381,70 @@
 
+
+
+
+
In [9]:
+
+
+
#Es mas lenteja
+
+def mas_frecuentes_caros_baratos_ohe(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):
+    """Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas."""
+    """Hace ohe de las palabras mas usadas obtenidas, sobre la columna col"""
+    """Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df"""
+    """Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,
+    buscaria las palabras mas frecuentes de los mas baratos"""
+    
+    def fill_nans(df, columns, value):
+        for column in columns:
+            df[column] = df[column].fillna(value)
+        return df
+    
+    ver_info_a_filtrar(df, col)
+    df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')
+    arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)
+    df[col] = df[col].fillna('vacio')
+    cv = CountVectorizer(vocabulary=arr_mas_frecuentes)    
+    r = pd.SparseDataFrame(cv.fit_transform(df[col]), df['id'], cv.get_feature_names(), default_fill_value=0)
+    r = r.reset_index()
+    df_merge = df.merge(r, on='id')
+    df_merge = fill_nans(df_merge, cv.get_feature_names(), 0)
+    
+    if df_test is None:
+        return df_merge, df_test
+    
+    df_test[col] = df_test[col].fillna('vacio')
+    r_test = pd.SparseDataFrame(cv.fit_transform(df_test[col]), df_test['id'], cv.get_feature_names(), default_fill_value=0)
+    r_test = r_test.reset_index()
+    df_merge_aux = df_test.merge(r_test, on='id')
+    df_merge_aux = fill_nans(df_merge_aux, cv.get_feature_names(), 0)
+    
+    return df_merge, df_merge_aux
+
+ +
+
+
+ +
+
+
+
In [ ]:
+
+
+
# def ejemplo():
+df = df_train
+df_aut = df_test
+df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)
+display(df_merge)
+display(df_merge_aux)
+
+ +
+
+
+
@@ -13417,323 +13463,61 @@
-
Out[62]:
+
+
+
/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/ipykernel_launcher.py:20: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.
+Use a regular DataFrame whose columns are SparseArrays instead.
 
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
idtitulodescripciontipodepropiedaddireccionciudadprovinciaantiguedadhabitacionesgarages...latlngfechagimnasiousosmultiplespiscinaescuelascercanascentroscomercialescercanospreciopalabras_mas_frecuentes_BARATOS_titulo
0254099depto. tipo a-402depto. interior de 80.15m2, consta de sala com...ApartamentoAvenida Division del Norte 2005Benito JuárezDistrito FederalNaN2.01.0...NaNNaN2015-08-23 00:00:000.00.00.00.00.02273000.01.0
153461condominio horizontal en venta<p>entre sonora y guerrero, atr&aacute;s del h...Casa en condominioAV. MEXICOLa Magdalena ContrerasDistrito Federal10.03.02.0...19.310205-99.2276552013-06-28 00:00:000.00.00.01.01.03600000.02.0
2247984casa en venta urbi 3 recamaras tonaladescripcion \nla mejor ubicacion residencial e...CasaUrbi TonalaTonaláJalisco5.03.02.0...NaNNaN2015-10-17 00:00:000.00.00.00.00.01200000.05.0
3209067casa sola en toluca zinacantepec con credito i...casa en privada con caseta de vigilancia casas...CasaIGNACIO MANUEL ALTAMIRANO 128ZinacantepecEdo. de México1.02.01.0...19.301890-99.6880152012-03-09 00:00:000.00.00.01.01.0650000.02.0
4185997paseos del solbonito departamento en excelentes condiciones ...ApartamentoPASEOS DEL SOLZapopanJalisco10.02.01.0...NaNNaN2016-06-07 00:00:000.00.00.00.00.01150000.02.0
..................................................................
239995119879bonita casas de 2 recamaras a 10 minutos del c...vendo casa en bosques de ica residencial a 10 ...CasaBOSQUESZinacantepecEdo. de México0.02.02.0...NaNNaN2015-02-08 00:00:000.00.00.00.00.0650000.04.0
239996259178casa en condominio a 10 min. del centro de tolucacasa con un jardin amplio, un cuarto de servic...CasaFiliberto Navas 325TolucaEdo. de México0.03.03.0...19.294665-99.6929162014-07-10 00:00:000.00.00.01.01.01940000.03.0
239997131932nicolas san juandepartamento con excelente ubicación, muy cerc...ApartamentoNicolas San JuanBenito JuárezDistrito Federal20.02.01.0...NaNNaN2015-03-03 00:00:000.00.00.00.00.03400000.02.0
239998146867casa sola. javier rojo gomez.casa sola, dividida en cuatro departamentos de...CasaJavier Rojo Gomez 120IztapalapaDistrito Federal20.04.00.0...19.366651-99.0822462014-12-26 00:00:001.00.00.01.01.02890000.01.0
239999121958departamento en bosques de las lomas / av. st...id:19816, muy bonito e iluminado departamento,...ApartamentoAVE. STIMCuajimalpa de MorelosDistrito Federal1.03.02.0...NaNNaN2015-06-19 00:00:000.00.00.00.00.03650000.01.0
-

240000 rows × 24 columns

-
-
+ >>> series = pd.Series(pd.SparseArray(...)) + +See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more. + + sparse_index=BlockIndex(N, blocs, blens), +/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:269: FutureWarning: SparseSeries is deprecated and will be removed in a future version. +Use a Series with sparse values instead. + + >>> series = pd.Series(pd.SparseArray(...)) + +See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more. + + if column not in sdict +/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/generic.py:4583: FutureWarning: SparseSeries is deprecated and will be removed in a future version. +Use a Series with sparse values instead. + + >>> series = pd.Series(pd.SparseArray(...)) +See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more. + + return self._constructor(new_data).__finalize__(self) +/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/generic.py:5997: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version. +Use a regular DataFrame whose columns are SparseArrays instead. + +See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more. + + return self._constructor(data).__finalize__(self) +/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/frame.py:3471: FutureWarning: SparseSeries is deprecated and will be removed in a future version. +Use a Series with sparse values instead. + + >>> series = pd.Series(pd.SparseArray(...)) + +See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more. + + return klass(values, index=self.index, name=items, fastpath=True) +/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:745: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version. +Use a regular DataFrame whose columns are SparseArrays instead. + +See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more. + + default_fill_value=self._default_fill_value, +
+
From 0a7c75a6c4650a73a3dbdf69179b47ef2e6289a7 Mon Sep 17 00:00:00 2001 From: tlofano Date: Tue, 3 Dec 2019 03:08:30 -0300 Subject: [PATCH 3/3] Fuera del hilo de ejecucion principal --- featuresNLTK.ipynb | 10 ++++------ html/featuresNLTK.html | 10 ++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/featuresNLTK.ipynb b/featuresNLTK.ipynb index 4d0bc3b..8ed3841 100644 --- a/featuresNLTK.ipynb +++ b/featuresNLTK.ipynb @@ -370,12 +370,10 @@ } ], "source": [ - "# def ejemplo():\n", - "df = df_train\n", - "df_aut = df_test\n", - "df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)\n", - "display(df_merge)\n", - "display(df_merge_aux)" + "def ejemplo():\n", + " df = df_train\n", + " df_aut = df_test\n", + " df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)" ] } ], diff --git a/html/featuresNLTK.html b/html/featuresNLTK.html index d0457d8..a4485f9 100644 --- a/html/featuresNLTK.html +++ b/html/featuresNLTK.html @@ -13433,12 +13433,10 @@
In [ ]:
-
# def ejemplo():
-df = df_train
-df_aut = df_test
-df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)
-display(df_merge)
-display(df_merge_aux)
+
def ejemplo():
+    df = df_train
+    df_aut = df_test
+    df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)