diff --git a/featuresNLTK.ipynb b/featuresNLTK.ipynb index 7d15fdf..8ed3841 100644 --- a/featuresNLTK.ipynb +++ b/featuresNLTK.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -19,6 +19,7 @@ "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "nltk.download('stopwords')\n", "\n", @@ -28,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -87,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -122,16 +123,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "La cantidad de publicaciones con titulo no nulo es: 234613\n", - "La cantidad total de publicaciones es: 240000\n" - ] - } - ], + "outputs": [], "source": [ "def ejemplo():\n", " # Ejemplo de uso de las palabras mas frecuentes\n", @@ -141,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -167,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -209,6 +201,180 @@ " df_palabras = feature_cantidad_menos_frecuentes(df_train, 'titulo', 200)\n", " df_palabras[['id', 'titulo', 'cant_palabras_menos_frecuentes_titulo']].head()" ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def palabras_mas_usadas_en_mas_caros_baratos(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):\n", + " \"\"\"Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas.\"\"\"\n", + " \"\"\"Para cada texto de la columna col, cuenta cuantas de estas palabras contiene\"\"\"\n", + " \"\"\"Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df\"\"\"\n", + " \"\"\"Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,\n", + " buscaria las palabras mas frecuentes de los mas baratos\"\"\"\n", + " \n", + " info = 'CAROS' if mirar_mas_caros else 'BARATOS'\n", + " \n", + " ver_info_a_filtrar(df, col)\n", + " df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')\n", + " arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)\n", + " set_mas_frecuentes = set(arr_mas_frecuentes)\n", + "\n", + " def contador_mas_frecuentes(texto):\n", + " contador = 0\n", + " palabras_del_df = word_tokenize(texto)\n", + " for palabra in palabras_del_df:\n", + " if palabra in set_mas_frecuentes:\n", + " contador = contador + 1\n", + " return contador\n", + " \n", + " df['palabras_mas_frecuentes_' + info + '_' + col] = df[~df[col].isnull()][col].apply(contador_mas_frecuentes)\n", + " df['palabras_mas_frecuentes_' + info + '_' + col] = df['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)\n", + " \n", + " if df_test is None:\n", + " return df, df_test\n", + " \n", + " df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test[~df_test[col].isnull()][col].apply(contador_mas_frecuentes)\n", + " df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)\n", + " \n", + " return df, df_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "def ejemplo():\n", + " df = df_train\n", + " df_aut = df_test\n", + " palabras_mas_usadas_en_mas_caros_baratos(df, 'titulo', 80, 200, df_aut, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#Es mas lenteja\n", + "\n", + "def mas_frecuentes_caros_baratos_ohe(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):\n", + " \"\"\"Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas.\"\"\"\n", + " \"\"\"Hace ohe de las palabras mas usadas obtenidas, sobre la columna col\"\"\"\n", + " \"\"\"Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df\"\"\"\n", + " \"\"\"Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,\n", + " buscaria las palabras mas frecuentes de los mas baratos\"\"\"\n", + " \n", + " def fill_nans(df, columns, value):\n", + " for column in columns:\n", + " df[column] = df[column].fillna(value)\n", + " return df\n", + " \n", + " ver_info_a_filtrar(df, col)\n", + " df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')\n", + " arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)\n", + " df[col] = df[col].fillna('vacio')\n", + " cv = CountVectorizer(vocabulary=arr_mas_frecuentes) \n", + " r = pd.SparseDataFrame(cv.fit_transform(df[col]), df['id'], cv.get_feature_names(), default_fill_value=0)\n", + " r = r.reset_index()\n", + " df_merge = df.merge(r, on='id')\n", + " df_merge = fill_nans(df_merge, cv.get_feature_names(), 0)\n", + " \n", + " if df_test is None:\n", + " return df_merge, df_test\n", + " \n", + " df_test[col] = df_test[col].fillna('vacio')\n", + " r_test = pd.SparseDataFrame(cv.fit_transform(df_test[col]), df_test['id'], cv.get_feature_names(), default_fill_value=0)\n", + " r_test = r_test.reset_index()\n", + " df_merge_aux = df_test.merge(r_test, on='id')\n", + " df_merge_aux = fill_nans(df_merge_aux, cv.get_feature_names(), 0)\n", + " \n", + " return df_merge, df_merge_aux" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "La cantidad de publicaciones con titulo no nulo es: 234613\n", + "La cantidad total de publicaciones es: 240000\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/ipykernel_launcher.py:20: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n", + "Use a regular DataFrame whose columns are SparseArrays instead.\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:257: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n", + "Use a Series with sparse values instead.\n", + "\n", + " >>> series = pd.Series(pd.SparseArray(...))\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " sparse_index=BlockIndex(N, blocs, blens),\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:269: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n", + "Use a Series with sparse values instead.\n", + "\n", + " >>> series = pd.Series(pd.SparseArray(...))\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " if column not in sdict\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/generic.py:4583: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n", + "Use a Series with sparse values instead.\n", + "\n", + " >>> series = pd.Series(pd.SparseArray(...))\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " return self._constructor(new_data).__finalize__(self)\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/generic.py:5997: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n", + "Use a regular DataFrame whose columns are SparseArrays instead.\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " return self._constructor(data).__finalize__(self)\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/frame.py:3471: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n", + "Use a Series with sparse values instead.\n", + "\n", + " >>> series = pd.Series(pd.SparseArray(...))\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " return klass(values, index=self.index, name=items, fastpath=True)\n", + "/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:745: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n", + "Use a regular DataFrame whose columns are SparseArrays instead.\n", + "\n", + "See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n", + "\n", + " default_fill_value=self._default_fill_value,\n" + ] + } + ], + "source": [ + "def ejemplo():\n", + " df = df_train\n", + " df_aut = df_test\n", + " df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)" + ] } ], "metadata": { diff --git a/html/featuresNLTK.html b/html/featuresNLTK.html index 52d3f54..a4485f9 100644 --- a/html/featuresNLTK.html +++ b/html/featuresNLTK.html @@ -13076,13 +13076,14 @@
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
@@ -13116,7 +13117,7 @@
def ver_info_a_filtrar(df, col):
@@ -13132,7 +13133,7 @@
def generar_palabras_no_queridas(arr=None):
@@ -13157,7 +13158,7 @@
def generar_palabras_mas_frecuentes(df, col, n):
@@ -13187,7 +13188,7 @@
def feature_cantidad_mas_frecuentes(df, col, n):
@@ -13237,29 +13238,10 @@
def generar_palabras_menos_frecuentes(df, col, n):
@@ -13289,7 +13271,7 @@
def feature_cantidad_menos_frecuentes(df, col, n):
@@ -13339,6 +13321,206 @@
def palabras_mas_usadas_en_mas_caros_baratos(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):
+ """Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas."""
+ """Para cada texto de la columna col, cuenta cuantas de estas palabras contiene"""
+ """Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df"""
+ """Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,
+ buscaria las palabras mas frecuentes de los mas baratos"""
+
+ info = 'CAROS' if mirar_mas_caros else 'BARATOS'
+
+ ver_info_a_filtrar(df, col)
+ df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')
+ arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)
+ set_mas_frecuentes = set(arr_mas_frecuentes)
+
+ def contador_mas_frecuentes(texto):
+ contador = 0
+ palabras_del_df = word_tokenize(texto)
+ for palabra in palabras_del_df:
+ if palabra in set_mas_frecuentes:
+ contador = contador + 1
+ return contador
+
+ df['palabras_mas_frecuentes_' + info + '_' + col] = df[~df[col].isnull()][col].apply(contador_mas_frecuentes)
+ df['palabras_mas_frecuentes_' + info + '_' + col] = df['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)
+
+ if df_test is None:
+ return df, df_test
+
+ df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test[~df_test[col].isnull()][col].apply(contador_mas_frecuentes)
+ df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)
+
+ return df, df_test
+def ejemplo():
+ df = df_train
+ df_aut = df_test
+ palabras_mas_usadas_en_mas_caros_baratos(df, 'titulo', 80, 200, df_aut, False)
+#Es mas lenteja
+
+def mas_frecuentes_caros_baratos_ohe(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):
+ """Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas."""
+ """Hace ohe de las palabras mas usadas obtenidas, sobre la columna col"""
+ """Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df"""
+ """Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,
+ buscaria las palabras mas frecuentes de los mas baratos"""
+
+ def fill_nans(df, columns, value):
+ for column in columns:
+ df[column] = df[column].fillna(value)
+ return df
+
+ ver_info_a_filtrar(df, col)
+ df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')
+ arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)
+ df[col] = df[col].fillna('vacio')
+ cv = CountVectorizer(vocabulary=arr_mas_frecuentes)
+ r = pd.SparseDataFrame(cv.fit_transform(df[col]), df['id'], cv.get_feature_names(), default_fill_value=0)
+ r = r.reset_index()
+ df_merge = df.merge(r, on='id')
+ df_merge = fill_nans(df_merge, cv.get_feature_names(), 0)
+
+ if df_test is None:
+ return df_merge, df_test
+
+ df_test[col] = df_test[col].fillna('vacio')
+ r_test = pd.SparseDataFrame(cv.fit_transform(df_test[col]), df_test['id'], cv.get_feature_names(), default_fill_value=0)
+ r_test = r_test.reset_index()
+ df_merge_aux = df_test.merge(r_test, on='id')
+ df_merge_aux = fill_nans(df_merge_aux, cv.get_feature_names(), 0)
+
+ return df_merge, df_merge_aux
+def ejemplo():
+ df = df_train
+ df_aut = df_test
+ df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)
+