Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 183 additions & 17 deletions featuresNLTK.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand All @@ -19,6 +19,7 @@
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"nltk.download('stopwords')\n",
"\n",
Expand All @@ -28,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -40,7 +41,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -61,7 +62,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -87,7 +88,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -122,16 +123,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"La cantidad de publicaciones con titulo no nulo es: 234613\n",
"La cantidad total de publicaciones es: 240000\n"
]
}
],
"outputs": [],
"source": [
"def ejemplo():\n",
" # Ejemplo de uso de las palabras mas frecuentes\n",
Expand All @@ -141,7 +133,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -167,7 +159,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -209,6 +201,180 @@
" df_palabras = feature_cantidad_menos_frecuentes(df_train, 'titulo', 200)\n",
" df_palabras[['id', 'titulo', 'cant_palabras_menos_frecuentes_titulo']].head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def palabras_mas_usadas_en_mas_caros_baratos(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):\n",
" \"\"\"Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas.\"\"\"\n",
" \"\"\"Para cada texto de la columna col, cuenta cuantas de estas palabras contiene\"\"\"\n",
" \"\"\"Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df\"\"\"\n",
" \"\"\"Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,\n",
" buscaria las palabras mas frecuentes de los mas baratos\"\"\"\n",
" \n",
" info = 'CAROS' if mirar_mas_caros else 'BARATOS'\n",
" \n",
" ver_info_a_filtrar(df, col)\n",
" df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')\n",
" arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)\n",
" set_mas_frecuentes = set(arr_mas_frecuentes)\n",
"\n",
" def contador_mas_frecuentes(texto):\n",
" contador = 0\n",
" palabras_del_df = word_tokenize(texto)\n",
" for palabra in palabras_del_df:\n",
" if palabra in set_mas_frecuentes:\n",
" contador = contador + 1\n",
" return contador\n",
" \n",
" df['palabras_mas_frecuentes_' + info + '_' + col] = df[~df[col].isnull()][col].apply(contador_mas_frecuentes)\n",
" df['palabras_mas_frecuentes_' + info + '_' + col] = df['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)\n",
" \n",
" if df_test is None:\n",
" return df, df_test\n",
" \n",
" df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test[~df_test[col].isnull()][col].apply(contador_mas_frecuentes)\n",
" df_test['palabras_mas_frecuentes_' + info + '_' + col] = df_test['palabras_mas_frecuentes_' + info + '_' + col].fillna(0)\n",
" \n",
" return df, df_test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"def ejemplo():\n",
" df = df_train\n",
" df_aut = df_test\n",
" palabras_mas_usadas_en_mas_caros_baratos(df, 'titulo', 80, 200, df_aut, False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#Es mas lenteja\n",
"\n",
"def mas_frecuentes_caros_baratos_ohe(df, col, n_palabras, n_caros_baratos, df_test=None, mirar_mas_caros=True):\n",
" \"\"\"Busca los n_caros mas caros del df, y verifica las n_palabras mas usadas.\"\"\"\n",
" \"\"\"Hace ohe de las palabras mas usadas obtenidas, sobre la columna col\"\"\"\n",
" \"\"\"Si se pasa un df_test, hace la cuenta teniendo en cuenta las palabras obtenidas en df\"\"\"\n",
" \"\"\"Si mirar_mas_caros es True, va a buscar las palabras mas frecuentos de los mas caros, si fuera False,\n",
" buscaria las palabras mas frecuentes de los mas baratos\"\"\"\n",
" \n",
" def fill_nans(df, columns, value):\n",
" for column in columns:\n",
" df[column] = df[column].fillna(value)\n",
" return df\n",
" \n",
" ver_info_a_filtrar(df, col)\n",
" df_busqueda = df.nlargest(n_caros_baratos, 'precio') if mirar_mas_caros else df.nsmallest(n_caros_baratos, 'precio')\n",
" arr_mas_frecuentes = generar_palabras_mas_frecuentes(df_busqueda, col, n_palabras)\n",
" df[col] = df[col].fillna('vacio')\n",
" cv = CountVectorizer(vocabulary=arr_mas_frecuentes) \n",
" r = pd.SparseDataFrame(cv.fit_transform(df[col]), df['id'], cv.get_feature_names(), default_fill_value=0)\n",
" r = r.reset_index()\n",
" df_merge = df.merge(r, on='id')\n",
" df_merge = fill_nans(df_merge, cv.get_feature_names(), 0)\n",
" \n",
" if df_test is None:\n",
" return df_merge, df_test\n",
" \n",
" df_test[col] = df_test[col].fillna('vacio')\n",
" r_test = pd.SparseDataFrame(cv.fit_transform(df_test[col]), df_test['id'], cv.get_feature_names(), default_fill_value=0)\n",
" r_test = r_test.reset_index()\n",
" df_merge_aux = df_test.merge(r_test, on='id')\n",
" df_merge_aux = fill_nans(df_merge_aux, cv.get_feature_names(), 0)\n",
" \n",
" return df_merge, df_merge_aux"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"La cantidad de publicaciones con titulo no nulo es: 234613\n",
"La cantidad total de publicaciones es: 240000\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/ipykernel_launcher.py:20: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n",
"Use a regular DataFrame whose columns are SparseArrays instead.\n",
"\n",
"See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n",
"\n",
"/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:257: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n",
"Use a Series with sparse values instead.\n",
"\n",
" >>> series = pd.Series(pd.SparseArray(...))\n",
"\n",
"See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n",
"\n",
" sparse_index=BlockIndex(N, blocs, blens),\n",
"/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:269: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n",
"Use a Series with sparse values instead.\n",
"\n",
" >>> series = pd.Series(pd.SparseArray(...))\n",
"\n",
"See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n",
"\n",
" if column not in sdict\n",
"/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/generic.py:4583: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n",
"Use a Series with sparse values instead.\n",
"\n",
" >>> series = pd.Series(pd.SparseArray(...))\n",
"\n",
"See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n",
"\n",
" return self._constructor(new_data).__finalize__(self)\n",
"/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/generic.py:5997: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n",
"Use a regular DataFrame whose columns are SparseArrays instead.\n",
"\n",
"See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n",
"\n",
" return self._constructor(data).__finalize__(self)\n",
"/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/frame.py:3471: FutureWarning: SparseSeries is deprecated and will be removed in a future version.\n",
"Use a Series with sparse values instead.\n",
"\n",
" >>> series = pd.Series(pd.SparseArray(...))\n",
"\n",
"See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n",
"\n",
" return klass(values, index=self.index, name=items, fastpath=True)\n",
"/home/tomas/Escritorio/kmeans/datos-tp2/.venv/lib/python3.6/site-packages/pandas/core/sparse/frame.py:745: FutureWarning: SparseDataFrame is deprecated and will be removed in a future version.\n",
"Use a regular DataFrame whose columns are SparseArrays instead.\n",
"\n",
"See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.\n",
"\n",
" default_fill_value=self._default_fill_value,\n"
]
}
],
"source": [
"def ejemplo():\n",
" df = df_train\n",
" df_aut = df_test\n",
" df_merge, df_merge_aux = mas_frecuentes_caros_baratos_ohe(df, 'titulo', 200, 1000, df_aut, True)"
]
}
],
"metadata": {
Expand Down
Loading