diff --git a/BOWpractice.ipynb b/BOWpractice.ipynb
new file mode 100644
index 0000000..4c5f9d3
--- /dev/null
+++ b/BOWpractice.ipynb
@@ -0,0 +1,1808 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "authorship_tag": "ABX9TyOqk0a1WDXiVZJhkncj4kXi",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "669Bj7r8lcn1"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = pd.read_csv(\"/content/email_text.csv\")"
+ ],
+ "metadata": {
+ "id": "eb6ElgqmRjSV"
+ },
+ "execution_count": 2,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.head(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 363
+ },
+ "id": "oJkvZkyCSQ_D",
+ "outputId": "aa439ac5-03e3-4552-c486-59fb2a55257c"
+ },
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " label text\n",
+ "0 0 user id enrondlr pw bnawebescapenumber origina...\n",
+ "1 0 hi chris tonight we are rolling out a new repo...\n",
+ "2 0 rika r these new original message from thomas ...\n",
+ "3 0 john gerald we are currently trading under gtc...\n",
+ "4 0 gerald and stacy attached is a worksheet for a...\n",
+ "5 0 fyi below is a copy of my communication with m...\n",
+ "6 0 pg e gt nw plans lateral across washington sta...\n",
+ "7 0 mark i am working with the east power desk to ...\n",
+ "8 0 oops here it is kal original message from shah...\n",
+ "9 0 mark and charlie fmpa is ready to bill us for ..."
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " label \n",
+ " text \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0 \n",
+ " user id enrondlr pw bnawebescapenumber origina... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0 \n",
+ " hi chris tonight we are rolling out a new repo... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0 \n",
+ " rika r these new original message from thomas ... \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0 \n",
+ " john gerald we are currently trading under gtc... \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 0 \n",
+ " gerald and stacy attached is a worksheet for a... \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 0 \n",
+ " fyi below is a copy of my communication with m... \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 0 \n",
+ " pg e gt nw plans lateral across washington sta... \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " 0 \n",
+ " mark i am working with the east power desk to ... \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " 0 \n",
+ " oops here it is kal original message from shah... \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " 0 \n",
+ " mark and charlie fmpa is ready to bill us for ... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df",
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 17415,\n \"fields\": [\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 17415,\n \"samples\": [\n \"greetings i would like to take this opportunity to introduce our service please examine the information below and let me know if you have any inquiry we are accepting your m ortgage requirement there is no problem if you have bad cr edit you can get a escapenumber escapenumber loan for a escapenumber monthly payment approval procedure will only take escapenumber minute just visit the link below and fill out the quick and easy form http www masfre info aqwcwdpep thank you for your time best regards forrest rich general manager\",\n \"http www synetix com ammonia original appointment from taylor michael e sent thursday august escapenumber escapenumber escapenumber escapenumber pm to massey ii john woods trevor salhotra rajneesh subject nox model when friday august escapenumber escapenumber escapenumber escapenumber pm escapenumber escapenumber pm gmt escapenumber escapenumber central time us canada where escapenumber feedback suggestions on nox model\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 178
+ },
+ "id": "16a94e53",
+ "outputId": "06fcc323-589b-4abb-93ee-a1f7d87241b2"
+ },
+ "source": [
+ "label_distribution = df['label'].value_counts()\n",
+ "display(label_distribution)"
+ ],
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "label\n",
+ "0 9840\n",
+ "1 7575\n",
+ "Name: count, dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " \n",
+ " \n",
+ " label \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 9840 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 7575 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cda0fd8f"
+ },
+ "source": [
+ "The above output shows the count of each unique value in the 'label' column. We can also visualize this distribution using a bar plot to better understand the proportion of each label."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 444
+ },
+ "id": "5cdd3939",
+ "outputId": "328e4335-843c-4731-edbf-d7ef61b974e9"
+ },
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "plt.figure(figsize=(8, 6))\n",
+ "sns.barplot(x=label_distribution.index, y=label_distribution.values)\n",
+ "plt.title('Distribution of the Label Column')\n",
+ "plt.xlabel('Label')\n",
+ "plt.ylabel('Count')\n",
+ "plt.xticks(rotation=0)\n",
+ "plt.show()"
+ ],
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAskAAAIjCAYAAADx6oYJAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAOkdJREFUeJzt3XtUVXX+//EXyFUU8JIgRoiXTM201JC8JyOlVkw6RTl5GS/VoKWWFpWmllGaeJ/MmSnvk+YvzdEyEbxMSmo0XkdNGy3LASyFI5YgsH9/9GUvzwe8gMhBez7WOmt1Pp/32fu9YXt8td3nc9wsy7IEAAAAwObu6gYAAACAyoaQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAygzMaPHy83N7cK2VeXLl3UpUsX+/mmTZvk5uamFStWVMj+BwwYoPr161fIvsoqJydHgwcPVnBwsNzc3DRixIhSb6Pod/rjjz+Wf4PXyLFjx+Tm5qa333673LZZdH5t2rSp3LZZGub5DqDiEZIBSJLmz58vNzc3++Hj46OQkBBFR0dr5syZOnPmTLns58SJExo/frx27dpVLtsrT5W5tyvxxhtvaP78+Xr66ae1aNEiPfHEE5esXbVqVcU1Zyg637788kuX9VCevvnmGz355JNq0KCBfHx85O/vr/bt22vGjBn65ZdfXN0egDLwcHUDACqXiRMnKjw8XOfPn1d6ero2bdqkESNGKDExUatXr9Ydd9xh177yyit68cUXS7X9EydOaMKECapfv75atWp1xa9bv359qfZTFpfq7a9//asKCwuveQ9XIyUlRe3atdOrr7562do33nhDffr0UUxMzLVv7Aa3du1a/eEPf5C3t7f69eun22+/XXl5efr88881evRo7d+/X/PmzXN1mwBKiZAMwMn999+vNm3a2M/j4+OVkpKiXr166cEHH9SBAwfk6+srSfLw8JCHx7V9G/n5559VtWpVeXl5XdP9XI6np6dL938lMjMz1axZM1e38Zty9OhRxcbGKiwsTCkpKapbt649FxcXpyNHjmjt2rUu7BBAWXG7BYDLuvfeezV27Fh9++23Wrx4sT1e0j3JSUlJ6tChgwIDA1WtWjU1adJEL730kqRf7/Ns27atJGngwIH2rR3z58+X9Ot9mLfffrvS0tLUqVMnVa1a1X7txe7RLCgo0EsvvaTg4GD5+fnpwQcf1PHjx51q6tevrwEDBhR77YXbvFxvJd2TfPbsWT333HMKDQ2Vt7e3mjRporfffluWZTnVubm5adiwYVq1apVuv/12eXt7q3nz5lq3bl3JP3BDZmamBg0apKCgIPn4+Khly5ZasGCBPV90/+zRo0e1du1au/djx46VuD03NzedPXtWCxYssGvNn09WVpYGDBigwMBABQQEaODAgfr555+LbWvx4sVq3bq1fH19VbNmTcXGxhb7+ZdVXl6exo0bp9atWysgIEB+fn7q2LGjNm7ceNHXTJs2TWFhYfL19VXnzp21b9++YjUHDx5Unz59VLNmTfn4+KhNmzZavXp1mXqcPHmycnJy9Pe//90pIBdp1KiRnn32Wft5fn6+XnvtNTVs2FDe3t6qX7++XnrpJeXm5l5yP0W3p5i/05LunS76c7Rnzx517txZVatWVaNGjez79zdv3qyIiAj5+vqqSZMm2rBhg9M2i/5cHzly5IrOAeBGRUgGcEWK7m+91G0P+/fvV69evZSbm6uJEydq6tSpevDBB7V161ZJUtOmTTVx4kRJ0tChQ7Vo0SItWrRInTp1srfx008/6f7771erVq00ffp0de3a9ZJ9TZo0SWvXrtULL7ygZ555RklJSYqKiir1faBX0tuFLMvSgw8+qGnTpum+++5TYmKimjRpotGjR2vUqFHF6j///HP9+c9/VmxsrCZPnqxz586pd+/e+umnny7Z1y+//KIuXbpo0aJF6tu3r6ZMmaKAgAANGDBAM2bMsHtftGiRateurVatWtm933TTTSVuc9GiRfL29lbHjh3t2ieffNKp5pFHHtGZM2eUkJCgRx55RPPnz9eECROcaiZNmqR+/fqpcePGSkxM1IgRI5ScnKxOnTopKyvrksd1JRwOh/72t7+pS5cueuuttzR+/HidPHlS0dHRJd43vnDhQs2cOVNxcXGKj4/Xvn37dO+99yojI8Ou2b9/v9q1a6cDBw7oxRdf1NSpU+Xn56eYmBitXLmy1D3+85//VIMGDXTPPfdcUf3gwYM1btw43XXXXZo2bZo6d+6shIQExcbGlnrfl3L69Gn16tVLERERmjx5sry9vRUbG6tly5YpNjZWPXr00JtvvqmzZ8+qT58+JX7m4ErOAeCGZgGAZVnvv/++JcnauXPnRWsCAgKsO++8037+6quvWhe+jUybNs2SZJ08efKi29i5c6clyXr//feLzXXu3NmSZM2dO7fEuc6dO9vPN27caEmy6tWrZzkcDnt8+fLlliRrxowZ9lhYWJjVv3//y27zUr3179/fCgsLs5+vWrXKkmS9/vrrTnV9+vSx3NzcrCNHjthjkiwvLy+nsd27d1uSrFmzZhXb14WmT59uSbIWL15sj+Xl5VmRkZFWtWrVnI49LCzM6tmz5yW3V8TPz6/En0nR7/RPf/qT0/jvf/97q1atWvbzY8eOWVWqVLEmTZrkVLd3717Lw8Oj2LjpSs63/Px8Kzc312ns9OnTVlBQkFN/R48etSRZvr6+1vfff2+Pb9++3ZJkjRw50h7r1q2b1aJFC+vcuXP2WGFhoXXPPfdYjRs3tseKzq+NGzdetL/s7GxLkvXQQw9d8liL7Nq1y5JkDR482Gn8+eeftyRZKSkp9ph5bhb9vI4ePer02pL6LPpztHTpUnvs4MGDliTL3d3d+uKLL+zxzz77rNg5f6XnAHCj40oygCtWrVq1S65yERgYKEn6+OOPy/whN29vbw0cOPCK6/v166fq1avbz/v06aO6devqk08+KdP+r9Qnn3yiKlWq6JlnnnEaf+6552RZlj799FOn8aioKDVs2NB+fscdd8jf31///e9/L7uf4OBgPfbYY/aYp6ennnnmGeXk5Gjz5s3lcDTFPfXUU07PO3bsqJ9++kkOh0OS9NFHH6mwsFCPPPKIfvzxR/sRHBysxo0bX/KWiCtVpUoV+170wsJCnTp1Svn5+WrTpo2++uqrYvUxMTGqV6+e/fzuu+9WRESEfS6cOnVKKSkp9hXSop5/+uknRUdH6/Dhw/rhhx+uuL+in8WF59+lFPVh/kvDc889J0nleu9ytWrVnK5ON2nSRIGBgWratKkiIiLs8aL/Luk8vNw5ANzoCMkArlhOTs4lA8Gjjz6q9u3ba/DgwQoKClJsbKyWL19eqsBcr169Un1Ir3Hjxk7P3dzc1KhRo4vej1tevv32W4WEhBT7eTRt2tSev9Att9xSbBs1atTQ6dOnL7ufxo0by93d+e36YvspL2a/NWrUkCS738OHD8uyLDVu3Fg33XST0+PAgQPKzMwslz4WLFigO+64Qz4+PqpVq5ZuuukmrV27VtnZ2cVqzXNBkm699Vb7XDhy5Igsy9LYsWOL9Vy0Ikhp+vb395ekK14e8dtvv5W7u7saNWrkNB4cHKzAwMBy/V3efPPNxT4vEBAQoNDQ0GJjkko8Dy93DgA3Ola3AHBFvv/+e2VnZxf7C/5Cvr6+2rJlizZu3Ki1a9dq3bp1WrZsme69916tX79eVapUuex+ilbOKE8X+8KTgoKCK+qpPFxsP5bxIb/K4nL9FhYWys3NTZ9++mmJtdWqVbvqHhYvXqwBAwYoJiZGo0ePVp06dVSlShUlJCTom2++KfX2iv5n7fnnn1d0dHSJNZc6v03+/v4KCQkp8cOBl1KWL+C51Dlckov9/kpzHl5v5yxQ3gjJAK7IokWLJOmi4aKIu7u7unXrpm7duikxMVFvvPGGXn75ZW3cuFFRUVHl/g19hw8fdnpuWZaOHDnitJ5zjRo1Svwg2bfffqsGDRrYz0vTW1hYmDZs2KAzZ844XU0+ePCgPV8ewsLCtGfPHhUWFjpdTb7a/Vzt76Fhw4ayLEvh4eG69dZbr2pbF7NixQo1aNBAH330kVO/F1sH2jwXJOnrr7+2VyUp+l17enoqKiqqXHrs1auX5s2bp9TUVEVGRl6yNiwsTIWFhTp8+LD9LwGSlJGRoaysrEv+Louu4prn8bX6lwQA3G4B4AqkpKTotddeU3h4uPr27XvRulOnThUbK/pSjqIlrvz8/CQV/8u+rBYuXOj0z90rVqzQ//73P91///32WMOGDfXFF18oLy/PHluzZk2xpcpK01uPHj1UUFCg2bNnO41PmzZNbm5uTvu/Gj169FB6erqWLVtmj+Xn52vWrFmqVq2aOnfuXKbt+vn5XdXv4OGHH1aVKlU0YcKEYlcWLcu67KodV6LoSuaF29++fbtSU1NLrF+1apXTPcU7duzQ9u3b7d9FnTp11KVLF7377rv63//+V+z1J0+eLHWPY8aMkZ+fnwYPHuy0ikaRb775xl6FpEePHpKk6dOnO9UkJiZKknr27HnR/RTdz75lyxZ7rKCggC8pAa4hriQDcPLpp5/q4MGDys/PV0ZGhlJSUpSUlKSwsDCtXr1aPj4+F33txIkTtWXLFvXs2VNhYWHKzMzUX/7yF918883q0KGDpF//sg8MDNTcuXNVvXp1+fn5KSIiQuHh4WXqt2bNmurQoYMGDhyojIwMTZ8+XY0aNdKQIUPsmsGDB2vFihW677779Mgjj+ibb77R4sWLnT5IV9reHnjgAXXt2lUvv/yyjh07ppYtW2r9+vX6+OOPNWLEiGLbLquhQ4fq3Xff1YABA5SWlqb69etrxYoV2rp1q6ZPn37FHxoztW7dWhs2bFBiYqJCQkIUHh7u9IGuy2nYsKFef/11xcfH69ixY4qJiVH16tV19OhRrVy5UkOHDtXzzz9/2e289957Ja4X/eyzz6pXr1766KOP9Pvf/149e/bU0aNHNXfuXDVr1kw5OTnFXtOoUSN16NBBTz/9tHJzczV9+nTVqlVLY8aMsWvmzJmjDh06qEWLFhoyZIgaNGigjIwMpaam6vvvv9fu3buv+GdQ9HNYunSpHn30UTVt2tTpG/e2bdumDz/80F6DumXLlurfv7/mzZunrKwsde7cWTt27NCCBQsUExNzyeUOmzdvrnbt2ik+Pl6nTp1SzZo19cEHHyg/P79U/QIoBdcsqgGgsilaYqro4eXlZQUHB1u/+93vrBkzZjgtNVbEXAIuOTnZeuihh6yQkBDLy8vLCgkJsR577DHr66+/dnrdxx9/bDVr1szy8PBwWn6qc+fOVvPmzUvs72JLwP3jH/+w4uPjrTp16li+vr5Wz549rW+//bbY66dOnWrVq1fP8vb2ttq3b299+eWXxbZ5qd7MJeAsy7LOnDljjRw50goJCbE8PT2txo0bW1OmTLEKCwud6iRZcXFxxXq62NJ0poyMDGvgwIFW7dq1LS8vL6tFixYlLlNXmiXgDh48aHXq1Mny9fW1JNl9FP1OzWX8LrYE2f/7f//P6tChg+Xn52f5+flZt912mxUXF2cdOnTokvs3zzfzcfz4cauwsNB64403rLCwMMvb29u68847rTVr1hT7XRQtATdlyhRr6tSpVmhoqOXt7W117NjR2r17d7F9f/PNN1a/fv2s4OBgy9PT06pXr57Vq1cva8WKFXbNlSwBd6Gvv/7aGjJkiFW/fn3Ly8vLql69utW+fXtr1qxZTsvNnT9/3powYYIVHh5ueXp6WqGhoVZ8fLxTjWUVP9+L+o6KirK8vb2toKAg66WXXrKSkpJKXAKupD9HFzs/zPOztOcAcKNysyzuwAcAAAAuxD3JAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGPgykXJSWFioEydOqHr16uX+tbsAAAC4epZl6cyZMwoJCZG7+6WvFROSy8mJEycUGhrq6jYAAABwGcePH9fNN998yRpCcjkp+mrY48ePy9/f38XdAAAAwORwOBQaGmrntkshJJeTolss/P39CckAAACV2JXcGssH9wAAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwuDQkb9myRQ888IBCQkLk5uamVatWOc1blqVx48apbt268vX1VVRUlA4fPuxUc+rUKfXt21f+/v4KDAzUoEGDlJOT41SzZ88edezYUT4+PgoNDdXkyZOL9fLhhx/qtttuk4+Pj1q0aKFPPvmk3I8XAAAA1weXhuSzZ8+qZcuWmjNnTonzkydP1syZMzV37lxt375dfn5+io6O1rlz5+yavn37av/+/UpKStKaNWu0ZcsWDR061J53OBzq3r27wsLClJaWpilTpmj8+PGaN2+eXbNt2zY99thjGjRokP79738rJiZGMTEx2rdv37U7eAAAAFRabpZlWa5uQvr16wFXrlypmJgYSb9eRQ4JCdFzzz2n559/XpKUnZ2toKAgzZ8/X7GxsTpw4ICaNWumnTt3qk2bNpKkdevWqUePHvr+++8VEhKid955Ry+//LLS09Pl5eUlSXrxxRe1atUqHTx4UJL06KOP6uzZs1qzZo3dT7t27dSqVSvNnTv3ivp3OBwKCAhQdnY2X0sNAABQCZUmr1Xae5KPHj2q9PR0RUVF2WMBAQGKiIhQamqqJCk1NVWBgYF2QJakqKgoubu7a/v27XZNp06d7IAsSdHR0Tp06JBOnz5t11y4n6Kaov2UJDc3Vw6Hw+kBAACAG0OlDcnp6emSpKCgIKfxoKAgey49PV116tRxmvfw8FDNmjWdakraxoX7uFhN0XxJEhISFBAQYD9CQ0NLe4gAAACopCptSK7s4uPjlZ2dbT+OHz/u6pYAAABQTiptSA4ODpYkZWRkOI1nZGTYc8HBwcrMzHSaz8/P16lTp5xqStrGhfu4WE3RfEm8vb3l7+/v9AAAAMCNodKG5PDwcAUHBys5Odkeczgc2r59uyIjIyVJkZGRysrKUlpaml2TkpKiwsJCRURE2DVbtmzR+fPn7ZqkpCQ1adJENWrUsGsu3E9RTdF+AAAA8Nvi0pCck5OjXbt2adeuXZJ+/bDerl279N1338nNzU0jRozQ66+/rtWrV2vv3r3q16+fQkJC7BUwmjZtqvvuu09DhgzRjh07tHXrVg0bNkyxsbEKCQmRJD3++OPy8vLSoEGDtH//fi1btkwzZszQqFGj7D6effZZrVu3TlOnTtXBgwc1fvx4ffnllxo2bFhF/0gAAABQGVgutHHjRktSsUf//v0ty7KswsJCa+zYsVZQUJDl7e1tdevWzTp06JDTNn766Sfrscces6pVq2b5+/tbAwcOtM6cOeNUs3v3bqtDhw6Wt7e3Va9ePevNN98s1svy5cutW2+91fLy8rKaN29urV27tlTHkp2dbUmysrOzS/dDAAAAQIUoTV6rNOskX+9YJxkAAKByK01e86ignnCNtR690NUtALhG0qb0c3ULAPCbU2k/uAcAAAC4CiEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAABDpQ7JBQUFGjt2rMLDw+Xr66uGDRvqtddek2VZdo1lWRo3bpzq1q0rX19fRUVF6fDhw07bOXXqlPr27St/f38FBgZq0KBBysnJcarZs2ePOnbsKB8fH4WGhmry5MkVcowAAACofCp1SH7rrbf0zjvvaPbs2Tpw4IDeeustTZ48WbNmzbJrJk+erJkzZ2ru3Lnavn27/Pz8FB0drXPnztk1ffv21f79+5WUlKQ1a9Zoy5YtGjp0qD3vcDjUvXt3hYWFKS0tTVOmTNH48eM1b968Cj1eAAAAVA4erm7gUrZt26aHHnpIPXv2lCTVr19f//jHP7Rjxw5Jv15Fnj59ul555RU99NBDkqSFCxcqKChIq1atUmxsrA4cOKB169Zp586datOmjSRp1qxZ6tGjh95++22FhIRoyZIlysvL03vvvScvLy81b95cu3btUmJiolOYBgAAwG9Dpb6SfM899yg5OVlff/21JGn37t36/PPPdf/990uSjh49qvT0dEVFRdmvCQgIUEREhFJTUyVJqampCgwMtAOyJEVFRcnd3V3bt2+3azp16iQvLy+7Jjo6WocOHdLp06dL7C03N1cOh8PpAQAAgBtDpb6S/OKLL8rhcOi2225TlSpVVFBQoEmTJqlv376SpPT0dElSUFCQ0+uCgoLsufT0dNWpU8dp3sPDQzVr1nSqCQ8PL7aNorkaNWoU6y0hIUETJkwoh6MEAABAZVOpryQvX75cS5Ys0dKlS/XVV19pwYIFevvtt7VgwQJXt6b4+HhlZ2fbj+PHj7u6JQAAAJSTSn0lefTo0XrxxRcVGxsrSWrRooW+/fZbJSQkqH///goODpYkZWRkqG7duvbrMjIy1KpVK0lScHCwMjMznbabn5+vU6dO2a8PDg5WRkaGU03R86Iak7e3t7y9va/+IAEAAFDpVOoryT///LPc3Z1brFKligoLCyVJ4eHhCg4OVnJysj3vcDi0fft2RUZGSpIiIyOVlZWltLQ0uyYlJUWFhYWKiIiwa7Zs2aLz58/bNUlJSWrSpEmJt1oAAADgxlapQ/IDDzygSZMmae3atTp27JhWrlypxMRE/f73v5ckubm5acSIEXr99de1evVq7d27V/369VNISIhiYmIkSU2bNtV9992nIUOGaMeOHdq6dauGDRum2NhYhYSESJIef/xxeXl5adCgQdq/f7+WLVumGTNmaNSoUa46dAAAALhQpb7dYtasWRo7dqz+/Oc/KzMzUyEhIXryySc1btw4u2bMmDE6e/ashg4dqqysLHXo0EHr1q2Tj4+PXbNkyRINGzZM3bp1k7u7u3r37q2ZM2fa8wEBAVq/fr3i4uLUunVr1a5dW+PGjWP5NwAAgN8oN+vCr69DmTkcDgUEBCg7O1v+/v4Vvv/WoxdW+D4BVIy0Kf1c3QIA3BBKk9cq9e0WAAAAgCsQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAIOHqxsAAKAkrUcvdHULAK6RtCn9XN3CZXElGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkgEAAAADIRkAAAAwEJIBAAAAAyEZAAAAMBCSAQAAAEOlD8k//PCD/vjHP6pWrVry9fVVixYt9OWXX9rzlmVp3Lhxqlu3rnx9fRUVFaXDhw87bePUqVPq27ev/P39FRgYqEGDBiknJ8epZs+ePerYsaN8fHwUGhqqyZMnV8jxAQAAoPKp1CH59OnTat++vTw9PfXpp5/qP//5j6ZOnaoaNWrYNZMnT9bMmTM1d+5cbd++XX5+foqOjta5c+fsmr59+2r//v1KSkrSmjVrtGXLFg0dOtSedzgc6t69u8LCwpSWlqYpU6Zo/PjxmjdvXoUeLwAAACoHD1c3cClvvfWWQkND9f7779tj4eHh9n9blqXp06frlVde0UMPPSRJWrhwoYKCgrRq1SrFxsbqwIEDWrdunXbu3Kk2bdpIkmbNmqUePXro7bffVkhIiJYsWaK8vDy999578vLyUvPmzbVr1y4lJiY6hekL5ebmKjc3137ucDiuxY8AAAAALlCprySvXr1abdq00R/+8AfVqVNHd955p/7617/a80ePHlV6erqioqLssYCAAEVERCg1NVWSlJqaqsDAQDsgS1JUVJTc3d21fft2u6ZTp07y8vKya6Kjo3Xo0CGdPn26xN4SEhIUEBBgP0JDQ8v12AEAAOA6lTok//e//9U777yjxo0b67PPPtPTTz+tZ555RgsWLJAkpaenS5KCgoKcXhcUFGTPpaenq06dOk7zHh4eqlmzplNNSdu4cB+m+Ph4ZWdn24/jx49f5dECAACgsqjUt1sUFhaqTZs2euONNyRJd955p/bt26e5c+eqf//+Lu3N29tb3t7eLu0BAAAA10alvpJct25dNWvWzGmsadOm+u677yRJwcHBkqSMjAynmoyMDHsuODhYmZmZTvP5+fk6deqUU01J27hwHwAAAPjtqNQhuX379jp06JDT2Ndff62wsDBJv36ILzg4WMnJyfa8w+HQ9u3bFRkZKUmKjIxUVlaW0tLS7JqUlBQVFhYqIiLCrtmyZYvOnz9v1yQlJalJkyZOK2kAAADgt6FSh+SRI0fqiy++0BtvvKEjR45o6dKlmjdvnuLi4iRJbm5uGjFihF5//XWtXr1ae/fuVb9+/RQSEqKYmBhJv155vu+++zRkyBDt2LFDW7du1bBhwxQbG6uQkBBJ0uOPPy4vLy8NGjRI+/fv17JlyzRjxgyNGjXKVYcOAAAAF6rU9yS3bdtWK1euVHx8vCZOnKjw8HBNnz5dffv2tWvGjBmjs2fPaujQocrKylKHDh20bt06+fj42DVLlizRsGHD1K1bN7m7u6t3796aOXOmPR8QEKD169crLi5OrVu3Vu3atTVu3LiLLv8GAACAG5ubZVmWq5u4ETgcDgUEBCg7O1v+/v4Vvv/WoxdW+D4BVIy0Kf1c3YJL8L4G3Lhc9b5WmrxWqW+3AAAAAFyBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABjKFJIbNGign376qdh4VlaWGjRocNVNAQAAAK5UppB87NgxFRQUFBvPzc3VDz/8cNVNAQAAAK7kUZri1atX2//92WefKSAgwH5eUFCg5ORk1a9fv9yaAwAAAFyhVCE5JiZGkuTm5qb+/fs7zXl6eqp+/fqaOnVquTUHAAAAuEKpQnJhYaEkKTw8XDt37lTt2rWvSVMAAACAK5UqJBc5evRoefcBAAAAVBplCsmSlJycrOTkZGVmZtpXmIu89957V90YAAAA4CplCskTJkzQxIkT1aZNG9WtW1dubm7l3RcAAADgMmUKyXPnztX8+fP1xBNPlHc/AAAAgMuVaZ3kvLw83XPPPeXdCwAAAFAplCkkDx48WEuXLi3vXgAAAIBKoUy3W5w7d07z5s3Thg0bdMcdd8jT09NpPjExsVyaAwAAAFyhTCF5z549atWqlSRp3759TnN8iA8AAADXuzKF5I0bN5Z3HwAAAEClUaZ7kgEAAIAbWZmuJHft2vWSt1WkpKSUuSEAAADA1coUkovuRy5y/vx57dq1S/v27VP//v3Loy8AAADAZcoUkqdNm1bi+Pjx45WTk3NVDQEAAACuVq73JP/xj3/Ue++9V56bBAAAACpcuYbk1NRU+fj4lOcmAQAAgApXptstHn74YafnlmXpf//7n7788kuNHTu2XBoDAAAAXKVMITkgIMDpubu7u5o0aaKJEyeqe/fu5dIYAAAA4CplCsnvv/9+efcBAAAAVBplCslF0tLSdODAAUlS8+bNdeedd5ZLUwAAAIArlSkkZ2ZmKjY2Vps2bVJgYKAkKSsrS127dtUHH3ygm266qTx7BAAAACpUmVa3GD58uM6cOaP9+/fr1KlTOnXqlPbt2yeHw6FnnnmmvHsEAAAAKlSZriSvW7dOGzZsUNOmTe2xZs2aac6cOXxwDwAAANe9Ml1JLiwslKenZ7FxT09PFRYWXnVTAAAAgCuVKSTfe++9evbZZ3XixAl77IcfftDIkSPVrVu3cmsOAAAAcIUyheTZs2fL4XCofv36atiwoRo2bKjw8HA5HA7NmjWrvHsEAAAAKlSZ7kkODQ3VV199pQ0bNujgwYOSpKZNmyoqKqpcmwMAAABcoVRXklNSUtSsWTM5HA65ubnpd7/7nYYPH67hw4erbdu2at68uf71r39dq14BAACAClGqkDx9+nQNGTJE/v7+xeYCAgL05JNPKjExsdyaAwAAAFyhVCF59+7duu+++y463717d6WlpV11UwAAAIArlSokZ2RklLj0WxEPDw+dPHnyqpsCAAAAXKlUIblevXrat2/fRef37NmjunXrXnVTAAAAgCuVKiT36NFDY8eO1blz54rN/fLLL3r11VfVq1evcmsOAAAAcIVSLQH3yiuv6KOPPtKtt96qYcOGqUmTJpKkgwcPas6cOSooKNDLL798TRoFAAAAKkqpQnJQUJC2bdump59+WvHx8bIsS5Lk5uam6OhozZkzR0FBQdekUQAAAKCilPrLRMLCwvTJJ5/o9OnTOnLkiCzLUuPGjVWjRo1r0R8AAABQ4cr0jXuSVKNGDbVt27Y8ewEAAAAqhVJ9cA8AAAD4LSAkAwAAAAZCMgAAAGAgJAMAAAAGQjIAAABgICQDAAAABkIyAAAAYCAkAwAAAAZCMgAAAGAgJAMAAAAGQjIAAABgICQDAAAABkIyAAAAYCAkAwAAAAZCMgAAAGAgJAMAAAAGQjIAAABgICQDAAAABkIyAAAAYCAkAwAAAAZCMgAAAGAgJAMAAAAGQjIAAABgICQDAAAABkIyAAAAYLiuQvKbb74pNzc3jRgxwh47d+6c4uLiVKtWLVWrVk29e/dWRkaG0+u+++479ezZU1WrVlWdOnU0evRo5efnO9Vs2rRJd911l7y9vdWoUSPNnz+/Ao4IAAAAldF1E5J37typd999V3fccYfT+MiRI/XPf/5TH374oTZv3qwTJ07o4YcftucLCgrUs2dP5eXladu2bVqwYIHmz5+vcePG2TVHjx5Vz5491bVrV+3atUsjRozQ4MGD9dlnn1XY8QEAAKDyuC5Cck5Ojvr27au//vWvqlGjhj2enZ2tv//970pMTNS9996r1q1b6/3339e2bdv0xRdfSJLWr1+v//znP1q8eLFatWql+++/X6+99prmzJmjvLw8SdLcuXMVHh6uqVOnqmnTpho2bJj69OmjadOmueR4AQAA4FrXRUiOi4tTz549FRUV5TSelpam8+fPO43fdtttuuWWW5SamipJSk1NVYsWLRQUFGTXREdHy+FwaP/+/XaNue3o6Gh7GyXJzc2Vw+FwegAAAODG4OHqBi7ngw8+0FdffaWdO3cWm0tPT5eXl5cCAwOdxoOCgpSenm7XXBiQi+aL5i5V43A49Msvv8jX17fYvhMSEjRhwoQyHxcAAAAqr0p9Jfn48eN69tlntWTJEvn4+Li6HSfx8fHKzs62H8ePH3d1SwAAACgnlTokp6WlKTMzU3fddZc8PDzk4eGhzZs3a+bMmfLw8FBQUJDy8vKUlZXl9LqMjAwFBwdLkoKDg4utdlH0/HI1/v7+JV5FliRvb2/5+/s7PQAAAHBjqNQhuVu3btq7d6927dplP9q0aaO+ffva/+3p6ank5GT7NYcOHdJ3332nyMhISVJkZKT27t2rzMxMuyYpKUn+/v5q1qyZXXPhNopqirYBAACA35ZKfU9y9erVdfvttzuN+fn5qVatWvb4oEGDNGrUKNWsWVP+/v4aPny4IiMj1a5dO0lS9+7d1axZMz3xxBOaPHmy0tPT9corryguLk7e3t6SpKeeekqzZ8/WmDFj9Kc//UkpKSlavny51q5dW7EHDAAAgEqhUofkKzFt2jS5u7urd+/eys3NVXR0tP7yl7/Y81WqVNGaNWv09NNPKzIyUn5+furfv78mTpxo14SHh2vt2rUaOXKkZsyYoZtvvll/+9vfFB0d7YpDAgAAgIu5WZZlubqJG4HD4VBAQICys7Ndcn9y69ELK3yfACpG2pR+rm7BJXhfA25crnpfK01eq9T3JAMAAACuQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMlTokJyQkqG3btqpevbrq1KmjmJgYHTp0yKnm3LlziouLU61atVStWjX17t1bGRkZTjXfffedevbsqapVq6pOnToaPXq08vPznWo2bdqku+66S97e3mrUqJHmz59/rQ8PAAAAlVSlDsmbN29WXFycvvjiCyUlJen8+fPq3r27zp49a9eMHDlS//znP/Xhhx9q8+bNOnHihB5++GF7vqCgQD179lReXp62bdumBQsWaP78+Ro3bpxdc/ToUfXs2VNdu3bVrl27NGLECA0ePFifffZZhR4vAAAAKgc3y7IsVzdxpU6ePKk6depo8+bN6tSpk7Kzs3XTTTdp6dKl6tOnjyTp4MGDatq0qVJTU9WuXTt9+umn6tWrl06cOKGgoCBJ0ty5c/XCCy/o5MmT8vLy0gsvvKC1a9dq37599r5iY2OVlZWldevWXVFvDodDAQEBys7Olr+/f/kf/GW0Hr2wwvcJoGKkTenn6hZcgvc14Mblqve10uS1Sn0l2ZSdnS1JqlmzpiQpLS1N58+fV1RUlF1z22236ZZbblFqaqokKTU1VS1atLADsiRFR0fL4XBo//79ds2F2yiqKdpGSXJzc+VwOJweAAAAuDFcNyG5sLBQI0aMUPv27XX77bdLktLT0+Xl5aXAwECn2qCgIKWnp9s1FwbkovmiuUvVOBwO/fLLLyX2k5CQoICAAPsRGhp61ccIAACAyuG6CclxcXHat2+fPvjgA1e3IkmKj49Xdna2/Th+/LirWwIAAEA58XB1A1di2LBhWrNmjbZs2aKbb77ZHg8ODlZeXp6ysrKcriZnZGQoODjYrtmxY4fT9opWv7iwxlwRIyMjQ/7+/vL19S2xJ29vb3l7e1/1sQEAAKDyqdRXki3L0rBhw7Ry5UqlpKQoPDzcab5169by9PRUcnKyPXbo0CF99913ioyMlCRFRkZq7969yszMtGuSkpLk7++vZs2a2TUXbqOopmgbAAAA+G2p1FeS4+LitHTpUn388ceqXr26fQ9xQECAfH19FRAQoEGDBmnUqFGqWbOm/P39NXz4cEVGRqpdu3aSpO7du6tZs2Z64oknNHnyZKWnp+uVV15RXFycfSX4qaee0uzZszVmzBj96U9/UkpKipYvX661a9e67NgBAADgOpX6SvI777yj7OxsdenSRXXr1rUfy5Yts2umTZumXr16qXfv3urUqZOCg4P10Ucf2fNVqlTRmjVrVKVKFUVGRuqPf/yj+vXrp4kTJ9o14eHhWrt2rZKSktSyZUtNnTpVf/vb3xQdHV2hxwsAAIDKoVJfSb6SJZx9fHw0Z84czZkz56I1YWFh+uSTTy65nS5duujf//53qXsEAADAjadSX0kGAAAAXIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQbJgzZ47q168vHx8fRUREaMeOHa5uCQAAABWMkHyBZcuWadSoUXr11Vf11VdfqWXLloqOjlZmZqarWwMAAEAFIiRfIDExUUOGDNHAgQPVrFkzzZ07V1WrVtV7773n6tYAAABQgTxc3UBlkZeXp7S0NMXHx9tj7u7uioqKUmpqarH63Nxc5ebm2s+zs7MlSQ6H49o3W4KC3F9csl8A156r3ldcjfc14Mblqve1ov1alnXZWkLy//nxxx9VUFCgoKAgp/GgoCAdPHiwWH1CQoImTJhQbDw0NPSa9Qjgtylg1lOubgEAypWr39fOnDmjgICAS9YQkssoPj5eo0aNsp8XFhbq1KlTqlWrltzc3FzYGW50DodDoaGhOn78uPz9/V3dDgBcNd7XUFEsy9KZM2cUEhJy2VpC8v+pXbu2qlSpooyMDKfxjIwMBQcHF6v39vaWt7e301hgYOC1bBFw4u/vz18mAG4ovK+hIlzuCnIRPrj3f7y8vNS6dWslJyfbY4WFhUpOTlZkZKQLOwMAAEBF40ryBUaNGqX+/furTZs2uvvuuzV9+nSdPXtWAwcOdHVrAAAAqECE5As8+uijOnnypMaNG6f09HS1atVK69atK/ZhPsCVvL299eqrrxa73QcArle8r6EycrOuZA0MAAAA4DeEe5IBAAAAAyEZAAAAMBCSAQAAAAMhGQAAADAQkoHrzJw5c1S/fn35+PgoIiJCO3bscHVLAFAmW7Zs0QMPPKCQkBC5ublp1apVrm4JsBGSgevIsmXLNGrUKL366qv66quv1LJlS0VHRyszM9PVrQFAqZ09e1YtW7bUnDlzXN0KUAxLwAHXkYiICLVt21azZ8+W9Ou3QoaGhmr48OF68cUXXdwdAJSdm5ubVq5cqZiYGFe3AkjiSjJw3cjLy1NaWpqioqLsMXd3d0VFRSk1NdWFnQEAcOMhJAPXiR9//FEFBQXFvgEyKChI6enpLuoKAIAbEyEZAAAAMBCSgetE7dq1VaVKFWVkZDiNZ2RkKDg42EVdAQBwYyIkA9cJLy8vtW7dWsnJyfZYYWGhkpOTFRkZ6cLOAAC48Xi4ugEAV27UqFHq37+/2rRpo7vvvlvTp0/X2bNnNXDgQFe3BgCllpOToyNHjtjPjx49ql27dqlmzZq65ZZbXNgZwBJwwHVn9uzZmjJlitLT09WqVSvNnDlTERERrm4LAEpt06ZN6tq1a7Hx/v37a/78+RXfEHABQjIAAABg4J5kAAAAwEBIBgAAAAyEZAAAAMBASAYAAAAMhGQAAADAQEgGAAAADIRkAAAAwEBIBgAAAAyEZABAMfPnz1dgYOBVb8fNzU2rVq266u0AQEUjJAPADWrAgAGKiYlxdRsAcF0iJAMAAAAGQjIA/AYlJiaqRYsW8vPzU2hoqP785z8rJyenWN2qVavUuHFj+fj4KDo6WsePH3ea//jjj3XXXXfJx8dHDRo00IQJE5Sfn19RhwEA1wwhGQB+g9zd3TVz5kzt379fCxYsUEpKisaMGeNU8/PPP2vSpElauHChtm7dqqysLMXGxtrz//rXv9SvXz89++yz+s9//qN3331X8+fP16RJkyr6cACg3LlZlmW5ugkAQPkbMGCAsrKyruiDcytWrNBTTz2lH3/8UdKvH9wbOHCgvvjiC0VEREiSDh48qKZNm2r79u26++67FRUVpW7duik+Pt7ezuLFizVmzBidOHFC0q8f3Fu5ciX3RgO47ni4ugEAQMXbsGGDEhISdPDgQTkcDuXn5+vcuXP6+eefVbVqVUmSh4eH2rZta7/mtttuU2BgoA4cOKC7775bu3fv1tatW52uHBcUFBTbDgBcjwjJAPAbc+zYMfXq1UtPP/20Jk2apJo1a+rzzz/XoEGDlJeXd8XhNicnRxMmTNDDDz9cbM7Hx6e82waACkVIBoDfmLS0NBUWFmrq1Klyd//1oynLly8vVpefn68vv/xSd999tyTp0KFDysrKUtOmTSVJd911lw4dOqRGjRpVXPMAUEEIyQBwA8vOztauXbucxmrXrq3z589r1qxZeuCBB7R161bNnTu32Gs9PT01fPhwzZw5Ux4eHho2bJjatWtnh+Zx48apV69euuWWW9SnTx+5u7tr9+7d2rdvn15//fWKODwAuGZY3QIAbmCbNm3SnXfe6fRYtGiREhMT9dZbb+n222/XkiVLlJCQUOy1VatW1QsvvKDHH39c7du3V7Vq1bRs2TJ7Pjo6WmvWrNH69evVtm1btWvXTtOmTVNYWFhFHiIAXBOsbgEAAAAYuJIMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAYCMkAAACAgZAMAAAAGAjJAAAAgIGQDAAAABgIyQAAAICBkAwAAAAY/j8efSUp2XpE+gAAAABJRU5ErkJggg==\n"
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import re\n",
+ "import nltk\n",
+ "nltk.download('stopwords')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "TC5g2Y6zTdkG",
+ "outputId": "6692619c-1712-4c36-b6b5-95541a11665c"
+ },
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
+ "[nltk_data] Unzipping corpora/stopwords.zip.\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from nltk.corpus import stopwords\n",
+ "from nltk.stem import PorterStemmer\n",
+ "ps = PorterStemmer()"
+ ],
+ "metadata": {
+ "id": "qhClmm5EUCCK"
+ },
+ "execution_count": 7,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "corpus=[]\n",
+ "for i in range (0,len(df)):\n",
+ " review = re.sub('[^a-zA-z]',' ',df['text'][i])\n",
+ " review = review.lower()\n",
+ " review = review.split()\n",
+ " review = [ps.stem(word) for word in review if not word in stopwords.words('english')]\n",
+ " review = ' '.join(review)\n",
+ " corpus.append(review)"
+ ],
+ "metadata": {
+ "id": "aIVAQXzhVDOs"
+ },
+ "execution_count": 8,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# **Create Bag of Words**"
+ ],
+ "metadata": {
+ "id": "ipsL5hyDWl3X"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "cv = CountVectorizer(max_features=2500,ngram_range=(1,2))\n",
+ "X = cv.fit_transform(corpus).toarray()\n"
+ ],
+ "metadata": {
+ "id": "esUtMy6CWYdi"
+ },
+ "execution_count": 9,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "cv.vocabulary_"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "yPEaAou0aO3o",
+ "outputId": "07300953-4837-4413-9cb7-e170bd9ab405"
+ },
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "{'user': np.int64(2369),\n",
+ " 'id': np.int64(1130),\n",
+ " 'origin': np.int64(1610),\n",
+ " 'messag': np.int64(1442),\n",
+ " 'bna': np.int64(257),\n",
+ " 'highlight': np.int64(1096),\n",
+ " 'sent': np.int64(2023),\n",
+ " 'thursday': np.int64(2262),\n",
+ " 'june': np.int64(1260),\n",
+ " 'escapenumb': np.int64(746),\n",
+ " 'pm': np.int64(1716),\n",
+ " 'subject': np.int64(2179),\n",
+ " 'inc': np.int64(1152),\n",
+ " 'daili': np.int64(528),\n",
+ " 'labor': np.int64(1286),\n",
+ " 'report': np.int64(1900),\n",
+ " 'tabl': np.int64(2211),\n",
+ " 'content': np.int64(465),\n",
+ " 'regist': np.int64(1874),\n",
+ " 'web': np.int64(2413),\n",
+ " 'subscrib': np.int64(2183),\n",
+ " 'access': np.int64(8),\n",
+ " 'full': np.int64(1019),\n",
+ " 'text': np.int64(2246),\n",
+ " 'articl': np.int64(140),\n",
+ " 'use': np.int64(2365),\n",
+ " 'url': np.int64(2359),\n",
+ " 'link': np.int64(1333),\n",
+ " 'suppli': np.int64(2200),\n",
+ " 'inform': np.int64(1169),\n",
+ " 'becom': np.int64(208),\n",
+ " 'sign': np.int64(2068),\n",
+ " 'free': np.int64(1009),\n",
+ " 'trial': np.int64(2313),\n",
+ " 'avail': np.int64(173),\n",
+ " 'http': np.int64(1123),\n",
+ " 'com': np.int64(400),\n",
+ " 'call': np.int64(300),\n",
+ " 'custom': np.int64(522),\n",
+ " 'relat': np.int64(1881),\n",
+ " 'mon': np.int64(1481),\n",
+ " 'fri': np.int64(1011),\n",
+ " 'et': np.int64(870),\n",
+ " 'decis': np.int64(561),\n",
+ " 'conduct': np.int64(437),\n",
+ " 'polici': np.int64(1719),\n",
+ " 'california': np.int64(299),\n",
+ " 'firm': np.int64(976),\n",
+ " 'two': np.int64(2327),\n",
+ " 'bar': np.int64(188),\n",
+ " 'languag': np.int64(1288),\n",
+ " 'restrict': np.int64(1920),\n",
+ " 'solicit': np.int64(2092),\n",
+ " 'distribut': np.int64(617),\n",
+ " 'constitut': np.int64(454),\n",
+ " 'practic': np.int64(1733),\n",
+ " 'district': np.int64(619),\n",
+ " 'rule': np.int64(1959),\n",
+ " 'nation': np.int64(1511),\n",
+ " 'board': np.int64(260),\n",
+ " 'held': np.int64(1085),\n",
+ " 'transport': np.int64(2305),\n",
+ " 'employe': np.int64(694),\n",
+ " 'right': np.int64(1937),\n",
+ " 'act': np.int64(16),\n",
+ " 'order': np.int64(1607),\n",
+ " 'second': np.int64(1990),\n",
+ " 'elect': np.int64(680),\n",
+ " 'determin': np.int64(594),\n",
+ " 'whether': np.int64(2432),\n",
+ " 'calif': np.int64(298),\n",
+ " 'want': np.int64(2403),\n",
+ " 'repres': np.int64(1903),\n",
+ " 'trade': np.int64(2294),\n",
+ " 'northern': np.int64(1551),\n",
+ " 'court': np.int64(506),\n",
+ " 'find': np.int64(973),\n",
+ " 'consid': np.int64(450),\n",
+ " 'new': np.int64(1528),\n",
+ " 'final': np.int64(968),\n",
+ " 'view': np.int64(2390),\n",
+ " 'actual': np.int64(21),\n",
+ " 'judg': np.int64(1255),\n",
+ " 'legal': np.int64(1314),\n",
+ " 'appli': np.int64(119),\n",
+ " 'work': np.int64(2463),\n",
+ " 'time': np.int64(2271),\n",
+ " 'place': np.int64(1696),\n",
+ " 'across': np.int64(15),\n",
+ " 'cannot': np.int64(310),\n",
+ " 'said': np.int64(1965),\n",
+ " 'union': np.int64(2348),\n",
+ " 'effort': np.int64(675),\n",
+ " 'protect': np.int64(1795),\n",
+ " 'activ': np.int64(20),\n",
+ " 'page': np.int64(1628),\n",
+ " 'aa': np.int64(0),\n",
+ " 'pub': np.int64(1803),\n",
+ " 'ip': np.int64(1219),\n",
+ " 'dlr': np.int64(621),\n",
+ " 'nsf': np.int64(1563),\n",
+ " 'escapelong': np.int64(742),\n",
+ " 'attack': np.int64(159),\n",
+ " 'former': np.int64(998),\n",
+ " 'servic': np.int64(2041),\n",
+ " 'worker': np.int64(2464),\n",
+ " 'power': np.int64(1729),\n",
+ " 'compani': np.int64(422),\n",
+ " 'fire': np.int64(975),\n",
+ " 'like': np.int64(1329),\n",
+ " 'stem': np.int64(2152),\n",
+ " 'head': np.int64(1079),\n",
+ " 'injuri': np.int64(1178),\n",
+ " 'unabl': np.int64(2340),\n",
+ " 'establish': np.int64(864),\n",
+ " 'american': np.int64(94),\n",
+ " 'state': np.int64(2132),\n",
+ " 'co': np.int64(395),\n",
+ " 'escapenumberth': np.int64(861),\n",
+ " 'fail': np.int64(927),\n",
+ " 'show': np.int64(2066),\n",
+ " 'qualifi': np.int64(1821),\n",
+ " 'individu': np.int64(1166),\n",
+ " 'given': np.int64(1046),\n",
+ " 'could': np.int64(497),\n",
+ " 'abil': np.int64(3),\n",
+ " 'answer': np.int64(109),\n",
+ " 'phone': np.int64(1684),\n",
+ " 'ga': np.int64(1028),\n",
+ " 'electr': np.int64(681),\n",
+ " 'emerg': np.int64(691),\n",
+ " 'summari': np.int64(2195),\n",
+ " 'also': np.int64(88),\n",
+ " 'direct': np.int64(604),\n",
+ " 'employ': np.int64(693),\n",
+ " 'job': np.int64(1241),\n",
+ " 'requir': np.int64(1907),\n",
+ " 'handl': np.int64(1074),\n",
+ " 'line': np.int64(1332),\n",
+ " 'signific': np.int64(2069),\n",
+ " 'public': np.int64(1806),\n",
+ " 'vote': np.int64(2397),\n",
+ " 'affili': np.int64(58),\n",
+ " 'assembl': np.int64(144),\n",
+ " 'unit': np.int64(2350),\n",
+ " 'arm': np.int64(133),\n",
+ " 'associ': np.int64(149),\n",
+ " 'goe': np.int64(1052),\n",
+ " 'plan': np.int64(1697),\n",
+ " 'member': np.int64(1435),\n",
+ " 'effect': np.int64(673),\n",
+ " 'juli': np.int64(1257),\n",
+ " 'howev': np.int64(1118),\n",
+ " 'appear': np.int64(118),\n",
+ " 'one': np.int64(1597),\n",
+ " 'may': np.int64(1412),\n",
+ " 'need': np.int64(1518),\n",
+ " 'clear': np.int64(388),\n",
+ " 'year': np.int64(2486),\n",
+ " 'hous': np.int64(1114),\n",
+ " 'schedul': np.int64(1980),\n",
+ " 'chang': np.int64(368),\n",
+ " 'would': np.int64(2468),\n",
+ " 'collect': np.int64(399),\n",
+ " 'bargain': np.int64(189),\n",
+ " 'within': np.int64(2452),\n",
+ " 'four': np.int64(1006),\n",
+ " 'condit': np.int64(436),\n",
+ " 'grant': np.int64(1058),\n",
+ " 'pass': np.int64(1648),\n",
+ " 'happen': np.int64(1075),\n",
+ " 'agreement': np.int64(71),\n",
+ " 'york': np.int64(2491),\n",
+ " 'citi': np.int64(383),\n",
+ " 'law': np.int64(1301),\n",
+ " 'case': np.int64(321),\n",
+ " 'first': np.int64(977),\n",
+ " 'feder': np.int64(946),\n",
+ " 'million': np.int64(1462),\n",
+ " 'damag': np.int64(530),\n",
+ " 'award': np.int64(178),\n",
+ " 'human': np.int64(1127),\n",
+ " 'post': np.int64(1727),\n",
+ " 'southern': np.int64(2105),\n",
+ " 'defens': np.int64(564),\n",
+ " 'take': np.int64(2213),\n",
+ " 'advantag': np.int64(36),\n",
+ " 'depart': np.int64(575),\n",
+ " 'process': np.int64(1771),\n",
+ " 'pressur': np.int64(1748),\n",
+ " 'late': np.int64(1295),\n",
+ " 'rais': np.int64(1830),\n",
+ " 'challeng': np.int64(366),\n",
+ " 'nomin': np.int64(1544),\n",
+ " 'candid': np.int64(309),\n",
+ " 'presid': np.int64(1745),\n",
+ " 'session': np.int64(2044),\n",
+ " 'intern': np.int64(1200),\n",
+ " 'jame': np.int64(1226),\n",
+ " 'tom': np.int64(2282),\n",
+ " 'top': np.int64(2286),\n",
+ " 'offic': np.int64(1587),\n",
+ " 'current': np.int64(519),\n",
+ " 'secretari': np.int64(1991),\n",
+ " 'run': np.int64(1960),\n",
+ " 'local': np.int64(1348),\n",
+ " 'occur': np.int64(1576),\n",
+ " 'receiv': np.int64(1848),\n",
+ " 'percent': np.int64(1665),\n",
+ " 'support': np.int64(2202),\n",
+ " 'name': np.int64(1509),\n",
+ " 'fall': np.int64(930),\n",
+ " 'among': np.int64(96),\n",
+ " 'voic': np.int64(2394),\n",
+ " 'variou': np.int64(2376),\n",
+ " 'amend': np.int64(92),\n",
+ " 'hundr': np.int64(1128),\n",
+ " 'campaign': np.int64(305),\n",
+ " 'confid': np.int64(440),\n",
+ " 'get': np.int64(1042),\n",
+ " 'necessari': np.int64(1517),\n",
+ " 'earlier': np.int64(650),\n",
+ " 'readi': np.int64(1841),\n",
+ " 'survey': np.int64(2205),\n",
+ " 'week': np.int64(2421),\n",
+ " 'launch': np.int64(1298),\n",
+ " 'safeti': np.int64(1964),\n",
+ " 'health': np.int64(1080),\n",
+ " 'administr': np.int64(29),\n",
+ " 'annual': np.int64(107),\n",
+ " 'gather': np.int64(1034),\n",
+ " 'data': np.int64(535),\n",
+ " 'better': np.int64(234),\n",
+ " 'target': np.int64(2217),\n",
+ " 'high': np.int64(1093),\n",
+ " 'agenc': np.int64(64),\n",
+ " 'includ': np.int64(1156),\n",
+ " 'construct': np.int64(457),\n",
+ " 'program': np.int64(1780),\n",
+ " 'improv': np.int64(1151),\n",
+ " 'identifi': np.int64(1136),\n",
+ " 'davi': np.int64(540),\n",
+ " 'say': np.int64(1976),\n",
+ " 'request': np.int64(1906),\n",
+ " 'calendar': np.int64(297),\n",
+ " 'base': np.int64(191),\n",
+ " 'januari': np.int64(1229),\n",
+ " 'februari': np.int64(945),\n",
+ " 'spokesman': np.int64(2116),\n",
+ " 'ask': np.int64(142),\n",
+ " 'figur': np.int64(965),\n",
+ " 'averag': np.int64(174),\n",
+ " 'hour': np.int64(1112),\n",
+ " 'site': np.int64(2078),\n",
+ " 'must': np.int64(1501),\n",
+ " 'return': np.int64(1928),\n",
+ " 'even': np.int64(877),\n",
+ " 'record': np.int64(1858),\n",
+ " 'today': np.int64(2278),\n",
+ " 'event': np.int64(879),\n",
+ " 'revis': np.int64(1934),\n",
+ " 'domest': np.int64(626),\n",
+ " 'product': np.int64(1773),\n",
+ " 'quarter': np.int64(1823),\n",
+ " 'releas': np.int64(1883),\n",
+ " 'commerc': np.int64(411),\n",
+ " 'news': np.int64(1530),\n",
+ " 'reflect': np.int64(1866),\n",
+ " 'market': np.int64(1400),\n",
+ " 'number': np.int64(1565),\n",
+ " 'involv': np.int64(1215),\n",
+ " 'higher': np.int64(1094),\n",
+ " 'five': np.int64(979),\n",
+ " 'month': np.int64(1488),\n",
+ " 'period': np.int64(1671),\n",
+ " 'total': np.int64(2288),\n",
+ " 'insur': np.int64(1185),\n",
+ " 'claim': np.int64(384),\n",
+ " 'file': np.int64(966),\n",
+ " 'fell': np.int64(951),\n",
+ " 'third': np.int64(2252),\n",
+ " 'season': np.int64(1988),\n",
+ " 'adjust': np.int64(28),\n",
+ " 'end': np.int64(699),\n",
+ " 'care': np.int64(318),\n",
+ " 'sever': np.int64(2049),\n",
+ " 'factor': np.int64(926),\n",
+ " 'combin': np.int64(408),\n",
+ " 'key': np.int64(1270),\n",
+ " 'accord': np.int64(9),\n",
+ " 'gener': np.int64(1038),\n",
+ " 'account': np.int64(10),\n",
+ " 'entertain': np.int64(725),\n",
+ " 'six': np.int64(2080),\n",
+ " 'world': np.int64(2465),\n",
+ " 'three': np.int64(2259),\n",
+ " 'contract': np.int64(470),\n",
+ " 'lead': np.int64(1304),\n",
+ " 'committe': np.int64(416),\n",
+ " 'expect': np.int64(900),\n",
+ " 'bill': np.int64(245),\n",
+ " 'allow': np.int64(84),\n",
+ " 'organ': np.int64(1609),\n",
+ " 'propos': np.int64(1792),\n",
+ " 'panel': np.int64(1633),\n",
+ " 'chairman': np.int64(365),\n",
+ " 'econom': np.int64(657),\n",
+ " 'outlook': np.int64(1618),\n",
+ " 'anderson': np.int64(101),\n",
+ " 'forecast': np.int64(992),\n",
+ " 'face': np.int64(922),\n",
+ " 'sinc': np.int64(2074),\n",
+ " 'earli': np.int64(649),\n",
+ " 'predict': np.int64(1735),\n",
+ " 'lower': np.int64(1368),\n",
+ " 'growth': np.int64(1067),\n",
+ " 'least': np.int64(1309),\n",
+ " 'senat': np.int64(2015),\n",
+ " 'democrat': np.int64(573),\n",
+ " 'legisl': np.int64(1315),\n",
+ " 'prospect': np.int64(1794),\n",
+ " 'close': np.int64(392),\n",
+ " 'busi': np.int64(287),\n",
+ " 'night': np.int64(1542),\n",
+ " 'help': np.int64(1087),\n",
+ " 'ad': np.int64(23),\n",
+ " 'confer': np.int64(438),\n",
+ " 'demand': np.int64(572),\n",
+ " 'declin': np.int64(562),\n",
+ " 'advertis': np.int64(37),\n",
+ " 'index': np.int64(1161),\n",
+ " 'point': np.int64(1718),\n",
+ " 'previou': np.int64(1751),\n",
+ " 'offici': np.int64(1590),\n",
+ " 'go': np.int64(1048),\n",
+ " 'forward': np.int64(1002),\n",
+ " 'way': np.int64(2411),\n",
+ " 'announc': np.int64(106),\n",
+ " 'studi': np.int64(2177),\n",
+ " 'feel': np.int64(949),\n",
+ " 'affect': np.int64(57),\n",
+ " 'perform': np.int64(1667),\n",
+ " 'manag': np.int64(1392),\n",
+ " 'promot': np.int64(1789),\n",
+ " 'transfer': np.int64(2302),\n",
+ " 'anoth': np.int64(108),\n",
+ " 'facil': np.int64(923),\n",
+ " 'situat': np.int64(2079),\n",
+ " 'treat': np.int64(2309),\n",
+ " 'favor': np.int64(936),\n",
+ " 'steel': np.int64(2149),\n",
+ " 'reach': np.int64(1837),\n",
+ " 'cover': np.int64(507),\n",
+ " 'letter': np.int64(1323),\n",
+ " 'bush': np.int64(286),\n",
+ " 'leadership': np.int64(1306),\n",
+ " 'reduct': np.int64(1863),\n",
+ " 'john': np.int64(1243),\n",
+ " 'cut': np.int64(523),\n",
+ " 'special': np.int64(2109),\n",
+ " 'last': np.int64(1292),\n",
+ " 'train': np.int64(2297),\n",
+ " 'publish': np.int64(1808),\n",
+ " 'affair': np.int64(56),\n",
+ " 'st': np.int64(2124),\n",
+ " 'washington': np.int64(2406),\n",
+ " 'dc': np.int64(545),\n",
+ " 'contact': np.int64(460),\n",
+ " 'copi': np.int64(481),\n",
+ " 'mail': np.int64(1380),\n",
+ " 'copyright': np.int64(483),\n",
+ " 'www': np.int64(2475),\n",
+ " 'corp': np.int64(487),\n",
+ " 'term': np.int64(2238),\n",
+ " 'licens': np.int64(1326),\n",
+ " 'prohibit': np.int64(1782),\n",
+ " 'origin messag': np.int64(1611),\n",
+ " 'sent thursday': np.int64(2027),\n",
+ " 'june escapenumb': np.int64(1261),\n",
+ " 'escapenumb escapenumb': np.int64(772),\n",
+ " 'escapenumb pm': np.int64(808),\n",
+ " 'bna com': np.int64(258),\n",
+ " 'aa escapenumb': np.int64(1),\n",
+ " 'escapenumb http': np.int64(787),\n",
+ " 'http pub': np.int64(1124),\n",
+ " 'pub bna': np.int64(1804),\n",
+ " 'com ip': np.int64(405),\n",
+ " 'ip bna': np.int64(1220),\n",
+ " 'bna dlr': np.int64(259),\n",
+ " 'dlr nsf': np.int64(622),\n",
+ " 'nsf id': np.int64(1564),\n",
+ " 'id escapelong': np.int64(1133),\n",
+ " 'ga electr': np.int64(1029),\n",
+ " 'page escapenumb': np.int64(1629),\n",
+ " 'juli escapenumb': np.int64(1258),\n",
+ " 'new york': np.int64(1529),\n",
+ " 'escapenumb million': np.int64(794),\n",
+ " 'escapenumb percent': np.int64(805),\n",
+ " 'year escapenumb': np.int64(2488),\n",
+ " 'total escapenumb': np.int64(2289),\n",
+ " 'week escapenumb': np.int64(2422),\n",
+ " 'escapenumb week': np.int64(824),\n",
+ " 'unit state': np.int64(2351),\n",
+ " 'escapenumb new': np.int64(800),\n",
+ " 'last year': np.int64(1294),\n",
+ " 'inc escapenumb': np.int64(1153),\n",
+ " 'dc escapenumb': np.int64(546),\n",
+ " 'escapenumb mail': np.int64(790),\n",
+ " 'go http': np.int64(1049),\n",
+ " 'http www': np.int64(1125),\n",
+ " 'copyright escapenumb': np.int64(484),\n",
+ " 'hi': np.int64(1092),\n",
+ " 'chri': np.int64(379),\n",
+ " 'roll': np.int64(1951),\n",
+ " 'jon': np.int64(1249),\n",
+ " 'select': np.int64(2010),\n",
+ " 'main': np.int64(1386),\n",
+ " 'download': np.int64(634),\n",
+ " 'screen': np.int64(1985),\n",
+ " 'bottom': np.int64(266),\n",
+ " 'export': np.int64(911),\n",
+ " 'click': np.int64(390),\n",
+ " 'give': np.int64(1045),\n",
+ " 'choos': np.int64(378),\n",
+ " 'save': np.int64(1974),\n",
+ " 'let': np.int64(1321),\n",
+ " 'know': np.int64(1281),\n",
+ " 'anyon': np.int64(113),\n",
+ " 'els': np.int64(684),\n",
+ " 'thank': np.int64(2247),\n",
+ " 'jay': np.int64(1233),\n",
+ " 'let know': np.int64(1322),\n",
+ " 'thoma': np.int64(2255),\n",
+ " 'paul': np.int64(1656),\n",
+ " 'friday': np.int64(1012),\n",
+ " 'mark': np.int64(1399),\n",
+ " 'east': np.int64(654),\n",
+ " 'follow': np.int64(988),\n",
+ " 'pjm': np.int64(1695),\n",
+ " 'price': np.int64(1753),\n",
+ " 'enron': np.int64(716),\n",
+ " 'asp': np.int64(143),\n",
+ " 'iso': np.int64(1223),\n",
+ " 'cooper': np.int64(479),\n",
+ " 'mid': np.int64(1453),\n",
+ " 'add': np.int64(25),\n",
+ " 'question': np.int64(1824),\n",
+ " 'sent friday': np.int64(2025),\n",
+ " 'web site': np.int64(2414),\n",
+ " 'corp enron': np.int64(489),\n",
+ " 'enron com': np.int64(717),\n",
+ " 'know question': np.int64(1282),\n",
+ " 'gerald': np.int64(1041),\n",
+ " 'spot': np.int64(2121),\n",
+ " 'el': np.int64(678),\n",
+ " 'paso': np.int64(1647),\n",
+ " 'ena': np.int64(695),\n",
+ " 'prefer': np.int64(1736),\n",
+ " 'review': np.int64(1932),\n",
+ " 'master': np.int64(1402),\n",
+ " 'send': np.int64(2016),\n",
+ " 'judi': np.int64(1256),\n",
+ " 'jan': np.int64(1227),\n",
+ " 'tuesday': np.int64(2319),\n",
+ " 'cc': np.int64(330),\n",
+ " 'harri': np.int64(1078),\n",
+ " 'quick': np.int64(1826),\n",
+ " 'respons': np.int64(1918),\n",
+ " 'attach': np.int64(156),\n",
+ " 'pleas': np.int64(1703),\n",
+ " 'pro': np.int64(1766),\n",
+ " 'fill': np.int64(967),\n",
+ " 'analyst': np.int64(100),\n",
+ " 'vacat': np.int64(2372),\n",
+ " 'turn': np.int64(2324),\n",
+ " 'back': np.int64(182),\n",
+ " 'comment': np.int64(410),\n",
+ " 'recommend': np.int64(1857),\n",
+ " 'merchant': np.int64(1441),\n",
+ " 'energi': np.int64(701),\n",
+ " 'mailto': np.int64(1385),\n",
+ " 'anyth': np.int64(115),\n",
+ " 'llc': np.int64(1342),\n",
+ " 'email': np.int64(686),\n",
+ " 'corpor': np.int64(492),\n",
+ " 'confidenti': np.int64(441),\n",
+ " 'intend': np.int64(1188),\n",
+ " 'sole': np.int64(2090),\n",
+ " 'entiti': np.int64(727),\n",
+ " 'address': np.int64(27),\n",
+ " 'error': np.int64(739),\n",
+ " 'notifi': np.int64(1558),\n",
+ " 'sender': np.int64(2019),\n",
+ " 'doc': np.int64(623),\n",
+ " 'el paso': np.int64(679),\n",
+ " 'sent tuesday': np.int64(2028),\n",
+ " 'pleas find': np.int64(1708),\n",
+ " 'call question': np.int64(302),\n",
+ " 'escapenumb origin': np.int64(803),\n",
+ " 'com sent': np.int64(406),\n",
+ " 'com subject': np.int64(407),\n",
+ " 'pleas let': np.int64(1709),\n",
+ " 'escapenumb email': np.int64(768),\n",
+ " 'sole use': np.int64(2091),\n",
+ " 'error pleas': np.int64(740),\n",
+ " 'physic': np.int64(1689),\n",
+ " 'prepar': np.int64(1741),\n",
+ " 'draft': np.int64(637),\n",
+ " 'conveni': np.int64(476),\n",
+ " 'person': np.int64(1673),\n",
+ " 'email address': np.int64(687),\n",
+ " 'fyi': np.int64(1027),\n",
+ " 'commun': np.int64(419),\n",
+ " 'america': np.int64(93),\n",
+ " 'regard': np.int64(1871),\n",
+ " 'someon': np.int64(2095),\n",
+ " 'stephani': np.int64(2155),\n",
+ " 'sorri': np.int64(2100),\n",
+ " 'tri': np.int64(2312),\n",
+ " 'per': np.int64(1662),\n",
+ " 'trader': np.int64(2296),\n",
+ " 'transact': np.int64(2301),\n",
+ " 'energi market': np.int64(702),\n",
+ " 'pg': np.int64(1679),\n",
+ " 'later': np.int64(1296),\n",
+ " 'transmiss': np.int64(2304),\n",
+ " 'northwest': np.int64(1552),\n",
+ " 'capac': np.int64(313),\n",
+ " 'cross': np.int64(514),\n",
+ " 'pipelin': np.int64(1693),\n",
+ " 'expand': np.int64(898),\n",
+ " 'western': np.int64(2430),\n",
+ " 'half': np.int64(1071),\n",
+ " 'along': np.int64(86),\n",
+ " 'open': np.int64(1601),\n",
+ " 'project': np.int64(1784),\n",
+ " 'expans': np.int64(899),\n",
+ " 'region': np.int64(1873),\n",
+ " 'level': np.int64(1324),\n",
+ " 'interest': np.int64(1197),\n",
+ " 'increas': np.int64(1159),\n",
+ " 'peter': np.int64(1678),\n",
+ " 'vice': np.int64(2387),\n",
+ " 'group': np.int64(1065),\n",
+ " 'capabl': np.int64(312),\n",
+ " 'express': np.int64(913),\n",
+ " 'begin': np.int64(209),\n",
+ " 'south': np.int64(2104),\n",
+ " 'west': np.int64(2429),\n",
+ " 'area': np.int64(131),\n",
+ " 'central': np.int64(341),\n",
+ " 'serv': np.int64(2039),\n",
+ " 'rout': np.int64(1957),\n",
+ " 'deliveri': np.int64(571),\n",
+ " 'look': np.int64(1355),\n",
+ " 'park': np.int64(1636),\n",
+ " 'basic': np.int64(196),\n",
+ " 'pacif': np.int64(1625),\n",
+ " 'larg': np.int64(1289),\n",
+ " 'william': np.int64(2440),\n",
+ " 'crisi': np.int64(512),\n",
+ " 'primari': np.int64(1755),\n",
+ " 'reli': np.int64(1886),\n",
+ " 'wood': np.int64(2460),\n",
+ " 'natur': np.int64(1512),\n",
+ " 'escapenumb third': np.int64(819),\n",
+ " 'vice presid': np.int64(2388),\n",
+ " 'natur ga': np.int64(1513),\n",
+ " 'desk': np.int64(591),\n",
+ " 'purchas': np.int64(1812),\n",
+ " 'space': np.int64(2107),\n",
+ " 'enrononlin': np.int64(721),\n",
+ " 'websit': np.int64(2415),\n",
+ " 'buy': np.int64(289),\n",
+ " 'offer': np.int64(1584),\n",
+ " 'us': np.int64(2360),\n",
+ " 'section': np.int64(1992),\n",
+ " 'refus': np.int64(1870),\n",
+ " 'keep': np.int64(1264),\n",
+ " 'extend': np.int64(916),\n",
+ " 'escapenumb month': np.int64(798),\n",
+ " 'escapenumb total': np.int64(822),\n",
+ " 'offer escapenumb': np.int64(1586),\n",
+ " 'would like': np.int64(2469),\n",
+ " 'section escapenumb': np.int64(1993),\n",
+ " 'taylor': np.int64(2222),\n",
+ " 'internet': np.int64(1201),\n",
+ " 'oil': np.int64(1593),\n",
+ " 'coupl': np.int64(504),\n",
+ " 'invoic': np.int64(1214),\n",
+ " 'refer': np.int64(1864),\n",
+ " 'steve': np.int64(2157),\n",
+ " 'tell': np.int64(2236),\n",
+ " 'cell': np.int64(338),\n",
+ " 'check': np.int64(372),\n",
+ " 'miss': np.int64(1471),\n",
+ " 'great': np.int64(1059),\n",
+ " 'escapenumb cell': np.int64(756),\n",
+ " 'note': np.int64(1553),\n",
+ " 'cash': np.int64(322),\n",
+ " 'balanc': np.int64(184),\n",
+ " 'smith': np.int64(2084),\n",
+ " 'asset': np.int64(146),\n",
+ " 'fix': np.int64(980),\n",
+ " 'incom': np.int64(1158),\n",
+ " 'portfolio': np.int64(1722),\n",
+ " 'fund': np.int64(1023),\n",
+ " 'discuss': np.int64(613),\n",
+ " 'pleas note': np.int64(1710),\n",
+ " 'eric': np.int64(738),\n",
+ " 'structur': np.int64(2175),\n",
+ " 'north': np.int64(1549),\n",
+ " 'enron north': np.int64(720),\n",
+ " 'north america': np.int64(1550),\n",
+ " 'com escapenumb': np.int64(404),\n",
+ " 'rod': np.int64(1948),\n",
+ " 'mani': np.int64(1393),\n",
+ " 'host': np.int64(1107),\n",
+ " 'rather': np.int64(1834),\n",
+ " 'guy': np.int64(1069),\n",
+ " 'feedback': np.int64(948),\n",
+ " 'step': np.int64(2154),\n",
+ " 'land': np.int64(1287),\n",
+ " 'partner': np.int64(1645),\n",
+ " 'terri': np.int64(2241),\n",
+ " 'donna': np.int64(628),\n",
+ " 'edison': np.int64(664),\n",
+ " 'meet': np.int64(1433),\n",
+ " 'minut': np.int64(1470),\n",
+ " 'execut': np.int64(896),\n",
+ " 'ceo': np.int64(342),\n",
+ " 'suit': np.int64(2192),\n",
+ " 'houston': np.int64(1115),\n",
+ " 'tx': np.int64(2328),\n",
+ " 'fax': np.int64(937),\n",
+ " 'xl': np.int64(2479),\n",
+ " 'suit escapenumb': np.int64(2193),\n",
+ " 'escapenumb houston': np.int64(786),\n",
+ " 'houston tx': np.int64(1117),\n",
+ " 'tx escapenumb': np.int64(2329),\n",
+ " 'escapenumb phone': np.int64(806),\n",
+ " 'phone escapenumb': np.int64(1685),\n",
+ " 'escapenumb fax': np.int64(782),\n",
+ " 'path': np.int64(1655),\n",
+ " 'monday': np.int64(1482),\n",
+ " 'deal': np.int64(550),\n",
+ " 'texa': np.int64(2244),\n",
+ " 'side': np.int64(2067),\n",
+ " 'much': np.int64(1497),\n",
+ " 'everyth': np.int64(885),\n",
+ " 'issu': np.int64(1224),\n",
+ " 'deliv': np.int64(570),\n",
+ " 'possibl': np.int64(1726),\n",
+ " 'idea': np.int64(1135),\n",
+ " 'look like': np.int64(1357),\n",
+ " 'sent monday': np.int64(2026),\n",
+ " 'deal escapenumb': np.int64(551),\n",
+ " 'januari escapenumb': np.int64(1230),\n",
+ " 'escapenumb deal': np.int64(764),\n",
+ " 'escapenumb receiv': np.int64(812),\n",
+ " 'posit': np.int64(1724),\n",
+ " 'network': np.int64(1526),\n",
+ " 'info': np.int64(1168),\n",
+ " 'locat': np.int64(1349),\n",
+ " 'xescapenumb': np.int64(2478),\n",
+ " 'mba': np.int64(1422),\n",
+ " 'home': np.int64(1105),\n",
+ " 'johnson': np.int64(1245),\n",
+ " 'wednesday': np.int64(2417),\n",
+ " 'allen': np.int64(82),\n",
+ " 'dave': np.int64(539),\n",
+ " 'fw': np.int64(1026),\n",
+ " 'surpris': np.int64(2204),\n",
+ " 'extens': np.int64(917),\n",
+ " 'sent wednesday': np.int64(2029),\n",
+ " 'subject fw': np.int64(2181),\n",
+ " 'pleas send': np.int64(1714),\n",
+ " 'aol': np.int64(116),\n",
+ " 'date': np.int64(537),\n",
+ " 'mime': np.int64(1464),\n",
+ " 'version': np.int64(2381),\n",
+ " 'type': np.int64(2330),\n",
+ " 'multipart': np.int64(1498),\n",
+ " 'mix': np.int64(1472),\n",
+ " 'boundari': np.int64(267),\n",
+ " 'mailer': np.int64(1384),\n",
+ " 'window': np.int64(2443),\n",
+ " 'escapenumberd': np.int64(840),\n",
+ " 'mx': np.int64(1506),\n",
+ " 'air': np.int64(74),\n",
+ " 'vescapenumb': np.int64(2383),\n",
+ " 'rescapenumb': np.int64(1908),\n",
+ " 'net': np.int64(1524),\n",
+ " 'unknown': np.int64(2353),\n",
+ " 'default': np.int64(563),\n",
+ " 'georg': np.int64(1039),\n",
+ " 'hotmail': np.int64(1110),\n",
+ " 'edu': np.int64(666),\n",
+ " 'msn': np.int64(1496),\n",
+ " 'cs': np.int64(517),\n",
+ " 'went': np.int64(2428),\n",
+ " 'long': np.int64(1352),\n",
+ " 'eye': np.int64(920),\n",
+ " 'bad': np.int64(183),\n",
+ " 'best': np.int64(232),\n",
+ " 'put': np.int64(1816),\n",
+ " 'solid': np.int64(2093),\n",
+ " 'wait': np.int64(2400),\n",
+ " 'room': np.int64(1953),\n",
+ " 'well': np.int64(2426),\n",
+ " 'thing': np.int64(2250),\n",
+ " 'good': np.int64(1053),\n",
+ " 'littl': np.int64(1340),\n",
+ " 'less': np.int64(1319),\n",
+ " 'approach': np.int64(123),\n",
+ " 'old': np.int64(1596),\n",
+ " 'friend': np.int64(1016),\n",
+ " 'told': np.int64(2281),\n",
+ " 'aid': np.int64(73),\n",
+ " 'gave': np.int64(1035),\n",
+ " 'left': np.int64(1313),\n",
+ " 'thought': np.int64(2257),\n",
+ " 'die': np.int64(597),\n",
+ " 'true': np.int64(2316),\n",
+ " 'god': np.int64(1051),\n",
+ " 'never': np.int64(1527),\n",
+ " 'argu': np.int64(132),\n",
+ " 'robert': np.int64(1946),\n",
+ " 'richard': np.int64(1935),\n",
+ " 'road': np.int64(1944),\n",
+ " 'win': np.int64(2441),\n",
+ " 'draw': np.int64(638),\n",
+ " 'simpli': np.int64(2073),\n",
+ " 'entir': np.int64(726),\n",
+ " 'past': np.int64(1651),\n",
+ " 'altern': np.int64(89),\n",
+ " 'directli': np.int64(606),\n",
+ " 'aol com': np.int64(117),\n",
+ " 'com escapelong': np.int64(403),\n",
+ " 'mime version': np.int64(1465),\n",
+ " 'version escapenumb': np.int64(2382),\n",
+ " 'escapenumb content': np.int64(760),\n",
+ " 'content type': np.int64(467),\n",
+ " 'type multipart': np.int64(2333),\n",
+ " 'escapenumb escapelong': np.int64(771),\n",
+ " 'escapelong escapelong': np.int64(744),\n",
+ " 'escapenumb window': np.int64(825),\n",
+ " 'escapelong escapenumb': np.int64(745),\n",
+ " 'id escapenumb': np.int64(1134),\n",
+ " 'escapenumb date': np.int64(762),\n",
+ " 'hotmail com': np.int64(1111),\n",
+ " 'type text': np.int64(2334),\n",
+ " 'fax escapenumb': np.int64(938),\n",
+ " 'pleas click': np.int64(1706),\n",
+ " 'investig': np.int64(1210),\n",
+ " 'submit': np.int64(2182),\n",
+ " 'pleas review': np.int64(1712),\n",
+ " 'pat': np.int64(1653),\n",
+ " 'suggest': np.int64(2191),\n",
+ " 'matter': np.int64(1411),\n",
+ " 'david': np.int64(541),\n",
+ " 'mr': np.int64(1494),\n",
+ " 'see': np.int64(2004),\n",
+ " 'statu': np.int64(2147),\n",
+ " 'concern': np.int64(435),\n",
+ " 'agent': np.int64(66),\n",
+ " 'hello': np.int64(1086),\n",
+ " 'everyon': np.int64(884),\n",
+ " 'ann': np.int64(105),\n",
+ " 'escapenumb thank': np.int64(818),\n",
+ " 'street': np.int64(2171),\n",
+ " 'roger': np.int64(1949),\n",
+ " 'enjoy': np.int64(713),\n",
+ " 'talk': np.int64(2215),\n",
+ " 'document': np.int64(624),\n",
+ " 'green': np.int64(1061),\n",
+ " 'light': np.int64(1328),\n",
+ " 'format': np.int64(997),\n",
+ " 'doug': np.int64(631),\n",
+ " 'jone': np.int64(1250),\n",
+ " 'na': np.int64(1508),\n",
+ " 'escapenumb smith': np.int64(816),\n",
+ " 'smith street': np.int64(2085),\n",
+ " 'houston texa': np.int64(1116),\n",
+ " 'texa escapenumb': np.int64(2245),\n",
+ " 'pleas see': np.int64(1713),\n",
+ " 'see attach': np.int64(2005),\n",
+ " 'sunday': np.int64(2198),\n",
+ " 'lee': np.int64(1312),\n",
+ " 'build': np.int64(284),\n",
+ " 'materi': np.int64(1405),\n",
+ " 'reason': np.int64(1846),\n",
+ " 'decid': np.int64(559),\n",
+ " 'recogn': np.int64(1856),\n",
+ " 'major': np.int64(1388),\n",
+ " 'depend': np.int64(576),\n",
+ " 'analysi': np.int64(99),\n",
+ " 'parti': np.int64(1639),\n",
+ " 'initi': np.int64(1177),\n",
+ " 'negoti': np.int64(1520),\n",
+ " 'procedur': np.int64(1769),\n",
+ " 'amount': np.int64(97),\n",
+ " 'progress': np.int64(1781),\n",
+ " 'made': np.int64(1374),\n",
+ " 'near': np.int64(1515),\n",
+ " 'liabil': np.int64(1325),\n",
+ " 'evid': np.int64(886),\n",
+ " 'prove': np.int64(1797),\n",
+ " 'respond': np.int64(1917),\n",
+ " 'tel': np.int64(2231),\n",
+ " 'join': np.int64(1246),\n",
+ " 'relev': np.int64(1884),\n",
+ " 'brought': np.int64(281),\n",
+ " 'titl': np.int64(2276),\n",
+ " 'differ': np.int64(599),\n",
+ " 'arbitr': np.int64(130),\n",
+ " 'part': np.int64(1637),\n",
+ " 'critic': np.int64(513),\n",
+ " 'congress': np.int64(447),\n",
+ " 'next': np.int64(1536),\n",
+ " 'secur': np.int64(1997),\n",
+ " 'adopt': np.int64(34),\n",
+ " 'independ': np.int64(1160),\n",
+ " 'medic': np.int64(1432),\n",
+ " 'rush': np.int64(1961),\n",
+ " 'regul': np.int64(1878),\n",
+ " 'reserv': np.int64(1910),\n",
+ " 'recov': np.int64(1859),\n",
+ " 'cost': np.int64(495),\n",
+ " 'paid': np.int64(1630),\n",
+ " 'deni': np.int64(574),\n",
+ " 'disput': np.int64(616),\n",
+ " 'brief': np.int64(275),\n",
+ " 'accept': np.int64(6),\n",
+ " 'ground': np.int64(1064),\n",
+ " 'economi': np.int64(658),\n",
+ " 'still': np.int64(2159),\n",
+ " 'yet': np.int64(2490),\n",
+ " 'remain': np.int64(1892),\n",
+ " 'particip': np.int64(1642),\n",
+ " 'believ': np.int64(214),\n",
+ " 'rate': np.int64(1833),\n",
+ " 'prior': np.int64(1758),\n",
+ " 'fulli': np.int64(1020),\n",
+ " 'countri': np.int64(503),\n",
+ " 'chicago': np.int64(373),\n",
+ " 'util': np.int64(2370),\n",
+ " 'day': np.int64(542),\n",
+ " 'resum': np.int64(1926),\n",
+ " 'money': np.int64(1486),\n",
+ " 'attempt': np.int64(160),\n",
+ " 'outsid': np.int64(1619),\n",
+ " 'contractor': np.int64(473),\n",
+ " 'normal': np.int64(1548),\n",
+ " 'plant': np.int64(1699),\n",
+ " 'oper': np.int64(1602),\n",
+ " 'maintain': np.int64(1387),\n",
+ " 'limit': np.int64(1330),\n",
+ " 'faith': np.int64(929),\n",
+ " 'longer': np.int64(1354),\n",
+ " 'environ': np.int64(729),\n",
+ " 'competit': np.int64(430),\n",
+ " 'stand': np.int64(2129),\n",
+ " 'enterpris': np.int64(724),\n",
+ " 'substanti': np.int64(2187),\n",
+ " 'found': np.int64(1005),\n",
+ " 'commit': np.int64(415),\n",
+ " 'continu': np.int64(469),\n",
+ " 'violat': np.int64(2391),\n",
+ " 'especi': np.int64(863),\n",
+ " 'specif': np.int64(2110),\n",
+ " 'stop': np.int64(2163),\n",
+ " 'relief': np.int64(1891),\n",
+ " 'compens': np.int64(429),\n",
+ " 'privat': np.int64(1761),\n",
+ " 'industri': np.int64(1167),\n",
+ " 'march': np.int64(1396),\n",
+ " 'delay': np.int64(567),\n",
+ " 'provis': np.int64(1802),\n",
+ " 'hear': np.int64(1081),\n",
+ " 'loss': np.int64(1360),\n",
+ " 'fear': np.int64(942),\n",
+ " 'uncertainti': np.int64(2341),\n",
+ " 'log': np.int64(1350),\n",
+ " 'alway': np.int64(91),\n",
+ " 'agre': np.int64(70),\n",
+ " 'commiss': np.int64(413),\n",
+ " 'cancel': np.int64(308),\n",
+ " 'staff': np.int64(2126),\n",
+ " 'interpret': np.int64(1202),\n",
+ " 'pertain': np.int64(1676),\n",
+ " 'assess': np.int64(145),\n",
+ " 'approv': np.int64(125),\n",
+ " 'set': np.int64(2045),\n",
+ " 'hotel': np.int64(1109),\n",
+ " 'defin': np.int64(565),\n",
+ " 'contribut': np.int64(474),\n",
+ " 'benefit': np.int64(217),\n",
+ " 'sponsor': np.int64(2117),\n",
+ " 'model': np.int64(1479),\n",
+ " 'tax': np.int64(2221),\n",
+ " 'relianc': np.int64(1889),\n",
+ " 'counti': np.int64(502),\n",
+ " 'govern': np.int64(1056),\n",
+ " 'minimum': np.int64(1468),\n",
+ " 'credit': np.int64(511),\n",
+ " 'without': np.int64(2457),\n",
+ " 'provid': np.int64(1798),\n",
+ " 'caus': np.int64(326),\n",
+ " 'polit': np.int64(1720),\n",
+ " 'financ': np.int64(971),\n",
+ " 'reform': np.int64(1867),\n",
+ " 'floor': np.int64(983),\n",
+ " 'start': np.int64(2131),\n",
+ " 'charg': np.int64(370),\n",
+ " 'make': np.int64(1389),\n",
+ " 'statement': np.int64(2133),\n",
+ " 'connect': np.int64(448),\n",
+ " 'novemb': np.int64(1561),\n",
+ " 'implement': np.int64(1148),\n",
+ " 'forc': np.int64(991),\n",
+ " 'sue': np.int64(2189),\n",
+ " 'despit': np.int64(592),\n",
+ " 'correct': np.int64(493),\n",
+ " 'stori': np.int64(2167),\n",
+ " 'describ': np.int64(587),\n",
+ " 'paragraph': np.int64(1635),\n",
+ " 'highli': np.int64(1095),\n",
+ " 'rose': np.int64(1954),\n",
+ " 'preliminari': np.int64(1738),\n",
+ " 'estim': np.int64(865),\n",
+ " 'educ': np.int64(667),\n",
+ " 'jim': np.int64(1239),\n",
+ " 'bd': np.int64(204),\n",
+ " 'brown': np.int64(282),\n",
+ " 'wi': np.int64(2436),\n",
+ " 'moor': np.int64(1490),\n",
+ " 'gov': np.int64(1055),\n",
+ " 'microsoft': np.int64(1452),\n",
+ " 'corp escapenumb': np.int64(490),\n",
+ " 'secur act': np.int64(1998),\n",
+ " 'third quarter': np.int64(2254),\n",
+ " 'power plant': np.int64(1731),\n",
+ " 'escapenumb hour': np.int64(785),\n",
+ " 'march escapenumb': np.int64(1397),\n",
+ " 'novemb escapenumb': np.int64(1562),\n",
+ " 'escapenumb per': np.int64(804),\n",
+ " 'cera': np.int64(343),\n",
+ " 'monthli': np.int64(1489),\n",
+ " 'rest': np.int64(1919),\n",
+ " 'summer': np.int64(2196),\n",
+ " 'winter': np.int64(2446),\n",
+ " 'print': np.int64(1757),\n",
+ " 'regulatori': np.int64(1879),\n",
+ " 'reduc': np.int64(1862),\n",
+ " 'reliabl': np.int64(1888),\n",
+ " 'peak': np.int64(1660),\n",
+ " 'ferc': np.int64(952),\n",
+ " 'strong': np.int64(2174),\n",
+ " 'consider': np.int64(451),\n",
+ " 'basi': np.int64(194),\n",
+ " 'eas': np.int64(652),\n",
+ " 'load': np.int64(1346),\n",
+ " 'bring': np.int64(276),\n",
+ " 'complet': np.int64(431),\n",
+ " 'categori': np.int64(324),\n",
+ " 'knowledg': np.int64(1283),\n",
+ " 'profil': np.int64(1778),\n",
+ " 'client': np.int64(391),\n",
+ " 'password': np.int64(1650),\n",
+ " 'electron': np.int64(682),\n",
+ " 'contain': np.int64(462),\n",
+ " 'research': np.int64(1909),\n",
+ " 'privileg': np.int64(1763),\n",
+ " 'disclosur': np.int64(609),\n",
+ " 'whole': np.int64(2434),\n",
+ " 'strictli': np.int64(2172),\n",
+ " 'last week': np.int64(1293),\n",
+ " 'ga price': np.int64(1030),\n",
+ " 'strictli prohibit': np.int64(2173),\n",
+ " 'brad': np.int64(269),\n",
+ " 'escapenumbera': np.int64(831),\n",
+ " 'descript': np.int64(588),\n",
+ " 'elizabeth': np.int64(683),\n",
+ " 'alreadi': np.int64(87),\n",
+ " 'standard': np.int64(2130),\n",
+ " 'wish': np.int64(2449),\n",
+ " 'own': np.int64(1623),\n",
+ " 'properti': np.int64(1790),\n",
+ " 'perhap': np.int64(1670),\n",
+ " 'hereto': np.int64(1089),\n",
+ " 'assumpt': np.int64(151),\n",
+ " 'watson': np.int64(2410),\n",
+ " 'design': np.int64(589),\n",
+ " 'escapenumber': np.int64(830),\n",
+ " 'speak': np.int64(2108),\n",
+ " 'pursu': np.int64(1814),\n",
+ " 'acquir': np.int64(13),\n",
+ " 'subsidiari': np.int64(2186),\n",
+ " 'except': np.int64(890),\n",
+ " ...}"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "y=df['label']"
+ ],
+ "metadata": {
+ "id": "lPyOikpYamj9"
+ },
+ "execution_count": 11,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "y"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 458
+ },
+ "id": "e1bQoUy7awqT",
+ "outputId": "a4e8f3c4-5e3d-49f5-dbda-292751cb4043"
+ },
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 0\n",
+ "1 0\n",
+ "2 0\n",
+ "3 0\n",
+ "4 0\n",
+ " ..\n",
+ "17410 0\n",
+ "17411 0\n",
+ "17412 0\n",
+ "17413 0\n",
+ "17414 0\n",
+ "Name: label, Length: 17415, dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " label \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 17410 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 17411 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 17412 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 17413 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 17414 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
17415 rows × 1 columns
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "np.set_printoptions(edgeitems=30,linewidth =100000,\n",
+ " formatter = dict(float=lambda x: \"%.3g\" % x))\n",
+ "X"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "XQ94xcP9c7c4",
+ "outputId": "99ca082f-30b7-4c15-de6a-de8a18200e39"
+ },
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([[ 3, 3, 0, 3, 0, 0, 0, 0, 2, 1, 2, 0, 0, 0, 0, 1, 4, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 2, 5, ..., 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 1, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..., 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 3, 3, 0, 1, 0, 0, 1, 0, 2, 2, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 3, ..., 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " ...,\n",
+ " [ 0, 0, 6, 2, 3, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 6, 0, 16, 1, 0, 0, 0, ..., 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 6, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..., 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 2, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..., 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 1, 2, 0, 1, 0, 1, 14, 2, 0, 0, 3, 0, 0, 3, 1, 11, 0, 5, 0, 0, 7, 0, 5, 2, 1, 0, 1, ..., 0, 10, 1, 0, 0, 6, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 18, 0, 0, 13, 3, 21, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..., 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+ " [ 2, 2, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 6, 0, 1, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0, 2, ..., 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0]])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20)"
+ ],
+ "metadata": {
+ "id": "b3BPaHvRdgp1"
+ },
+ "execution_count": 18,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.naive_bayes import MultinomialNB"
+ ],
+ "metadata": {
+ "id": "gU6d2QB_ePWK"
+ },
+ "execution_count": 19,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "spam_detect_model = MultinomialNB().fit(X_train,y_train)"
+ ],
+ "metadata": {
+ "id": "218ZZYjReTpV"
+ },
+ "execution_count": 20,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "y_pred = spam_detect_model.predict(X_test)"
+ ],
+ "metadata": {
+ "id": "sdzxoiszejR7"
+ },
+ "execution_count": 21,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
+ ],
+ "metadata": {
+ "id": "o2HQTtdwezNV"
+ },
+ "execution_count": 22,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "accuracy_score(y_test,y_pred)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "pZcH5t41fL38",
+ "outputId": "93f559d1-366a-44ab-d337-9fa4c90c9132"
+ },
+ "execution_count": 23,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0.8380706287683032"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(classification_report(y_test,y_pred))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "i1teODLnfVzD",
+ "outputId": "87f7827d-abb0-4e95-bb7b-dcd1216a65c9"
+ },
+ "execution_count": 24,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " 0 0.78 0.99 0.88 2002\n",
+ " 1 0.98 0.63 0.77 1481\n",
+ "\n",
+ " accuracy 0.84 3483\n",
+ " macro avg 0.88 0.81 0.82 3483\n",
+ "weighted avg 0.87 0.84 0.83 3483\n",
+ "\n"
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/MLPproject.ipynb b/MLPproject.ipynb
new file mode 100644
index 0000000..8a7b0f2
--- /dev/null
+++ b/MLPproject.ipynb
@@ -0,0 +1,388 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "authorship_tag": "ABX9TyM/fQMHaYzh1HwHzpLLGyA9",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "wvsuiNVuMWa5"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# Load all necessary files\n",
+ "try:\n",
+ " booknow_booking_df = pd.read_csv('booknow_booking.csv')\n",
+ " cinepos_booking_df = pd.read_csv('cinePOS_booking.csv')\n",
+ " id_relation_df = pd.read_csv('movie_theater_id_relation.csv')\n",
+ " booknow_visits_df = pd.read_csv('booknow_visits.csv')\n",
+ " date_info_df = pd.read_csv('date_info.csv')\n",
+ " booknow_theaters_df = pd.read_csv('booknow_theaters.csv')\n",
+ "\n",
+ " print(\"All files loaded successfully.\")\n",
+ "except Exception as e:\n",
+ " print(f\"Error loading files: {e}\")\n",
+ " # Stop execution if files can't be loaded\n",
+ " raise e\n",
+ "\n",
+ "print(\"Starting data consolidation...\")\n",
+ "\n",
+ "# --- 1a: Clean booknow_theaters ---\n",
+ "# Drop rows where book_theater_id is null, as they cannot be linked\n",
+ "booknow_theaters_df.dropna(subset=['book_theater_id'], inplace=True)\n",
+ "# We'll ignore lat/lon and sparse theater_type/area for this model\n",
+ "booknow_theaters_df = booknow_theaters_df[['book_theater_id']]\n",
+ "\n",
+ "# --- 1b: Process booknow_booking (Online) ---\n",
+ "# Convert to datetime and get the date part\n",
+ "booknow_booking_df['show_datetime'] = pd.to_datetime(booknow_booking_df['show_datetime'])\n",
+ "booknow_booking_df['show_date'] = booknow_booking_df['show_datetime'].dt.strftime('%Y-%m-%d')\n",
+ "# Aggregate: sum tickets by theater and date\n",
+ "booknow_agg_df = booknow_booking_df.groupby(['book_theater_id', 'show_date'])['tickets_booked'].sum().reset_index()\n",
+ "booknow_agg_df.rename(columns={'tickets_booked': 'total_booknow_tickets'}, inplace=True)\n",
+ "\n",
+ "# --- 1c: Process cinePOS_booking (On-site) ---\n",
+ "# Convert to datetime and get the date part\n",
+ "cinepos_booking_df['show_datetime'] = pd.to_datetime(cinepos_booking_df['show_datetime'])\n",
+ "cinepos_booking_df['show_date'] = cinepos_booking_df['show_datetime'].dt.strftime('%Y-%m-%d')\n",
+ "# Aggregate: sum tickets by theater and date\n",
+ "cinepos_agg_df = cinepos_booking_df.groupby(['cine_theater_id', 'show_date'])['tickets_sold'].sum().reset_index()\n",
+ "cinepos_agg_df.rename(columns={'tickets_sold': 'total_cinepos_tickets'}, inplace=True)\n",
+ "\n",
+ "# --- 1d: Link cinePOS to booknow IDs ---\n",
+ "cinepos_linked_df = pd.merge(cinepos_agg_df, id_relation_df, on='cine_theater_id', how='inner')\n",
+ "# Re-aggregate in case multiple cinePOS IDs map to a single book_theater_id\n",
+ "cinepos_linked_agg_df = cinepos_linked_df.groupby(['book_theater_id', 'show_date'])['total_cinepos_tickets'].sum().reset_index()\n",
+ "\n",
+ "# --- 1e: Create Master DataFrame ---\n",
+ "# Start with the base visits data (our target)\n",
+ "master_df = booknow_visits_df.copy()\n",
+ "\n",
+ "# Merge calendar info\n",
+ "master_df = pd.merge(master_df, date_info_df, on='show_date', how='left')\n",
+ "\n",
+ "# Merge aggregated BookNow bookings\n",
+ "master_df = pd.merge(master_df, booknow_agg_df, on=['book_theater_id', 'show_date'], how='left')\n",
+ "\n",
+ "# Merge aggregated and linked CinePOS bookings\n",
+ "master_df = pd.merge(master_df, cinepos_linked_agg_df, on=['book_theater_id', 'show_date'], how='left')\n",
+ "\n",
+ "# --- 1f: Final Cleanup ---\n",
+ "# Fill booking NaNs with 0 (days with visits but no recorded online/POS bookings)\n",
+ "master_df['total_booknow_tickets'].fillna(0, inplace=True)\n",
+ "master_df['total_cinepos_tickets'].fillna(0, inplace=True)\n",
+ "\n",
+ "# Convert show_date to datetime object for sorting and feature engineering\n",
+ "master_df['show_date'] = pd.to_datetime(master_df['show_date'])\n",
+ "\n",
+ "print(\"--- Master DataFrame Created ---\")\n",
+ "print(master_df.head())\n",
+ "print(f\"\\nShape of master_df: {master_df.shape}\")\n",
+ "print(master_df.info())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Step 2: Feature Engineering & Model Validation"
+ ],
+ "metadata": {
+ "id": "UvSxegRaNEPO"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.metrics import mean_squared_error\n",
+ "\n",
+ "print(\"\\n--- Starting Step 2: Feature Engineering & Validation ---\")\n",
+ "\n",
+ "# --- 2a: Create Features ---\n",
+ "# Create 'total_tickets' feature\n",
+ "master_df['total_tickets'] = master_df['total_booknow_tickets'] + master_df['total_cinepos_tickets']\n",
+ "\n",
+ "# Date Features\n",
+ "master_df['day_of_month'] = master_df['show_date'].dt.day\n",
+ "master_df['month'] = master_df['show_date'].dt.month\n",
+ "master_df['year'] = master_df['show_date'].dt.year\n",
+ "master_df['day_of_year'] = master_df['show_date'].dt.dayofyear\n",
+ "master_df['is_weekend'] = master_df['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)\n",
+ "\n",
+ "# CRITICAL: Sort by theater and date\n",
+ "master_df = master_df.sort_values(by=['book_theater_id', 'show_date'])\n",
+ "\n",
+ "# Lag & Rolling Features\n",
+ "print(\"Creating lag and rolling features...\")\n",
+ "gb = master_df.groupby('book_theater_id')['audience_count']\n",
+ "master_df['audience_lag_7'] = gb.shift(7)\n",
+ "master_df['audience_lag_14'] = gb.shift(14)\n",
+ "master_df['audience_roll_mean_7'] = gb.shift(1).rolling(7, min_periods=1).mean()\n",
+ "\n",
+ "# --- 2b: Categorical Encoding ---\n",
+ "# We will use LabelEncoder for IDs and One-Hot Encoding for 'day_of_week'\n",
+ "le = LabelEncoder()\n",
+ "master_df['book_theater_id_encoded'] = le.fit_transform(master_df['book_theater_id'])\n",
+ "master_df = pd.get_dummies(master_df, columns=['day_of_week'], prefix='dow')\n",
+ "\n",
+ "# --- 2c: Clean Data ---\n",
+ "# Drop rows where lag features are NaN (at the start of each series)\n",
+ "master_df_cleaned = master_df.dropna()\n",
+ "print(f\"Data shape after feature engineering and cleaning: {master_df_cleaned.shape}\")\n",
+ "\n",
+ "# --- 2d: Time-Series Split for Validation ---\n",
+ "target_col = 'audience_count'\n",
+ "# Exclude original IDs and date\n",
+ "features = [col for col in master_df_cleaned.columns if col not in [\n",
+ " 'audience_count', 'show_date', 'book_theater_id'\n",
+ "]]\n",
+ "\n",
+ "X = master_df_cleaned[features]\n",
+ "y = master_df_cleaned[target_col]\n",
+ "\n",
+ "# We will use the last 4 weeks (28 days) for validation\n",
+ "max_date = master_df_cleaned['show_date'].max()\n",
+ "split_date = max_date - pd.to_timedelta('28 days')\n",
+ "\n",
+ "train_mask = (master_df_cleaned['show_date'] < split_date)\n",
+ "valid_mask = (master_df_cleaned['show_date'] >= split_date)\n",
+ "\n",
+ "X_train, y_train = X[train_mask], y[train_mask]\n",
+ "X_valid, y_valid = X[valid_mask], y[valid_mask]\n",
+ "\n",
+ "print(f\"Training data shape: {X_train.shape}\")\n",
+ "print(f\"Validation data shape: {X_valid.shape}\")\n",
+ "\n",
+ "# --- 2e: Train and Validate Model ---\n",
+ "print(\"\\nTraining RandomForestRegressor for validation...\")\n",
+ "# Use a fast and powerful RandomForest\n",
+ "rf = RandomForestRegressor(\n",
+ " n_estimators=100,\n",
+ " random_state=42,\n",
+ " n_jobs=-1,\n",
+ " min_samples_leaf=5,\n",
+ " max_features=0.7\n",
+ ")\n",
+ "\n",
+ "rf.fit(X_train, y_train)\n",
+ "\n",
+ "# Evaluate\n",
+ "y_pred = rf.predict(X_valid)\n",
+ "rmse = np.sqrt(mean_squared_error(y_valid, y_pred))\n",
+ "print(f\"\\n--- Validation Complete ---\")\n",
+ "print(f\"Validation RMSE: {rmse:.4f}\")\n",
+ "print(\"This shows our model is predictive. Now we will build the final submission.\")"
+ ],
+ "metadata": {
+ "id": "XdhawvAYM_qs"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Step 3: Create Full Dataset for Submission"
+ ],
+ "metadata": {
+ "id": "aedKBYJ5NInq"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "from itertools import product\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "\n",
+ "print(\"\\n--- Starting Step 3: Creating Full Train+Test Dataset ---\")\n",
+ "\n",
+ "# --- 3a: Reload original data ---\n",
+ "# We need the original files to build the full train+test set\n",
+ "booknow_visits_df = pd.read_csv('booknow_visits.csv')\n",
+ "date_info_df = pd.read_csv('date_info.csv')\n",
+ "\n",
+ "# --- 3b: Identify Test Period ---\n",
+ "booknow_visits_df['show_date'] = pd.to_datetime(booknow_visits_df['show_date'])\n",
+ "date_info_df['show_date'] = pd.to_datetime(date_info_df['show_date'])\n",
+ "max_train_date = booknow_visits_df['show_date'].max()\n",
+ "test_dates_df = date_info_df[date_info_df['show_date'] > max_train_date]\n",
+ "print(f\"Test period identified: {test_dates_df['show_date'].min().date()} to {test_dates_df['show_date'].max().date()}\")\n",
+ "\n",
+ "# --- 3c: Create Test Scaffolding ---\n",
+ "all_theater_ids = booknow_visits_df['book_theater_id'].unique()\n",
+ "test_scaffold_df = pd.DataFrame(product(all_theater_ids, test_dates_df['show_date']),\n",
+ " columns=['book_theater_id', 'show_date'])\n",
+ "print(f\"Test scaffold created with shape: {test_scaffold_df.shape}\")\n",
+ "\n",
+ "# --- 3d: Combine Train and Test ---\n",
+ "# `audience_count` will be NaN for the test pairs\n",
+ "full_data_df = pd.concat([booknow_visits_df, test_scaffold_df], sort=True)\n",
+ "full_data_df = full_data_df.sort_values(by=['book_theater_id', 'show_date']).reset_index(drop=True)\n",
+ "\n",
+ "# --- 3e: Re-run Feature Engineering on Full Dataset ---\n",
+ "# We re-use the aggregated DataFrames from Step 1\n",
+ "print(\"Merging all features into full dataset...\")\n",
+ "full_master_df = pd.merge(full_data_df, date_info_df, on='show_date', how='left')\n",
+ "full_master_df = pd.merge(full_master_df, booknow_agg_df, on=['book_theater_id', 'show_date'], how='left')\n",
+ "full_master_df = pd.merge(full_master_df, cinepos_linked_agg_df, on=['book_theater_id', 'show_date'], how='left')\n",
+ "\n",
+ "# Cleanup NaNs\n",
+ "full_master_df['total_booknow_tickets'].fillna(0, inplace=True)\n",
+ "full_master_df['total_cinepos_tickets'].fillna(0, inplace=True)\n",
+ "full_master_df['total_tickets'] = full_master_df['total_booknow_tickets'] + full_master_df['total_cinepos_tickets']\n",
+ "\n",
+ "# Date Features\n",
+ "full_master_df['day_of_month'] = full_master_df['show_date'].dt.day\n",
+ "full_master_df['month'] = full_master_df['show_date'].dt.month\n",
+ "full_master_df['year'] = full_master_df['show_date'].dt.year\n",
+ "full_master_df['day_of_year'] = full_master_df['show_date'].dt.dayofyear\n",
+ "full_master_df['is_weekend'] = full_master_df['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)\n",
+ "\n",
+ "# Lag & Rolling Features\n",
+ "# This now correctly uses train data to create lags for the test data\n",
+ "print(\"Creating lags on full dataset...\")\n",
+ "gb_full = full_master_df.groupby('book_theater_id')['audience_count']\n",
+ "full_master_df['audience_lag_7'] = gb_full.shift(7)\n",
+ "full_master_df['audience_lag_14'] = gb_full.shift(14)\n",
+ "full_master_df['audience_roll_mean_7'] = gb_full.shift(1).rolling(7, min_periods=1).mean()\n",
+ "\n",
+ "# Categorical Encoding\n",
+ "full_master_df['book_theater_id_encoded'] = le.transform(full_master_df['book_theater_id']) # Use the LE from Step 2\n",
+ "full_master_df = pd.get_dummies(full_master_df, columns=['day_of_week'], prefix='dow')\n",
+ "\n",
+ "print(\"--- Full Train+Test Dataset is Ready ---\")\n",
+ "print(full_master_df.info())"
+ ],
+ "metadata": {
+ "id": "s0H0ImjrNNRI"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Step 4: Final Model Training & Submission"
+ ],
+ "metadata": {
+ "id": "85nmUKffNP_e"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "\n",
+ "print(\"\\n--- Starting Step 4: Final Training & Submission ---\")\n",
+ "\n",
+ "# --- 4a: Split into Final Train and Test ---\n",
+ "# Training data is where 'audience_count' is known\n",
+ "train_final_df = full_master_df[full_master_df['audience_count'].notnull()]\n",
+ "# Test data is where 'audience_count' is unknown\n",
+ "test_final_df = full_master_df[full_master_df['audience_count'].isnull()]\n",
+ "\n",
+ "# Clean the final training data (drop initial NaNs)\n",
+ "train_final_df = train_final_df.dropna(subset=['audience_lag_7', 'audience_lag_14', 'audience_roll_mean_7'])\n",
+ "\n",
+ "print(f\"Final training data shape: {train_final_df.shape}\")\n",
+ "print(f\"Final test data shape: {test_final_df.shape}\")\n",
+ "\n",
+ "# --- 4b: Align Columns ---\n",
+ "# Get feature list from the training set\n",
+ "features = [col for col in train_final_df.columns if col not in [\n",
+ " 'audience_count', 'show_date', 'book_theater_id'\n",
+ "]]\n",
+ "\n",
+ "# Ensure test set has the exact same columns as the train set\n",
+ "X_train_final = train_final_df[features]\n",
+ "y_train_final = train_final_df[target_col]\n",
+ "\n",
+ "# Align test set columns\n",
+ "X_test_final = test_final_df.copy()\n",
+ "for col in features:\n",
+ " if col not in X_test_final.columns:\n",
+ " X_test_final[col] = 0\n",
+ "X_test_final = X_test_final[features] # Keep only feature columns in correct order\n",
+ "\n",
+ "# Handle any NaNs in test features (e.g., if a new theater had no lag data)\n",
+ "# For this problem, we'll fill with 0\n",
+ "X_test_final.fillna(0, inplace=True)\n",
+ "\n",
+ "# --- 4c: Train Final Model ---\n",
+ "print(\"Training final model on ALL available data...\")\n",
+ "rf_final = RandomForestRegressor(\n",
+ " n_estimators=100,\n",
+ " random_state=42,\n",
+ " n_jobs=-1,\n",
+ " min_samples_leaf=5,\n",
+ " max_features=0.7\n",
+ ")\n",
+ "rf_final.fit(X_train_final, y_train_final)\n",
+ "print(\"Final model trained.\")\n",
+ "\n",
+ "# --- 4d: Make Predictions ---\n",
+ "print(\"Making final predictions...\")\n",
+ "predictions = rf_final.predict(X_test_final)\n",
+ "\n",
+ "# --- 4e: Format Submission File ---\n",
+ "submission_df = test_final_df[['book_theater_id', 'show_date']].copy()\n",
+ "submission_df['audience_count'] = predictions\n",
+ "\n",
+ "# Format the ID: book_theater_id + show_date\n",
+ "submission_df['show_date'] = submission_df['show_date'].dt.strftime('%Y-%m-%d')\n",
+ "submission_df['ID'] = submission_df['book_theater_id'] + '_' + submission_df['show_date']\n",
+ "\n",
+ "# Ensure predictions are non-negative and integers\n",
+ "submission_df['audience_count'] = np.round(submission_df['audience_count']).astype(int)\n",
+ "submission_df.loc[submission_df['audience_count'] < 0, 'audience_count'] = 0\n",
+ "\n",
+ "# Select final columns\n",
+ "final_submission = submission_df[['ID', 'audience_count']]\n",
+ "\n",
+ "# Save the file\n",
+ "final_submission.to_csv('submission.csv', index=False)\n",
+ "\n",
+ "print(\"\\n--- Submission File Created! ---\")\n",
+ "print(final_submission.head())\n",
+ "print(f\"File 'submission.csv' saved with {len(final_submission)} predictions.\")"
+ ],
+ "metadata": {
+ "id": "vxFSnZKTNiZK"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Welcome_to_Colab.ipynb b/Welcome_to_Colab.ipynb
new file mode 100644
index 0000000..c1d21c4
--- /dev/null
+++ b/Welcome_to_Colab.ipynb
@@ -0,0 +1,655 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# GA_5_MLP"
+ ],
+ "metadata": {
+ "id": "v2RMPpA9fg29"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "# Step 2: Upload the dataset\n",
+ "from google.colab import files\n",
+ "uploaded = files.upload()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 73
+ },
+ "id": "GcizXku4X4wC",
+ "outputId": "da4ce9b3-6868-46e1-8ab8-59b8fa65d3ba"
+ },
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " Upload widget is only available when the cell has been executed in the\n",
+ " current browser session. Please rerun this cell to enable.\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Saving GA_5_dataset.csv to GA_5_dataset.csv\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = pd.read_csv('GA_5_dataset.csv') # Make sure the file name is correct\n",
+ "\n",
+ "# Step 4: Separate features (X) and target (y)\n",
+ "X = df.drop(columns=['Credit_Limit']) # Features\n",
+ "y = df['Credit_Limit'] # Target\n",
+ "\n",
+ "# Step 5: Split the data (70% train, 30% test)\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
+ " test_size=0.3,\n",
+ " random_state=42)\n",
+ "\n",
+ "# Step 6: Check the shapes\n",
+ "print(\"Shape of X_train:\", X_train.shape)\n",
+ "print(\"Shape of X_test :\", X_test.shape)\n",
+ "print(\"Shape of y_train:\", y_train.shape)\n",
+ "print(\"Shape of y_test :\", y_test.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "u1DmB_u6YT6u",
+ "outputId": "ef1206b7-6f61-488e-db54-09237e10d8b9"
+ },
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shape of X_train: (4200, 16)\n",
+ "Shape of X_test : (1800, 16)\n",
+ "Shape of y_train: (4200,)\n",
+ "Shape of y_test : (1800,)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.metrics import r2_score\n",
+ "\n",
+ "# Initialize the model\n",
+ "model = LinearRegression(fit_intercept=False)\n",
+ "\n",
+ "# Train the model on training data\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "# Predict on test data\n",
+ "y_pred = model.predict(X_test)\n",
+ "\n",
+ "# Step 8: Calculate and print R² score\n",
+ "r2 = r2_score(y_test, y_pred)\n",
+ "print(\"R² Score on Test Set:\", r2)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0BK61pSnZC_Z",
+ "outputId": "7951a2ed-e47c-4e39-e5b6-7d14216eeb8d"
+ },
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "R² Score on Test Set: -0.41121711792312987\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "# Step 9: Get the model coefficients\n",
+ "coefficients = model.coef_\n",
+ "\n",
+ "# Step 10: Find index of the highest absolute coefficient\n",
+ "index_max_coeff = np.argmax(np.abs(coefficients))\n",
+ "\n",
+ "print(\"Index of feature with highest absolute coefficient value:\", index_max_coeff)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OuZr9EawZtgL",
+ "outputId": "faef861a-95c0-4a9d-d322-be226ac16285"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Index of feature with highest absolute coefficient value: 5\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.linear_model import Ridge\n",
+ "\n",
+ "# Step 11: Train Ridge Regression model\n",
+ "ridge_model = Ridge(solver='sag', tol=0.0005, random_state=42)\n",
+ "\n",
+ "# Fit the model on training data\n",
+ "ridge_model.fit(X_train, y_train)\n",
+ "\n",
+ "# Predict on test data\n",
+ "y_pred_ridge = ridge_model.predict(X_test)\n",
+ "\n",
+ "# Step 12: Compute R² score\n",
+ "r2_ridge = r2_score(y_test, y_pred_ridge)\n",
+ "print(\"R² Score of Ridge Regression on Test Set:\", r2_ridge)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7WL6ZDkdbDtT",
+ "outputId": "09a378ac-464e-4dd4-e14f-91a32d3f5ed6"
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "R² Score of Ridge Regression on Test Set: 0.5031632306039973\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"Intercept of Ridge Regression model:\", ridge_model.intercept_)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wlxpDa3FbgH9",
+ "outputId": "b46a973e-5dad-4c6e-99d5-4a9cfffadeec"
+ },
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Intercept of Ridge Regression model: 8638.307615757858\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.linear_model import Lasso\n",
+ "\n",
+ "# Step: Train Lasso Regression model\n",
+ "lasso_model = Lasso(alpha=100, random_state=42)\n",
+ "\n",
+ "# Fit the model on training data\n",
+ "lasso_model.fit(X_train, y_train)\n",
+ "\n",
+ "# Predict on test data\n",
+ "y_pred_lasso = lasso_model.predict(X_test)\n",
+ "\n",
+ "# Compute R² score\n",
+ "r2_lasso = r2_score(y_test, y_pred_lasso)\n",
+ "print(\"R² Score of Lasso Regression on Test Set:\", r2_lasso)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3NLyRn2BbgEh",
+ "outputId": "c71d523b-15a6-42f8-a903-e82eb453611c"
+ },
+ "execution_count": 11,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "R² Score of Lasso Regression on Test Set: 0.5013545795541585\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "# Get coefficients from the trained Lasso model\n",
+ "lasso_coeffs = lasso_model.coef_\n",
+ "\n",
+ "# Count how many are in the range [-1, 1]\n",
+ "count_in_range = np.sum((lasso_coeffs >= -1) & (lasso_coeffs <= 1))\n",
+ "\n",
+ "print(\"Number of coefficients in the range [-1, 1]:\", count_in_range)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "TVC_y_B3cT4k",
+ "outputId": "72b42fba-7fdf-4d59-f0fe-6e1825c02559"
+ },
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Number of coefficients in the range [-1, 1]: 9\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.neighbors import KNeighborsRegressor\n",
+ "from sklearn.metrics import mean_squared_error\n",
+ "\n",
+ "# Step: Train KNeighborsRegressor\n",
+ "knn_model = KNeighborsRegressor(n_neighbors=10, p=1)\n",
+ "\n",
+ "# Fit the model on training data\n",
+ "knn_model.fit(X_train, y_train)\n",
+ "\n",
+ "# Predict on test data\n",
+ "y_pred_knn = knn_model.predict(X_test)\n",
+ "\n",
+ "# Compute RMSE\n",
+ "rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))\n",
+ "print(\"Root Mean Squared Error (RMSE) of KNN on Test Set:\", rmse_knn)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "X5jo--1PcvYw",
+ "outputId": "525f31ae-cc7a-47ab-eb47-146a2089f249"
+ },
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Root Mean Squared Error (RMSE) of KNN on Test Set: 6707.055787083381\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.tree import DecisionTreeRegressor\n",
+ "from sklearn.metrics import mean_squared_error\n",
+ "\n",
+ "# Step: Train Decision Tree Regressor\n",
+ "tree_model = DecisionTreeRegressor(\n",
+ " max_depth=10,\n",
+ " min_samples_split=6,\n",
+ " min_samples_leaf=6,\n",
+ " random_state=42\n",
+ ")\n",
+ "\n",
+ "# Fit the model on training data\n",
+ "tree_model.fit(X_train, y_train)\n",
+ "\n",
+ "# Predict on test data\n",
+ "y_pred_tree = tree_model.predict(X_test)\n",
+ "\n",
+ "# Compute RMSE\n",
+ "rmse_tree = np.sqrt(mean_squared_error(y_test, y_pred_tree))\n",
+ "print(\"Root Mean Squared Error (RMSE) of Decision Tree on Test Set:\", rmse_tree)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GcgGvaEQdcFY",
+ "outputId": "df107910-763a-4fa7-aa52-9edc4190729c"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Root Mean Squared Error (RMSE) of Decision Tree on Test Set: 6740.833851583081\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.ensemble import AdaBoostRegressor\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.metrics import r2_score\n",
+ "\n",
+ "# Step: Define parameter grid\n",
+ "param_grid = {\n",
+ " 'n_estimators': [10, 50, 100, 200, 500],\n",
+ " 'learning_rate': [0.1, 0.5, 1, 2]\n",
+ "}\n",
+ "\n",
+ "# Step: Initialize AdaBoostRegressor\n",
+ "ada = AdaBoostRegressor(random_state=42)\n",
+ "\n",
+ "# Step: Apply GridSearchCV\n",
+ "grid_search = GridSearchCV(estimator=ada,\n",
+ " param_grid=param_grid,\n",
+ " cv=4,\n",
+ " scoring='r2',\n",
+ " n_jobs=-1)\n",
+ "\n",
+ "# Fit on training data\n",
+ "grid_search.fit(X_train, y_train)\n",
+ "\n",
+ "# Get the best model\n",
+ "best_ada_model = grid_search.best_estimator_\n",
+ "\n",
+ "# Predict on test data\n",
+ "y_pred_ada = best_ada_model.predict(X_test)\n",
+ "\n",
+ "# Compute R² score\n",
+ "r2_ada = r2_score(y_test, y_pred_ada)\n",
+ "\n",
+ "# Display results\n",
+ "print(\"Best Parameters:\", grid_search.best_params_)\n",
+ "print(\"R² Score of Best AdaBoost Model on Test Set:\", r2_ada)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7nZzB46geoNF",
+ "outputId": "497cb68d-51b9-488d-a147-c21cbb119b8f"
+ },
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Best Parameters: {'learning_rate': 0.1, 'n_estimators': 10}\n",
+ "R² Score of Best AdaBoost Model on Test Set: 0.5400284992718735\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"Best n_estimators:\", grid_search.best_params_['n_estimators'])\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3zu1AJ7UfXJD",
+ "outputId": "a39a3a41-be25-42a3-e0cd-3865bcde9c4f"
+ },
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Best n_estimators: 10\n"
+ ]
+ }
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "name": "Welcome to Colab",
+ "toc_visible": true,
+ "provenance": [],
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/chapter_appendix-tools-for-deep-learning/jupyter.ipynb b/chapter_appendix-tools-for-deep-learning/jupyter.ipynb
new file mode 100644
index 0000000..d4b2714
--- /dev/null
+++ b/chapter_appendix-tools-for-deep-learning/jupyter.ipynb
@@ -0,0 +1,1753 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Exploratory Data Analysis using python"
+ ],
+ "metadata": {
+ "id": "sYAuY4XefN_D"
+ },
+ "id": "sYAuY4XefN_D"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd"
+ ],
+ "metadata": {
+ "id": "0vB2tXoDfhD6"
+ },
+ "id": "0vB2tXoDfhD6",
+ "execution_count": 1,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = pd.read_csv(\"/content/customer_shopping_behavior.csv\")"
+ ],
+ "metadata": {
+ "id": "zxQ7epxjf7S1"
+ },
+ "id": "zxQ7epxjf7S1",
+ "execution_count": 2,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 313
+ },
+ "id": "uZe77-EMgE4T",
+ "outputId": "1952fd3e-a5a9-47f8-fdaf-aa08913a4308"
+ },
+ "id": "uZe77-EMgE4T",
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Customer ID Age Gender Item Purchased Category Purchase Amount (USD) \\\n",
+ "0 1 55 Male Blouse Clothing 53 \n",
+ "1 2 19 Male Sweater Clothing 64 \n",
+ "2 3 50 Male Jeans Clothing 73 \n",
+ "3 4 21 Male Sandals Footwear 90 \n",
+ "4 5 45 Male Blouse Clothing 49 \n",
+ "\n",
+ " Location Size Color Season Review Rating Subscription Status \\\n",
+ "0 Kentucky L Gray Winter 3.1 Yes \n",
+ "1 Maine L Maroon Winter 3.1 Yes \n",
+ "2 Massachusetts S Maroon Spring 3.1 Yes \n",
+ "3 Rhode Island M Maroon Spring 3.5 Yes \n",
+ "4 Oregon M Turquoise Spring 2.7 Yes \n",
+ "\n",
+ " Shipping Type Discount Applied Promo Code Used Previous Purchases \\\n",
+ "0 Express Yes Yes 14 \n",
+ "1 Express Yes Yes 2 \n",
+ "2 Free Shipping Yes Yes 23 \n",
+ "3 Next Day Air Yes Yes 49 \n",
+ "4 Free Shipping Yes Yes 31 \n",
+ "\n",
+ " Payment Method Frequency of Purchases \n",
+ "0 Venmo Fortnightly \n",
+ "1 Cash Fortnightly \n",
+ "2 Credit Card Weekly \n",
+ "3 PayPal Weekly \n",
+ "4 PayPal Annually "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Customer ID \n",
+ " Age \n",
+ " Gender \n",
+ " Item Purchased \n",
+ " Category \n",
+ " Purchase Amount (USD) \n",
+ " Location \n",
+ " Size \n",
+ " Color \n",
+ " Season \n",
+ " Review Rating \n",
+ " Subscription Status \n",
+ " Shipping Type \n",
+ " Discount Applied \n",
+ " Promo Code Used \n",
+ " Previous Purchases \n",
+ " Payment Method \n",
+ " Frequency of Purchases \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 55 \n",
+ " Male \n",
+ " Blouse \n",
+ " Clothing \n",
+ " 53 \n",
+ " Kentucky \n",
+ " L \n",
+ " Gray \n",
+ " Winter \n",
+ " 3.1 \n",
+ " Yes \n",
+ " Express \n",
+ " Yes \n",
+ " Yes \n",
+ " 14 \n",
+ " Venmo \n",
+ " Fortnightly \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 2 \n",
+ " 19 \n",
+ " Male \n",
+ " Sweater \n",
+ " Clothing \n",
+ " 64 \n",
+ " Maine \n",
+ " L \n",
+ " Maroon \n",
+ " Winter \n",
+ " 3.1 \n",
+ " Yes \n",
+ " Express \n",
+ " Yes \n",
+ " Yes \n",
+ " 2 \n",
+ " Cash \n",
+ " Fortnightly \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 3 \n",
+ " 50 \n",
+ " Male \n",
+ " Jeans \n",
+ " Clothing \n",
+ " 73 \n",
+ " Massachusetts \n",
+ " S \n",
+ " Maroon \n",
+ " Spring \n",
+ " 3.1 \n",
+ " Yes \n",
+ " Free Shipping \n",
+ " Yes \n",
+ " Yes \n",
+ " 23 \n",
+ " Credit Card \n",
+ " Weekly \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 4 \n",
+ " 21 \n",
+ " Male \n",
+ " Sandals \n",
+ " Footwear \n",
+ " 90 \n",
+ " Rhode Island \n",
+ " M \n",
+ " Maroon \n",
+ " Spring \n",
+ " 3.5 \n",
+ " Yes \n",
+ " Next Day Air \n",
+ " Yes \n",
+ " Yes \n",
+ " 49 \n",
+ " PayPal \n",
+ " Weekly \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 5 \n",
+ " 45 \n",
+ " Male \n",
+ " Blouse \n",
+ " Clothing \n",
+ " 49 \n",
+ " Oregon \n",
+ " M \n",
+ " Turquoise \n",
+ " Spring \n",
+ " 2.7 \n",
+ " Yes \n",
+ " Free Shipping \n",
+ " Yes \n",
+ " Yes \n",
+ " 31 \n",
+ " PayPal \n",
+ " Annually \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df",
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 3900,\n \"fields\": [\n {\n \"column\": \"Customer ID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1125,\n \"min\": 1,\n \"max\": 3900,\n \"num_unique_values\": 3900,\n \"samples\": [\n 840,\n 1718,\n 322\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 18,\n \"max\": 70,\n \"num_unique_values\": 53,\n \"samples\": [\n 56,\n 24,\n 51\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Female\",\n \"Male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Item Purchased\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 25,\n \"samples\": [\n \"Handbag\",\n \"Jewelry\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Footwear\",\n \"Accessories\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Purchase Amount (USD)\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 23,\n \"min\": 20,\n \"max\": 100,\n \"num_unique_values\": 81,\n \"samples\": [\n 60,\n 53\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"New Hampshire\",\n \"Connecticut\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Size\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"S\",\n \"XL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Color\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 25,\n \"samples\": [\n \"Olive\",\n \"Red\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Season\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Spring\",\n \"Fall\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Review Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7169829842073647,\n \"min\": 2.5,\n \"max\": 5.0,\n \"num_unique_values\": 26,\n \"samples\": [\n 4.9,\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Subscription Status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"No\",\n \"Yes\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Shipping Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Express\",\n \"Free Shipping\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Discount Applied\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"No\",\n \"Yes\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Promo Code Used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"No\",\n \"Yes\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Previous Purchases\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14,\n \"min\": 1,\n \"max\": 50,\n \"num_unique_values\": 50,\n \"samples\": [\n 36,\n 47\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Payment Method\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Venmo\",\n \"Cash\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Frequency of Purchases\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"Fortnightly\",\n \"Weekly\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.info()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "fYjWO4C7gcWH",
+ "outputId": "a2b1c2e1-7c09-4f07-f559-c38b4774d7e0"
+ },
+ "id": "fYjWO4C7gcWH",
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 3900 entries, 0 to 3899\n",
+ "Data columns (total 18 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Customer ID 3900 non-null int64 \n",
+ " 1 Age 3900 non-null int64 \n",
+ " 2 Gender 3900 non-null object \n",
+ " 3 Item Purchased 3900 non-null object \n",
+ " 4 Category 3900 non-null object \n",
+ " 5 Purchase Amount (USD) 3900 non-null int64 \n",
+ " 6 Location 3900 non-null object \n",
+ " 7 Size 3900 non-null object \n",
+ " 8 Color 3900 non-null object \n",
+ " 9 Season 3900 non-null object \n",
+ " 10 Review Rating 3863 non-null float64\n",
+ " 11 Subscription Status 3900 non-null object \n",
+ " 12 Shipping Type 3900 non-null object \n",
+ " 13 Discount Applied 3900 non-null object \n",
+ " 14 Promo Code Used 3900 non-null object \n",
+ " 15 Previous Purchases 3900 non-null int64 \n",
+ " 16 Payment Method 3900 non-null object \n",
+ " 17 Frequency of Purchases 3900 non-null object \n",
+ "dtypes: float64(1), int64(4), object(13)\n",
+ "memory usage: 548.6+ KB\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.describe(include='all')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 466
+ },
+ "id": "3SaaS_RTge3A",
+ "outputId": "141ef1f3-8f27-49d4-85af-c0692ad5161c"
+ },
+ "id": "3SaaS_RTge3A",
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Customer ID Age Gender Item Purchased Category \\\n",
+ "count 3900.000000 3900.000000 3900 3900 3900 \n",
+ "unique NaN NaN 2 25 4 \n",
+ "top NaN NaN Male Blouse Clothing \n",
+ "freq NaN NaN 2652 171 1737 \n",
+ "mean 1950.500000 44.068462 NaN NaN NaN \n",
+ "std 1125.977353 15.207589 NaN NaN NaN \n",
+ "min 1.000000 18.000000 NaN NaN NaN \n",
+ "25% 975.750000 31.000000 NaN NaN NaN \n",
+ "50% 1950.500000 44.000000 NaN NaN NaN \n",
+ "75% 2925.250000 57.000000 NaN NaN NaN \n",
+ "max 3900.000000 70.000000 NaN NaN NaN \n",
+ "\n",
+ " Purchase Amount (USD) Location Size Color Season Review Rating \\\n",
+ "count 3900.000000 3900 3900 3900 3900 3863.000000 \n",
+ "unique NaN 50 4 25 4 NaN \n",
+ "top NaN Montana M Olive Spring NaN \n",
+ "freq NaN 96 1755 177 999 NaN \n",
+ "mean 59.764359 NaN NaN NaN NaN 3.750065 \n",
+ "std 23.685392 NaN NaN NaN NaN 0.716983 \n",
+ "min 20.000000 NaN NaN NaN NaN 2.500000 \n",
+ "25% 39.000000 NaN NaN NaN NaN 3.100000 \n",
+ "50% 60.000000 NaN NaN NaN NaN 3.800000 \n",
+ "75% 81.000000 NaN NaN NaN NaN 4.400000 \n",
+ "max 100.000000 NaN NaN NaN NaN 5.000000 \n",
+ "\n",
+ " Subscription Status Shipping Type Discount Applied Promo Code Used \\\n",
+ "count 3900 3900 3900 3900 \n",
+ "unique 2 6 2 2 \n",
+ "top No Free Shipping No No \n",
+ "freq 2847 675 2223 2223 \n",
+ "mean NaN NaN NaN NaN \n",
+ "std NaN NaN NaN NaN \n",
+ "min NaN NaN NaN NaN \n",
+ "25% NaN NaN NaN NaN \n",
+ "50% NaN NaN NaN NaN \n",
+ "75% NaN NaN NaN NaN \n",
+ "max NaN NaN NaN NaN \n",
+ "\n",
+ " Previous Purchases Payment Method Frequency of Purchases \n",
+ "count 3900.000000 3900 3900 \n",
+ "unique NaN 6 7 \n",
+ "top NaN PayPal Every 3 Months \n",
+ "freq NaN 677 584 \n",
+ "mean 25.351538 NaN NaN \n",
+ "std 14.447125 NaN NaN \n",
+ "min 1.000000 NaN NaN \n",
+ "25% 13.000000 NaN NaN \n",
+ "50% 25.000000 NaN NaN \n",
+ "75% 38.000000 NaN NaN \n",
+ "max 50.000000 NaN NaN "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Customer ID \n",
+ " Age \n",
+ " Gender \n",
+ " Item Purchased \n",
+ " Category \n",
+ " Purchase Amount (USD) \n",
+ " Location \n",
+ " Size \n",
+ " Color \n",
+ " Season \n",
+ " Review Rating \n",
+ " Subscription Status \n",
+ " Shipping Type \n",
+ " Discount Applied \n",
+ " Promo Code Used \n",
+ " Previous Purchases \n",
+ " Payment Method \n",
+ " Frequency of Purchases \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " 3900.000000 \n",
+ " 3900.000000 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " 3900.000000 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " 3863.000000 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " 3900.000000 \n",
+ " 3900 \n",
+ " 3900 \n",
+ " \n",
+ " \n",
+ " unique \n",
+ " NaN \n",
+ " NaN \n",
+ " 2 \n",
+ " 25 \n",
+ " 4 \n",
+ " NaN \n",
+ " 50 \n",
+ " 4 \n",
+ " 25 \n",
+ " 4 \n",
+ " NaN \n",
+ " 2 \n",
+ " 6 \n",
+ " 2 \n",
+ " 2 \n",
+ " NaN \n",
+ " 6 \n",
+ " 7 \n",
+ " \n",
+ " \n",
+ " top \n",
+ " NaN \n",
+ " NaN \n",
+ " Male \n",
+ " Blouse \n",
+ " Clothing \n",
+ " NaN \n",
+ " Montana \n",
+ " M \n",
+ " Olive \n",
+ " Spring \n",
+ " NaN \n",
+ " No \n",
+ " Free Shipping \n",
+ " No \n",
+ " No \n",
+ " NaN \n",
+ " PayPal \n",
+ " Every 3 Months \n",
+ " \n",
+ " \n",
+ " freq \n",
+ " NaN \n",
+ " NaN \n",
+ " 2652 \n",
+ " 171 \n",
+ " 1737 \n",
+ " NaN \n",
+ " 96 \n",
+ " 1755 \n",
+ " 177 \n",
+ " 999 \n",
+ " NaN \n",
+ " 2847 \n",
+ " 675 \n",
+ " 2223 \n",
+ " 2223 \n",
+ " NaN \n",
+ " 677 \n",
+ " 584 \n",
+ " \n",
+ " \n",
+ " mean \n",
+ " 1950.500000 \n",
+ " 44.068462 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 59.764359 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 3.750065 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 25.351538 \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " std \n",
+ " 1125.977353 \n",
+ " 15.207589 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 23.685392 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.716983 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 14.447125 \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " min \n",
+ " 1.000000 \n",
+ " 18.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 20.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 2.500000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 1.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " 25% \n",
+ " 975.750000 \n",
+ " 31.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 39.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 3.100000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 13.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " 50% \n",
+ " 1950.500000 \n",
+ " 44.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 60.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 3.800000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 25.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " 75% \n",
+ " 2925.250000 \n",
+ " 57.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 81.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 4.400000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 38.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " max \n",
+ " 3900.000000 \n",
+ " 70.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 100.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 5.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " 50.000000 \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"Customer ID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1405.274081775269,\n \"min\": 1.0,\n \"max\": 3900.0,\n \"num_unique_values\": 6,\n \"samples\": [\n 3900.0,\n 1950.5,\n 2925.25\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1364.878318051572,\n \"min\": 15.20758912716238,\n \"max\": 3900.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 44.06846153846154,\n 44.0,\n 3900.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 2,\n \"2652\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Item Purchased\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 25,\n \"171\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 4,\n \"1737\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Purchase Amount (USD)\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1359.7647934740537,\n \"min\": 20.0,\n \"max\": 3900.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 59.76435897435898,\n 60.0,\n 3900.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 50,\n \"96\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Size\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 4,\n \"1755\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Color\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 25,\n \"177\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Season\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 4,\n \"999\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Review Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1364.602207410873,\n \"min\": 0.7169829842073647,\n \"max\": 3863.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 3.750064716541548,\n 3.8,\n 3863.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Subscription Status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 2,\n \"2847\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Shipping Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 6,\n \"675\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Discount Applied\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 2,\n \"2223\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Promo Code Used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 2,\n \"2223\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Previous Purchases\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1370.5178374100155,\n \"min\": 1.0,\n \"max\": 3900.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 25.35153846153846,\n 25.0,\n 3900.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Payment Method\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 6,\n \"677\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Frequency of Purchases\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n 7,\n \"584\",\n \"3900\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 648
+ },
+ "id": "F8QhPcI2indu",
+ "outputId": "240677f0-bb6b-4e1f-bd6b-a6843353a655"
+ },
+ "id": "F8QhPcI2indu",
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Customer ID 0\n",
+ "Age 0\n",
+ "Gender 0\n",
+ "Item Purchased 0\n",
+ "Category 0\n",
+ "Purchase Amount (USD) 0\n",
+ "Location 0\n",
+ "Size 0\n",
+ "Color 0\n",
+ "Season 0\n",
+ "Review Rating 37\n",
+ "Subscription Status 0\n",
+ "Shipping Type 0\n",
+ "Discount Applied 0\n",
+ "Promo Code Used 0\n",
+ "Previous Purchases 0\n",
+ "Payment Method 0\n",
+ "Frequency of Purchases 0\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Customer ID \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Age \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Gender \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Item Purchased \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Category \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Purchase Amount (USD) \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Location \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Size \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Color \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Season \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Review Rating \n",
+ " 37 \n",
+ " \n",
+ " \n",
+ " Subscription Status \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Shipping Type \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Discount Applied \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Promo Code Used \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Previous Purchases \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Payment Method \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Frequency of Purchases \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "There are 37 null values in review rating column . We can either remove this null values or fill with some value, So fill with mean or median is better ."
+ ],
+ "metadata": {
+ "id": "GTeriRgHjE89"
+ },
+ "id": "GTeriRgHjE89"
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "So ,now question is replace with mean or median as mean is affected by outliers and median is robust to outliers so we choose median over mean ."
+ ],
+ "metadata": {
+ "id": "PD-kZrZfjeHh"
+ },
+ "id": "PD-kZrZfjeHh"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Review Rating'] = df.groupby('Category')['Review Rating'].transform(lambda x:x.fillna(x.median()))"
+ ],
+ "metadata": {
+ "id": "cNaRp7DviyLE"
+ },
+ "id": "cNaRp7DviyLE",
+ "execution_count": 9,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Here we replace null values with median of each category instead of filling with global median ."
+ ],
+ "metadata": {
+ "id": "ubN24NUTlwnq"
+ },
+ "id": "ubN24NUTlwnq"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 648
+ },
+ "id": "OyfjphiwmCbH",
+ "outputId": "5abbaf4a-1d5c-48cd-fae8-01d62fec8e08"
+ },
+ "id": "OyfjphiwmCbH",
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Customer ID 0\n",
+ "Age 0\n",
+ "Gender 0\n",
+ "Item Purchased 0\n",
+ "Category 0\n",
+ "Purchase Amount (USD) 0\n",
+ "Location 0\n",
+ "Size 0\n",
+ "Color 0\n",
+ "Season 0\n",
+ "Review Rating 0\n",
+ "Subscription Status 0\n",
+ "Shipping Type 0\n",
+ "Discount Applied 0\n",
+ "Promo Code Used 0\n",
+ "Previous Purchases 0\n",
+ "Payment Method 0\n",
+ "Frequency of Purchases 0\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Customer ID \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Age \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Gender \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Item Purchased \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Category \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Purchase Amount (USD) \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Location \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Size \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Color \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Season \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Review Rating \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Subscription Status \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Shipping Type \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Discount Applied \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Promo Code Used \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Previous Purchases \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Payment Method \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " Frequency of Purchases \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "We have transform name of each column in snake case format , so we easy to check column name in our sql queries."
+ ],
+ "metadata": {
+ "id": "OZxQxfDXmdVP"
+ },
+ "id": "OZxQxfDXmdVP"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.columns= df.columns.str.lower()"
+ ],
+ "metadata": {
+ "id": "IF0fbab0mY0I"
+ },
+ "id": "IF0fbab0mY0I",
+ "execution_count": 13,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.columns = df.columns.str.replace(' ','_')"
+ ],
+ "metadata": {
+ "id": "Tgm5yN_rm5rr"
+ },
+ "id": "Tgm5yN_rm5rr",
+ "execution_count": 14,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.columns"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "5DJowruTnQ0f",
+ "outputId": "2c50fe42-f04a-4e98-d76f-8201d710bd53"
+ },
+ "id": "5DJowruTnQ0f",
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['customer_id', 'age', 'gender', 'item_purchased', 'category',\n",
+ " 'purchase_amount_(usd)', 'location', 'size', 'color', 'season',\n",
+ " 'review_rating', 'subscription_status', 'shipping_type',\n",
+ " 'discount_applied', 'promo_code_used', 'previous_purchases',\n",
+ " 'payment_method', 'frequency_of_purchases'],\n",
+ " dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = df.rename(columns={'purchase_amount_(usd)':'purchase_amount'})"
+ ],
+ "metadata": {
+ "id": "TuAiHzfNnZiO"
+ },
+ "id": "TuAiHzfNnZiO",
+ "execution_count": 16,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.columns"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "5feaMJnEn8-2",
+ "outputId": "e8fd7b9d-0a18-424c-d12f-2360dca1f0e8"
+ },
+ "id": "5feaMJnEn8-2",
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['customer_id', 'age', 'gender', 'item_purchased', 'category',\n",
+ " 'purchase_amount', 'location', 'size', 'color', 'season',\n",
+ " 'review_rating', 'subscription_status', 'shipping_type',\n",
+ " 'discount_applied', 'promo_code_used', 'previous_purchases',\n",
+ " 'payment_method', 'frequency_of_purchases'],\n",
+ " dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#create a column age\n",
+ "labels = ['Young Adult','Adult','Middle aged','Senior']\n",
+ "df['age_group'] = pd.qcut(df['age'],q=4,labels= labels)"
+ ],
+ "metadata": {
+ "id": "_7OzOCVMoGgq"
+ },
+ "id": "_7OzOCVMoGgq",
+ "execution_count": 19,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df[['age','age_group']].head(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 363
+ },
+ "id": "0G1E_mAfp6mt",
+ "outputId": "4654bb9e-0635-436c-acac-56216071f8f3"
+ },
+ "id": "0G1E_mAfp6mt",
+ "execution_count": 21,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " age age_group\n",
+ "0 55 Middle aged\n",
+ "1 19 Young Adult\n",
+ "2 50 Middle aged\n",
+ "3 21 Young Adult\n",
+ "4 45 Middle aged\n",
+ "5 46 Middle aged\n",
+ "6 63 Senior\n",
+ "7 27 Young Adult\n",
+ "8 26 Young Adult\n",
+ "9 57 Middle aged"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " age \n",
+ " age_group \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 55 \n",
+ " Middle aged \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 19 \n",
+ " Young Adult \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 50 \n",
+ " Middle aged \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 21 \n",
+ " Young Adult \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 45 \n",
+ " Middle aged \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 46 \n",
+ " Middle aged \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 63 \n",
+ " Senior \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " 27 \n",
+ " Young Adult \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " 26 \n",
+ " Young Adult \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " 57 \n",
+ " Middle aged \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"df[['age','age_group']]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 16,\n \"min\": 19,\n \"max\": 63,\n \"num_unique_values\": 10,\n \"samples\": [\n 26,\n 19,\n 46\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"age_group\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Middle aged\",\n \"Young Adult\",\n \"Senior\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 21
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "I make this age_group column to understand customers purchasing behaviour based on their age group ."
+ ],
+ "metadata": {
+ "id": "UGbYFgeBqQXS"
+ },
+ "id": "UGbYFgeBqQXS"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# create column purchasig_frequency_days"
+ ],
+ "metadata": {
+ "id": "W6cS9UBzq71f"
+ },
+ "id": "W6cS9UBzq71f",
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "frequency_mapping = {\n",
+ " 'Fortnightly': 14,\n",
+ " 'Weekly' : 7,\n",
+ " 'Monthly' : 30,\n",
+ " 'Quarterly': 90,\n",
+ " 'Bi-Weekly' : 14,\n",
+ " 'Annually' : 365,\n",
+ " 'Every 3 Month' : 90\n",
+ "}\n",
+ "df['purchase_frequency_days'] = df['frequency_of_purchases'].map(frequency_mapping)"
+ ],
+ "metadata": {
+ "id": "wlvsoLH9rLVq"
+ },
+ "id": "wlvsoLH9rLVq",
+ "execution_count": 26,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df[['frequency_of_purchases','purchase_frequency_days']].head(10)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 363
+ },
+ "id": "lVwmDlzytmHA",
+ "outputId": "09ac7514-5600-48e8-adb3-5bdbe055c605"
+ },
+ "id": "lVwmDlzytmHA",
+ "execution_count": 27,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " frequency_of_purchases purchase_frequency_days\n",
+ "0 Fortnightly 14.0\n",
+ "1 Fortnightly 14.0\n",
+ "2 Weekly 7.0\n",
+ "3 Weekly 7.0\n",
+ "4 Annually 365.0\n",
+ "5 Weekly 7.0\n",
+ "6 Quarterly 90.0\n",
+ "7 Weekly 7.0\n",
+ "8 Annually 365.0\n",
+ "9 Quarterly 90.0"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " frequency_of_purchases \n",
+ " purchase_frequency_days \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Fortnightly \n",
+ " 14.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Fortnightly \n",
+ " 14.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Weekly \n",
+ " 7.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Weekly \n",
+ " 7.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Annually \n",
+ " 365.0 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Weekly \n",
+ " 7.0 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " Quarterly \n",
+ " 90.0 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Weekly \n",
+ " 7.0 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Annually \n",
+ " 365.0 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Quarterly \n",
+ " 90.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"df[['frequency_of_purchases','purchase_frequency_days']]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"frequency_of_purchases\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Weekly\",\n \"Quarterly\",\n \"Fortnightly\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"purchase_frequency_days\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 145.2669573195807,\n \"min\": 7.0,\n \"max\": 365.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 7.0,\n 90.0,\n 14.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 27
+ }
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "required_libs": [],
+ "colab": {
+ "provenance": [],
+ "include_colab_link": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/email.json b/email.json
new file mode 100644
index 0000000..4aa7711
--- /dev/null
+++ b/email.json
@@ -0,0 +1,3 @@
+{
+ "email": "23f3002893@ds.study.iitm.ac.in"
+}
diff --git a/problem_1.md b/problem_1.md
new file mode 100644
index 0000000..78f6cb1
--- /dev/null
+++ b/problem_1.md
@@ -0,0 +1,161 @@
+---
+title: Data type operations
+---
+
+# Problem Statement
+
+Implement the function data_type_operations(data: dict) -> dict that takes a dictionary containing keys 'int', 'float', 'list', and 'set'. Perform the following operations:
+
+For 'int', add 5.
+For 'float', multiply by 1.5.
+For 'list', append the length of the list.
+For 'set', add the square of the size of the set.
+Return the modified dictionary.
+
+**Example**
+```
+data = {
+ 'int': 10,
+ 'float': 4.0,
+ 'list': [1, 2, 3],
+ 'set': {1, 2}
+}
+Result: {
+ 'int': 15,
+ 'float': 6.0,
+ 'list': [1, 2, 3, 3],
+ 'set': {1, 2, 4}
+}
+```
+
+# Solution
+
+```py3 test.py -r 'python test.py'
+
+def data_type_operations(data: dict) -> dict:
+ '''
+ Modify the input dictionary as per the operations specified.
+
+ Arguments:
+ data: dict - Dictionary containing specific keys and values.
+
+ Returns:
+ dict - Modified dictionary after performing operations.
+ '''
+ ...
+
+ data['int'] += 5
+ data['float'] *= 1.5
+ data['list'].append(len(data['list']))
+ data['set'].add(len(data['set'])**2)
+ return data
+
+
+
+
+{% include '../function_type_and_modify_check_suffix.py.jinja' %}
+
+
+```
+
+# Public Test Cases
+
+## Input 1
+
+```
+data = {
+ 'int': 3,
+ 'float': 2.0,
+ 'list': [10, 20],
+ 'set': {2}
+}
+is_equal(
+ data_type_operations(data),
+ {
+ 'int': 8,
+ 'float': 3.0,
+ 'list': [10, 20, 2],
+ 'set': {2, 1}
+ }
+)
+```
+
+## Output 1
+
+```
+{
+ 'int': 8,
+ 'float': 3.0,
+ 'list': [10, 20, 2],
+ 'set': {2, 1}
+}
+```
+
+# Private Test Cases
+
+## Input 1
+
+```
+data = {
+ 'int': 0,
+ 'float': 0.0,
+ 'list': [],
+ 'set': set()
+}
+is_equal(
+ data_type_operations(data),
+ {
+ 'int': 5,
+ 'float': 0.0,
+ 'list': [0],
+ 'set': {0}
+ }
+)
+
+```
+
+## Output 1
+
+```
+{
+ 'int': 5,
+ 'float': 0.0,
+ 'list': [0],
+ 'set': {0}
+}
+
+```
+
+## Input 2
+
+```
+data = {
+ 'int': -5,
+ 'float': 10.0,
+ 'list': [1, 2, 3, 4],
+ 'set': {1, 3, 5}
+}
+is_equal(
+ data_type_operations(data),
+ {
+ 'int': 0,
+ 'float': 15.0,
+ 'list': [1, 2, 3, 4, 4],
+ 'set': {1, 3, 5, 9}
+ }
+)
+
+```
+
+## Output 2
+
+```
+{
+ 'int': 0,
+ 'float': 15.0,
+ 'list': [1, 2, 3, 4, 4],
+ 'set': {1, 3, 5, 9}
+}
+
+
+```
diff --git a/problem_2.md b/problem_2.md
new file mode 100644
index 0000000..25dcc70
--- /dev/null
+++ b/problem_2.md
@@ -0,0 +1,104 @@
+---
+title: Top Student
+---
+
+# Problem Statement
+
+Implement the function top_student(students: list) -> dict that takes a list of dictionaries where each dictionary contains 'name' and 'marks' of a student. Return a dictionary with the following keys:
+
+- 'average': the average marks rounded to 2 decimal places.
+- 'topper': the name of the student with the highest marks.
+
+*Example*
+
+students = [
+ {"name": "lucky", "marks": 85},
+ {"name": "Rocky", "marks": 92},
+ {"name": "Chocky", "marks": 78}
+]
+top_student(students) # Output: {"average": 85.0, "topper": "Rocky"}
+
+
+# Solution
+
+py3 test.py -r 'python test.py'
+
+def top_student(students: list) -> dict:
+ '''
+ Find the average marks and the student with the highest marks.
+ Arguments:
+ students: list - a list of dictionaries containing 'name' and 'marks'.
+ Return: dict - a dictionary with 'average' and 'topper'.
+ '''
+ ...
+
+ average = round(sum(s['marks'] for s in students) / len(students), 2)
+ topper = max(students, key=lambda x: x['marks'])['name']
+ return {'average': average, 'topper': topper}
+
+
+
+{% include '../function_type_and_modify_check_suffix.py.jinja' %}
+
+
+
+# Public Test Cases
+
+## Input 1
+
+
+students = [
+ {"name": "Johny", "marks": 70},
+ {"name": "Dony", "marks": 90},
+ {"name": "Smithy", "marks": 80}
+]
+is_equal(
+ top_student(students),
+ {"average": 80.0, "topper": "Dony"}
+)
+
+
+## Output 1
+
+
+{"average": 80.0, "topper": "Dony"}
+
+
+# Private Test Cases
+
+## Input 1
+
+
+students = [
+ {"name": "Ammu", "marks": 88},
+ {"name": "Evuram", "marks": 92}
+]
+is_equal(
+ top_student(students),
+ {"average": 90.0, "topper": "Evuram"}
+)
+
+
+## Output 1
+
+
+{"average": 90.0, "topper": "Evuram"}
+
+
+## Input 2
+
+
+students = [
+ {"name": "Manvendra", "marks": 70},
+ {"name": "Lokendra", "marks": 86}
+]
+is_equal(
+ top_student(students),
+ {"average": 78.0, "topper": "Lokendra"}
+)
+
+
+## Output 2
+
+
+{"average": 78.0, "topper": "lokendra"}
diff --git a/problem_3.md b/problem_3.md
new file mode 100644
index 0000000..8ea5e0a
--- /dev/null
+++ b/problem_3.md
@@ -0,0 +1,85 @@
+---
+title: City Temperatures
+---
+
+# Problem Statement
+
+Write a program that reads city temperatures from stdin. The input contains lines of the format . Print the average temperature and the name of the city with the highest temperature(no two cities have highest temperature). Input ends when an empty line is encountered.
+
+**Example**
+```
+Input:
+Delhi 30
+Mumbai 35
+Chennai 33
+
+Output:
+Average Temperature: 32.67
+City with Highest Temperature: Mumbai
+```
+
+# Solution
+
+```py3 test.py -r 'python test.py'
+
+import sys
+
+def main():
+ '''
+ Calculate the average temperature and find the city with the highest temperature.
+
+ Input: Reads lines containing ' '. Ends on an empty line.
+ Output: Prints average temperature and city with highest temperature.
+ '''
+ ...
+
+ lines = sys.stdin.read().strip().split('\n')
+ data = [line.split() for line in lines if line]
+ cities = [d[0] for d in data]
+ temps = [float(d[1]) for d in data]
+
+ avg_temp = round(sum(temps) / len(temps), 2)
+ highest_temp_city = cities[temps.index(max(temps))]
+
+ print(f"Average Temperature: {avg_temp}")
+ print(f"City with Highest Temperature: {highest_temp_city}")
+
+
+
+{% include '../function_type_and_modify_check_suffix.py.jinja' %}
+
+```
+
+# Public Test Cases
+
+## Input 1
+
+```
+NYC 20
+London 25
+Berlin 22
+```
+
+## Output 1
+
+```
+Average Temperature: 22.33
+City with Highest Temperature: London
+```
+
+# Private Test Cases
+
+## Input 1
+
+```
+Delhi 27
+Bhopal 24
+Indore 21
+```
+
+## Output 1
+
+```
+Average Temperature: 24.00
+City with Highest Temperature: Delhi
+```
diff --git a/problem_4.md b/problem_4.md
new file mode 100644
index 0000000..e6cac6f
--- /dev/null
+++ b/problem_4.md
@@ -0,0 +1,135 @@
+---
+title: Problem-Solving: Expense Tracker
+---
+
+# Problem Statement
+
+Design an expense tracker application that allows users to:
+
+1. **Add an expense**: Record an expense with a category and amount.
+2. **Remove an expense**: Remove a specific expense by its ID.
+3. **Get total expenses**: Return the total amount of expenses.
+
+You need to implement the following functions:
+
+- `add_expense(expenses: list, category: str, amount: float) -> dict`: Adds a new expense with a unique ID.
+- `remove_expense(expenses: list, expense_id: int) -> list`: Removes an expense by its ID.
+- `total_expenses(expenses: list) -> float`: Returns the total amount of all expenses.
+
+**Example**
+```
+expenses = []
+expenses = add_expense(expenses, "Food", 50.0)
+expenses = add_expense(expenses, "Transport", 30.0)
+expenses = remove_expense(expenses, 1)
+print(total_expenses(expenses)) # Output: 30.0
+```
+
+# Solution
+
+```py3 test.py -r 'python test.py'
+
+def add_expense(expenses: list, category: str, amount: float) -> dict:
+ '''
+ Add a new expense to the list with a unique ID.
+
+ Arguments:
+ expenses: list - List of existing expenses.
+ category: str - Category of the expense.
+ amount: float - Amount of the expense.
+
+ Return: dict - Newly added expense.
+ '''
+ ...
+
+ expense_id = len(expenses) + 1
+ expense = {"id": expense_id, "category": category, "amount": amount}
+ expenses.append(expense)
+ return expenses
+
+def remove_expense(expenses: list, expense_id: int) -> list:
+ '''
+ Remove an expense from the list by its ID.
+
+ Arguments:
+ expenses: list - List of existing expenses.
+ expense_id: int - ID of the expense to remove.
+
+ Return: list - Updated list of expenses.
+ '''
+ ...
+
+ expenses = [expense for expense in expenses if expense["id"] != expense_id]
+ return expenses
+
+def total_expenses(expenses: list) -> float:
+ '''
+ Calculate the total amount of all expenses.
+
+ Arguments:
+ expenses: list - List of existing expenses.
+
+ Return: float - Total expenses.
+ '''
+ ...
+
+ return sum(expense["amount"] for expense in expenses)
+
+
+
+{% include '../function_type_and_modify_check_suffix.py.jinja' %}
+
+```
+
+# Public Test Cases
+
+## Input 1
+
+```
+expenses = []
+expenses = add_expense(expenses, "Food", 50.0)
+expenses = add_expense(expenses, "Travel", 100.0)
+is_equal(
+ total_expenses(expenses),
+ 150.0
+)
+expenses = remove_expense(expenses, 1)
+is_equal(
+ total_expenses(expenses),
+ 100.0
+)
+```
+
+## Output 1
+
+```
+150.0
+100.0
+```
+
+# Private Test Cases
+
+## Input 1
+
+```
+expenses = []
+expenses = add_expense(expenses, "Utilities", 80.0)
+expenses = add_expense(expenses, "Groceries", 120.0)
+expenses = add_expense(expenses, "Entertainment", 60.0)
+is_equal(
+ total_expenses(expenses),
+ 260.0
+)
+expenses = remove_expense(expenses, 2)
+is_equal(
+ total_expenses(expenses),
+ 140.0
+)
+```
+
+## Output 1
+
+```
+260.0
+140.0
+```