diff --git a/model_practice/TitanicSurviverPredict.ipynb b/model_practice/TitanicSurviverPredict.ipynb index 60d6851..fb989f9 100644 --- a/model_practice/TitanicSurviverPredict.ipynb +++ b/model_practice/TitanicSurviverPredict.ipynb @@ -32,7 +32,7 @@ "metadata": { "id": "v96JX7Flle9E", "colab_type": "code", - "outputId": "f28b1aca-9269-4b53-d601-22c7e39d61af", + "outputId": "97fb4732-4afd-4489-b9fb-b89935ba71ed", "colab": { "base_uri": "https://localhost:8080/", "height": 255 @@ -42,7 +42,7 @@ "train_data = pd.read_csv('drive/My Drive/train_data/titanic_train.csv')\n", "train_data.head()" ], - "execution_count": 8, + "execution_count": 2, "outputs": [ { "output_type": "execute_result", @@ -174,7 +174,7 @@ "metadata": { "tags": [] }, - "execution_count": 8 + "execution_count": 2 } ] }, @@ -183,7 +183,7 @@ "metadata": { "id": "DzJlvYPl78aL", "colab_type": "code", - "outputId": "f3ba4960-e50a-429b-95a9-bdb1ccfe156a", + "outputId": "4a00e5a2-bc66-4178-c905-8e437fc956d8", "colab": { "base_uri": "https://localhost:8080/", "height": 238 @@ -192,7 +192,7 @@ "source": [ "train_data.isnull().sum()" ], - "execution_count": 9, + "execution_count": 3, "outputs": [ { "output_type": "execute_result", @@ -216,7 +216,7 @@ "metadata": { "tags": [] }, - "execution_count": 9 + "execution_count": 3 } ] }, @@ -225,7 +225,7 @@ "metadata": { "id": "1zILNZW08BSP", "colab_type": "code", - "outputId": "64cd24e0-1622-42f5-c118-87dbc2c2bfe7", + "outputId": "26ae2e62-0a67-408d-df55-f012e411bd63", "colab": { "base_uri": "https://localhost:8080/", "height": 255 @@ -237,7 +237,7 @@ "train_data['Sex'] = train_data['Sex'].map(sex_mapping)\n", "train_data.head()" ], - "execution_count": 10, + "execution_count": 4, "outputs": [ { "output_type": "execute_result", @@ -369,7 +369,422 @@ "metadata": { "tags": [] }, - "execution_count": 10 + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FmmMODl4IS8W", + "colab_type": "code", + "outputId": "3597bb8b-b6fc-4aab-c320-34efab5e385c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 + } + }, + "source": [ + "# guess Age by Sex and Pclass for no Age \n", + "guess_ages = np.zeros((2,3))\n", + "\n", + "for i in range(0, 2):\n", + " for j in range(0, 3):\n", + " guess_df = train_data[(train_data['Sex'] == i) & (train_data['Pclass'] == j+1)]['Age'].dropna()\n", + " age_guess = guess_df.median()\n", + " guess_ages[i, j] = int( age_guess/0.5 + 0.5 ) * 0.5\n", + "\n", + "for i in range(0, 2):\n", + " for j in range(0, 3):\n", + " # fill guess_ages\n", + " train_data.loc[ (train_data.Age.isnull()) & (train_data.Sex == i) & (train_data.Pclass == j+1), 'Age'] = guess_ages[i, j]\n", + "\n", + "train_data['Age'] = train_data['Age'].astype(int)\n", + "train_data.head()" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harris12210A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...03810PC 1759971.2833C85C
2313Heikkinen, Miss. Laina02600STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)0351011380353.1000C123S
4503Allen, Mr. William Henry135003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass ... Fare Cabin Embarked\n", + "0 1 0 3 ... 7.2500 NaN S\n", + "1 2 1 1 ... 71.2833 C85 C\n", + "2 3 1 3 ... 7.9250 NaN S\n", + "3 4 1 1 ... 53.1000 C123 S\n", + "4 5 0 3 ... 8.0500 NaN S\n", + "\n", + "[5 rows x 12 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZDdBkC88Xqg1", + "colab_type": "code", + "outputId": "039f55c9-bd05-4f75-af2c-e9305bf67b3f", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + } + }, + "source": [ + "train_data['AgeBand'] = pd.cut(train_data['Age'], 5)\n", + "train_data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeBandSurvived
0(-0.08, 16.0]0.550000
1(16.0, 32.0]0.337374
2(32.0, 48.0]0.412037
3(48.0, 64.0]0.434783
4(64.0, 80.0]0.090909
\n", + "
" + ], + "text/plain": [ + " AgeBand Survived\n", + "0 (-0.08, 16.0] 0.550000\n", + "1 (16.0, 32.0] 0.337374\n", + "2 (32.0, 48.0] 0.412037\n", + "3 (48.0, 64.0] 0.434783\n", + "4 (64.0, 80.0] 0.090909" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Pj_U1OgZUj3e", + "colab_type": "code", + "outputId": "93d1ca92-513d-49db-bd9e-d69712df0ecb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 289 + } + }, + "source": [ + "train_data.loc[ train_data['Age'] <= 16, 'Age'] = 0\n", + "train_data.loc[ (train_data['Age'] > 16) & (train_data['Age'] <= 32), 'Age'] = 1\n", + "train_data.loc[ (train_data['Age'] > 32) & (train_data['Age'] <= 48), 'Age'] = 2\n", + "train_data.loc[ (train_data['Age'] > 48) & (train_data['Age'] <= 64), 'Age'] = 3\n", + "train_data.loc[ train_data['Age'] > 64, 'Age']\n", + "train_data.head()" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedAgeBand
0103Braund, Mr. Owen Harris1110A/5 211717.2500NaNS(16.0, 32.0]
1211Cumings, Mrs. John Bradley (Florence Briggs Th...0210PC 1759971.2833C85C(32.0, 48.0]
2313Heikkinen, Miss. Laina0100STON/O2. 31012827.9250NaNS(16.0, 32.0]
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)021011380353.1000C123S(32.0, 48.0]
4503Allen, Mr. William Henry12003734508.0500NaNS(32.0, 48.0]
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass ... Cabin Embarked AgeBand\n", + "0 1 0 3 ... NaN S (16.0, 32.0]\n", + "1 2 1 1 ... C85 C (32.0, 48.0]\n", + "2 3 1 3 ... NaN S (16.0, 32.0]\n", + "3 4 1 1 ... C123 S (32.0, 48.0]\n", + "4 5 0 3 ... NaN S (32.0, 48.0]\n", + "\n", + "[5 rows x 13 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 } ] }, @@ -378,24 +793,30 @@ "metadata": { "id": "vp49nn2usonr", "colab_type": "code", - "outputId": "4c32d3f9-8892-460e-9c04-6e70de6cbe65", + "outputId": "51b07567-bb9d-4c41-a1e0-513ef8d8cc42", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ - "X = pd.DataFrame(train_data[['Pclass', 'Sex']])\n", + "feature_label = [\n", + " 'Pclass',\n", + " 'Sex',\n", + " 'Age',\n", + " 'SibSp',\n", + "]\n", + "X = pd.DataFrame(train_data[feature_label])\n", "y = pd.DataFrame(train_data.iloc[:, 1]) # Survived\n", "print(X.shape)\n", "print(y.shape)" ], - "execution_count": 11, + "execution_count": 33, "outputs": [ { "output_type": "stream", "text": [ - "(891, 2)\n", + "(891, 4)\n", "(891, 1)\n" ], "name": "stdout" @@ -407,7 +828,7 @@ "metadata": { "id": "avgsvmL9t_sn", "colab_type": "code", - "outputId": "25f45bff-2a9e-4111-a9d2-8ae4f9eee4e9", + "outputId": "e8a5d053-b99b-453b-9c90-5afcc1f15216", "colab": { "base_uri": "https://localhost:8080/", "height": 51 @@ -419,13 +840,13 @@ "print(X_train.shape)\n", "print(X_test.shape)" ], - "execution_count": 15, + "execution_count": 34, "outputs": [ { "output_type": "stream", "text": [ - "(623, 2)\n", - "(268, 2)\n" + "(623, 4)\n", + "(268, 4)\n" ], "name": "stdout" } @@ -436,7 +857,7 @@ "metadata": { "id": "kbX47pzmu5D-", "colab_type": "code", - "outputId": "d7d1218e-a7c7-4360-f24d-1e6522336dc9", + "outputId": "2152512b-4c00-45a4-fc0a-c758eb56e126", "colab": { "base_uri": "https://localhost:8080/", "height": 156 @@ -448,7 +869,7 @@ "model = LogisticRegression(random_state=FIXED_RESULT)\n", "model.fit(X_train, y_train)" ], - "execution_count": 20, + "execution_count": 35, "outputs": [ { "output_type": "stream", @@ -472,7 +893,7 @@ "metadata": { "tags": [] }, - "execution_count": 20 + "execution_count": 35 } ] }, @@ -481,39 +902,26 @@ "metadata": { "id": "WFqxHH6NvQ50", "colab_type": "code", + "outputId": "bb33c855-a196-41bd-dd92-55bf60e4f25b", "colab": { "base_uri": "https://localhost:8080/", "height": 34 - }, - "outputId": "1fac9b1d-c84b-42e2-f011-0ae87a25fa60" + } }, "source": [ "# prediction\n", - "print(model.score(X_train, y_train))" + "print(model.score(X_test, y_test))" ], - "execution_count": 21, + "execution_count": 36, "outputs": [ { "output_type": "stream", "text": [ - "0.8009630818619583\n" + "0.7611940298507462\n" ], "name": "stdout" } ] - }, - { - "cell_type": "code", - "metadata": { - "id": "v_89Rw7G7uc-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] } ] } \ No newline at end of file diff --git a/model_practice/titanicsurviverpredict.py b/model_practice/titanicsurviverpredict.py index cdd8876..8dd7397 100644 --- a/model_practice/titanicsurviverpredict.py +++ b/model_practice/titanicsurviverpredict.py @@ -21,7 +21,40 @@ train_data['Sex'] = train_data['Sex'].map(sex_mapping) train_data.head() -X = pd.DataFrame(train_data[['Pclass', 'Sex']]) +# guess Age by Sex and Pclass for no Age +guess_ages = np.zeros((2,3)) + +for i in range(0, 2): + for j in range(0, 3): + guess_df = train_data[(train_data['Sex'] == i) & (train_data['Pclass'] == j+1)]['Age'].dropna() + age_guess = guess_df.median() + guess_ages[i, j] = int( age_guess/0.5 + 0.5 ) * 0.5 + +for i in range(0, 2): + for j in range(0, 3): + # fill guess_ages + train_data.loc[ (train_data.Age.isnull()) & (train_data.Sex == i) & (train_data.Pclass == j+1), 'Age'] = guess_ages[i, j] + +train_data['Age'] = train_data['Age'].astype(int) +train_data.head() + +train_data['AgeBand'] = pd.cut(train_data['Age'], 5) +train_data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True) + +train_data.loc[ train_data['Age'] <= 16, 'Age'] = 0 +train_data.loc[ (train_data['Age'] > 16) & (train_data['Age'] <= 32), 'Age'] = 1 +train_data.loc[ (train_data['Age'] > 32) & (train_data['Age'] <= 48), 'Age'] = 2 +train_data.loc[ (train_data['Age'] > 48) & (train_data['Age'] <= 64), 'Age'] = 3 +train_data.loc[ train_data['Age'] > 64, 'Age'] +train_data.head() + +feature_label = [ + 'Pclass', + 'Sex', + 'Age', + 'SibSp', +] +X = pd.DataFrame(train_data[feature_label]) y = pd.DataFrame(train_data.iloc[:, 1]) # Survived print(X.shape) print(y.shape) @@ -37,5 +70,4 @@ model.fit(X_train, y_train) # prediction -print(model.score(X_train, y_train)) - +print(model.score(X_test, y_test)) \ No newline at end of file