WayScience · roshankern · Aug 30, 2023 · Aug 24, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/0.download_data/README.md b/0.download_data/README.md
@@ -14,6 +14,34 @@ The version of mitocheck_data used is specified by the hash corresponding to a c
 The current hash being used is `e1f86cd007657f8247310b78df92891b22e51621` which corresponds to [mitocheck_data/e1f86cd](https://github.com/WayScience/mitocheck_data/tree/e1f86cd007657f8247310b78df92891b22e51621).
 The `hash` variable can be set in [download_data.ipynb](download_data.ipynb) to change which version of mitocheck_data is being accessed.
 
+### Data Preview
+
+The labeled dataset includes CellProfiler (CP) and DeepProfiler (DP) features as well as metadata (location, perturbation, etc) for cells from the original MitoCheck project.
+The breakdown of cell counts by phenotypic class (as labeled manually by MitoCheck) is as follows:
+
+| Phenotypic Class    | Cell Count |
+|---------------------|-------|
+| Interphase          | 420   |
+| Polylobed           | 367   |
+| Prometaphase        | 345   |
+| OutOfFocus          | 304   |
+| Apoptosis           | 273   |
+| Binuclear           | 184   |
+| MetaphaseAlignment  | 175   |
+| SmallIrregular      | 164   |
+| Hole                | 114   |
+| Elongated           | 110   |
+| ADCCM               | 95    |
+| Anaphase            | 84    |
+| Large               | 79    |
+| Grape               | 74    |
+| Metaphase           | 74    |
+| Folded              | 54    |
+
+**Note**: The `get_features_data()` function (defined in [split_utils.py](../utils/split_utils.py)) used to load the labeled cell dataset excludes cells from the `Folded` phenotypic class when loading the labeled cells.
+In our testing, the low representation of `Folded` cells leads to significantly low classification accuracy for this class (only tested with multi-class models).
+Thus, we opt to exclude these cells from all training and testing.
+
 ## Step 1: Download Data
 
 Use the commands below to download labeled training dataset:

diff --git a/3.evaluate_model/class_PR_curves.ipynb b/3.evaluate_model/class_PR_curves.ipynb
@@ -24,7 +24,7 @@
     "sys.path.append(\"../utils\")\n",
     "from split_utils import get_features_data\n",
     "from train_utils import get_dataset\n",
-    "from evaluate_utils import class_PR_curves, class_PR_curves_SCM\n"
+    "from evaluate_utils import class_PR_curves, class_PR_curves_SCM"
    ]
   },
   {
@@ -44,7 +44,7 @@
     "data_split_path = pathlib.Path(\"../1.split_data/indexes/data_split_indexes.tsv\")\n",
     "data_split_indexes = pd.read_csv(data_split_path, sep=\"\\t\", index_col=0)\n",
     "features_dataframe_path = pathlib.Path(\"../0.download_data/data/labeled_data.csv.gz\")\n",
-    "features_dataframe = get_features_data(features_dataframe_path)"
+    "features_dataframe = get_features_data(features_dataframe_path)\n"
    ]
   },
   {
@@ -329,7 +329,7 @@
     "        PR_data[\"feature_type\"] = feature_type\n",
     "\n",
     "        # add this score data to the tidy scores compiling list\n",
-    "        compiled_class_PR_curves.append(PR_data)"
+    "        compiled_class_PR_curves.append(PR_data)\n"
    ]
   },
   {
@@ -545,14 +545,14 @@
     "compiled_class_PR_curves.to_csv(compiled_PR_data_save_path, sep=\"\\t\")\n",
     "\n",
     "# preview tidy data\n",
-    "compiled_class_PR_curves"
+    "compiled_class_PR_curves\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Evaluate each model on each dataset (multiclass)\n"
+    "### Evaluate each model on each dataset (single class)\n"
    ]
   },
   {
@@ -618,7 +618,6 @@
     "    for feature_type, evaluation_type, phenotypic_class in itertools.product(\n",
     "        feature_types, evaluation_types, phenotypic_classes\n",
     "    ):\n",
-    "\n",
     "        # load single class model for this combination of model type, feature type, and phenotypic class\n",
     "        single_class_model_path = pathlib.Path(\n",
     "            f\"{single_class_models_dir}/{phenotypic_class}_models/{model_type}__{feature_type}.joblib\"\n",
@@ -658,7 +657,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Save PR curves from each evaluation (multiclass)\n"
+    "### Save PR curves from each evaluation (single class)\n"
    ]
   },
   {
@@ -865,7 +864,7 @@
     "compiled_SCM_PR_data.to_csv(compiled_PR_data_save_path, sep=\"\\t\")\n",
     "\n",
     "# preview tidy data\n",
-    "compiled_SCM_PR_data"
+    "compiled_SCM_PR_data\n"
    ]
   }
  ],

diff --git a/3.evaluate_model/get_LOIO_probabilities.ipynb b/3.evaluate_model/get_LOIO_probabilities.ipynb
@@ -29,7 +29,7 @@
     "sys.path.append(\"../utils\")\n",
     "from split_utils import get_features_data\n",
     "from train_utils import get_X_y_data\n",
-    "from evaluate_utils import get_SCM_model_data\n"
+    "from evaluate_utils import get_SCM_model_data"
    ]
   },
   {
@@ -286,7 +286,7 @@
     "\n",
     "# preview labeled data\n",
     "print(labeled_data.shape)\n",
-    "labeled_data.head(5)\n"
+    "labeled_data.head(5)"
    ]
   },
   {
@@ -305,14 +305,14 @@
    "source": [
     "# see number of images to\n",
     "num_images = labeled_data[\"Metadata_DNA\"].unique().shape[0]\n",
-    "print(f\"There are {num_images} images to perform LOIO evaluation on per model.\")"
+    "print(f\"There are {num_images} images to perform LOIO evaluation on per model.\")\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Get LOIO probabilities\n"
+    "### Get LOIO probabilities (multi class models)\n"
    ]
   },
   {
@@ -406,14 +406,14 @@
     "        test_cells_wide_data = pd.concat([metadata_dataframe, probas_dataframe], axis=1)\n",
     "\n",
     "        # add tidy long data to compiled data\n",
-    "        compiled_LOIO_wide_data.append(test_cells_wide_data)"
+    "        compiled_LOIO_wide_data.append(test_cells_wide_data)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Format and save LOIO probabilities\n"
+    "### Format and save LOIO probabilities (multi class models)\n"
    ]
   },
   {
@@ -657,7 +657,7 @@
     "compiled_LOIO_tidy_long_data.to_csv(compiled_LOIO_save_path, sep=\"\\t\")\n",
     "\n",
     "# preview tidy long data\n",
-    "compiled_LOIO_tidy_long_data"
+    "compiled_LOIO_tidy_long_data\n"
    ]
   },
   {
@@ -819,14 +819,14 @@
     "        test_cells_wide_data = pd.concat([metadata_dataframe, probas_dataframe], axis=1)\n",
     "\n",
     "        # add tidy long data to compiled data\n",
-    "        compiled_LOIO_wide_data.append(test_cells_wide_data)"
+    "        compiled_LOIO_wide_data.append(test_cells_wide_data)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Format and save LOIO probabilities\n"
+    "### Format and save LOIO probabilities (single class models)\n"
    ]
   },
   {
@@ -1082,7 +1082,7 @@
     "compiled_LOIO_tidy_long_data.to_csv(compiled_LOIO_save_path, sep=\"\\t\")\n",
     "\n",
     "# preview tidy long data\n",
-    "compiled_LOIO_tidy_long_data\n"
+    "compiled_LOIO_tidy_long_data"
    ]
   }
  ],

diff --git a/3.evaluate_model/get_model_predictions.ipynb b/3.evaluate_model/get_model_predictions.ipynb
@@ -25,7 +25,7 @@
     "sys.path.append(\"../utils\")\n",
     "from split_utils import get_features_data\n",
     "from train_utils import get_dataset, get_X_y_data\n",
-    "from evaluate_utils import get_SCM_model_data\n"
+    "from evaluate_utils import get_SCM_model_data"
    ]
   },
   {
@@ -45,14 +45,14 @@
     "data_split_path = pathlib.Path(\"../1.split_data/indexes/data_split_indexes.tsv\")\n",
     "data_split_indexes = pd.read_csv(data_split_path, sep=\"\\t\", index_col=0)\n",
     "features_dataframe_path = pathlib.Path(\"../0.download_data/data/labeled_data.csv.gz\")\n",
-    "features_dataframe = get_features_data(features_dataframe_path)"
+    "features_dataframe = get_features_data(features_dataframe_path)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Get Each Model Predictions on Each Dataset (Multi Class Models)\n"
+    "### Get Each Model Predictions on Each Dataset (multi class models)\n"
    ]
   },
   {
@@ -125,14 +125,14 @@
     "            }\n",
     "        )\n",
     "\n",
-    "        compiled_predictions.append(predictions_df)\n"
+    "        compiled_predictions.append(predictions_df)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Compile and Save Predictions\n"
+    "### Compile and Save Predictions (multi class models)\n"
    ]
   },
   {
@@ -321,14 +321,14 @@
     "compiled_predictions.to_csv(compiled_predictions_save_path, sep=\"\\t\")\n",
     "\n",
     "# preview compiled predictions\n",
-    "compiled_predictions"
+    "compiled_predictions\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Get Each Model Predictions on Each Dataset (Single Class Models)\n"
+    "### Get Each Model Predictions on Each Dataset (single class models)\n"
    ]
   },
   {
@@ -577,14 +577,14 @@
     "        }\n",
     "    )\n",
     "\n",
-    "    compiled_predictions.append(predictions_df)"
+    "    compiled_predictions.append(predictions_df)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Compile and Save Predictions\n"
+    "### Compile and Save Predictions (single class models)\n"
    ]
   },
   {
@@ -775,7 +775,7 @@
     "compiled_predictions.to_csv(compiled_predictions_save_path, sep=\"\\t\")\n",
     "\n",
     "# preview compiled predictions\n",
-    "compiled_predictions"
+    "compiled_predictions\n"
    ]
   }
  ],

diff --git a/3.evaluate_model/scripts/nbconverted/class_PR_curves.py b/3.evaluate_model/scripts/nbconverted/class_PR_curves.py
@@ -111,7 +111,7 @@
 compiled_class_PR_curves
 
 
-# ### Evaluate each model on each dataset (multiclass)
+# ### Evaluate each model on each dataset (single class)
 # 
 
 # In[5]:
@@ -188,7 +188,7 @@
     plt.show()
 
 
-# ### Save PR curves from each evaluation (multiclass)
+# ### Save PR curves from each evaluation (single class)
 # 
 
 # In[6]:
@@ -212,3 +212,4 @@
 
 # preview tidy data
 compiled_SCM_PR_data
+
diff --git a/3.evaluate_model/scripts/nbconverted/get_LOIO_probabilities.py b/3.evaluate_model/scripts/nbconverted/get_LOIO_probabilities.py
@@ -49,7 +49,7 @@
 print(f"There are {num_images} images to perform LOIO evaluation on per model.")
 
 
-# ### Get LOIO probabilities
+# ### Get LOIO probabilities (multi class models)
 # 
 
 # In[4]:
@@ -133,7 +133,7 @@
         compiled_LOIO_wide_data.append(test_cells_wide_data)
 
 
-# ### Format and save LOIO probabilities
+# ### Format and save LOIO probabilities (multi class models)
 # 
 
 # In[5]:
@@ -274,7 +274,7 @@
         compiled_LOIO_wide_data.append(test_cells_wide_data)
 
 
-# ### Format and save LOIO probabilities
+# ### Format and save LOIO probabilities (single class models)
 # 
 
 # In[7]: