diff --git a/0.download_data/README.md b/0.download_data/README.md index bece3893..bc593ace 100644 --- a/0.download_data/README.md +++ b/0.download_data/README.md @@ -14,6 +14,34 @@ The version of mitocheck_data used is specified by the hash corresponding to a c The current hash being used is `e1f86cd007657f8247310b78df92891b22e51621` which corresponds to [mitocheck_data/e1f86cd](https://github.com/WayScience/mitocheck_data/tree/e1f86cd007657f8247310b78df92891b22e51621). The `hash` variable can be set in [download_data.ipynb](download_data.ipynb) to change which version of mitocheck_data is being accessed. +### Data Preview + +The labeled dataset includes CellProfiler (CP) and DeepProfiler (DP) features as well as metadata (location, perturbation, etc) for cells from the original MitoCheck project. +The breakdown of cell counts by phenotypic class (as labeled manually by MitoCheck) is as follows: + +| Phenotypic Class | Cell Count | +|---------------------|-------| +| Interphase | 420 | +| Polylobed | 367 | +| Prometaphase | 345 | +| OutOfFocus | 304 | +| Apoptosis | 273 | +| Binuclear | 184 | +| MetaphaseAlignment | 175 | +| SmallIrregular | 164 | +| Hole | 114 | +| Elongated | 110 | +| ADCCM | 95 | +| Anaphase | 84 | +| Large | 79 | +| Grape | 74 | +| Metaphase | 74 | +| Folded | 54 | + +**Note**: The `get_features_data()` function (defined in [split_utils.py](../utils/split_utils.py)) used to load the labeled cell dataset excludes cells from the `Folded` phenotypic class when loading the labeled cells. +In our testing, the low representation of `Folded` cells leads to significantly low classification accuracy for this class (only tested with multi-class models). +Thus, we opt to exclude these cells from all training and testing. + ## Step 1: Download Data Use the commands below to download labeled training dataset: diff --git a/3.evaluate_model/class_PR_curves.ipynb b/3.evaluate_model/class_PR_curves.ipynb index 6c44c722..3f2284ab 100644 --- a/3.evaluate_model/class_PR_curves.ipynb +++ b/3.evaluate_model/class_PR_curves.ipynb @@ -24,7 +24,7 @@ "sys.path.append(\"../utils\")\n", "from split_utils import get_features_data\n", "from train_utils import get_dataset\n", - "from evaluate_utils import class_PR_curves, class_PR_curves_SCM\n" + "from evaluate_utils import class_PR_curves, class_PR_curves_SCM" ] }, { @@ -44,7 +44,7 @@ "data_split_path = pathlib.Path(\"../1.split_data/indexes/data_split_indexes.tsv\")\n", "data_split_indexes = pd.read_csv(data_split_path, sep=\"\\t\", index_col=0)\n", "features_dataframe_path = pathlib.Path(\"../0.download_data/data/labeled_data.csv.gz\")\n", - "features_dataframe = get_features_data(features_dataframe_path)" + "features_dataframe = get_features_data(features_dataframe_path)\n" ] }, { @@ -329,7 +329,7 @@ " PR_data[\"feature_type\"] = feature_type\n", "\n", " # add this score data to the tidy scores compiling list\n", - " compiled_class_PR_curves.append(PR_data)" + " compiled_class_PR_curves.append(PR_data)\n" ] }, { @@ -545,14 +545,14 @@ "compiled_class_PR_curves.to_csv(compiled_PR_data_save_path, sep=\"\\t\")\n", "\n", "# preview tidy data\n", - "compiled_class_PR_curves" + "compiled_class_PR_curves\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Evaluate each model on each dataset (multiclass)\n" + "### Evaluate each model on each dataset (single class)\n" ] }, { @@ -618,7 +618,6 @@ " for feature_type, evaluation_type, phenotypic_class in itertools.product(\n", " feature_types, evaluation_types, phenotypic_classes\n", " ):\n", - "\n", " # load single class model for this combination of model type, feature type, and phenotypic class\n", " single_class_model_path = pathlib.Path(\n", " f\"{single_class_models_dir}/{phenotypic_class}_models/{model_type}__{feature_type}.joblib\"\n", @@ -658,7 +657,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Save PR curves from each evaluation (multiclass)\n" + "### Save PR curves from each evaluation (single class)\n" ] }, { @@ -865,7 +864,7 @@ "compiled_SCM_PR_data.to_csv(compiled_PR_data_save_path, sep=\"\\t\")\n", "\n", "# preview tidy data\n", - "compiled_SCM_PR_data" + "compiled_SCM_PR_data\n" ] } ], diff --git a/3.evaluate_model/get_LOIO_probabilities.ipynb b/3.evaluate_model/get_LOIO_probabilities.ipynb index c83cfc15..5baf9f5f 100644 --- a/3.evaluate_model/get_LOIO_probabilities.ipynb +++ b/3.evaluate_model/get_LOIO_probabilities.ipynb @@ -29,7 +29,7 @@ "sys.path.append(\"../utils\")\n", "from split_utils import get_features_data\n", "from train_utils import get_X_y_data\n", - "from evaluate_utils import get_SCM_model_data\n" + "from evaluate_utils import get_SCM_model_data" ] }, { @@ -286,7 +286,7 @@ "\n", "# preview labeled data\n", "print(labeled_data.shape)\n", - "labeled_data.head(5)\n" + "labeled_data.head(5)" ] }, { @@ -305,14 +305,14 @@ "source": [ "# see number of images to\n", "num_images = labeled_data[\"Metadata_DNA\"].unique().shape[0]\n", - "print(f\"There are {num_images} images to perform LOIO evaluation on per model.\")" + "print(f\"There are {num_images} images to perform LOIO evaluation on per model.\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Get LOIO probabilities\n" + "### Get LOIO probabilities (multi class models)\n" ] }, { @@ -406,14 +406,14 @@ " test_cells_wide_data = pd.concat([metadata_dataframe, probas_dataframe], axis=1)\n", "\n", " # add tidy long data to compiled data\n", - " compiled_LOIO_wide_data.append(test_cells_wide_data)" + " compiled_LOIO_wide_data.append(test_cells_wide_data)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Format and save LOIO probabilities\n" + "### Format and save LOIO probabilities (multi class models)\n" ] }, { @@ -657,7 +657,7 @@ "compiled_LOIO_tidy_long_data.to_csv(compiled_LOIO_save_path, sep=\"\\t\")\n", "\n", "# preview tidy long data\n", - "compiled_LOIO_tidy_long_data" + "compiled_LOIO_tidy_long_data\n" ] }, { @@ -819,14 +819,14 @@ " test_cells_wide_data = pd.concat([metadata_dataframe, probas_dataframe], axis=1)\n", "\n", " # add tidy long data to compiled data\n", - " compiled_LOIO_wide_data.append(test_cells_wide_data)" + " compiled_LOIO_wide_data.append(test_cells_wide_data)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Format and save LOIO probabilities\n" + "### Format and save LOIO probabilities (single class models)\n" ] }, { @@ -1082,7 +1082,7 @@ "compiled_LOIO_tidy_long_data.to_csv(compiled_LOIO_save_path, sep=\"\\t\")\n", "\n", "# preview tidy long data\n", - "compiled_LOIO_tidy_long_data\n" + "compiled_LOIO_tidy_long_data" ] } ], diff --git a/3.evaluate_model/get_model_predictions.ipynb b/3.evaluate_model/get_model_predictions.ipynb index 675f6d34..e1a3535e 100644 --- a/3.evaluate_model/get_model_predictions.ipynb +++ b/3.evaluate_model/get_model_predictions.ipynb @@ -25,7 +25,7 @@ "sys.path.append(\"../utils\")\n", "from split_utils import get_features_data\n", "from train_utils import get_dataset, get_X_y_data\n", - "from evaluate_utils import get_SCM_model_data\n" + "from evaluate_utils import get_SCM_model_data" ] }, { @@ -45,14 +45,14 @@ "data_split_path = pathlib.Path(\"../1.split_data/indexes/data_split_indexes.tsv\")\n", "data_split_indexes = pd.read_csv(data_split_path, sep=\"\\t\", index_col=0)\n", "features_dataframe_path = pathlib.Path(\"../0.download_data/data/labeled_data.csv.gz\")\n", - "features_dataframe = get_features_data(features_dataframe_path)" + "features_dataframe = get_features_data(features_dataframe_path)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Get Each Model Predictions on Each Dataset (Multi Class Models)\n" + "### Get Each Model Predictions on Each Dataset (multi class models)\n" ] }, { @@ -125,14 +125,14 @@ " }\n", " )\n", "\n", - " compiled_predictions.append(predictions_df)\n" + " compiled_predictions.append(predictions_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Compile and Save Predictions\n" + "### Compile and Save Predictions (multi class models)\n" ] }, { @@ -321,14 +321,14 @@ "compiled_predictions.to_csv(compiled_predictions_save_path, sep=\"\\t\")\n", "\n", "# preview compiled predictions\n", - "compiled_predictions" + "compiled_predictions\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Get Each Model Predictions on Each Dataset (Single Class Models)\n" + "### Get Each Model Predictions on Each Dataset (single class models)\n" ] }, { @@ -577,14 +577,14 @@ " }\n", " )\n", "\n", - " compiled_predictions.append(predictions_df)" + " compiled_predictions.append(predictions_df)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Compile and Save Predictions\n" + "### Compile and Save Predictions (single class models)\n" ] }, { @@ -775,7 +775,7 @@ "compiled_predictions.to_csv(compiled_predictions_save_path, sep=\"\\t\")\n", "\n", "# preview compiled predictions\n", - "compiled_predictions" + "compiled_predictions\n" ] } ], diff --git a/3.evaluate_model/scripts/nbconverted/class_PR_curves.py b/3.evaluate_model/scripts/nbconverted/class_PR_curves.py index 1fd6e871..ec3dc3ac 100644 --- a/3.evaluate_model/scripts/nbconverted/class_PR_curves.py +++ b/3.evaluate_model/scripts/nbconverted/class_PR_curves.py @@ -111,7 +111,7 @@ compiled_class_PR_curves -# ### Evaluate each model on each dataset (multiclass) +# ### Evaluate each model on each dataset (single class) # # In[5]: @@ -188,7 +188,7 @@ plt.show() -# ### Save PR curves from each evaluation (multiclass) +# ### Save PR curves from each evaluation (single class) # # In[6]: @@ -212,3 +212,4 @@ # preview tidy data compiled_SCM_PR_data + diff --git a/3.evaluate_model/scripts/nbconverted/get_LOIO_probabilities.py b/3.evaluate_model/scripts/nbconverted/get_LOIO_probabilities.py index 791b8e94..9173b907 100644 --- a/3.evaluate_model/scripts/nbconverted/get_LOIO_probabilities.py +++ b/3.evaluate_model/scripts/nbconverted/get_LOIO_probabilities.py @@ -49,7 +49,7 @@ print(f"There are {num_images} images to perform LOIO evaluation on per model.") -# ### Get LOIO probabilities +# ### Get LOIO probabilities (multi class models) # # In[4]: @@ -133,7 +133,7 @@ compiled_LOIO_wide_data.append(test_cells_wide_data) -# ### Format and save LOIO probabilities +# ### Format and save LOIO probabilities (multi class models) # # In[5]: @@ -274,7 +274,7 @@ compiled_LOIO_wide_data.append(test_cells_wide_data) -# ### Format and save LOIO probabilities +# ### Format and save LOIO probabilities (single class models) # # In[7]: diff --git a/3.evaluate_model/scripts/nbconverted/get_model_predictions.py b/3.evaluate_model/scripts/nbconverted/get_model_predictions.py index 80b44e49..a5d68818 100644 --- a/3.evaluate_model/scripts/nbconverted/get_model_predictions.py +++ b/3.evaluate_model/scripts/nbconverted/get_model_predictions.py @@ -35,7 +35,7 @@ features_dataframe = get_features_data(features_dataframe_path) -# ### Get Each Model Predictions on Each Dataset (Multi Class Models) +# ### Get Each Model Predictions on Each Dataset (multi class models) # # In[3]: @@ -89,7 +89,7 @@ compiled_predictions.append(predictions_df) -# ### Compile and Save Predictions +# ### Compile and Save Predictions (multi class models) # # In[4]: @@ -109,7 +109,8 @@ compiled_predictions -# ### Get Each Model Predictions on Each Dataset (Single Class Models) +# ### Get Each Model Predictions on Each Dataset (single class models) +# # In[5]: @@ -121,7 +122,10 @@ compiled_predictions = [] # define combinations to test over -model_types = ["final", "shuffled_baseline"] # only perform LOIO with hyper params from final models so skip shuffled_baseline models +model_types = [ + "final", + "shuffled_baseline", +] # only perform LOIO with hyper params from final models so skip shuffled_baseline models feature_types = ["CP", "DP", "CP_and_DP"] evaluation_types = ["train", "test"] phenotypic_classes = features_dataframe["Mitocheck_Phenotypic_Class"].unique() @@ -135,20 +139,20 @@ f"{models_dir}/{phenotypic_class}_models/{model_type}__{feature_type}.joblib" ) model = load(single_class_model_path) - + print( - f"Getting predictions for {phenotypic_class} model: {model_type}, trained with features: {feature_type}, on dataset: {evaluation_type}" - ) - + f"Getting predictions for {phenotypic_class} model: {model_type}, trained with features: {feature_type}, on dataset: {evaluation_type}" + ) + # load dataset (train, test, etc) data = get_SCM_model_data(features_dataframe, phenotypic_class, evaluation_type) - + # get features and labels dataframe X, y = get_X_y_data(data, feature_type) - + # get predictions from model y_pred = model.predict(X) - + # create dataframe with dataset index of cell being predicted, # predicted phenotypic class, # true phenotypic class, @@ -167,7 +171,8 @@ compiled_predictions.append(predictions_df) -# ### Compile and Save Predictions +# ### Compile and Save Predictions (single class models) +# # In[6]: @@ -176,7 +181,9 @@ compiled_predictions = pd.concat(compiled_predictions).reset_index(drop=True) # specify save path -compiled_predictions_save_path = pathlib.Path("predictions/compiled_SCM_predictions.tsv") +compiled_predictions_save_path = pathlib.Path( + "predictions/compiled_SCM_predictions.tsv" +) compiled_predictions_save_path.parent.mkdir(parents=True, exist_ok=True) # save data as tsv diff --git a/utils/split_utils.py b/utils/split_utils.py index 2dbe584a..6cce2e6f 100644 --- a/utils/split_utils.py +++ b/utils/split_utils.py @@ -13,14 +13,14 @@ def get_features_data(load_path: pathlib.Path) -> pd.DataFrame: """get features data from csv at load path Args: - load_path (pathlib.Path): path to training data csv + load_path (pathlib.Path): path to labeled data csv Returns: - pd.DataFrame: training dataframe + pd.DataFrame: labeled cells dataframe """ # read dataset into pandas dataframe features_data = pd.read_csv(load_path, index_col=0) - # remove fold class that has low representation + # exclude folded class that has significantly low representation/classification accuracy features_data = features_data[ features_data["Mitocheck_Phenotypic_Class"] != "Folded" ]