From 52f920e1088cdbca94ff8a625adc6aa5caf9353c Mon Sep 17 00:00:00 2001 From: mbuttner Date: Tue, 2 Aug 2022 18:21:09 +0200 Subject: [PATCH 1/3] :memo: add preprocessing notebook --- docs/tutorials/preprocessing.ipynb | 262 +++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 docs/tutorials/preprocessing.ipynb diff --git a/docs/tutorials/preprocessing.ipynb b/docs/tutorials/preprocessing.ipynb new file mode 100644 index 0000000..25c8da2 --- /dev/null +++ b/docs/tutorials/preprocessing.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocess flow data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we load an fcs file into the anndata format, move the forward scatter (FCS) and sideward scatter (SSC) information to the `.obs` section of the anndata file and perform compensation on the data. Next, we apply different types of normalisation to the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import readfcs\n", + "import pytometry as pm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read data from `readfcs` package example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlretrieve\n", + "\n", + "path_data, _ = urlretrieve(readfcs.datasets.example(), \"example.fcs\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata = pm.io.read_fcs(path_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reduce features " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We split the data matrix into the marker intensity part and the FSC/SSC part. Moreover, we move all height related features to the `.obs` part of the anndata file. Notably. the function `split_signal` checks if a feature name is either FSC/SSC or whether a name endswith `-A` for area related features and `-H` for height related features. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pm.pp.split_signal(adata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us check the `var_names` of the features and the channel names. In this example, the channel names have been cleaned such that none of the markers have the `-A` or `-H` suffix. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata.var" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us modify the feature column `signal_type` manually." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata.var[\"signal_type\"] = adata.var[\"signal_type\"].cat.add_categories([\"area\"])\n", + "adata.var[\"signal_type\"][3:] = \"area\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata.var" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Repeat to split the data matrix." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pm.pp.split_signal(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time, we did not get the warning that all features are returned. Indeed, the data matrix was reduced by three features (`FSC-A`, `FSC-H` and `SSC-A`). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compensation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we compensate the data using the compensation matrix that is included in the FCS file header. Alternatively, one may provide a custom compensation matrix." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pm.pp.compensate(adata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Normalize data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next step, we normalize the data. By default, normalization is an inplace operation, i.e. we only create a new anndata object, if we set the argument `copy=True`. We demonstrate three different normalization methods that are build in `pytometry`:\n", + "* arcsinh \n", + "* logicle \n", + "* bi-exponential" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata_arcsinh = pm.tl.normalize_arcsinh(adata, cofactor=150, copy=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata_logicle = pm.tl.normalize_logicle(adata, copy=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "adata_biex = pm.tl.normalize_biExp(adata, copy=True)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "48c3c4927e81daf79217bae0bb1c93e3ab00a11990990ff2e155253980f357b0" + }, + "kernelspec": { + "display_name": "Python 3.9.7 ('pyto_dev')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 13d97ad0c1ac714901a481a7a72547caa6336a3b Mon Sep 17 00:00:00 2001 From: mbuttner Date: Tue, 2 Aug 2022 18:30:50 +0200 Subject: [PATCH 2/3] :bug: change warning type --- pytometry/preprocessing/_process_data.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pytometry/preprocessing/_process_data.py b/pytometry/preprocessing/_process_data.py index de51cfe..aeed92b 100644 --- a/pytometry/preprocessing/_process_data.py +++ b/pytometry/preprocessing/_process_data.py @@ -173,10 +173,8 @@ def compensate( # check for nan values nan_val = np.isnan(adata.X[:, indexes]).sum() if nan_val > 0: - raise Warning( - f"{nan_val} NaN values found after compensation. Please adjust" - " compensation matrix." - ) + assert f"{nan_val} NaN values found after compensation. Please adjust " + "compensation matrix." return adata if copy else None From e13b3ef893ba46b7a660618a0006b230efe6c2a4 Mon Sep 17 00:00:00 2001 From: mbuttner Date: Tue, 2 Aug 2022 18:36:09 +0200 Subject: [PATCH 3/3] :memo: add nb to tutorials page --- .gitignore | 3 +++ docs/tutorials/index.md | 1 + 2 files changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index c5003e4..a52b104 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ _build docs/pytometry.* lamin_sphinx docs/conf.py + +# data +docs/tutorials/*.fcs diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index b09bc17..e3f4053 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -11,4 +11,5 @@ This makes it both easy for the user to understand the documentation, and for th quickstart read_fcs +preprocessing ```