From ec2fac0621029fa83a6d5f5aa3e3f49ea181de01 Mon Sep 17 00:00:00 2001 From: huidongchen Date: Wed, 27 Oct 2021 14:13:51 -0400 Subject: [PATCH 1/2] update .tl.discretize() and .pl.discretize() --- simba/plotting/_plot.py | 67 +++++++++++++++--------------------- simba/tools/_general.py | 76 +++++++++++++++++++++++++++-------------- 2 files changed, 79 insertions(+), 64 deletions(-) diff --git a/simba/plotting/_plot.py b/simba/plotting/_plot.py index c7e8016..614858a 100755 --- a/simba/plotting/_plot.py +++ b/simba/plotting/_plot.py @@ -830,7 +830,8 @@ def _scatterplot2d(df, # fig_legend_order: `dict`,optional (default: None) # Specified order for the appearance of the annotation keys. # Only valid for categorical/string variable -# e.g. fig_legend_order = {'ann1':['a','b','c'],'ann2':['aa','bb','cc']} +# e.g. fig_legend_order = {'ann1':['a','b','c'], +# 'ann2':['aa','bb','cc']} # fig_legend_ncol: `int`, optional (default: 1) # The number of columns that the legend has. # vmin,vmax: `float`, optional (default: None) @@ -888,7 +889,8 @@ def _scatterplot2d(df, # if(len(list_hue) < fig_ncol): # fig_ncol = len(list_hue) # fig_nrow = int(np.ceil(len(list_hue)/fig_ncol)) -# fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05, fig_size[1]*fig_nrow)) +# fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05, +# fig_size[1]*fig_nrow)) # for hue in list_hue: # if hue in hue_palette.keys(): # palette = hue_palette[hue] @@ -1129,10 +1131,7 @@ def umap(adata, def discretize(adata, - layer=None, - kde=False, - bins=20, - fig_size=(5, 8), + fig_size=(6, 6), pad=1.08, w_pad=None, h_pad=None, @@ -1146,15 +1145,6 @@ def discretize(adata, ---------- adata : `Anndata` Annotated data matrix. - layer : `str`, optional (default: None) - Layer to use for original histogram plot. - If None, ``adata.X`` will be used. - bins : `int`, optional (default: 20) - The number of equal-width bins in the given range - for original histogram plot. - kde : `bool`, optional (default: True) - If True, compute a kernel density estimate to smooth the distribution - and show on the plot pad: `float`, optional (default: 1.08) Padding between the figure edge and the edges of subplots, as a fraction of the font size. @@ -1171,39 +1161,38 @@ def discretize(adata, if `save_fig` is True, specify figure name. **kwargs: `dict`, optional Other keyword arguments are passed through to ``sns.histplot`` - Returns ------- None """ - if fig_size is None: - fig_size = mpl.rcParams['figure.figsize'] - if save_fig is None: - save_fig = settings.save_fig - if fig_path is None: - fig_path = os.path.join(settings.workdir, 'figures') +# if fig_size is None: +# fig_size = mpl.rcParams['figure.figsize'] +# if save_fig is None: +# save_fig = settings.save_fig +# if fig_path is None: +# fig_path = os.path.join(settings.workdir, 'figures') + + assert 'disc' in adata.uns_keys(), \ + "please run `si.tl.discretize()` first" + + hist_edges = adata.uns['disc']['hist_edges'] + hist_count = adata.uns['disc']['hist_count'] + bin_edges = adata.uns['disc']['bin_edges'] + bin_count = adata.uns['disc']['bin_count'] - if layer is None: - X = adata.X.copy() - else: - X = adata.layers[layer].copy() - nonzero_disc = adata.uns['disc']['disc_ori'].data - bin_edges = adata.uns['disc']['bin_edges'][0] - print(bin_edges) fig, ax = plt.subplots(2, 1, figsize=fig_size) - _ = sns.histplot(ax=ax[0], - x=X.data, - kde=kde, - bins=bins, - **kwargs) - _ = sns.histplot(ax=ax[1], - x=nonzero_disc, - kde=False, - bins=bin_edges, - **kwargs) + _ = ax[0].hist(hist_edges[:-1], + hist_edges, + weights=hist_count, + linewidth=0) + _ = ax[1].hist(bin_edges[:-1], + bin_edges, + weights=bin_count) ax[0].set_xlabel('Non-zero values') + ax[0].set_ylabel('Count') ax[0].set_title('Original') ax[1].set_xlabel('Non-zero values') + ax[1].set_ylabel('Count') ax[1].set_title('Discretized') plt.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad) if(save_fig): diff --git a/simba/tools/_general.py b/simba/tools/_general.py index 85b0566..87e505e 100755 --- a/simba/tools/_general.py +++ b/simba/tools/_general.py @@ -1,43 +1,69 @@ """General-purpose tools""" -from sklearn.preprocessing import KBinsDiscretizer +import numpy as np +from sklearn.cluster import KMeans def discretize(adata, layer=None, - n_bins=3, - encode='ordinal', - strategy='kmeans', - dtype=None): - """Discretize continous features + n_bins=5, + max_bins=100): + """Discretize continous values + Parameters ---------- adata: AnnData Annotated data matrix. + layer: `str`, optional (default: None) + The layer used to perform discretization + n_bins: `int`, optional (default: 5) + The number of bins to produce. + It must be smaller than `max_bins`. + max_bins: `int`, optional (default: 100) + The number of bins used to the initial approximation. Returns ------- updates `adata` with the following fields. - X: `numpy.ndarray` (`adata.X`) - Store #observations × #var_genes logarithmized data matrix. + `.layer['disc']` : `array_like` + Discretized values. + `.uns['disc']` : `dict` + `bin_edges`: The edges of each bin. + `bin_count`: The number of values in each bin. + `hist_edges`: The edges of each bin \ + in the initial approximation. + `hist_count`: The number of values in each bin \ + for the initial approximation. """ if layer is None: - X = adata.X.copy() + X = adata.X else: - X = adata.layers[layer].copy() - est = KBinsDiscretizer(n_bins=n_bins, - encode=encode, - strategy=strategy, - dtype=dtype) - nonzero_cont = X.data.copy() - nonzero_id = est.fit_transform(nonzero_cont.reshape(-1, 1)) - nonzero_disc = est.inverse_transform(nonzero_id).reshape(-1, ) - - adata.layers['disc'] = adata.layers['raw'].copy() - adata.layers['disc'].data = (nonzero_id+1).reshape(-1,) - - # discretized data transformed back to original feature space + X = adata.layers[layer] + nonzero_cont = X.data + + hist_count, hist_edges = np.histogram( + nonzero_cont, + bins=max_bins, + density=False) + hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2 + + kmeans = KMeans(n_clusters=n_bins, random_state=2021).fit( + hist_centroids.reshape(-1, 1), + sample_weight=hist_count) + cluster_centers = np.sort(kmeans.cluster_centers_.flatten()) + + padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10) + bin_edges = np.array( + [hist_edges[0]-padding] + + list((cluster_centers[0:-1] + cluster_centers[1:])/2) + + [hist_edges[-1]+padding]) + nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,) + bin_count = np.unique(nonzero_disc, return_counts=True)[1] + + adata.layers['disc'] = X.copy() + adata.layers['disc'].data = nonzero_disc adata.uns['disc'] = dict() - adata.uns['disc']['disc_ori'] = adata.layers['raw'].copy() - adata.uns['disc']['disc_ori'].data = nonzero_disc.reshape(-1,) - adata.uns['disc']['bin_edges'] = est.bin_edges_ + adata.uns['disc']['bin_edges'] = bin_edges + adata.uns['disc']['bin_count'] = bin_count + adata.uns['disc']['hist_edges'] = hist_edges + adata.uns['disc']['hist_count'] = hist_count From 825b0bc1c080d7e450e2f3e36d08e0c1810c2b94 Mon Sep 17 00:00:00 2001 From: huidongchen Date: Wed, 27 Oct 2021 15:49:22 -0400 Subject: [PATCH 2/2] update .tl.discretize() and .pl.discretize() --- simba/plotting/_plot.py | 21 ++++++++++++--------- simba/tools/_general.py | 6 ++++-- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/simba/plotting/_plot.py b/simba/plotting/_plot.py index 614858a..92068a2 100755 --- a/simba/plotting/_plot.py +++ b/simba/plotting/_plot.py @@ -1160,17 +1160,18 @@ def discretize(adata, fig_name: `str`, optional (default: 'plot_discretize.pdf') if `save_fig` is True, specify figure name. **kwargs: `dict`, optional - Other keyword arguments are passed through to ``sns.histplot`` + Other keyword arguments are passed through to ``plt.hist()`` + Returns ------- None """ -# if fig_size is None: -# fig_size = mpl.rcParams['figure.figsize'] -# if save_fig is None: -# save_fig = settings.save_fig -# if fig_path is None: -# fig_path = os.path.join(settings.workdir, 'figures') + if fig_size is None: + fig_size = mpl.rcParams['figure.figsize'] + if save_fig is None: + save_fig = settings.save_fig + if fig_path is None: + fig_path = os.path.join(settings.workdir, 'figures') assert 'disc' in adata.uns_keys(), \ "please run `si.tl.discretize()` first" @@ -1184,10 +1185,12 @@ def discretize(adata, _ = ax[0].hist(hist_edges[:-1], hist_edges, weights=hist_count, - linewidth=0) + linewidth=0, + **kwargs) _ = ax[1].hist(bin_edges[:-1], bin_edges, - weights=bin_count) + weights=bin_count, + **kwargs) ax[0].set_xlabel('Non-zero values') ax[0].set_ylabel('Count') ax[0].set_title('Original') diff --git a/simba/tools/_general.py b/simba/tools/_general.py index 87e505e..8dbd924 100755 --- a/simba/tools/_general.py +++ b/simba/tools/_general.py @@ -20,11 +20,13 @@ def discretize(adata, The number of bins to produce. It must be smaller than `max_bins`. max_bins: `int`, optional (default: 100) - The number of bins used to the initial approximation. + The number of bins used in the initial approximation. + i.e. the number of bins to cluster. Returns ------- - updates `adata` with the following fields. + updates `adata` with the following fields + `.layer['disc']` : `array_like` Discretized values. `.uns['disc']` : `dict`