diff --git a/simba/plotting/_plot.py b/simba/plotting/_plot.py index c7e8016..92068a2 100755 --- a/simba/plotting/_plot.py +++ b/simba/plotting/_plot.py @@ -830,7 +830,8 @@ def _scatterplot2d(df, # fig_legend_order: `dict`,optional (default: None) # Specified order for the appearance of the annotation keys. # Only valid for categorical/string variable -# e.g. fig_legend_order = {'ann1':['a','b','c'],'ann2':['aa','bb','cc']} +# e.g. fig_legend_order = {'ann1':['a','b','c'], +# 'ann2':['aa','bb','cc']} # fig_legend_ncol: `int`, optional (default: 1) # The number of columns that the legend has. # vmin,vmax: `float`, optional (default: None) @@ -888,7 +889,8 @@ def _scatterplot2d(df, # if(len(list_hue) < fig_ncol): # fig_ncol = len(list_hue) # fig_nrow = int(np.ceil(len(list_hue)/fig_ncol)) -# fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05, fig_size[1]*fig_nrow)) +# fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05, +# fig_size[1]*fig_nrow)) # for hue in list_hue: # if hue in hue_palette.keys(): # palette = hue_palette[hue] @@ -1129,10 +1131,7 @@ def umap(adata, def discretize(adata, - layer=None, - kde=False, - bins=20, - fig_size=(5, 8), + fig_size=(6, 6), pad=1.08, w_pad=None, h_pad=None, @@ -1146,15 +1145,6 @@ def discretize(adata, ---------- adata : `Anndata` Annotated data matrix. - layer : `str`, optional (default: None) - Layer to use for original histogram plot. - If None, ``adata.X`` will be used. - bins : `int`, optional (default: 20) - The number of equal-width bins in the given range - for original histogram plot. - kde : `bool`, optional (default: True) - If True, compute a kernel density estimate to smooth the distribution - and show on the plot pad: `float`, optional (default: 1.08) Padding between the figure edge and the edges of subplots, as a fraction of the font size. @@ -1170,7 +1160,7 @@ def discretize(adata, fig_name: `str`, optional (default: 'plot_discretize.pdf') if `save_fig` is True, specify figure name. **kwargs: `dict`, optional - Other keyword arguments are passed through to ``sns.histplot`` + Other keyword arguments are passed through to ``plt.hist()`` Returns ------- @@ -1183,27 +1173,29 @@ def discretize(adata, if fig_path is None: fig_path = os.path.join(settings.workdir, 'figures') - if layer is None: - X = adata.X.copy() - else: - X = adata.layers[layer].copy() - nonzero_disc = adata.uns['disc']['disc_ori'].data - bin_edges = adata.uns['disc']['bin_edges'][0] - print(bin_edges) + assert 'disc' in adata.uns_keys(), \ + "please run `si.tl.discretize()` first" + + hist_edges = adata.uns['disc']['hist_edges'] + hist_count = adata.uns['disc']['hist_count'] + bin_edges = adata.uns['disc']['bin_edges'] + bin_count = adata.uns['disc']['bin_count'] + fig, ax = plt.subplots(2, 1, figsize=fig_size) - _ = sns.histplot(ax=ax[0], - x=X.data, - kde=kde, - bins=bins, - **kwargs) - _ = sns.histplot(ax=ax[1], - x=nonzero_disc, - kde=False, - bins=bin_edges, - **kwargs) + _ = ax[0].hist(hist_edges[:-1], + hist_edges, + weights=hist_count, + linewidth=0, + **kwargs) + _ = ax[1].hist(bin_edges[:-1], + bin_edges, + weights=bin_count, + **kwargs) ax[0].set_xlabel('Non-zero values') + ax[0].set_ylabel('Count') ax[0].set_title('Original') ax[1].set_xlabel('Non-zero values') + ax[1].set_ylabel('Count') ax[1].set_title('Discretized') plt.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad) if(save_fig): diff --git a/simba/tools/_general.py b/simba/tools/_general.py index 85b0566..8dbd924 100755 --- a/simba/tools/_general.py +++ b/simba/tools/_general.py @@ -1,43 +1,71 @@ """General-purpose tools""" -from sklearn.preprocessing import KBinsDiscretizer +import numpy as np +from sklearn.cluster import KMeans def discretize(adata, layer=None, - n_bins=3, - encode='ordinal', - strategy='kmeans', - dtype=None): - """Discretize continous features + n_bins=5, + max_bins=100): + """Discretize continous values + Parameters ---------- adata: AnnData Annotated data matrix. + layer: `str`, optional (default: None) + The layer used to perform discretization + n_bins: `int`, optional (default: 5) + The number of bins to produce. + It must be smaller than `max_bins`. + max_bins: `int`, optional (default: 100) + The number of bins used in the initial approximation. + i.e. the number of bins to cluster. Returns ------- - updates `adata` with the following fields. - X: `numpy.ndarray` (`adata.X`) - Store #observations × #var_genes logarithmized data matrix. + updates `adata` with the following fields + + `.layer['disc']` : `array_like` + Discretized values. + `.uns['disc']` : `dict` + `bin_edges`: The edges of each bin. + `bin_count`: The number of values in each bin. + `hist_edges`: The edges of each bin \ + in the initial approximation. + `hist_count`: The number of values in each bin \ + for the initial approximation. """ if layer is None: - X = adata.X.copy() + X = adata.X else: - X = adata.layers[layer].copy() - est = KBinsDiscretizer(n_bins=n_bins, - encode=encode, - strategy=strategy, - dtype=dtype) - nonzero_cont = X.data.copy() - nonzero_id = est.fit_transform(nonzero_cont.reshape(-1, 1)) - nonzero_disc = est.inverse_transform(nonzero_id).reshape(-1, ) - - adata.layers['disc'] = adata.layers['raw'].copy() - adata.layers['disc'].data = (nonzero_id+1).reshape(-1,) - - # discretized data transformed back to original feature space + X = adata.layers[layer] + nonzero_cont = X.data + + hist_count, hist_edges = np.histogram( + nonzero_cont, + bins=max_bins, + density=False) + hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2 + + kmeans = KMeans(n_clusters=n_bins, random_state=2021).fit( + hist_centroids.reshape(-1, 1), + sample_weight=hist_count) + cluster_centers = np.sort(kmeans.cluster_centers_.flatten()) + + padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10) + bin_edges = np.array( + [hist_edges[0]-padding] + + list((cluster_centers[0:-1] + cluster_centers[1:])/2) + + [hist_edges[-1]+padding]) + nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,) + bin_count = np.unique(nonzero_disc, return_counts=True)[1] + + adata.layers['disc'] = X.copy() + adata.layers['disc'].data = nonzero_disc adata.uns['disc'] = dict() - adata.uns['disc']['disc_ori'] = adata.layers['raw'].copy() - adata.uns['disc']['disc_ori'].data = nonzero_disc.reshape(-1,) - adata.uns['disc']['bin_edges'] = est.bin_edges_ + adata.uns['disc']['bin_edges'] = bin_edges + adata.uns['disc']['bin_count'] = bin_count + adata.uns['disc']['hist_edges'] = hist_edges + adata.uns['disc']['hist_count'] = hist_count