Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 25 additions & 33 deletions simba/plotting/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,8 @@ def _scatterplot2d(df,
# fig_legend_order: `dict`,optional (default: None)
# Specified order for the appearance of the annotation keys.
# Only valid for categorical/string variable
# e.g. fig_legend_order = {'ann1':['a','b','c'],'ann2':['aa','bb','cc']}
# e.g. fig_legend_order = {'ann1':['a','b','c'],
# 'ann2':['aa','bb','cc']}
# fig_legend_ncol: `int`, optional (default: 1)
# The number of columns that the legend has.
# vmin,vmax: `float`, optional (default: None)
Expand Down Expand Up @@ -888,7 +889,8 @@ def _scatterplot2d(df,
# if(len(list_hue) < fig_ncol):
# fig_ncol = len(list_hue)
# fig_nrow = int(np.ceil(len(list_hue)/fig_ncol))
# fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05, fig_size[1]*fig_nrow))
# fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05,
# fig_size[1]*fig_nrow))
# for hue in list_hue:
# if hue in hue_palette.keys():
# palette = hue_palette[hue]
Expand Down Expand Up @@ -1129,10 +1131,7 @@ def umap(adata,


def discretize(adata,
layer=None,
kde=False,
bins=20,
fig_size=(5, 8),
fig_size=(6, 6),
pad=1.08,
w_pad=None,
h_pad=None,
Expand All @@ -1146,15 +1145,6 @@ def discretize(adata,
----------
adata : `Anndata`
Annotated data matrix.
layer : `str`, optional (default: None)
Layer to use for original histogram plot.
If None, ``adata.X`` will be used.
bins : `int`, optional (default: 20)
The number of equal-width bins in the given range
for original histogram plot.
kde : `bool`, optional (default: True)
If True, compute a kernel density estimate to smooth the distribution
and show on the plot
pad: `float`, optional (default: 1.08)
Padding between the figure edge and the edges of subplots,
as a fraction of the font size.
Expand All @@ -1170,7 +1160,7 @@ def discretize(adata,
fig_name: `str`, optional (default: 'plot_discretize.pdf')
if `save_fig` is True, specify figure name.
**kwargs: `dict`, optional
Other keyword arguments are passed through to ``sns.histplot``
Other keyword arguments are passed through to ``plt.hist()``

Returns
-------
Expand All @@ -1183,27 +1173,29 @@ def discretize(adata,
if fig_path is None:
fig_path = os.path.join(settings.workdir, 'figures')

if layer is None:
X = adata.X.copy()
else:
X = adata.layers[layer].copy()
nonzero_disc = adata.uns['disc']['disc_ori'].data
bin_edges = adata.uns['disc']['bin_edges'][0]
print(bin_edges)
assert 'disc' in adata.uns_keys(), \
"please run `si.tl.discretize()` first"

hist_edges = adata.uns['disc']['hist_edges']
hist_count = adata.uns['disc']['hist_count']
bin_edges = adata.uns['disc']['bin_edges']
bin_count = adata.uns['disc']['bin_count']

fig, ax = plt.subplots(2, 1, figsize=fig_size)
_ = sns.histplot(ax=ax[0],
x=X.data,
kde=kde,
bins=bins,
**kwargs)
_ = sns.histplot(ax=ax[1],
x=nonzero_disc,
kde=False,
bins=bin_edges,
**kwargs)
_ = ax[0].hist(hist_edges[:-1],
hist_edges,
weights=hist_count,
linewidth=0,
**kwargs)
_ = ax[1].hist(bin_edges[:-1],
bin_edges,
weights=bin_count,
**kwargs)
ax[0].set_xlabel('Non-zero values')
ax[0].set_ylabel('Count')
ax[0].set_title('Original')
ax[1].set_xlabel('Non-zero values')
ax[1].set_ylabel('Count')
ax[1].set_title('Discretized')
plt.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad)
if(save_fig):
Expand Down
80 changes: 54 additions & 26 deletions simba/tools/_general.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,71 @@
"""General-purpose tools"""

from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
from sklearn.cluster import KMeans


def discretize(adata,
layer=None,
n_bins=3,
encode='ordinal',
strategy='kmeans',
dtype=None):
"""Discretize continous features
n_bins=5,
max_bins=100):
"""Discretize continous values

Parameters
----------
adata: AnnData
Annotated data matrix.
layer: `str`, optional (default: None)
The layer used to perform discretization
n_bins: `int`, optional (default: 5)
The number of bins to produce.
It must be smaller than `max_bins`.
max_bins: `int`, optional (default: 100)
The number of bins used in the initial approximation.
i.e. the number of bins to cluster.

Returns
-------
updates `adata` with the following fields.
X: `numpy.ndarray` (`adata.X`)
Store #observations × #var_genes logarithmized data matrix.
updates `adata` with the following fields

`.layer['disc']` : `array_like`
Discretized values.
`.uns['disc']` : `dict`
`bin_edges`: The edges of each bin.
`bin_count`: The number of values in each bin.
`hist_edges`: The edges of each bin \
in the initial approximation.
`hist_count`: The number of values in each bin \
for the initial approximation.
"""
if layer is None:
X = adata.X.copy()
X = adata.X
else:
X = adata.layers[layer].copy()
est = KBinsDiscretizer(n_bins=n_bins,
encode=encode,
strategy=strategy,
dtype=dtype)
nonzero_cont = X.data.copy()
nonzero_id = est.fit_transform(nonzero_cont.reshape(-1, 1))
nonzero_disc = est.inverse_transform(nonzero_id).reshape(-1, )

adata.layers['disc'] = adata.layers['raw'].copy()
adata.layers['disc'].data = (nonzero_id+1).reshape(-1,)

# discretized data transformed back to original feature space
X = adata.layers[layer]
nonzero_cont = X.data

hist_count, hist_edges = np.histogram(
nonzero_cont,
bins=max_bins,
density=False)
hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2

kmeans = KMeans(n_clusters=n_bins, random_state=2021).fit(
hist_centroids.reshape(-1, 1),
sample_weight=hist_count)
cluster_centers = np.sort(kmeans.cluster_centers_.flatten())

padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10)
bin_edges = np.array(
[hist_edges[0]-padding] +
list((cluster_centers[0:-1] + cluster_centers[1:])/2) +
[hist_edges[-1]+padding])
nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,)
bin_count = np.unique(nonzero_disc, return_counts=True)[1]

adata.layers['disc'] = X.copy()
adata.layers['disc'].data = nonzero_disc
adata.uns['disc'] = dict()
adata.uns['disc']['disc_ori'] = adata.layers['raw'].copy()
adata.uns['disc']['disc_ori'].data = nonzero_disc.reshape(-1,)
adata.uns['disc']['bin_edges'] = est.bin_edges_
adata.uns['disc']['bin_edges'] = bin_edges
adata.uns['disc']['bin_count'] = bin_count
adata.uns['disc']['hist_edges'] = hist_edges
adata.uns['disc']['hist_count'] = hist_count