From ec2fac0621029fa83a6d5f5aa3e3f49ea181de01 Mon Sep 17 00:00:00 2001
From: huidongchen <hd7chen@gmail.com>
Date: Wed, 27 Oct 2021 14:13:51 -0400
Subject: [PATCH 1/2] update .tl.discretize() and .pl.discretize()

---
 simba/plotting/_plot.py | 67 +++++++++++++++---------------------
 simba/tools/_general.py | 76 +++++++++++++++++++++++++++--------------
 2 files changed, 79 insertions(+), 64 deletions(-)

diff --git a/simba/plotting/_plot.py b/simba/plotting/_plot.py
index c7e8016..614858a 100755
--- a/simba/plotting/_plot.py
+++ b/simba/plotting/_plot.py
@@ -830,7 +830,8 @@ def _scatterplot2d(df,
 #     fig_legend_order: `dict`,optional (default: None)
 #         Specified order for the appearance of the annotation keys.
 #         Only valid for categorical/string variable
-#         e.g. fig_legend_order = {'ann1':['a','b','c'],'ann2':['aa','bb','cc']}
+#         e.g. fig_legend_order = {'ann1':['a','b','c'],
+#                                  'ann2':['aa','bb','cc']}
 #     fig_legend_ncol: `int`, optional (default: 1)
 #         The number of columns that the legend has.
 #     vmin,vmax: `float`, optional (default: None)
@@ -888,7 +889,8 @@ def _scatterplot2d(df,
 #     if(len(list_hue) < fig_ncol):
 #         fig_ncol = len(list_hue)
 #     fig_nrow = int(np.ceil(len(list_hue)/fig_ncol))
-#     fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05, fig_size[1]*fig_nrow))
+#     fig = plt.figure(figsize=(fig_size[0]*fig_ncol*1.05,
+#                      fig_size[1]*fig_nrow))
 #     for hue in list_hue:
 #         if hue in hue_palette.keys():
 #             palette = hue_palette[hue]
@@ -1129,10 +1131,7 @@ def umap(adata,
 
 
 def discretize(adata,
-               layer=None,
-               kde=False,
-               bins=20,
-               fig_size=(5, 8),
+               fig_size=(6, 6),
                pad=1.08,
                w_pad=None,
                h_pad=None,
@@ -1146,15 +1145,6 @@ def discretize(adata,
     ----------
     adata : `Anndata`
         Annotated data matrix.
-    layer : `str`, optional (default: None)
-        Layer to use for original histogram plot.
-        If None, ``adata.X`` will be used.
-    bins : `int`, optional (default: 20)
-        The number of equal-width bins in the given range
-        for original histogram plot.
-    kde : `bool`, optional (default: True)
-        If True, compute a kernel density estimate to smooth the distribution
-        and show on the plot
     pad: `float`, optional (default: 1.08)
         Padding between the figure edge and the edges of subplots,
         as a fraction of the font size.
@@ -1171,39 +1161,38 @@ def discretize(adata,
         if `save_fig` is True, specify figure name.
     **kwargs: `dict`, optional
         Other keyword arguments are passed through to ``sns.histplot``
-
     Returns
     -------
     None
     """
-    if fig_size is None:
-        fig_size = mpl.rcParams['figure.figsize']
-    if save_fig is None:
-        save_fig = settings.save_fig
-    if fig_path is None:
-        fig_path = os.path.join(settings.workdir, 'figures')
+#     if fig_size is None:
+#         fig_size = mpl.rcParams['figure.figsize']
+#     if save_fig is None:
+#         save_fig = settings.save_fig
+#     if fig_path is None:
+#         fig_path = os.path.join(settings.workdir, 'figures')
+
+    assert 'disc' in adata.uns_keys(), \
+        "please run `si.tl.discretize()` first"
+
+    hist_edges = adata.uns['disc']['hist_edges']
+    hist_count = adata.uns['disc']['hist_count']
+    bin_edges = adata.uns['disc']['bin_edges']
+    bin_count = adata.uns['disc']['bin_count']
 
-    if layer is None:
-        X = adata.X.copy()
-    else:
-        X = adata.layers[layer].copy()
-    nonzero_disc = adata.uns['disc']['disc_ori'].data
-    bin_edges = adata.uns['disc']['bin_edges'][0]
-    print(bin_edges)
     fig, ax = plt.subplots(2, 1, figsize=fig_size)
-    _ = sns.histplot(ax=ax[0],
-                     x=X.data,
-                     kde=kde,
-                     bins=bins,
-                     **kwargs)
-    _ = sns.histplot(ax=ax[1],
-                     x=nonzero_disc,
-                     kde=False,
-                     bins=bin_edges,
-                     **kwargs)
+    _ = ax[0].hist(hist_edges[:-1],
+                   hist_edges,
+                   weights=hist_count,
+                   linewidth=0)
+    _ = ax[1].hist(bin_edges[:-1],
+                   bin_edges,
+                   weights=bin_count)
     ax[0].set_xlabel('Non-zero values')
+    ax[0].set_ylabel('Count')
     ax[0].set_title('Original')
     ax[1].set_xlabel('Non-zero values')
+    ax[1].set_ylabel('Count')
     ax[1].set_title('Discretized')
     plt.tight_layout(pad=pad, h_pad=h_pad, w_pad=w_pad)
     if(save_fig):
diff --git a/simba/tools/_general.py b/simba/tools/_general.py
index 85b0566..87e505e 100755
--- a/simba/tools/_general.py
+++ b/simba/tools/_general.py
@@ -1,43 +1,69 @@
 """General-purpose tools"""
 
-from sklearn.preprocessing import KBinsDiscretizer
+import numpy as np
+from sklearn.cluster import KMeans
 
 
 def discretize(adata,
                layer=None,
-               n_bins=3,
-               encode='ordinal',
-               strategy='kmeans',
-               dtype=None):
-    """Discretize continous features
+               n_bins=5,
+               max_bins=100):
+    """Discretize continous values
+
     Parameters
     ----------
     adata: AnnData
         Annotated data matrix.
+    layer: `str`, optional (default: None)
+        The layer used to perform discretization
+    n_bins: `int`, optional (default: 5)
+        The number of bins to produce.
+        It must be smaller than `max_bins`.
+    max_bins: `int`, optional (default: 100)
+        The number of bins used to the initial approximation.
 
     Returns
     -------
     updates `adata` with the following fields.
-    X: `numpy.ndarray` (`adata.X`)
-        Store #observations × #var_genes logarithmized data matrix.
+    `.layer['disc']` : `array_like`
+        Discretized values.
+    `.uns['disc']` : `dict`
+        `bin_edges`: The edges of each bin.
+        `bin_count`: The number of values in each bin.
+        `hist_edges`: The edges of each bin \
+                      in the initial approximation.
+        `hist_count`: The number of values in each bin \
+                      for the initial approximation.
     """
     if layer is None:
-        X = adata.X.copy()
+        X = adata.X
     else:
-        X = adata.layers[layer].copy()
-    est = KBinsDiscretizer(n_bins=n_bins,
-                           encode=encode,
-                           strategy=strategy,
-                           dtype=dtype)
-    nonzero_cont = X.data.copy()
-    nonzero_id = est.fit_transform(nonzero_cont.reshape(-1, 1))
-    nonzero_disc = est.inverse_transform(nonzero_id).reshape(-1, )
-
-    adata.layers['disc'] = adata.layers['raw'].copy()
-    adata.layers['disc'].data = (nonzero_id+1).reshape(-1,)
-
-    # discretized data transformed back to original feature space
+        X = adata.layers[layer]
+    nonzero_cont = X.data
+
+    hist_count, hist_edges = np.histogram(
+        nonzero_cont,
+        bins=max_bins,
+        density=False)
+    hist_centroids = (hist_edges[0:-1] + hist_edges[1:])/2
+
+    kmeans = KMeans(n_clusters=n_bins, random_state=2021).fit(
+        hist_centroids.reshape(-1, 1),
+        sample_weight=hist_count)
+    cluster_centers = np.sort(kmeans.cluster_centers_.flatten())
+
+    padding = (hist_edges[-1] - hist_edges[0])/(max_bins*10)
+    bin_edges = np.array(
+        [hist_edges[0]-padding] +
+        list((cluster_centers[0:-1] + cluster_centers[1:])/2) +
+        [hist_edges[-1]+padding])
+    nonzero_disc = np.digitize(nonzero_cont, bin_edges).reshape(-1,)
+    bin_count = np.unique(nonzero_disc, return_counts=True)[1]
+
+    adata.layers['disc'] = X.copy()
+    adata.layers['disc'].data = nonzero_disc
     adata.uns['disc'] = dict()
-    adata.uns['disc']['disc_ori'] = adata.layers['raw'].copy()
-    adata.uns['disc']['disc_ori'].data = nonzero_disc.reshape(-1,)
-    adata.uns['disc']['bin_edges'] = est.bin_edges_
+    adata.uns['disc']['bin_edges'] = bin_edges
+    adata.uns['disc']['bin_count'] = bin_count
+    adata.uns['disc']['hist_edges'] = hist_edges
+    adata.uns['disc']['hist_count'] = hist_count

From 825b0bc1c080d7e450e2f3e36d08e0c1810c2b94 Mon Sep 17 00:00:00 2001
From: huidongchen <hd7chen@gmail.com>
Date: Wed, 27 Oct 2021 15:49:22 -0400
Subject: [PATCH 2/2] update .tl.discretize() and .pl.discretize()

---
 simba/plotting/_plot.py | 21 ++++++++++++---------
 simba/tools/_general.py |  6 ++++--
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/simba/plotting/_plot.py b/simba/plotting/_plot.py
index 614858a..92068a2 100755
--- a/simba/plotting/_plot.py
+++ b/simba/plotting/_plot.py
@@ -1160,17 +1160,18 @@ def discretize(adata,
     fig_name: `str`, optional (default: 'plot_discretize.pdf')
         if `save_fig` is True, specify figure name.
     **kwargs: `dict`, optional
-        Other keyword arguments are passed through to ``sns.histplot``
+        Other keyword arguments are passed through to ``plt.hist()``
+
     Returns
     -------
     None
     """
-#     if fig_size is None:
-#         fig_size = mpl.rcParams['figure.figsize']
-#     if save_fig is None:
-#         save_fig = settings.save_fig
-#     if fig_path is None:
-#         fig_path = os.path.join(settings.workdir, 'figures')
+    if fig_size is None:
+        fig_size = mpl.rcParams['figure.figsize']
+    if save_fig is None:
+        save_fig = settings.save_fig
+    if fig_path is None:
+        fig_path = os.path.join(settings.workdir, 'figures')
 
     assert 'disc' in adata.uns_keys(), \
         "please run `si.tl.discretize()` first"
@@ -1184,10 +1185,12 @@ def discretize(adata,
     _ = ax[0].hist(hist_edges[:-1],
                    hist_edges,
                    weights=hist_count,
-                   linewidth=0)
+                   linewidth=0,
+                   **kwargs)
     _ = ax[1].hist(bin_edges[:-1],
                    bin_edges,
-                   weights=bin_count)
+                   weights=bin_count,
+                   **kwargs)
     ax[0].set_xlabel('Non-zero values')
     ax[0].set_ylabel('Count')
     ax[0].set_title('Original')
diff --git a/simba/tools/_general.py b/simba/tools/_general.py
index 87e505e..8dbd924 100755
--- a/simba/tools/_general.py
+++ b/simba/tools/_general.py
@@ -20,11 +20,13 @@ def discretize(adata,
         The number of bins to produce.
         It must be smaller than `max_bins`.
     max_bins: `int`, optional (default: 100)
-        The number of bins used to the initial approximation.
+        The number of bins used in the initial approximation.
+        i.e. the number of bins to cluster.
 
     Returns
     -------
-    updates `adata` with the following fields.
+    updates `adata` with the following fields
+
     `.layer['disc']` : `array_like`
         Discretized values.
     `.uns['disc']` : `dict`