Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
181ea14
adding tool function to compute dendrogram
fidelram Jan 11, 2019
5791c8b
added dendrogram to list of tools
fidelram Jan 11, 2019
c8415ad
updated dendrogram plotting for anndata visualizations
fidelram Jan 11, 2019
80bf4f3
add `tl.dendrogram` to doc
fidelram Jan 14, 2019
797bdda
added 'pl.correlation'
fidelram Jan 14, 2019
64ece62
updated dendrogram description in docs
fidelram Jan 14, 2019
b542423
modified default key to 'dendrogram_' + groupby. Added correlation ma…
fidelram Jan 14, 2019
d5f4abc
added option to plot the correlation (sc.pl.correlation) that makes a…
fidelram Jan 14, 2019
4b1f799
added `sc.tl.filter_rank_genes_groups`, which allows to filter maker …
fidelram Jan 14, 2019
ceac9f3
update test images
fidelram Jan 14, 2019
8160869
Fix wrong color ordering or categories in `heatmap` and `tracksplot`
fidelram Jan 14, 2019
45f1812
fix typo
fidelram Jan 15, 2019
26f741a
fix dendrogram position in correlation plot
fidelram Jan 15, 2019
044a4d2
Merge branch 'master' into tl.dendrogram
fidelram Feb 1, 2019
90af672
merge
fidelram Feb 1, 2019
694cafe
Merge branch 'tl.dendrogram' of https://github.com/fidelram/scanpy in…
fidelram Feb 1, 2019
ad48aae
rolling bank __init__.py changes
fidelram Feb 1, 2019
f851a7c
incorporated comments from @falexwolf
fidelram Feb 1, 2019
9137756
increase tolerance to image tests
fidelram Feb 3, 2019
385af65
added message if dendrogram values do not match and fix problem with …
fidelram Feb 5, 2019
c5e33ae
as for scatterplots(#385 #430) add gene_symbols for other plotting fu…
fidelram Feb 5, 2019
92d3529
increase tolerance to pass test
fidelram Feb 5, 2019
c31c0ae
fix error related to stacked_violin and colors
fidelram Feb 7, 2019
6e69bca
Merge branch 'master' into tl.dendrogram
fidelram Feb 12, 2019
1e8042c
increase matplotlib version in requirements to address #480
fidelram Feb 14, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
anndata>=0.6.15
matplotlib>=2.2
matplotlib>=3.0.0
pandas>=0.21
scipy
seaborn
Expand Down
2 changes: 2 additions & 0 deletions scanpy/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@
tl.louvain
tl.dpt
tl.paga
tl.dendrogram

Marker genes
~~~~~~~~~~~~
Expand All @@ -151,6 +152,7 @@
:toctree: .

tl.rank_genes_groups
tl.filter_rank_genes_groups

Gene scores, Cell cycle
~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
4 changes: 2 additions & 2 deletions scanpy/plotting/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ._anndata import scatter, violin, ranking, clustermap, stacked_violin, heatmap, dotplot, matrixplot, tracksplot
from ._anndata import scatter, violin, ranking, clustermap, stacked_violin, heatmap, dotplot, matrixplot, tracksplot, dendrogram, correlation_matrix

from ._preprocessing import filter_genes_dispersion, highly_variable_genes

Expand All @@ -11,7 +11,7 @@
from ._tools import sim

from ._rcmod import set_rcParams_scanpy, set_rcParams_defaults
from . import palettes
from . import palettes

from ._utils import matrix
from ._utils import timeseries, timeseries_subplot, timeseries_as_heatmap
Expand Down
436 changes: 340 additions & 96 deletions scanpy/plotting/_anndata.py

Large diffs are not rendered by default.

21 changes: 12 additions & 9 deletions scanpy/plotting/_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
Keys for annotations of observations/cells or variables/genes, e.g.,
`'ann1'` or `['ann1', 'ann2']`.
gene_symbols : string, optional (default: `None`)
Key for field in .var that stores gene symbols if you do not want to use
.var_names.
Column name in `.var` DataFrame that stores gene symbols. By default `var_names`
refer to the index column of the `.var` DataFrame. Setting this option allows
alternative names to be used.
use_raw : `bool`, optional (default: `None`)
Use `.raw` attribute of `adata` for coloring with gene expression. If
`None`, uses `.raw` if present.\
Expand Down Expand Up @@ -108,13 +109,15 @@
figsize : (`float`, `float`), optional (default: `None`)
Figure size when multi_panel = True. Otherwise the rcParam['figure.figsize] value is used.
Format is (width, height)
dendrogram: `bool` If True, hierarchical clustering between the `groupby` categories is
computed and a dendrogram is plotted. `groupby` categories are reordered according to
the dendrogram order. If groups of `var_names` (see next arguments) are set and those groups correspond
to the `groupby` categories, those groups are also reordered. The 'pearson' method
is used to compute the pairwise correlation between categories using all var_names in
`raw` if `use_raw` is None, otherwise all adata.var_names are used. The linkage method
used is `complete`.
dendrogram: `bool` or `str`, optional (default, `False`)
If True or a valid dendrogram key, a dendrogram based on the hierarchical clustering
between the `groupby` categories is added. The dendrogram information is computed
using :ref:`scanpy.tl.dendrogram`. If `tl.dendrogram` has not been called previously
the function is called with default parameters.
gene_symbols : string, optional (default: `None`)
Column name in `.var` DataFrame that stores gene symbols. By default `var_names`
refer to the index column of the `.var` DataFrame. Setting this option allows
alternative names to be used.
var_group_positions : list of `tuples`.
Use this parameter to highlight groups of `var_names`.
This will draw a 'bracket' or a color block between the given start and end positions. If the
Expand Down
24 changes: 17 additions & 7 deletions scanpy/plotting/_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,13 +285,23 @@ def _rank_genes_groups_plot(adata, plot_type='heatmap', groups=None,
group_names = (adata.uns[key]['names'].dtype.names
if groups is None else groups)

# make a list of tuples containing the index for the start gene and the
# end gene that should be labelled
group_positions = [(x, x + n_genes - 1) for x in range(0, n_genes * len(group_names), n_genes)]

# sum(list, []) is used to flatten the gene list
gene_names = sum([list(adata.uns[key]['names'][x][:n_genes]) for x in group_names], [])

gene_names = []
start = 0
group_positions = []
group_names_valid = []
for group in group_names:
# get all genes that are 'not-nan'
genes_list = [gene for gene in adata.uns[key]['names'][group] if not pd.isnull(gene)][:n_genes]
if len(genes_list) == 0:
logg.warn("No genes found for group {}".format(group))
continue
gene_names.extend(genes_list)
end = start + len(genes_list)
group_positions.append((start, end -1))
group_names_valid.append(group)
start = end

group_names = group_names_valid
if plot_type == 'dotplot':
from .._anndata import dotplot
dotplot(adata, gene_names, groupby, var_group_labels=group_names,
Expand Down
4 changes: 4 additions & 0 deletions scanpy/plotting/_tools/scatterplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,10 @@ def _get_color_values(adata, value_to_plot, groups=None, palette=None, use_raw=F
else:
color_vector = adata.obs[value_to_plot]
elif gene_symbols in adata.var.columns:
if value_to_plot not in adata.var[gene_symbols].values:
logg.error("Gene symbol {!r} not found in given gene_symbols "
"column: {!r}".format(value_to_plot, gene_symbols))
return
gene_id = adata.var[adata.var[gene_symbols] == value_to_plot].index[0]
if use_raw:
color_vector = adata.raw[:, gene_id].X
Expand Down
Binary file added scanpy/tests/_images/correlation.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added scanpy/tests/_images/dendrogram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_dotplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_dotplot3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_dotplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_heatmap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_heatmap_swap_axes.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_matrixplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_matrixplot_swap_axes.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_stacked_violin.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_tracksplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_umap_with_edges.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 43 additions & 2 deletions scanpy/tests/test_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,18 @@ def test_violin():
save_and_compare_images('master_violin_multi_panel', tolerance=40)


def test_dendrogram():
pbmc = sc.datasets.pbmc68k_reduced()
sc.pl.dendrogram(pbmc, 'bulk_labels')
save_and_compare_images('dendrogram', tolerance=10)


def test_correlation():
pbmc = sc.datasets.pbmc68k_reduced()
sc.pl.correlation_matrix(pbmc, 'bulk_labels')
save_and_compare_images('correlation', tolerance=15)


def test_rank_genes_groups():
pbmc = sc.datasets.pbmc68k_reduced()
tolerance = 15
Expand All @@ -143,7 +155,7 @@ def test_rank_genes_groups():

# test ranked genes using stacked violin plots
sc.pl.rank_genes_groups_stacked_violin(pbmc, n_genes=3, show=False)
save_and_compare_images('master_ranked_genes_stacked_violin', tolerance=tolerance)
save_and_compare_images('master_ranked_genes_stacked_violin', tolerance=20)

# test ranked genes using dotplot
sc.pl.rank_genes_groups_dotplot(pbmc, n_genes=4, show=False)
Expand Down Expand Up @@ -171,6 +183,35 @@ def test_rank_genes_groups():
# save_and_compare_images('master_ranked_genes_stacked_violin', tolerance=tolerance)


def test_rank_genes_symbols():
adata = sc.datasets.krumsiek11()

# add a 'symbols' column
adata.var['symbols'] = adata.var.index.map(lambda x: "symbol_{}".format(x))
symbols = ["symbol_{}".format(x) for x in adata.var_names]
sc.pl.heatmap(adata, symbols, 'cell_type', use_raw=False, show=False, dendrogram=True,
gene_symbols='symbols')
save_and_compare_images('master_heatmap_gene_symbols')

sc.pl.dotplot(adata, symbols, 'cell_type', use_raw=False, dendrogram=True, show=False,
gene_symbols='symbols')

save_and_compare_images('master_dotplot_gene_symbols', tolerance=15)

sc.pl.matrixplot(adata, symbols, 'cell_type', use_raw=False, dendrogram=True, show=False,
gene_symbols='symbols')

save_and_compare_images('master_matrixplot_gene_symbols', tolerance=15)

sc.pl.stacked_violin(adata, symbols, 'cell_type', use_raw=False, color='blue', show=False,
gene_symbols='symbols')
save_and_compare_images('master_stacked_violin_gene_symbols', tolerance=20)

sc.pl.tracksplot(adata, symbols, 'cell_type', dendrogram=True, use_raw=False,
gene_symbols='symbols')
save_and_compare_images('master_tracksplot_gene_symbols')


def test_scatterplots():

pbmc = sc.datasets.pbmc68k_reduced()
Expand Down Expand Up @@ -209,7 +250,7 @@ def test_scatterplots():
# test edges = True
sc.pp.neighbors(pbmc)
sc.pl.umap(pbmc, color='louvain', edges=True, edges_width=0.1, s=50, show=False)
save_and_compare_images('master_umap_with_edges', tolerance=20)
save_and_compare_images('master_umap_with_edges', tolerance=35)

# test diffmap
# sc.tl.diffmap(pbmc)
Expand Down
3 changes: 2 additions & 1 deletion scanpy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from ._draw_graph import draw_graph

from ._paga import paga, paga_degrees, paga_expression_entropies, paga_compare_paths
from ._rank_genes_groups import rank_genes_groups
from ._rank_genes_groups import rank_genes_groups, filter_rank_genes_groups
from ._dpt import dpt
from ._leiden import leiden
from ._louvain import louvain
from ._sim import sim
from ._score_genes import score_genes, score_genes_cell_cycle
from ._dendrogram import dendrogram
117 changes: 117 additions & 0 deletions scanpy/tools/_dendrogram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
Computes a dendrogram based on a given categorical observation.
"""

from typing import Optional, List
import pandas as pd
from anndata import AnnData
from pandas.api.types import is_categorical_dtype

from .. utils import doc_params
from .. import logging as logg
from ..tools._utils import choose_representation, doc_use_rep, doc_n_pcs


@doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep)
def dendrogram(adata: AnnData, groupby: str,
n_pcs: Optional[int]=None,
use_rep: Optional[str]=None,
var_names: Optional[List[str]]=None,
use_raw: Optional[bool]=None,
cor_method: Optional[str]='pearson',
linkage_method: Optional[str]='complete',
key_added: Optional[str]=None) -> None:

"""
Computes a hierarchical clustering for the given `groupby` categories. Be default the PCA
components are used unless .X has less than 50 variables.

Alternatively, a list of var_names (e.g genes) can be given.

Average values of either var_names or components are used to compute a correlation matrix.

The hierarchical clustering can be visualized using `sc.pl.dendrogram` or multiple other
visualizations that can include a dendrogram: `matrixplot`, `heatmap`, `dotplot` and `stacked_violin`

.. note::
The computation of the hierarchical clustering is based on predefined groups and not
per cell. The correlation matrix is computed using by default pearson but other methods
are available.

Parameters
----------
adata : :class:`~anndata.AnnData`
Annotated data matrix
{n_pcs}
{use_rep}
var_names : `list of str` (default: None)
List of var_names to use for computing the hierarchical clustering. If `var_names` is given,
then `use_rep` and `n_pcs` is ignored.
use_raw : `bool`, optional (default: None)
Only when `var_names` is not None. Use `raw` attribute of `adata` if present.
cor_method : `str`, optional (default: `"pearson"`)
correlation method to use. Options are 'pearson', 'kendall', and 'spearman'
linkage_method : `str`, optional (default: `"complete"`)
linkage method to use. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
for more information.
key_added : : `str`, optional (default: `None`)
By default, the dendrogram information is added to `.uns['dendrogram_' + groupby]`. Notice
that the `groupby` information is added to the dendrogram.

Returns
-------
adata.uns['dendrogram'] (or instead of 'dendrogram' the value selected for `key_added`) is updated
with the dendrogram information

Examples
--------

>>> adata = sc.datasets.pbmc68k_reduced()
>>> sc.tl.dendrogram(adata, groupby='bulk_labels')
>>> sc.pl.dendrogram(adata)
>>> sc.pl.dotplot(adata, ['C1QA', 'PSAP', 'CD79A', 'CD79B', 'CST3', 'LYZ'],
... groupby='bulk_labels', dendrogram=True)
"""
if groupby not in adata.obs_keys():
raise ValueError('groupby has to be a valid observation. Given value: {}, '
'valid observations: {}'.format(groupby, adata.obs_keys()))
if not is_categorical_dtype(adata.obs[groupby]):
# if the groupby column is not categorical, turn it into one
# by subdividing into `num_categories` categories
raise ValueError('groupby has to be a categorical observation. Given value: {}, '
'Column type: {}'.format(groupby, adata.obs[groupby].dtype))

if var_names is None:
rep_df = pd.DataFrame(choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs))
rep_df.set_index(adata.obs[groupby], inplace=True)
categories = rep_df.index.categories
else:
if use_raw is None and adata.raw is not None: use_raw = True
gene_names = adata.raw.var_names if use_raw else adata.var_names
from ..plotting._anndata import _prepare_dataframe
categories, rep_df = _prepare_dataframe(adata, gene_names, groupby, use_raw)

if key_added is None:
key_added = 'dendrogram_' + groupby

logg.info('Storing dendrogram info using `.uns[{!r}]`'.format(key_added))
# aggregate values within categories using 'mean'
mean_df = rep_df.groupby(level=0).mean()

import scipy.cluster.hierarchy as sch

corr_matrix = mean_df.T.corr(method=cor_method)
z_var = sch.linkage(corr_matrix, method=linkage_method)
dendro_info = sch.dendrogram(z_var, labels=categories, no_plot=True)

# order of groupby categories
categories_idx_ordered = dendro_info['leaves']

adata.uns[key_added] = {'linkage': z_var,
'groupby': groupby,
'use_rep': use_rep,
'cor_method': cor_method,
'linkage_method': linkage_method,
'categories_idx_ordered': categories_idx_ordered,
'dendrogram_info': dendro_info,
'correlation_matrix': corr_matrix.values}
Loading