-
Notifications
You must be signed in to change notification settings - Fork 0
Feature/filtering qc metrics highly variable genes #8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
TRextabat
merged 11 commits into
develop
from
feature/filtering-qc_metrics-highly_variable_genes
Apr 9, 2025
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
04ef841
preprocess filtering, qc_metrics, highly_variable_genes kısmı güncell…
Oykupnrbs b79bb4b
Add files via upload
Ekin-hub-code c24786e
preproccesing2-demo.py
Ekin-hub-code 53d06a4
Add files via upload
Ekin-hub-code 2608e60
preproccesing2-demo.py
Ekin-hub-code aec0bb9
preprocessing
Ekin-hub-code 8b19703
preprocessing demo
Oykupnrbs 7b5dbd9
Preprocessing demo
Oykupnrbs 6971d77
Add modular preprocessing functions: filtering, QC metrics, and HVG s…
Oykupnrbs 887a0e8
Add preprocessing module
Oykupnrbs ca37da3
Resolved FileNotFoundError caused by missing dataset file
Oykupnrbs File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Binary file not shown.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| import scanpy as sc | ||
| import numpy as np | ||
| import pandas as pd | ||
| from preprocessing import filter_data, calculate_qc_metrics, select_highly_variable_genes | ||
|
|
||
| def load_file_demo(file_path): | ||
| adata = sc.read_h5ad(file_path) | ||
| adata.write("adata_demo.h5ad") | ||
| print("Data saved as adata_demo.h5ad") | ||
|
|
||
| file_path = "C:/Users/pc/OneDrive/Masaüstü/bioinformatic/Hw3covid_Data_AllCells.h5ad" | ||
| adata = load_file_demo(file_path) | ||
|
|
||
| # Step 1: Create example data (random counts matrix) | ||
| np.random.seed(42) # For reproducibility | ||
|
|
||
| n_cells = 1000 # Number of cells | ||
| n_genes = 5000 # Number of genes | ||
|
|
||
| # Create random gene expression data (Poisson distribution) | ||
| counts = np.random.poisson(lam=1.0, size=(n_cells, n_genes)) # Shape: (n_cells, n_genes) | ||
|
|
||
| # Create an AnnData object | ||
| adata = sc.AnnData(counts) | ||
|
|
||
| # Assign gene and cell names | ||
| adata.var_names = [f"Gene_{i}" for i in range(n_genes)] # Gene names: Gene_0, Gene_1, ..., Gene_4999 | ||
| adata.obs_names = [f"Cell_{i}" for i in range(n_cells)] # Cell names: Cell_0, Cell_1, ..., Cell_999 | ||
|
|
||
| # Before adding 'MT-', note that var_names is not a list but an Index object. | ||
| # So we need to convert var_names to a list before making changes. | ||
| var_names_list = adata.var_names.tolist() | ||
|
|
||
| # Add some mitochondrial genes for demonstration (every 10th gene is mitochondrial) | ||
| for i in range(0, n_genes, 10): | ||
| var_names_list[i] = f"MT-{var_names_list[i]}" # Rename every 10th gene to start with "MT-" | ||
|
|
||
| # We assign the modified list back as an Index object | ||
| adata.var_names = pd.Index(var_names_list) | ||
|
|
||
| # Step 2: Apply filter_data to filter cells and genes | ||
| adata_filtered = filter_data(adata, min_genes=200, min_cells=3) | ||
| print(f"Filtered AnnData: {adata_filtered.shape} (cells, genes)") | ||
|
|
||
| # Step 3: Calculate QC metrics | ||
| adata_qc = calculate_qc_metrics(adata_filtered) | ||
| print("\nQC Metrics (first few rows of obs):") | ||
| print(adata_qc.obs.head()) # View the QC metrics such as 'pct_counts_mt' (percentage of mitochondrial genes) | ||
|
|
||
| # Step 4: Select highly variable genes | ||
| adata_hvg = select_highly_variable_genes(adata_qc, min_mean=0.0125, max_mean=3, min_disp=0.5) | ||
| print("\nHighly Variable Genes (first few rows of 'highly_variable' column):") | ||
| print(adata_hvg.var[['highly_variable']].head()) # Display which genes are highly variable | ||
|
|
||
| # Step 5: Count how many genes are highly variable | ||
| num_highly_variable_genes = adata_hvg.var['highly_variable'].sum() | ||
| print(f"\nNumber of highly variable genes selected: {num_highly_variable_genes}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| import scanpy as sc | ||
| import pandas as pd | ||
| from anndata import AnnData | ||
|
|
||
| def filter_data( | ||
| adata: AnnData, | ||
| min_genes: int = 200, | ||
| min_cells: int = 3 | ||
| ) -> AnnData: | ||
| """ | ||
| Filters the dataset. | ||
|
|
||
| Args: | ||
| adata (AnnData): The single-cell dataset. | ||
| min_genes (int): Minimum number of genes expressed per cell. Default is 200. | ||
| min_cells (int): Minimum number of cells expressing a gene. Default is 3. | ||
|
|
||
| Returns: | ||
| AnnData: Filtered dataset. | ||
| """ | ||
| sc.pp.filter_cells(adata, min_genes=min_genes) | ||
| sc.pp.filter_genes(adata, min_cells=min_cells) | ||
| return adata | ||
|
|
||
| def calculate_qc_metrics(adata: AnnData) -> AnnData: | ||
| """ | ||
| Computes quality control (QC) metrics and adds them to the dataset. | ||
| Checks for mitochondrial genes with both uppercase and lowercase 'mt-' prefix. | ||
|
|
||
| Args: | ||
| adata (AnnData): The filtered dataset. | ||
|
|
||
| Returns: | ||
| AnnData: Dataset with QC metrics added. | ||
| """ | ||
| if adata.var_names.str.startswith("MT-").any(): | ||
| adata.var["mt"] = adata.var_names.str.startswith("MT-") | ||
| elif adata.var_names.str.startswith("mt-").any(): | ||
| adata.var["mt"] = adata.var_names.str.startswith("mt-") | ||
| else: | ||
| # In case neither is found, check in a case-insensitive way just to be sure | ||
| adata.var["mt"] = adata.var_names.str.upper().str.startswith("MT-") | ||
|
|
||
| sc.pp.calculate_qc_metrics( | ||
| adata, | ||
| qc_vars=["mt"], | ||
| percent_top=None, | ||
| log1p=False, | ||
| inplace=True | ||
| ) | ||
| return adata | ||
|
|
||
| def select_highly_variable_genes( | ||
| adata: AnnData, | ||
| min_mean: float = 0.0125, | ||
| max_mean: int = 3, | ||
| min_disp: float = 0.5 | ||
| ) -> AnnData: | ||
| """ | ||
| Identifies highly variable genes in the dataset. | ||
|
|
||
| Args: | ||
| adata (AnnData): The dataset with QC metrics. | ||
| min_mean (float): Minimum mean expression threshold. Default is 0.0125. | ||
| max_mean (float): Maximum mean expression threshold. Default is 3. | ||
| min_disp (float): Minimum dispersion threshold. Default is 0.5. | ||
|
|
||
| Returns: | ||
| AnnData: Dataset with highly variable gene information. | ||
| """ | ||
| sc.pp.highly_variable_genes( | ||
| adata, | ||
| min_mean=min_mean, | ||
| max_mean=max_mean, | ||
| min_disp=min_disp | ||
| ) | ||
| return adata | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you could add mt and MT as argument in future maybe we want to avoid mt filtration