Skip to content
Binary file added src/modules/data/pbmc3k_raw.h5ad
Binary file not shown.
57 changes: 57 additions & 0 deletions src/modules/preproccesing2-demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import scanpy as sc
import numpy as np
import pandas as pd
from preprocessing import filter_data, calculate_qc_metrics, select_highly_variable_genes

def load_file_demo(file_path):
adata = sc.read_h5ad(file_path)
adata.write("adata_demo.h5ad")
print("Data saved as adata_demo.h5ad")

file_path = "C:/Users/pc/OneDrive/Masaüstü/bioinformatic/Hw3covid_Data_AllCells.h5ad"
adata = load_file_demo(file_path)

# Step 1: Create example data (random counts matrix)
np.random.seed(42) # For reproducibility

n_cells = 1000 # Number of cells
n_genes = 5000 # Number of genes

# Create random gene expression data (Poisson distribution)
counts = np.random.poisson(lam=1.0, size=(n_cells, n_genes)) # Shape: (n_cells, n_genes)

# Create an AnnData object
adata = sc.AnnData(counts)

# Assign gene and cell names
adata.var_names = [f"Gene_{i}" for i in range(n_genes)] # Gene names: Gene_0, Gene_1, ..., Gene_4999
adata.obs_names = [f"Cell_{i}" for i in range(n_cells)] # Cell names: Cell_0, Cell_1, ..., Cell_999

# Before adding 'MT-', note that var_names is not a list but an Index object.
# So we need to convert var_names to a list before making changes.
var_names_list = adata.var_names.tolist()

# Add some mitochondrial genes for demonstration (every 10th gene is mitochondrial)
for i in range(0, n_genes, 10):
var_names_list[i] = f"MT-{var_names_list[i]}" # Rename every 10th gene to start with "MT-"

# We assign the modified list back as an Index object
adata.var_names = pd.Index(var_names_list)

# Step 2: Apply filter_data to filter cells and genes
adata_filtered = filter_data(adata, min_genes=200, min_cells=3)
print(f"Filtered AnnData: {adata_filtered.shape} (cells, genes)")

# Step 3: Calculate QC metrics
adata_qc = calculate_qc_metrics(adata_filtered)
print("\nQC Metrics (first few rows of obs):")
print(adata_qc.obs.head()) # View the QC metrics such as 'pct_counts_mt' (percentage of mitochondrial genes)

# Step 4: Select highly variable genes
adata_hvg = select_highly_variable_genes(adata_qc, min_mean=0.0125, max_mean=3, min_disp=0.5)
print("\nHighly Variable Genes (first few rows of 'highly_variable' column):")
print(adata_hvg.var[['highly_variable']].head()) # Display which genes are highly variable

# Step 5: Count how many genes are highly variable
num_highly_variable_genes = adata_hvg.var['highly_variable'].sum()
print(f"\nNumber of highly variable genes selected: {num_highly_variable_genes}")
77 changes: 77 additions & 0 deletions src/modules/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import scanpy as sc
import pandas as pd
from anndata import AnnData

def filter_data(
adata: AnnData,
min_genes: int = 200,
min_cells: int = 3
) -> AnnData:
"""
Filters the dataset.

Args:
adata (AnnData): The single-cell dataset.
min_genes (int): Minimum number of genes expressed per cell. Default is 200.
min_cells (int): Minimum number of cells expressing a gene. Default is 3.

Returns:
AnnData: Filtered dataset.
"""
sc.pp.filter_cells(adata, min_genes=min_genes)
sc.pp.filter_genes(adata, min_cells=min_cells)
return adata

def calculate_qc_metrics(adata: AnnData) -> AnnData:
"""
Computes quality control (QC) metrics and adds them to the dataset.
Checks for mitochondrial genes with both uppercase and lowercase 'mt-' prefix.

Args:
adata (AnnData): The filtered dataset.

Returns:
AnnData: Dataset with QC metrics added.
"""
if adata.var_names.str.startswith("MT-").any():
adata.var["mt"] = adata.var_names.str.startswith("MT-")
elif adata.var_names.str.startswith("mt-").any():
adata.var["mt"] = adata.var_names.str.startswith("mt-")
else:
# In case neither is found, check in a case-insensitive way just to be sure
adata.var["mt"] = adata.var_names.str.upper().str.startswith("MT-")

sc.pp.calculate_qc_metrics(
adata,
qc_vars=["mt"],
percent_top=None,
log1p=False,
inplace=True
)
Comment on lines +25 to +50
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could add mt and MT as argument in future maybe we want to avoid mt filtration

return adata

def select_highly_variable_genes(
adata: AnnData,
min_mean: float = 0.0125,
max_mean: int = 3,
min_disp: float = 0.5
) -> AnnData:
"""
Identifies highly variable genes in the dataset.

Args:
adata (AnnData): The dataset with QC metrics.
min_mean (float): Minimum mean expression threshold. Default is 0.0125.
max_mean (float): Maximum mean expression threshold. Default is 3.
min_disp (float): Minimum dispersion threshold. Default is 0.5.

Returns:
AnnData: Dataset with highly variable gene information.
"""
sc.pp.highly_variable_genes(
adata,
min_mean=min_mean,
max_mean=max_mean,
min_disp=min_disp
)
return adata