diff --git a/.gitignore b/.gitignore index d65d045e..bcff8eac 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ run_app.bat python* gdpr_consent/node_modules/ *~ +CLAUDE.md diff --git a/README.md b/README.md index 9f955c83..e70b56ce 100644 --- a/README.md +++ b/README.md @@ -57,3 +57,4 @@ After it has been built you can run the image with: `docker run -p 8501:8501 flashapp:latest` Navigate to `http://localhost:8501` in your browser. +. \ No newline at end of file diff --git a/content/FLASHDeconv/FLASHDeconvLayoutManager.py b/content/FLASHDeconv/FLASHDeconvLayoutManager.py index a86164f5..1e7c2375 100644 --- a/content/FLASHDeconv/FLASHDeconvLayoutManager.py +++ b/content/FLASHDeconv/FLASHDeconvLayoutManager.py @@ -15,8 +15,10 @@ 'Deconvolved spectrum (Scan table needed)', 'Raw spectrum (Scan table needed)', 'Mass table (Scan table needed)', + 'Feature table', '3D S/N plot (Mass table needed)', - 'Score Distribution Plot' + 'Score Distribution Plot', + 'TIC Chromatogram', # "Sequence view" and "Internal fragment map" is added when "input_sequence" is submitted ] @@ -29,8 +31,10 @@ 'deconv_spectrum', 'anno_spectrum', 'mass_table', + 'feature_table', '3D_SN_plot', 'fdr_plot', + 'tic_chromatogram', # "sequence view" and "internal fragment map" added when "input_sequence" is submitted ] diff --git a/openms-streamlit-vue-component b/openms-streamlit-vue-component index fad86f46..93b8a7d1 160000 --- a/openms-streamlit-vue-component +++ b/openms-streamlit-vue-component @@ -1 +1 @@ -Subproject commit fad86f46c3aa788387ec5201fc5cceda5db30fd9 +Subproject commit 93b8a7d1e80b6427ad56420df390a5d6bbe218d0 diff --git a/src/parse/deconv.py b/src/parse/deconv.py index aaa498b1..e7731ebf 100644 --- a/src/parse/deconv.py +++ b/src/parse/deconv.py @@ -8,7 +8,7 @@ def parseDeconv( file_manager, dataset_id, out_deconv_mzML, anno_annotated_mzML, - spec1_tsv=None, spec2_tsv=None, logger=None + spec1_tsv, spec2_tsv=None, logger=None ): logger.log("Progress of 'processing FLASHDeconv results':", level=2) logger.log("0.0 %", level=2) @@ -21,7 +21,88 @@ def parseDeconv( file_manager.store_data(dataset_id, 'deconv_dfs', deconv_df) del deconv_df del anno_df + + spec1_df = pd.read_csv( + spec1_tsv, sep='\t', usecols=[ + 'FeatureIndex', 'MonoisotopicMass', 'SumIntensity', 'RetentionTime', + 'ScanNum' + ] + ) + spec1_df.loc[:,'Level'] = 1 + file_manager.store_data(dataset_id, 'spec1_df', spec1_df) + spec2_df = pd.read_csv( + spec2_tsv, sep='\t', usecols=[ + 'FeatureIndex', 'MonoisotopicMass', 'SumIntensity', 'RetentionTime', + 'ScanNum' + ] + ) + spec2_df.loc[:,'Level'] = 2 + file_manager.store_data(dataset_id, 'spec2_df', spec2_df) + del spec1_df + del spec2_df + features = file_manager.get_results( + dataset_id, ['spec1_df', 'spec2_df'], use_polars=True + ) + # Build the base once + base = pl.concat([features["spec1_df"], features["spec2_df"]]) + + # Sort first so indices reflect first appearance order in the data + sorted_base = base.sort("RetentionTime") + + # Create a ScanNum -> ScanIndex mapping in order of first occurrence + scan_index_map = ( + sorted_base + .select("ScanNum") + .unique(maintain_order=True) + .with_row_count("ScanIndex") + ) + + # Build dataframe + features = ( + sorted_base + # needed for MassIndex; global index after sort + .with_row_count("RowID") + .with_columns( + # per-ScanNum 0-based MassIndex using RowID + (pl.col("RowID") - pl.col("RowID").min().over("ScanNum")).alias("MassIndex"), + # Retention time in seconds to comply with other datastructures + (pl.col("RetentionTime") * 60).alias("RetentionTime"), + ) + # Attach scan index + .join(scan_index_map, on="ScanNum", how="left") + # For now we only consider features at ms1 level + .filter(pl.col("Level") == 1) + # Drop helper columns + .drop(["Level", "RowID"]) + ) + file_manager.store_data(dataset_id, 'feature_dfs', features) + + # Create aggregated feature table for display + # Group by FeatureIndex and compute summary statistics + feature_table = ( + features + .filter(pl.col('FeatureIndex').is_not_null() & (pl.col('FeatureIndex') >= 0)) + .group_by('FeatureIndex') + .agg([ + pl.col('MonoisotopicMass').mean().alias('MonoMass'), + pl.col('SumIntensity').sum().alias('TotalIntensity'), + pl.col('SumIntensity').max().alias('ApexIntensity'), + pl.col('RetentionTime').min().alias('RTStart'), + pl.col('RetentionTime').max().alias('RTEnd'), + pl.len().alias('NumScans'), + # Get the scan index at apex (max intensity) + pl.col('ScanIndex').sort_by('SumIntensity', descending=True).first().alias('ApexScanIndex'), + # Get the mass index at apex + pl.col('MassIndex').sort_by('SumIntensity', descending=True).first().alias('ApexMassIndex'), + ]) + .with_columns([ + (pl.col('RTEnd') - pl.col('RTStart')).alias('RTDuration'), + ]) + .sort('FeatureIndex') + ) + file_manager.store_data(dataset_id, 'feature_table', feature_table) + # Immediately reload as polars LazyFrames for efficient processing results = file_manager.get_results(dataset_id, ['anno_dfs', 'deconv_dfs'], use_polars=True) pl_anno = results['anno_dfs'] @@ -45,7 +126,7 @@ def parseDeconv( ) # Collect here as this is the data we are operating on - relevant_heatmap_lazy = relevant_heatmap_lazy.collect().lazy() + relevant_heatmap_lazy = relevant_heatmap_lazy.collect(streaming=True).lazy() # Get count for compression level calculation heatmap_count = relevant_heatmap_lazy.select(pl.len()).collect().item() @@ -69,6 +150,32 @@ def parseDeconv( dataset_id, f'ms{ms_level}_{descriptor}_heatmap_{size}', current_heatmap_lazy ) + + # Create TIC table + ms1_heatmap = file_manager.get_results( + dataset_id, ['ms1_raw_heatmap'], use_polars=True + )['ms1_raw_heatmap'] + ms1_heatmap = ms1_heatmap.with_columns(pl.lit(1).alias('level')) + ms1_heatmap = ms1_heatmap.drop(['mass', 'mass_idx']) + ms2_heatmap = file_manager.get_results( + dataset_id, ['ms2_raw_heatmap'], use_polars=True + )['ms2_raw_heatmap'] + ms2_heatmap = ms2_heatmap.with_columns(pl.lit(2).alias('level')) + ms2_heatmap = ms2_heatmap.drop(['mass', 'mass_idx']) + tic_data = pl.concat([ms1_heatmap, ms2_heatmap], how='vertical') + tic_data = ( + tic_data.group_by('scan_idx') + .agg([ + pl.col('rt').first().alias('rt'), + pl.col('level').first().alias('level'), + pl.col('intensity').sum().alias('tic'), + ]) + ) + tic_data = tic_data.sort("scan_idx", descending=False) + file_manager.store_data(dataset_id, 'tic', tic_data) + + + logger.log("20.0 %", level=2) @@ -126,8 +233,66 @@ def parseDeconv( pl.col('snr').alias('SNR'), pl.col('qscore').alias('QScore') ]) + ) + + # Add FeatureIndex arrays to mass_table + features = file_manager.get_results(dataset_id, ['feature_dfs'], use_polars=True)['feature_dfs'] + + # Handle NaN FeatureIndex values by replacing with -1 + features = features.with_columns([ + pl.when(pl.col('FeatureIndex').is_null()) + .then(pl.lit(-1)) + .otherwise(pl.col('FeatureIndex')) + .alias('FeatureIndex') + ]) + + # Group by ScanNum and create arrays of FeatureIndex ordered by MassIndex + feature_arrays = ( + features + .sort(['ScanIndex', 'MassIndex']) + .group_by('ScanIndex') + .agg([ + pl.col('FeatureIndex').alias('FeatureIndices') + ]) + ) + + # Get scan info with MSLevel and number of masses for creating -1 arrays + scan_info = ( + pl_deconv_indexed + .select([ + pl.col('index'), + pl.col('Scan'), + pl.col('MSLevel'), + pl.col('mzarray').list.len().alias('num_masses') + ]) + ) + + # Join feature arrays with scan info and create FeatureIndex column + scans_with_features = ( + scan_info + .join(feature_arrays, left_on='index', right_on='ScanIndex', how='left') + .with_columns([ + # For MS2 scans create array of -1s + pl.when(pl.col('MSLevel') == 2) + .then( + pl.col('num_masses').map_elements( + lambda n: [-1] * n, + return_dtype=pl.List(pl.Int64) + ) + ) + .otherwise(pl.col('FeatureIndices')) + .alias('FeatureIndex') + ]) + .select(['index', 'FeatureIndex']) + ) + + # Add FeatureIndex to mass_table + mass_table_lazy = ( + mass_table_lazy + .join(scans_with_features, on='index', how='left') .sort("index") ) + file_manager.store_data(dataset_id, 'mass_table', mass_table_lazy) logger.log("50.0 %", level=2) diff --git a/src/render/components.py b/src/render/components.py index 2469c1de..9df3f58f 100644 --- a/src/render/components.py +++ b/src/render/components.py @@ -6,7 +6,7 @@ # Create a _RELEASE constant. We'll set this to False while we're developing # the component, and True when we're ready to package and distribute it. -_RELEASE = True +_RELEASE = False _component_func = None @@ -59,6 +59,9 @@ def __init__(self, table_type): elif table_type == 'TagTable': self.title = 'Tag Table' self.componentName = "TabulatorTagTable" + elif table_type == 'FeatureTable': + self.title = 'Feature Table' + self.componentName = "TabulatorFeatureTable" class PlotlyLineplot: @@ -99,3 +102,9 @@ class FLASHQuant: def __init__(self): self.title = 'QuantVis' self.componentName = 'FLASHQuantView' + + +class Chromatogram: + def __init__(self): + self.title = 'TIC' + self.componentName = 'TICChromatogram' diff --git a/src/render/compression.py b/src/render/compression.py index 316e5d2c..4590b288 100644 --- a/src/render/compression.py +++ b/src/render/compression.py @@ -50,7 +50,7 @@ def downsample_heatmap(data, max_datapoints=20000, rt_bins=400, mz_bins=50, logg ) # We need to collect here because scipy requires numpy arrays - sorted_data = sorted_data.collect() + sorted_data = sorted_data.collect(streaming=True) # Count peaks total_count = sorted_data.select(pl.count()).item() diff --git a/src/render/initialize.py b/src/render/initialize.py index be693aef..1d05ee22 100644 --- a/src/render/initialize.py +++ b/src/render/initialize.py @@ -3,7 +3,7 @@ from src.render.components import ( PlotlyHeatmap, PlotlyLineplot, PlotlyLineplotTagger, Plotly3Dplot, Tabulator, SequenceView, InternalFragmentMap, FlashViewerComponent, - FDRPlotly, FLASHQuant + FDRPlotly, FLASHQuant, Chromatogram ) from src.render.compression import compute_compression_levels @@ -33,6 +33,13 @@ def initialize_data(comp_name, selected_data, file_manager, tool): data_to_send['deconv_heatmap_df'] = cached_compression_levels[0] additional_data['deconv_heatmap_df'] = cached_compression_levels + + # Get feature annotations + feature_data = file_manager.get_results( + selected_data, ['feature_dfs'], use_polars=True + )['feature_dfs'] + data_to_send['feature_data'] = feature_data + component_arguments = PlotlyHeatmap(title="Deconvolved MS1 Heatmap") elif comp_name == 'ms2_deconv_heat_map': @@ -172,6 +179,31 @@ def initialize_data(comp_name, selected_data, file_manager, tool): data = file_manager.get_results(selected_data, ['quant_dfs']) data_to_send['quant_data'] = data['quant_dfs'] component_arguments = FLASHQuant() + elif comp_name == 'tic_chromatogram': + data = file_manager.get_results(selected_data, ['tic', 'feature_table', 'feature_dfs']) + data_to_send['tic'] = data['tic'] + data_to_send['feature_table'] = data.get('feature_table') + # feature_dfs contains per-scan intensity data for each feature + feature_dfs = data.get('feature_dfs') + if feature_dfs is not None: + # Convert DataFrame to list of dicts for JSON serialization + if hasattr(feature_dfs, 'collect'): + # It's a Polars LazyFrame + df = feature_dfs.collect() + elif hasattr(feature_dfs, 'to_dicts'): + # It's a Polars DataFrame + df = feature_dfs + else: + # It's a pandas DataFrame - convert to polars for consistent handling + df = pl.from_pandas(feature_dfs) + # Select only needed columns and drop nulls to ensure clean JSON + df = df.select(['FeatureIndex', 'RetentionTime', 'SumIntensity']).drop_nulls() + data_to_send['feature_dfs'] = df.to_dicts() + component_arguments = Chromatogram() + elif comp_name == 'feature_table': + data = file_manager.get_results(selected_data, ['feature_table']) + data_to_send['feature_table'] = data['feature_table'] + component_arguments = Tabulator('FeatureTable') components = [[FlashViewerComponent(component_arguments)]] diff --git a/src/render/update.py b/src/render/update.py index 3d940eda..b0d5215c 100644 --- a/src/render/update.py +++ b/src/render/update.py @@ -181,4 +181,29 @@ def filter_data(data, out_components, selection_store, additional_data, tool): ][selection_store['proteinIndex']] } + # Feature Level Information + if (component == 'Deconvolved MS1 Heatmap'): + if ('scanIndex' in selection_store) and ('massIndex' in selection_store): + feature_data = data['feature_data'] + feature_info = feature_data.filter( + (pl.col("ScanIndex") == selection_store['scanIndex']) + & (pl.col("MassIndex") == selection_store['massIndex']) + ) + mass_row = feature_info.collect(streaming=True) + if mass_row.height == 0: + data['feature_data'] = pd.DataFrame() + else: + idx = mass_row.row(0, named=True)['FeatureIndex'] + if idx is None: + data['feature_data'] = pd.DataFrame() + else: + feature_data = ( + feature_data + .filter(pl.col("FeatureIndex") == idx) + .sort("RetentionTime") + ) + data['feature_data'] = feature_data.collect(streaming=True) + else: + data['feature_data'] = pd.DataFrame() + return data \ No newline at end of file