-
Notifications
You must be signed in to change notification settings - Fork 0
Add TicChromaogram #58
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
17c41c2
5420ac8
d1ab85d
d84b504
ac399de
68a7615
e567dc5
2abab9c
cb16cd5
aee5eb7
0bf00bd
f637348
4ffbbd9
fb0e620
3a727dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,3 +29,4 @@ run_app.bat | |
| python* | ||
| gdpr_consent/node_modules/ | ||
| *~ | ||
| CLAUDE.md | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,7 @@ | |
|
|
||
| def parseDeconv( | ||
| file_manager, dataset_id, out_deconv_mzML, anno_annotated_mzML, | ||
| spec1_tsv=None, spec2_tsv=None, logger=None | ||
| spec1_tsv, spec2_tsv=None, logger=None | ||
| ): | ||
| logger.log("Progress of 'processing FLASHDeconv results':", level=2) | ||
| logger.log("0.0 %", level=2) | ||
|
|
@@ -21,7 +21,88 @@ def parseDeconv( | |
| file_manager.store_data(dataset_id, 'deconv_dfs', deconv_df) | ||
| del deconv_df | ||
| del anno_df | ||
|
|
||
| spec1_df = pd.read_csv( | ||
| spec1_tsv, sep='\t', usecols=[ | ||
| 'FeatureIndex', 'MonoisotopicMass', 'SumIntensity', 'RetentionTime', | ||
| 'ScanNum' | ||
| ] | ||
| ) | ||
| spec1_df.loc[:,'Level'] = 1 | ||
| file_manager.store_data(dataset_id, 'spec1_df', spec1_df) | ||
| spec2_df = pd.read_csv( | ||
| spec2_tsv, sep='\t', usecols=[ | ||
| 'FeatureIndex', 'MonoisotopicMass', 'SumIntensity', 'RetentionTime', | ||
| 'ScanNum' | ||
| ] | ||
| ) | ||
| spec2_df.loc[:,'Level'] = 2 | ||
| file_manager.store_data(dataset_id, 'spec2_df', spec2_df) | ||
| del spec1_df | ||
| del spec2_df | ||
|
|
||
| features = file_manager.get_results( | ||
| dataset_id, ['spec1_df', 'spec2_df'], use_polars=True | ||
| ) | ||
| # Build the base once | ||
| base = pl.concat([features["spec1_df"], features["spec2_df"]]) | ||
|
|
||
| # Sort first so indices reflect first appearance order in the data | ||
| sorted_base = base.sort("RetentionTime") | ||
|
|
||
| # Create a ScanNum -> ScanIndex mapping in order of first occurrence | ||
| scan_index_map = ( | ||
| sorted_base | ||
| .select("ScanNum") | ||
| .unique(maintain_order=True) | ||
| .with_row_count("ScanIndex") | ||
| ) | ||
|
|
||
| # Build dataframe | ||
| features = ( | ||
| sorted_base | ||
| # needed for MassIndex; global index after sort | ||
| .with_row_count("RowID") | ||
| .with_columns( | ||
| # per-ScanNum 0-based MassIndex using RowID | ||
| (pl.col("RowID") - pl.col("RowID").min().over("ScanNum")).alias("MassIndex"), | ||
| # Retention time in seconds to comply with other datastructures | ||
| (pl.col("RetentionTime") * 60).alias("RetentionTime"), | ||
| ) | ||
| # Attach scan index | ||
| .join(scan_index_map, on="ScanNum", how="left") | ||
| # For now we only consider features at ms1 level | ||
| .filter(pl.col("Level") == 1) | ||
| # Drop helper columns | ||
| .drop(["Level", "RowID"]) | ||
| ) | ||
| file_manager.store_data(dataset_id, 'feature_dfs', features) | ||
|
Comment on lines
+44
to
+79
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: rg "spec2_df" --type py -C 3Repository: OpenMS/FLASHApp Length of output: 174 🏁 Script executed: rg "spec2_tsv" --type py -C 3Repository: OpenMS/FLASHApp Length of output: 6886 🏁 Script executed: rg "file_manager.get_results\|file_manager.store_data" --type py -C 2 -A 2Repository: OpenMS/FLASHApp Length of output: 41 🏁 Script executed: cat -n src/parse/deconv.py | head -100Repository: OpenMS/FLASHApp Length of output: 4566 🏁 Script executed: rg "store_data.*spec2_df|spec2_df.*store" --type py -C 3Repository: OpenMS/FLASHApp Length of output: 347 spec2_df unconditional read will crash when spec2_tsv is None. Lines 33–40 read spec2_tsv without checking if it's 🤖 Prompt for AI Agents |
||
|
|
||
| # Create aggregated feature table for display | ||
| # Group by FeatureIndex and compute summary statistics | ||
| feature_table = ( | ||
| features | ||
| .filter(pl.col('FeatureIndex').is_not_null() & (pl.col('FeatureIndex') >= 0)) | ||
| .group_by('FeatureIndex') | ||
| .agg([ | ||
| pl.col('MonoisotopicMass').mean().alias('MonoMass'), | ||
| pl.col('SumIntensity').sum().alias('TotalIntensity'), | ||
| pl.col('SumIntensity').max().alias('ApexIntensity'), | ||
| pl.col('RetentionTime').min().alias('RTStart'), | ||
| pl.col('RetentionTime').max().alias('RTEnd'), | ||
| pl.len().alias('NumScans'), | ||
| # Get the scan index at apex (max intensity) | ||
| pl.col('ScanIndex').sort_by('SumIntensity', descending=True).first().alias('ApexScanIndex'), | ||
| # Get the mass index at apex | ||
| pl.col('MassIndex').sort_by('SumIntensity', descending=True).first().alias('ApexMassIndex'), | ||
| ]) | ||
| .with_columns([ | ||
| (pl.col('RTEnd') - pl.col('RTStart')).alias('RTDuration'), | ||
| ]) | ||
| .sort('FeatureIndex') | ||
| ) | ||
| file_manager.store_data(dataset_id, 'feature_table', feature_table) | ||
|
|
||
| # Immediately reload as polars LazyFrames for efficient processing | ||
| results = file_manager.get_results(dataset_id, ['anno_dfs', 'deconv_dfs'], use_polars=True) | ||
| pl_anno = results['anno_dfs'] | ||
|
|
@@ -45,7 +126,7 @@ def parseDeconv( | |
| ) | ||
|
|
||
| # Collect here as this is the data we are operating on | ||
| relevant_heatmap_lazy = relevant_heatmap_lazy.collect().lazy() | ||
| relevant_heatmap_lazy = relevant_heatmap_lazy.collect(streaming=True).lazy() | ||
|
|
||
| # Get count for compression level calculation | ||
| heatmap_count = relevant_heatmap_lazy.select(pl.len()).collect().item() | ||
|
|
@@ -69,6 +150,32 @@ def parseDeconv( | |
| dataset_id, f'ms{ms_level}_{descriptor}_heatmap_{size}', | ||
| current_heatmap_lazy | ||
| ) | ||
|
|
||
| # Create TIC table | ||
| ms1_heatmap = file_manager.get_results( | ||
| dataset_id, ['ms1_raw_heatmap'], use_polars=True | ||
| )['ms1_raw_heatmap'] | ||
| ms1_heatmap = ms1_heatmap.with_columns(pl.lit(1).alias('level')) | ||
| ms1_heatmap = ms1_heatmap.drop(['mass', 'mass_idx']) | ||
| ms2_heatmap = file_manager.get_results( | ||
| dataset_id, ['ms2_raw_heatmap'], use_polars=True | ||
| )['ms2_raw_heatmap'] | ||
| ms2_heatmap = ms2_heatmap.with_columns(pl.lit(2).alias('level')) | ||
| ms2_heatmap = ms2_heatmap.drop(['mass', 'mass_idx']) | ||
| tic_data = pl.concat([ms1_heatmap, ms2_heatmap], how='vertical') | ||
| tic_data = ( | ||
| tic_data.group_by('scan_idx') | ||
| .agg([ | ||
| pl.col('rt').first().alias('rt'), | ||
| pl.col('level').first().alias('level'), | ||
| pl.col('intensity').sum().alias('tic'), | ||
| ]) | ||
| ) | ||
| tic_data = tic_data.sort("scan_idx", descending=False) | ||
| file_manager.store_data(dataset_id, 'tic', tic_data) | ||
|
|
||
|
|
||
|
|
||
|
|
||
| logger.log("20.0 %", level=2) | ||
|
|
||
|
|
@@ -126,8 +233,66 @@ def parseDeconv( | |
| pl.col('snr').alias('SNR'), | ||
| pl.col('qscore').alias('QScore') | ||
| ]) | ||
| ) | ||
|
|
||
| # Add FeatureIndex arrays to mass_table | ||
| features = file_manager.get_results(dataset_id, ['feature_dfs'], use_polars=True)['feature_dfs'] | ||
|
|
||
| # Handle NaN FeatureIndex values by replacing with -1 | ||
| features = features.with_columns([ | ||
| pl.when(pl.col('FeatureIndex').is_null()) | ||
| .then(pl.lit(-1)) | ||
| .otherwise(pl.col('FeatureIndex')) | ||
| .alias('FeatureIndex') | ||
| ]) | ||
|
|
||
| # Group by ScanNum and create arrays of FeatureIndex ordered by MassIndex | ||
| feature_arrays = ( | ||
| features | ||
| .sort(['ScanIndex', 'MassIndex']) | ||
| .group_by('ScanIndex') | ||
| .agg([ | ||
| pl.col('FeatureIndex').alias('FeatureIndices') | ||
| ]) | ||
| ) | ||
|
|
||
| # Get scan info with MSLevel and number of masses for creating -1 arrays | ||
| scan_info = ( | ||
| pl_deconv_indexed | ||
| .select([ | ||
| pl.col('index'), | ||
| pl.col('Scan'), | ||
| pl.col('MSLevel'), | ||
| pl.col('mzarray').list.len().alias('num_masses') | ||
| ]) | ||
| ) | ||
|
|
||
| # Join feature arrays with scan info and create FeatureIndex column | ||
| scans_with_features = ( | ||
| scan_info | ||
| .join(feature_arrays, left_on='index', right_on='ScanIndex', how='left') | ||
| .with_columns([ | ||
| # For MS2 scans create array of -1s | ||
| pl.when(pl.col('MSLevel') == 2) | ||
| .then( | ||
| pl.col('num_masses').map_elements( | ||
| lambda n: [-1] * n, | ||
| return_dtype=pl.List(pl.Int64) | ||
| ) | ||
| ) | ||
| .otherwise(pl.col('FeatureIndices')) | ||
| .alias('FeatureIndex') | ||
| ]) | ||
| .select(['index', 'FeatureIndex']) | ||
| ) | ||
|
|
||
| # Add FeatureIndex to mass_table | ||
| mass_table_lazy = ( | ||
| mass_table_lazy | ||
| .join(scans_with_features, on='index', how='left') | ||
| .sort("index") | ||
| ) | ||
|
|
||
| file_manager.store_data(dataset_id, 'mass_table', mass_table_lazy) | ||
|
|
||
| logger.log("50.0 %", level=2) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -181,4 +181,29 @@ def filter_data(data, out_components, selection_store, additional_data, tool): | |
| ][selection_store['proteinIndex']] | ||
| } | ||
|
|
||
| # Feature Level Information | ||
| if (component == 'Deconvolved MS1 Heatmap'): | ||
| if ('scanIndex' in selection_store) and ('massIndex' in selection_store): | ||
| feature_data = data['feature_data'] | ||
| feature_info = feature_data.filter( | ||
| (pl.col("ScanIndex") == selection_store['scanIndex']) | ||
| & (pl.col("MassIndex") == selection_store['massIndex']) | ||
| ) | ||
| mass_row = feature_info.collect(streaming=True) | ||
| if mass_row.height == 0: | ||
| data['feature_data'] = pd.DataFrame() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix type inconsistency in empty DataFrame assignments. The empty cases assign Apply this diff to use Polars consistently: mass_row = feature_info.collect(streaming=True)
if mass_row.height == 0:
- data['feature_data'] = pd.DataFrame()
+ data['feature_data'] = pl.DataFrame()
else:
idx = mass_row.row(0, named=True)['FeatureIndex']
if idx is None:
- data['feature_data'] = pd.DataFrame()
+ data['feature_data'] = pl.DataFrame()
else:
feature_data = (
feature_data
.filter(pl.col("FeatureIndex") == idx)
.sort("RetentionTime")
)
data['feature_data'] = feature_data.collect(streaming=True)
else:
- data['feature_data'] = pd.DataFrame()
+ data['feature_data'] = pl.DataFrame()Also applies to: 198-198, 207-207 🤖 Prompt for AI Agents |
||
| else: | ||
| idx = mass_row.row(0, named=True)['FeatureIndex'] | ||
| if idx is None: | ||
| data['feature_data'] = pd.DataFrame() | ||
| else: | ||
| feature_data = ( | ||
| feature_data | ||
| .filter(pl.col("FeatureIndex") == idx) | ||
| .sort("RetentionTime") | ||
| ) | ||
| data['feature_data'] = feature_data.collect(streaming=True) | ||
| else: | ||
| data['feature_data'] = pd.DataFrame() | ||
|
|
||
| return data | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Handle optional spec2_tsv parameter.
While
spec1_tsvis now required,spec2_tsvremains optional (defaultNone). However, Lines 33-38 attempt to readspec2_tsvwithout checking if it'sNone, which will cause a crash.Apply this diff to handle the optional parameter:
spec1_df.loc[:,'Level'] = 1 file_manager.store_data(dataset_id, 'spec1_df', spec1_df) - spec2_df = pd.read_csv( - spec2_tsv, sep='\t', usecols=[ - 'FeatureIndex', 'MonoisotopicMass', 'SumIntensity', 'RetentionTime', - 'ScanNum' - ] - ) - spec2_df.loc[:,'Level'] = 2 - file_manager.store_data(dataset_id, 'spec2_df', spec2_df) - del spec1_df - del spec2_df + if spec2_tsv is not None: + spec2_df = pd.read_csv( + spec2_tsv, sep='\t', usecols=[ + 'FeatureIndex', 'MonoisotopicMass', 'SumIntensity', 'RetentionTime', + 'ScanNum' + ] + ) + spec2_df.loc[:,'Level'] = 2 + file_manager.store_data(dataset_id, 'spec2_df', spec2_df) + del spec2_df + else: + # Store empty dataframe to maintain consistent interface + spec2_df = pd.DataFrame(columns=['FeatureIndex', 'MonoisotopicMass', 'SumIntensity', 'RetentionTime', 'ScanNum', 'Level']) + file_manager.store_data(dataset_id, 'spec2_df', spec2_df) + del spec2_df + del spec1_dfAlso applies to: 33-40
🤖 Prompt for AI Agents