diff --git a/MainClass.cs b/MainClass.cs
index 6ae27e1..99ed51a 100644
--- a/MainClass.cs
+++ b/MainClass.cs
@@ -768,6 +768,9 @@ private static void RegularParametersParsing(string[] args)
if (parseInput.OutputFormat == OutputFormat.IndexMzML) parseInput.OutputFormat = OutputFormat.MzML;
}
+ // Switch off gzip compression for Parquet
+ if (parseInput.OutputFormat == OutputFormat.Parquet) parseInput.Gzip = false;
+
parseInput.MaxLevel = parseInput.MsLevel.Max();
if (parseInput.S3Url != null && parseInput.S3AccessKeyId != null &&
diff --git a/ThermoRawFileParserTest/ThermoRawFileParserTest.csproj b/ThermoRawFileParserTest/ThermoRawFileParserTest.csproj
index 7bb3f14..b1715cd 100644
--- a/ThermoRawFileParserTest/ThermoRawFileParserTest.csproj
+++ b/ThermoRawFileParserTest/ThermoRawFileParserTest.csproj
@@ -32,6 +32,7 @@
all
runtime; build; native; contentfiles; analyzers; buildtransitive
+
diff --git a/ThermoRawFileParserTest/WriterTests.cs b/ThermoRawFileParserTest/WriterTests.cs
index 53e89b0..a28dda8 100644
--- a/ThermoRawFileParserTest/WriterTests.cs
+++ b/ThermoRawFileParserTest/WriterTests.cs
@@ -5,6 +5,7 @@
using System.Xml.Serialization;
using IO.Mgf;
using NUnit.Framework;
+using Parquet;
using ThermoRawFileParser;
using ThermoRawFileParser.Writer.MzML;
@@ -281,5 +282,58 @@ public void TestMzML_MS2()
Assert.That(testMzMl.run.chromatogramList.chromatogram[0].defaultArrayLength, Is.EqualTo(95));
}
+
+ [Test]
+ public void TestParquetCentroid()
+ {
+ // Get temp path for writing the test mzML
+ var tempFilePath = Path.GetTempPath();
+
+ var testRawFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data/small.RAW");
+ var parseInput = new ParseInput(testRawFile, null, tempFilePath, OutputFormat.Parquet);
+
+ RawFileParser.Parse(parseInput);
+
+ // Actual test
+ var parquetFilePath = Path.Combine(tempFilePath, "small.mzparquet");
+
+ using (var parquetReader = ParquetReader.CreateAsync(parquetFilePath).Result)
+ {
+ var groupReader = parquetReader.OpenRowGroupReader(0);
+ var schema = parquetReader.Schema;
+ var scanColumn = groupReader.ReadColumnAsync(schema.FindDataField("scan")).Result;
+
+ Assert.That(scanColumn.NumValues, Is.EqualTo(48520));
+ Assert.That(scanColumn.Statistics.DistinctCount, Is.EqualTo(48));
+ Assert.That((from int p in scanColumn.Data where p == 22 select p).Count(), Is.EqualTo(1632));
+ }
+ }
+
+ [Test]
+ public void TestParquetProfile()
+ {
+ // Get temp path for writing the test mzML
+ var tempFilePath = Path.GetTempPath();
+
+ var testRawFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data/small.RAW");
+ var parseInput = new ParseInput(testRawFile, null, tempFilePath, OutputFormat.Parquet);
+ parseInput.NoPeakPicking = new HashSet { 1, 2 };
+
+ RawFileParser.Parse(parseInput);
+
+ // Actual test
+ var parquetFilePath = Path.Combine(tempFilePath, "small.mzparquet");
+
+ using (var parquetReader = ParquetReader.CreateAsync(parquetFilePath).Result)
+ {
+ var groupReader = parquetReader.OpenRowGroupReader(0);
+ var schema = parquetReader.Schema;
+ var scanColumn = groupReader.ReadColumnAsync(schema.FindDataField("scan")).Result;
+
+ Assert.That(scanColumn.NumValues, Is.EqualTo(305213));
+ Assert.That(scanColumn.Statistics.DistinctCount, Is.EqualTo(48));
+ Assert.That((from int p in scanColumn.Data where p == 22 select p).Count(), Is.EqualTo(17758));
+ }
+ }
}
}
\ No newline at end of file
diff --git a/Writer/MgfSpectrumWriter.cs b/Writer/MgfSpectrumWriter.cs
index a6449bf..bf282af 100644
--- a/Writer/MgfSpectrumWriter.cs
+++ b/Writer/MgfSpectrumWriter.cs
@@ -1,9 +1,6 @@
using System;
-using System.Collections.Generic;
using System.Globalization;
-using System.Linq;
using System.Reflection;
-using System.Text.RegularExpressions;
using log4net;
using ThermoFisher.CommonCore.Data.Business;
using ThermoFisher.CommonCore.Data.FilterEnums;
@@ -19,15 +16,9 @@ public class MgfSpectrumWriter : SpectrumWriter
private const string PositivePolarity = "+";
private const string NegativePolarity = "-";
- // Filter string
- private readonly Regex _filterStringIsolationMzPattern = new Regex(@"ms\d+ (.+?) \[");
-
// Precursor scan number for MSn scans
private int _precursorScanNumber;
- // Precursor scan number (value) and isolation m/z (key) for reference in the precursor element of an MSn spectrum
- private readonly Dictionary _precursorScanNumbers = new Dictionary();
-
public MgfSpectrumWriter(ParseInput parseInput) : base(parseInput)
{
ParseInput.MsLevel.Remove(1); // MS1 spectra are not supposed to be in MGF
@@ -126,23 +117,7 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
}
else //try getting it from the scan filter
{
- var parts = Regex.Split(result.Groups[1].Value, " ");
-
- //find the position of the first (from the end) precursor with a different mass
- //to account for possible supplementary activations written in the filter
- var lastIonMass = parts.Last().Split('@').First();
- int last = parts.Length;
- while (last > 0 &&
- parts[last - 1].Split('@').First() == lastIonMass)
- {
- last--;
- }
-
- string parentFilter = String.Join(" ", parts.Take(last));
- if (_precursorScanNumbers.ContainsKey(parentFilter))
- {
- _precursorScanNumber = _precursorScanNumbers[parentFilter];
- }
+ _precursorScanNumber = GetParentFromScanString(result.Groups[1].Value);
}
if (_precursorScanNumber > 0)
@@ -151,7 +126,8 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
}
else
{
- Log.Error($"Failed finding precursor for {scanNumber}");
+ Log.Error($"Cannot find precursor scan for scan# {scanNumber}");
+ _precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), null);
ParseInput.NewError();
}
}
diff --git a/Writer/MzMlSpectrumWriter.cs b/Writer/MzMlSpectrumWriter.cs
index 7868b39..81677de 100644
--- a/Writer/MzMlSpectrumWriter.cs
+++ b/Writer/MzMlSpectrumWriter.cs
@@ -27,8 +27,6 @@ public class MzMlSpectrumWriter : SpectrumWriter
private static readonly ILog Log =
LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
- private readonly Regex _filterStringIsolationMzPattern = new Regex(@"ms\d+ (.+?) \[");
-
// Tune version < 3 produces multiple trailer entry like "SPS Mass [number]"
private readonly Regex _spSentry = new Regex(@"SPS Mass\s+\d+:");
@@ -45,12 +43,6 @@ public class MzMlSpectrumWriter : SpectrumWriter
private readonly Dictionary _ionizationTypes =
new Dictionary();
- // Precursor scan number (value) and isolation m/z (key) for reference in the precursor element of an MSn spectrum
- private readonly Dictionary _precursorScanNumbers = new Dictionary();
-
- //Precursor information for scans
- private Dictionary _precursorTree = new Dictionary();
-
private const string SourceFileId = "RAW1";
private readonly XmlSerializerFactory _factory = new XmlSerializerFactory();
private const string Ns = "http://psi.hupo.org/ms/mzml";
@@ -68,8 +60,6 @@ public MzMlSpectrumWriter(ParseInput parseInput) : base(parseInput)
_mzMlNamespace.Add(string.Empty, "http://psi.hupo.org/ms/mzml");
_doIndexing = ParseInput.OutputFormat == OutputFormat.IndexMzML;
_osOffset = Environment.NewLine == "\n" ? 0 : 1;
- _precursorScanNumbers[""] = -1;
- _precursorTree[-1] = new PrecursorInfo();
}
///
@@ -639,7 +629,6 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
_writer.WriteValue(BitConverter.ToString(hash).Replace("-", "").ToLowerInvariant());
_writer.WriteEndElement(); // fileChecksum
-
_writer.WriteEndElement(); // indexedmzML
}
@@ -652,21 +641,6 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
Writer.Flush();
Writer.Close();
-
- //This section is not necessary?
- /*if (_doIndexing)
- {
- try
- {
- cryptoStream.Flush();
- cryptoStream.Close();
- }
- catch (System.ObjectDisposedException e)
- {
- // Cannot access a closed file. CryptoStream was already closed when closing _writer
- Log.Warn($"Warning: {e.Message}");
- }
- }*/
}
// In case of indexed mzML, change the extension from xml to mzML and check for the gzip option
@@ -1286,7 +1260,7 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)
int? charge = trailerData.AsPositiveInt("Charge State:");
double? monoisotopicMz = trailerData.AsDouble("Monoisotopic M/Z:");
double? ionInjectionTime = trailerData.AsDouble("Ion Injection Time (ms):");
- double? isolationWidth = trailerData.AsDouble("MS" + (int) scanFilter.MSOrder + " Isolation Width:");
+ double? isolationWidth = trailerData.AsDouble("MS" + msLevel + " Isolation Width:");
double? FAIMSCV = null;
if (trailerData.AsBool("FAIMS Voltage On:").GetValueOrDefault(false))
FAIMSCV = trailerData.AsDouble("FAIMS CV:");
@@ -1374,6 +1348,7 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)
{
Log.Warn($"Cannot find precursor scan for scan# {scanNumber}");
_precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), new PrecursorType[0]);
+ ParseInput.NewWarn();
}
try
@@ -1938,46 +1913,6 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)
return spectrum;
}
-
- private int FindLastReaction(IScanEvent scanEvent, int msLevel)
- {
- int lastReactionIndex = msLevel - 2;
-
- //iteratively trying find the last available index for reaction
- while(true)
- {
- try
- {
- scanEvent.GetReaction(lastReactionIndex + 1);
- }
- catch (ArgumentOutOfRangeException)
- {
- //stop trying
- break;
- }
-
- lastReactionIndex++;
- }
-
- //supplemental activation flag is on -> one of the levels (not necissirily the last one) used supplemental activation
- //check last two activations
- if (scanEvent.SupplementalActivation == TriState.On)
- {
- var lastActivation = scanEvent.GetReaction(lastReactionIndex).ActivationType;
- var beforeLastActivation = scanEvent.GetReaction(lastReactionIndex - 1).ActivationType;
-
- if ((beforeLastActivation == ActivationType.ElectronTransferDissociation || beforeLastActivation == ActivationType.ElectronCaptureDissociation) &&
- (lastActivation == ActivationType.CollisionInducedDissociation || lastActivation == ActivationType.HigherEnergyCollisionalDissociation))
- return lastReactionIndex - 1; //ETD or ECD followed by HCD or CID -> supplemental activation in the last level (move the last reaction one step back)
- else
- return lastReactionIndex;
- }
- else //just use the last one
- {
- return lastReactionIndex;
- }
- }
-
private SpectrumType ConstructPDASpectrum(int scanNumber, int instrumentNumber)
{
// Get each scan from the RAW file
@@ -2558,29 +2493,6 @@ private PrecursorListType ConstructPrecursorList(int precursorScanNumber, IScanE
}
- private int GetParentFromScanString(string scanString)
- {
- var parts = Regex.Split(scanString, " ");
-
- //find the position of the first (from the end) precursor with a different mass
- //to account for possible supplementary activations written in the filter
- var lastIonMass = parts.Last().Split('@').First();
- int last = parts.Length;
- while (last > 0 &&
- parts[last - 1].Split('@').First() == lastIonMass)
- {
- last--;
- }
-
- string parentFilter = String.Join(" ", parts.Take(last));
- if (_precursorScanNumbers.ContainsKey(parentFilter))
- {
- return _precursorScanNumbers[parentFilter];
- }
-
- return -2; //unsuccessful parsing
- }
-
///
/// Populate the scan list element. Full version used for mass spectra,
/// having Scan Event, scan Filter etc
diff --git a/Writer/ParquetSpectrumWriter.cs b/Writer/ParquetSpectrumWriter.cs
index 2703286..438aa37 100644
--- a/Writer/ParquetSpectrumWriter.cs
+++ b/Writer/ParquetSpectrumWriter.cs
@@ -1,257 +1,357 @@
using System;
using System.Collections.Generic;
-using System.IO;
-using System.Linq;
using System.Reflection;
using log4net;
-using Parquet;
-using Parquet.Data;
-using ThermoFisher.CommonCore.Data;
+using Parquet.Serialization;
using ThermoFisher.CommonCore.Data.Business;
using ThermoFisher.CommonCore.Data.FilterEnums;
using ThermoFisher.CommonCore.Data.Interfaces;
-using ThermoRawFileParser.Writer.MzML;
namespace ThermoRawFileParser.Writer
{
+ struct MzParquet
+ {
+ public uint scan;
+ public uint level;
+ public float rt;
+ public float mz;
+ public float intensity;
+ public float? ion_mobility;
+ public float? isolation_lower;
+ public float? isolation_upper;
+ public int? precursor_scan;
+ public float? precursor_mz;
+ public uint? precursor_charge;
+ }
+
+ struct PrecursorData
+ {
+ public float? mz;
+ public float? isolation_lower;
+ public float? isolation_upper;
+ }
+
public class ParquetSpectrumWriter : SpectrumWriter
{
private static readonly ILog Log =
LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
- private IRawDataPlus _rawFile;
-
- // Dictionary to keep track of the different mass analyzers (key: Thermo MassAnalyzerType; value: the reference string)
- private readonly Dictionary _massAnalyzers =
- new Dictionary();
-
- private readonly Dictionary _ionizationTypes =
- new Dictionary();
+ private const int ParquetSliceSize = 1_048_576;
public ParquetSpectrumWriter(ParseInput parseInput) : base(parseInput)
{
+ //nothing to do here
}
- public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastScanNumber)
+ public override void Write(IRawDataPlus raw, int firstScanNumber, int lastScanNumber)
{
- _rawFile = rawFile;
- if (rawFile.HasMsData)
+ if (!raw.HasMsData)
{
- List pScans = new List();
- WritePScans(ParseInput.OutputDirectory, rawFile.FileName, rawFile, pScans);
+ throw new RawFileParserException("No MS data in RAW file, no output will be produced");
}
- else
+
+ ConfigureWriter(".mzparquet");
+
+ ParquetSerializerOptions opts = new ParquetSerializerOptions();
+ opts.CompressionLevel = System.IO.Compression.CompressionLevel.Fastest;
+ opts.CompressionMethod = Parquet.CompressionMethod.Zstd;
+
+ var data = new List();
+
+ var lastScanProgress = 0;
+
+ Log.Info(String.Format("Processing {0} MS scans", +(1 + lastScanNumber - firstScanNumber)));
+
+ for (var scanNumber = firstScanNumber; scanNumber <= lastScanNumber; scanNumber++)
{
- throw new RawFileParserException("No MS data in RAW file, no output will be produced");
+ if (ParseInput.LogFormat == LogFormat.DEFAULT)
+ {
+ var scanProgress = (int)((double)scanNumber / (lastScanNumber - firstScanNumber + 1) * 100);
+ if (scanProgress % ProgressPercentageStep == 0)
+ {
+ if (scanProgress != lastScanProgress)
+ {
+ Console.Write("" + scanProgress + "% ");
+ lastScanProgress = scanProgress;
+ }
+ }
+ }
+
+ try
+ {
+ int level = (int)raw.GetScanEventForScanNumber(scanNumber).MSOrder; //applying MS level filter
+ if (level <= ParseInput.MaxLevel) // Primary MS level filter
+ {
+ var scanData = ReadScan(raw, scanNumber);
+ if (scanData != null && ParseInput.MsLevel.Contains(level)) // Final MS level filter
+ data.AddRange(scanData);
+ }
+
+ }
+ catch (Exception ex)
+ {
+ Log.Error($"Scan #{scanNumber} cannot be processed because of the following exception: {ex.Message}\n{ex.StackTrace}");
+ ParseInput.NewError();
+ }
+
+ // If we have enough ions to write a row group, do so
+ // - some row groups might have more than this number of ions
+ // but this ensures that all ions from a single scan are always
+ // present in the same row group (critical property of mzparquet)
+ if (data.Count >= ParquetSliceSize)
+ {
+ var task = ParquetSerializer.SerializeAsync(data, Writer.BaseStream, opts);
+ task.Wait();
+ opts.Append = true;
+ data.Clear();
+ Log.Debug("Writing next row group");
+ }
+ }
+
+ // serialize any remaining ions into the final row group
+ if (data.Count > 0)
+ {
+ var task = ParquetSerializer.SerializeAsync(data, Writer.BaseStream, opts);
+ task.Wait();
+ Log.Debug("Writing final row group");
+ }
+
+ if (ParseInput.LogFormat == LogFormat.DEFAULT) //Add new line after progress bar
+ {
+ Console.WriteLine();
}
+
+ // Release the OS file handle
+ Writer.Flush();
+ Writer.Close();
}
- private static void WritePScans(string outputDirectory, string fileName,
- IRawDataPlus raw,
- List scans)
+ private List ReadScan(IRawDataPlus raw, int scanNumber)
{
- var enumerator = raw.GetFilteredScanEnumerator(" ");
+ var scanFilter = raw.GetFilterForScanNumber(scanNumber);
- foreach (var scanNumber in enumerator
- ) // note in my tests serial is faster than Parallel.Foreach() (this involves disk access, so it makes sense)
+ // Get the scan event for this scan number
+ var scanEvent = raw.GetScanEventForScanNumber(scanNumber);
+
+ // Get scan ms level
+ var msLevel = (int)scanFilter.MSOrder;
+
+ // Get Scan
+ var scan = Scan.FromFile(raw, scanNumber);
+ ScanTrailer trailerData;
+
+ try
+ {
+ trailerData = new ScanTrailer(raw.GetTrailerExtraInformation(scanNumber));
+ }
+ catch (Exception ex)
+ {
+ Log.WarnFormat("Cannot load trailer infromation for scan {0} due to following exception\n{1}", scanNumber, ex.Message);
+ ParseInput.NewWarn();
+ trailerData = new ScanTrailer();
+ }
+
+ int? trailer_charge = trailerData.AsPositiveInt("Charge State:");
+ double? trailer_mz = trailerData.AsDouble("Monoisotopic M/Z:");
+ double? trailer_isolationWidth = trailerData.AsDouble("MS" + msLevel + " Isolation Width:");
+ double? FAIMSCV = null;
+ if (trailerData.AsBool("FAIMS Voltage On:").GetValueOrDefault(false))
+ FAIMSCV = trailerData.AsDouble("FAIMS CV:");
+
+ double rt = raw.RetentionTimeFromScanNumber(scanNumber);
+ int precursor_scan = 0;
+ PrecursorData precursor_data = new PrecursorData
{
- //trailer information is extracted via index
- var trailers = raw.GetTrailerExtraValues(scanNumber);
- var trailerLabels = raw.GetTrailerExtraInformation(scanNumber);
- object chargeState = 0;
- for (int i = 0; i < trailerLabels.Labels.Length; i++)
+ isolation_lower = null,
+ isolation_upper = null,
+ mz = null
+
+ };
+
+ if (msLevel == 1)
+ {
+ // Keep track of scan number for precursor reference
+ _precursorScanNumbers[""] = scanNumber;
+ _precursorTree[scanNumber] = new PrecursorInfo();
+ }
+ else if (msLevel > 1)
+ {
+ // Keep track of scan number and isolation m/z for precursor reference
+ var result = _filterStringIsolationMzPattern.Match(scanEvent.ToString());
+ if (result.Success)
{
- if (trailerLabels.Labels[i] == "Charge State:")
+ if (_precursorScanNumbers.ContainsKey(result.Groups[1].Value))
{
- chargeState = raw.GetTrailerExtraValue(scanNumber, i);
- break;
+ _precursorScanNumbers.Remove(result.Groups[1].Value);
}
- }
-
- var scanFilter = raw.GetFilterForScanNumber(scanNumber);
- var scanStats = raw.GetScanStatsForScanNumber(scanNumber);
- CentroidStream centroidStream = new CentroidStream();
+ _precursorScanNumbers.Add(result.Groups[1].Value, scanNumber);
+ }
- //check for FT mass analyzer data
- if (scanFilter.MassAnalyzer == MassAnalyzerType.MassAnalyzerFTMS)
+ //update precursor scan if it is provided in trailer data
+ var trailerMasterScan = trailerData.AsPositiveInt("Master Scan Number:");
+ if (trailerMasterScan.HasValue)
{
- centroidStream = raw.GetCentroidStream(scanNumber, false);
+ precursor_scan = trailerMasterScan.Value;
+ }
+ else //try getting it from the scan filter
+ {
+ precursor_scan = GetParentFromScanString(result.Groups[1].Value);
}
- //check for IT mass analyzer data
- if (scanFilter.MassAnalyzer == MassAnalyzerType.MassAnalyzerITMS)
+ //finding precursor scan failed
+ if (precursor_scan == -2)
{
- var scanData = raw.GetSimplifiedScan(scanNumber);
- centroidStream.Masses = scanData.Masses;
- centroidStream.Intensities = scanData.Intensities;
+ Log.Warn($"Cannot find precursor scan for scan# {scanNumber}");
+ _precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), null);
}
- var msOrder = raw.GetScanEventForScanNumber(scanNumber).MSOrder;
+ //Parsing the last reaction
+ try
+ {
+ try //since there is no direct way to get the number of reactions available, it is necessary to try and fail
+ {
+ scanEvent.GetReaction(_precursorTree[precursor_scan].ReactionCount);
+ }
+ catch (ArgumentOutOfRangeException ex)
+ {
+ Log.Debug($"Using Tribrid decision tree fix for scan# {scanNumber}");
+ //Is it a decision tree scheduled scan on tribrid?
+ if (msLevel == _precursorTree[precursor_scan].MSLevel)
+ {
+ precursor_scan = GetParentFromScanString(result.Groups[1].Value);
+ }
+ else
+ {
+ throw new RawFileParserException(
+ $"Tribrid decision tree fix failed - cannot get reaction# {_precursorTree[precursor_scan].ReactionCount} from {scanEvent.ToString()}",
+ ex);
+ }
+ }
+
+ // Get Precursor m/z and isolation window borders
+ precursor_data = GetPrecursorData(precursor_scan, scanEvent, trailer_mz, trailer_isolationWidth, out var reactionCount);
- if (msOrder == MSOrderType.Ms)
+ //save precursor information for later reference
+ _precursorTree[scanNumber] = new PrecursorInfo(precursor_scan, msLevel, reactionCount, null);
+ }
+ catch (Exception e)
{
- var pscan = GetPScan(scanStats, centroidStream, fileName, Convert.ToInt32(chargeState));
- scans.Add(pscan);
+ var extra = (e.InnerException is null) ? "" : $"\n{e.InnerException.StackTrace}";
+
+ Log.Warn($"Could not get precursor data for scan# {scanNumber} - precursor information for this and dependent scans will be empty\nException details:{e.Message}\n{e.StackTrace}\n{extra}");
+ ParseInput.NewWarn();
+
+ _precursorTree[scanNumber] = new PrecursorInfo(precursor_scan, 1, 0, null);
}
- if (msOrder == MSOrderType.Ms2)
+ }
+
+ double[] masses;
+ double[] intensities;
+
+ if (!ParseInput.NoPeakPicking.Contains(msLevel))
+ {
+ // Check if the scan has a centroid stream
+ if (scan.HasCentroidStream)
{
- var precursorMz = raw.GetScanEventForScanNumber(scanNumber).GetReaction(0).PrecursorMass;
- var pscan = GetPScan(scanStats, centroidStream, fileName, precursorMz,
- Convert.ToInt32(chargeState));
- scans.Add(pscan);
+ masses = scan.CentroidScan.Masses;
+ intensities = scan.CentroidScan.Intensities;
+ }
+ else // otherwise take the segmented (low res) scan
+ {
+ // If the spectrum is profile perform centroiding
+ var segmentedScan = scanEvent.ScanData == ScanDataType.Profile
+ ? Scan.ToCentroid(scan).SegmentedScan
+ : scan.SegmentedScan;
+
+ masses = segmentedScan.Positions;
+ intensities = segmentedScan.Intensities;
}
+ }
+ else // use the segmented data as is
+ {
+ masses = scan.SegmentedScan.Positions;
+ intensities = scan.SegmentedScan.Intensities;
+ }
- var t = raw.GetTrailerExtraValues(scanNumber);
+ List scanData = new List(masses.Length);
+ // Add a row to parquet file for every m/z value in this scan
+ for (int i = 0; i < masses.Length; i++)
+ {
+ MzParquet m;
+ m.rt = (float)rt;
+ m.scan = (uint)scanNumber;
+ m.level = (uint)msLevel;
+ m.intensity = (float)intensities[i];
+ m.mz = (float)masses[i];
+ m.isolation_lower = precursor_data.isolation_lower;
+ m.isolation_upper = precursor_data.isolation_upper;
+ m.precursor_scan = precursor_scan;
+ m.precursor_mz = precursor_data.mz;
+ m.precursor_charge = (uint?)trailer_charge;
+ m.ion_mobility = (float?)FAIMSCV;
+ scanData.Add(m);
}
- WriteScans(outputDirectory, scans, fileName);
+ return scanData;
}
- private static PScan GetPScan(ScanStatistics scanStats, CentroidStream centroidStream,
- string fileName, double? precursorMz = null, int? precursorCharge = null)
+ private PrecursorData GetPrecursorData(int precursorScanNumber, IScanEventBase scanEvent,
+ double? monoisotopicMz, double? isolationWidth, out int reactionCount)
{
- var scan = new PScan
+ double? isolation_lower = null;
+ double? isolation_upper = null;
+
+ // Get precursors from earlier levels
+ var prevPrecursors = _precursorTree[precursorScanNumber];
+ reactionCount = prevPrecursors.ReactionCount;
+
+ var reaction = scanEvent.GetReaction(reactionCount);
+
+ //if isolation width was not found in the trailer, try to get one from the reaction
+ if (isolationWidth == null) isolationWidth = reaction.IsolationWidth;
+ if (isolationWidth < 0) isolationWidth = null;
+
+ // Selected ion MZ
+ var selectedIonMz = CalculateSelectedIonMz(reaction, monoisotopicMz, isolationWidth);
+
+ if (isolationWidth != null)
+ {
+ var offset = isolationWidth.Value / 2 + reaction.IsolationWidthOffset;
+ isolation_lower = reaction.PrecursorMass - isolationWidth.Value + offset;
+ isolation_upper = reaction.PrecursorMass + offset;
+ }
+
+ // Activation only to keep track of the reactions
+ //increase reaction count
+ reactionCount++;
+
+ //Sometimes the property of supplemental activation is not set (Tune v4 on Tribrid),
+ //or is On if *at least* one of the levels had SA (i.e. not necissirily the last one), thus we need to try (and posibly fail)
+ try
+ {
+ reaction = scanEvent.GetReaction(reactionCount);
+
+ if (reaction != null)
+ {
+ //increase reaction count after successful parsing
+ reactionCount++;
+ }
+ }
+ catch (IndexOutOfRangeException)
{
- FileName = fileName,
- BasePeakMass = scanStats.BasePeakMass,
- ScanType = scanStats.ScanType,
- BasePeakIntensity = scanStats.BasePeakIntensity,
- PacketType = scanStats.PacketType,
- ScanNumber = scanStats.ScanNumber,
- RetentionTime = scanStats.StartTime,
- Masses = centroidStream.Masses.ToArray(),
- Intensities = centroidStream.Intensities.ToArray(),
- LowMass = scanStats.LowMass,
- HighMass = scanStats.HighMass,
- TIC = scanStats.TIC,
- FileId = fileName,
- PrecursorMz = precursorMz,
- PrecursorCharge = precursorCharge,
- MsOrder = 1
+ // If we failed do nothing
+ }
+
+ return new PrecursorData
+ {
+ mz = (float?)selectedIonMz,
+ isolation_lower = (float?)isolation_lower,
+ isolation_upper = (float?)isolation_upper
};
- return scan;
- }
- public static void WriteScans(string outputDirectory, List scans, string sourceRawFileName)
- {
- throw new NotImplementedException();
-
- //Needs refactoring since Parquet.NET API changed
-
- //var output = outputDirectory + "//" + Path.GetFileNameWithoutExtension(sourceRawFileName);
-
- //var ds = new DataSet(new DataField("BasePeakIntensity"),
- // new DataField("BasePeakMass"),
- // new DataField("Baselines"),
- // new DataField("Charges"),
- // new DataField("FileId"),
- // new DataField("FileName"),
- // new DataField("HighMass"),
- // new DataField("Intensities"),
- // new DataField("LowMass"),
- // new DataField("Masses"),
- // new DataField("MsOrder"),
- // new DataField("Noises"),
- // new DataField("PacketType"),
- // new DataField("PrecursorCharge"),
- // new DataField("PrecursorMass"),
- // new DataField("Resolutions"),
- // new DataField("RetentionTime"),
- // new DataField("ScanNumber"),
- // new DataField("ScanType"),
- // new DataField("TIC"));
-
- //foreach (var scan in scans)
- //{
- // //we can't store null values?
- // double[] dummyVal = new double[1];
- // if (scan.Noises == null)
- // {
- // scan.Noises = dummyVal;
- // }
-
- // if (scan.Charges == null)
- // {
- // scan.Charges = dummyVal;
- // }
-
- // if (scan.Baselines == null)
- // {
- // scan.Baselines = dummyVal;
- // }
-
- // if (scan.Resolutions == null)
- // {
- // scan.Resolutions = dummyVal;
- // }
-
- // if (scan.PrecursorMz == null)
- // {
- // scan.PrecursorMz = 0;
- // scan.PrecursorCharge = 0;
- // }
-
- // ds.Add(scan.BasePeakIntensity,
- // scan.BasePeakMass,
- // scan.Baselines,
- // scan.Charges,
- // scan.FileId,
- // scan.FileName,
- // scan.HighMass,
- // scan.Intensities,
- // scan.LowMass,
- // scan.Masses,
- // scan.MsOrder,
- // scan.Noises,
- // scan.PacketType,
- // scan.PrecursorCharge,
- // scan.PrecursorMz,
- // scan.Resolutions,
- // scan.RetentionTime,
- // scan.ScanNumber,
- // scan.ScanType,
- // scan.TIC);
- //}
-
- //using (Stream fileStream = File.OpenWrite(output + ".parquet"))
- //{
- // using (var writer = new ParquetWriter(fileStream))
- // {
- // writer.Write(ds);
- // }
- //}
}
}
- /// PSCan meaing Parsec Scan (because Commoncore has a Scan class)
- ///
- public class PScan
- {
- ///
- /// Unique ID per file (foreign key in data store)
- ///
- public string FileId { get; set; }
-
- public string FileName { get; set; }
- public double BasePeakIntensity { get; set; }
- public double BasePeakMass { get; set; }
- public double[] Baselines { get; set; }
- public double[] Charges { get; set; }
- public double HighMass { get; set; }
- public double[] Intensities { get; set; }
- public double LowMass { get; set; }
- public double[] Masses { get; set; }
- public double[] Noises { get; set; }
- public int PacketType { get; set; } // : 20,
- public double RetentionTime { get; set; }
- public double[] Resolutions { get; set; }
- public int ScanNumber { get; set; }
- public string ScanType { get; set; } // : FTMS + c ESI d Full ms2 335.9267@hcd30.00 [130.0000-346.0000],
- public double TIC { get; set; }
- public int MsOrder { get; set; }
- public double? PrecursorMz { get; set; }
- public int? PrecursorCharge { get; set; }
- }
}
\ No newline at end of file
diff --git a/Writer/SpectrumWriter.cs b/Writer/SpectrumWriter.cs
index 3b1d02b..8015a4d 100644
--- a/Writer/SpectrumWriter.cs
+++ b/Writer/SpectrumWriter.cs
@@ -1,13 +1,16 @@
using System;
+using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Reflection;
+using System.Text.RegularExpressions;
using log4net;
using ThermoFisher.CommonCore.Data;
using ThermoFisher.CommonCore.Data.Business;
using ThermoFisher.CommonCore.Data.FilterEnums;
using ThermoFisher.CommonCore.Data.Interfaces;
using ThermoRawFileParser.Util;
+using System.Linq;
namespace ThermoRawFileParser.Writer
{
@@ -42,6 +45,15 @@ public abstract class SpectrumWriter : ISpectrumWriter
///
private static LimitedSizeDictionary precursorCache;
+ // Precursor scan number (value) and isolation m/z (key) for reference in the precursor element of an MSn spectrum
+ private protected readonly Dictionary _precursorScanNumbers = new Dictionary();
+
+ //Precursor information for scans
+ private protected Dictionary _precursorTree = new Dictionary();
+
+ // Filter string regex to extract an isoaltion entry
+ private protected readonly Regex _filterStringIsolationMzPattern = new Regex(@"ms\d+ (.+?) \[");
+
///
/// Constructor.
///
@@ -50,6 +62,8 @@ protected SpectrumWriter(ParseInput parseInput)
{
ParseInput = parseInput;
precursorCache = new LimitedSizeDictionary(10);
+ _precursorScanNumbers[""] = -1;
+ _precursorTree[-1] = new PrecursorInfo();
}
///
@@ -68,42 +82,27 @@ protected void ConfigureWriter(string extension)
return;
}
- if (ParseInput.OutputFile == null)
+ var fileName = NormalizeFileName(ParseInput.OutputFile, extension, ParseInput.Gzip);
+ if (ParseInput.OutputFormat == OutputFormat.Parquet)
{
- var fullExtension = ParseInput.Gzip ? extension + ".gz" : extension;
- if (!ParseInput.Gzip || ParseInput.OutputFormat == OutputFormat.IndexMzML)
- {
- Writer = File.CreateText(ParseInput.OutputDirectory + "//" +
- ParseInput.RawFileNameWithoutExtension +
- extension);
- }
- else
- {
- var fileStream = File.Create(ParseInput.OutputDirectory + "//" +
- ParseInput.RawFileNameWithoutExtension + fullExtension);
- var compress = new GZipStream(fileStream, CompressionMode.Compress);
- Writer = new StreamWriter(compress);
- }
+ Writer = new StreamWriter(File.Create(fileName));
+ }
+ else if (!ParseInput.Gzip || ParseInput.OutputFormat == OutputFormat.IndexMzML)
+ {
+ Writer = File.CreateText(fileName);
}
else
{
- var fileName = NormalizeFileName(ParseInput.OutputFile, extension, ParseInput.Gzip);
- if (!ParseInput.Gzip || ParseInput.OutputFormat == OutputFormat.IndexMzML)
- {
- Writer = File.CreateText(fileName);
- }
- else
- {
- var fileStream = File.Create(fileName);
- var compress = new GZipStream(fileStream, CompressionMode.Compress);
- Writer = new StreamWriter(compress);
- }
+ var fileStream = File.Create(fileName);
+ var compress = new GZipStream(fileStream, CompressionMode.Compress);
+ Writer = new StreamWriter(compress);
}
+
}
private string NormalizeFileName(string outputFile, string extension, bool gzip)
{
- string result = outputFile;
+ string result = outputFile == null ? Path.Combine(ParseInput.OutputDirectory, ParseInput.RawFileNameWithoutExtension) : outputFile;
string tail = "";
string[] extensions;
@@ -267,5 +266,67 @@ public static IReaction GetReaction(IScanEvent scanEvent, int scanNumber)
return precursorIntensity;
}
+
+ private protected int GetParentFromScanString(string scanString)
+ {
+ var parts = Regex.Split(scanString, " ");
+
+ //find the position of the first (from the end) precursor with a different mass
+ //to account for possible supplementary activations written in the filter
+ var lastIonMass = parts.Last().Split('@').First();
+ int last = parts.Length;
+ while (last > 0 &&
+ parts[last - 1].Split('@').First() == lastIonMass)
+ {
+ last--;
+ }
+
+ string parentFilter = String.Join(" ", parts.Take(last));
+ if (_precursorScanNumbers.ContainsKey(parentFilter))
+ {
+ return _precursorScanNumbers[parentFilter];
+ }
+
+ return -2; //unsuccessful parsing
+ }
+
+ private protected int FindLastReaction(IScanEvent scanEvent, int msLevel)
+ {
+ int lastReactionIndex = msLevel - 2;
+
+ //iteratively trying find the last available index for reaction
+ while (true)
+ {
+ try
+ {
+ scanEvent.GetReaction(lastReactionIndex + 1);
+ }
+ catch (ArgumentOutOfRangeException)
+ {
+ //stop trying
+ break;
+ }
+
+ lastReactionIndex++;
+ }
+
+ //supplemental activation flag is on -> one of the levels (not necissirily the last one) used supplemental activation
+ //check last two activations
+ if (scanEvent.SupplementalActivation == TriState.On)
+ {
+ var lastActivation = scanEvent.GetReaction(lastReactionIndex).ActivationType;
+ var beforeLastActivation = scanEvent.GetReaction(lastReactionIndex - 1).ActivationType;
+
+ if ((beforeLastActivation == ActivationType.ElectronTransferDissociation || beforeLastActivation == ActivationType.ElectronCaptureDissociation) &&
+ (lastActivation == ActivationType.CollisionInducedDissociation || lastActivation == ActivationType.HigherEnergyCollisionalDissociation))
+ return lastReactionIndex - 1; //ETD or ECD followed by HCD or CID -> supplemental activation in the last level (move the last reaction one step back)
+ else
+ return lastReactionIndex;
+ }
+ else //just use the last one
+ {
+ return lastReactionIndex;
+ }
+ }
}
}
\ No newline at end of file