Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions MainClass.cs
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,9 @@ private static void RegularParametersParsing(string[] args)
if (parseInput.OutputFormat == OutputFormat.IndexMzML) parseInput.OutputFormat = OutputFormat.MzML;
}

// Switch off gzip compression for Parquet
if (parseInput.OutputFormat == OutputFormat.Parquet) parseInput.Gzip = false;

parseInput.MaxLevel = parseInput.MsLevel.Max();

if (parseInput.S3Url != null && parseInput.S3AccessKeyId != null &&
Expand Down
1 change: 1 addition & 0 deletions ThermoRawFileParserTest/ThermoRawFileParserTest.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Parquet.Net" Version="5.0.1" />
</ItemGroup>

<ItemGroup>
Expand Down
54 changes: 54 additions & 0 deletions ThermoRawFileParserTest/WriterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Xml.Serialization;
using IO.Mgf;
using NUnit.Framework;
using Parquet;
using ThermoRawFileParser;
using ThermoRawFileParser.Writer.MzML;

Expand Down Expand Up @@ -281,5 +282,58 @@ public void TestMzML_MS2()

Assert.That(testMzMl.run.chromatogramList.chromatogram[0].defaultArrayLength, Is.EqualTo(95));
}

[Test]
public void TestParquetCentroid()
{
// Get temp path for writing the test mzML
var tempFilePath = Path.GetTempPath();

var testRawFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data/small.RAW");
var parseInput = new ParseInput(testRawFile, null, tempFilePath, OutputFormat.Parquet);

RawFileParser.Parse(parseInput);

// Actual test
var parquetFilePath = Path.Combine(tempFilePath, "small.mzparquet");

using (var parquetReader = ParquetReader.CreateAsync(parquetFilePath).Result)
{
var groupReader = parquetReader.OpenRowGroupReader(0);
var schema = parquetReader.Schema;
var scanColumn = groupReader.ReadColumnAsync(schema.FindDataField("scan")).Result;

Assert.That(scanColumn.NumValues, Is.EqualTo(48520));
Assert.That(scanColumn.Statistics.DistinctCount, Is.EqualTo(48));
Assert.That((from int p in scanColumn.Data where p == 22 select p).Count(), Is.EqualTo(1632));
}
}

[Test]
public void TestParquetProfile()
{
// Get temp path for writing the test mzML
var tempFilePath = Path.GetTempPath();

var testRawFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data/small.RAW");
var parseInput = new ParseInput(testRawFile, null, tempFilePath, OutputFormat.Parquet);
parseInput.NoPeakPicking = new HashSet<int> { 1, 2 };

RawFileParser.Parse(parseInput);

// Actual test
var parquetFilePath = Path.Combine(tempFilePath, "small.mzparquet");

using (var parquetReader = ParquetReader.CreateAsync(parquetFilePath).Result)
{
var groupReader = parquetReader.OpenRowGroupReader(0);
var schema = parquetReader.Schema;
var scanColumn = groupReader.ReadColumnAsync(schema.FindDataField("scan")).Result;

Assert.That(scanColumn.NumValues, Is.EqualTo(305213));
Assert.That(scanColumn.Statistics.DistinctCount, Is.EqualTo(48));
Assert.That((from int p in scanColumn.Data where p == 22 select p).Count(), Is.EqualTo(17758));
}
}
}
}
30 changes: 3 additions & 27 deletions Writer/MgfSpectrumWriter.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Reflection;
using System.Text.RegularExpressions;
using log4net;
using ThermoFisher.CommonCore.Data.Business;
using ThermoFisher.CommonCore.Data.FilterEnums;
Expand All @@ -19,15 +16,9 @@ public class MgfSpectrumWriter : SpectrumWriter
private const string PositivePolarity = "+";
private const string NegativePolarity = "-";

// Filter string
private readonly Regex _filterStringIsolationMzPattern = new Regex(@"ms\d+ (.+?) \[");

// Precursor scan number for MSn scans
private int _precursorScanNumber;

// Precursor scan number (value) and isolation m/z (key) for reference in the precursor element of an MSn spectrum
private readonly Dictionary<string, int> _precursorScanNumbers = new Dictionary<string, int>();

public MgfSpectrumWriter(ParseInput parseInput) : base(parseInput)
{
ParseInput.MsLevel.Remove(1); // MS1 spectra are not supposed to be in MGF
Expand Down Expand Up @@ -126,23 +117,7 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
}
else //try getting it from the scan filter
{
var parts = Regex.Split(result.Groups[1].Value, " ");

//find the position of the first (from the end) precursor with a different mass
//to account for possible supplementary activations written in the filter
var lastIonMass = parts.Last().Split('@').First();
int last = parts.Length;
while (last > 0 &&
parts[last - 1].Split('@').First() == lastIonMass)
{
last--;
}

string parentFilter = String.Join(" ", parts.Take(last));
if (_precursorScanNumbers.ContainsKey(parentFilter))
{
_precursorScanNumber = _precursorScanNumbers[parentFilter];
}
_precursorScanNumber = GetParentFromScanString(result.Groups[1].Value);
}

if (_precursorScanNumber > 0)
Expand All @@ -151,7 +126,8 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
}
else
{
Log.Error($"Failed finding precursor for {scanNumber}");
Log.Error($"Cannot find precursor scan for scan# {scanNumber}");
_precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), null);
ParseInput.NewError();
}
}
Expand Down
92 changes: 2 additions & 90 deletions Writer/MzMlSpectrumWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ public class MzMlSpectrumWriter : SpectrumWriter
private static readonly ILog Log =
LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);

private readonly Regex _filterStringIsolationMzPattern = new Regex(@"ms\d+ (.+?) \[");

// Tune version < 3 produces multiple trailer entry like "SPS Mass [number]"
private readonly Regex _spSentry = new Regex(@"SPS Mass\s+\d+:");

Expand All @@ -45,12 +43,6 @@ public class MzMlSpectrumWriter : SpectrumWriter
private readonly Dictionary<IonizationModeType, CVParamType> _ionizationTypes =
new Dictionary<IonizationModeType, CVParamType>();

// Precursor scan number (value) and isolation m/z (key) for reference in the precursor element of an MSn spectrum
private readonly Dictionary<string, int> _precursorScanNumbers = new Dictionary<string, int>();

//Precursor information for scans
private Dictionary<int, PrecursorInfo> _precursorTree = new Dictionary<int, PrecursorInfo>();

private const string SourceFileId = "RAW1";
private readonly XmlSerializerFactory _factory = new XmlSerializerFactory();
private const string Ns = "http://psi.hupo.org/ms/mzml";
Expand All @@ -68,8 +60,6 @@ public MzMlSpectrumWriter(ParseInput parseInput) : base(parseInput)
_mzMlNamespace.Add(string.Empty, "http://psi.hupo.org/ms/mzml");
_doIndexing = ParseInput.OutputFormat == OutputFormat.IndexMzML;
_osOffset = Environment.NewLine == "\n" ? 0 : 1;
_precursorScanNumbers[""] = -1;
_precursorTree[-1] = new PrecursorInfo();
}

/// <inheritdoc />
Expand Down Expand Up @@ -639,7 +629,6 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc

_writer.WriteValue(BitConverter.ToString(hash).Replace("-", "").ToLowerInvariant());
_writer.WriteEndElement(); // fileChecksum

_writer.WriteEndElement(); // indexedmzML
}

Expand All @@ -652,21 +641,6 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc

Writer.Flush();
Writer.Close();

//This section is not necessary?
/*if (_doIndexing)
{
try
{
cryptoStream.Flush();
cryptoStream.Close();
}
catch (System.ObjectDisposedException e)
{
// Cannot access a closed file. CryptoStream was already closed when closing _writer
Log.Warn($"Warning: {e.Message}");
}
}*/
}

// In case of indexed mzML, change the extension from xml to mzML and check for the gzip option
Expand Down Expand Up @@ -1286,7 +1260,7 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)
int? charge = trailerData.AsPositiveInt("Charge State:");
double? monoisotopicMz = trailerData.AsDouble("Monoisotopic M/Z:");
double? ionInjectionTime = trailerData.AsDouble("Ion Injection Time (ms):");
double? isolationWidth = trailerData.AsDouble("MS" + (int) scanFilter.MSOrder + " Isolation Width:");
double? isolationWidth = trailerData.AsDouble("MS" + msLevel + " Isolation Width:");
double? FAIMSCV = null;
if (trailerData.AsBool("FAIMS Voltage On:").GetValueOrDefault(false))
FAIMSCV = trailerData.AsDouble("FAIMS CV:");
Expand Down Expand Up @@ -1374,6 +1348,7 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)
{
Log.Warn($"Cannot find precursor scan for scan# {scanNumber}");
_precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), new PrecursorType[0]);
ParseInput.NewWarn();
}

try
Expand Down Expand Up @@ -1938,46 +1913,6 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)

return spectrum;
}

private int FindLastReaction(IScanEvent scanEvent, int msLevel)
{
int lastReactionIndex = msLevel - 2;

//iteratively trying find the last available index for reaction
while(true)
{
try
{
scanEvent.GetReaction(lastReactionIndex + 1);
}
catch (ArgumentOutOfRangeException)
{
//stop trying
break;
}

lastReactionIndex++;
}

//supplemental activation flag is on -> one of the levels (not necissirily the last one) used supplemental activation
//check last two activations
if (scanEvent.SupplementalActivation == TriState.On)
{
var lastActivation = scanEvent.GetReaction(lastReactionIndex).ActivationType;
var beforeLastActivation = scanEvent.GetReaction(lastReactionIndex - 1).ActivationType;

if ((beforeLastActivation == ActivationType.ElectronTransferDissociation || beforeLastActivation == ActivationType.ElectronCaptureDissociation) &&
(lastActivation == ActivationType.CollisionInducedDissociation || lastActivation == ActivationType.HigherEnergyCollisionalDissociation))
return lastReactionIndex - 1; //ETD or ECD followed by HCD or CID -> supplemental activation in the last level (move the last reaction one step back)
else
return lastReactionIndex;
}
else //just use the last one
{
return lastReactionIndex;
}
}

private SpectrumType ConstructPDASpectrum(int scanNumber, int instrumentNumber)
{
// Get each scan from the RAW file
Expand Down Expand Up @@ -2558,29 +2493,6 @@ private PrecursorListType ConstructPrecursorList(int precursorScanNumber, IScanE

}

private int GetParentFromScanString(string scanString)
{
var parts = Regex.Split(scanString, " ");

//find the position of the first (from the end) precursor with a different mass
//to account for possible supplementary activations written in the filter
var lastIonMass = parts.Last().Split('@').First();
int last = parts.Length;
while (last > 0 &&
parts[last - 1].Split('@').First() == lastIonMass)
{
last--;
}

string parentFilter = String.Join(" ", parts.Take(last));
if (_precursorScanNumbers.ContainsKey(parentFilter))
{
return _precursorScanNumbers[parentFilter];
}

return -2; //unsuccessful parsing
}

/// <summary>
/// Populate the scan list element. Full version used for mass spectra,
/// having Scan Event, scan Filter etc
Expand Down
Loading