From d084f5782dd53a631283bdcf1f7f7397215c726b Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 15 May 2024 13:15:46 -0700 Subject: [PATCH 01/47] Update parameter name --- singlecell/resources/chunks/AvgExpression.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecell/resources/chunks/AvgExpression.R b/singlecell/resources/chunks/AvgExpression.R index b1086e72e..78426ff12 100644 --- a/singlecell/resources/chunks/AvgExpression.R +++ b/singlecell/resources/chunks/AvgExpression.R @@ -32,7 +32,7 @@ GenerateAveragedData <- function(seuratObj, groupFields, addMetadata) { } } - a <- CellMembrane::PseudobulkSeurat(seuratObj, groupFields = groupFields, assays = assayName, additionalFieldsToAggregate = additionalFieldsToAggregate, nCountRnaStratification = nCountRnaStratification) + a <- CellMembrane::PseudobulkSeurat(seuratObj, groupFields = groupFields, assayToAggregate = assayName, additionalFieldsToAggregate = additionalFieldsToAggregate, nCountRnaStratification = nCountRnaStratification) if (addMetadata) { a <- Rdiscvr::QueryAndApplyMetadataUsingCDNA(a) From 12cc0118533efb6149801e17da40421ad0631367 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 22 May 2024 09:46:54 -0700 Subject: [PATCH 02/47] Ensure savedSeuratObjects.txt file has unique name per job --- .../pipeline/AbstractSingleCellPipelineStep.java | 8 +++++++- singlecell/resources/chunks/Functions.R | 15 ++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java b/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java index 8e101a5c6..09b7f64fb 100644 --- a/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java +++ b/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java @@ -43,12 +43,17 @@ public AbstractSingleCellPipelineStep(PipelineStepProvider provider, Pipeline super(provider, ctx); } + private String getSavedSeuratFileName(String outputPrefix) + { + return outputPrefix + ".savedSeuratObjects.txt"; + } + @Override public Output execute(SequenceOutputHandler.JobContext ctx, List inputObjects, String outputPrefix) throws PipelineJobException { SingleCellOutput output = new SingleCellOutput(); - File tracker = new File(ctx.getOutputDir(), "savedSeuratObjects.txt"); + File tracker = new File(ctx.getOutputDir(), getSavedSeuratFileName(outputPrefix)); if (tracker.exists()) { ctx.getLogger().debug("deleting pre-existing file: " + tracker.getName()); @@ -481,6 +486,7 @@ protected Chunk createParamChunk(SequenceOutputHandler.JobContext ctx, List Date: Thu, 23 May 2024 08:33:31 -0700 Subject: [PATCH 03/47] Fix typo --- .../api/singlecell/pipeline/AbstractSingleCellPipelineStep.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java b/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java index 09b7f64fb..d97f01de3 100644 --- a/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java +++ b/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java @@ -486,7 +486,7 @@ protected Chunk createParamChunk(SequenceOutputHandler.JobContext ctx, List Date: Tue, 28 May 2024 20:46:01 -0700 Subject: [PATCH 04/47] Add link to single cell management page --- singlecell/resources/views/singleCellDataManagement.html | 3 +++ 1 file changed, 3 insertions(+) diff --git a/singlecell/resources/views/singleCellDataManagement.html b/singlecell/resources/views/singleCellDataManagement.html index e6bdedb9d..f6d6b4349 100644 --- a/singlecell/resources/views/singleCellDataManagement.html +++ b/singlecell/resources/views/singleCellDataManagement.html @@ -36,6 +36,9 @@ },{ name: 'VLoupe Files Needing TCR Import', url: LABKEY.ActionURL.buildURL('query', 'executeQuery.view', null, {schemaName: 'sequenceanalysis', queryName: 'outputfiles', 'query.readset/status~isblank': null, 'query.readset/numTcrResults~eq': 0, 'query.category~eq': '10x VLoupe', 'query.sort': 'readset/name'}) + },{ + name: 'Stim Data Needing Activation Data', + url: LABKEY.ActionURL.buildURL('query', 'executeQuery.view', null, {schemaName: 'sequenceanalysis', queryName: 'outputfiles', 'query.readset/assaytypes~contains': 'stim', 'query.readset/numTcrResults~eq': 0, 'query.category~eq': 'Seurat Object Prototype', 'query.readset/outputFileTypes~doesnotcontain': 'TCell Activation', 'query.sort': 'readset/name'}) },{ name: 'Loupe Files Missing Prototype', url: LABKEY.ActionURL.buildURL('query', 'executeQuery.view', null, {schemaName: 'sequenceanalysis', queryName: 'outputfiles', 'query.category~eq': '10x Loupe File', 'query.readset/status~isblank': null, 'query.readset/cDNA/rowid~isnonblank': null, 'query.readset/outputFileTypes~doesnotcontain': 'Prototype', 'query.sort': 'readset/name', 'query.maxRows': 250}) From 1549b6293d51bc8fe3445b078db62ac60cf1d975 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 29 May 2024 13:49:22 -0700 Subject: [PATCH 05/47] Add support for useLeiden --- singlecell/resources/chunks/FindClustersAndDimRedux.R | 2 +- .../pipeline/singlecell/FindClustersAndDimRedux.java | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/singlecell/resources/chunks/FindClustersAndDimRedux.R b/singlecell/resources/chunks/FindClustersAndDimRedux.R index b159fd756..f002dd905 100644 --- a/singlecell/resources/chunks/FindClustersAndDimRedux.R +++ b/singlecell/resources/chunks/FindClustersAndDimRedux.R @@ -2,7 +2,7 @@ for (datasetId in names(seuratObjects)) { printName(datasetId) seuratObj <- readSeuratRDS(seuratObjects[[datasetId]]) - seuratObj <- CellMembrane::FindClustersAndDimRedux(seuratObj, minDimsToUse = minDimsToUse) + seuratObj <- CellMembrane::FindClustersAndDimRedux(seuratObj, minDimsToUse = minDimsToUse, useLeiden = useLeiden) saveData(seuratObj, datasetId) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/FindClustersAndDimRedux.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/FindClustersAndDimRedux.java index a346e3c87..81f6edb73 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/FindClustersAndDimRedux.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/FindClustersAndDimRedux.java @@ -24,8 +24,10 @@ public Provider() SeuratToolParameter.create("minDimsToUse", "Min. PCs to Use", "The minimum number of PCs to use", "ldk-integerfield", new JSONObject() {{ put("minValue", 0); - }}, 15) + }}, 15), + SeuratToolParameter.create("useLeiden", "Use Leiden Clustering", "If true, FindClusters() will use algorith=4 (leiden), as opposed to the default (louvain)", "checkbox", new JSONObject(){{ + }}, false) ), null, null); } From ee4fe706b9eba91e63363cb12a3bce6f3579fe76 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 3 Jun 2024 14:34:17 -0700 Subject: [PATCH 06/47] Update default for requireRiraImmune --- .../singlecell/pipeline/singlecell/CheckExpectations.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/CheckExpectations.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/CheckExpectations.java index fcffcac5a..377bec741 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/CheckExpectations.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/CheckExpectations.java @@ -34,7 +34,7 @@ public Provider() SeuratToolParameter.create("requireSaturation", "Require Per-Cell Saturation", "If this dataset uses TCR sequencing, these data are required", "checkbox", null, true), SeuratToolParameter.create("requireSingleR", "Require SingleR", "If checked, SingleR calls, including singleRConsensus are required to pass", "checkbox", null, true), SeuratToolParameter.create("requireScGate", "Require scGate", "If checked, scGateConsensus calls are required to pass", "checkbox", null, true), - SeuratToolParameter.create("requireRiraImmune", "Require RIRA Immune V2", "If checked, RIRA_Immune_v2 calls (field RIRA_Immune_v2.cellclass) are required to pass", "checkbox", null, true) + SeuratToolParameter.create("requireRiraImmune", "Require RIRA Immune V2", "If checked, RIRA_Immune_v2 calls (field RIRA_Immune_v2.cellclass) are required to pass", "checkbox", null, false) ), null, null); } From b667a9a051758a47c600deb0c3bf0241bfd83e1c Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 5 Jun 2024 12:16:40 -0700 Subject: [PATCH 07/47] Update nimble arguments --- .../singlecell/run/NimbleAlignmentStep.java | 3 --- .../org/labkey/singlecell/run/NimbleHelper.java | 16 ---------------- 2 files changed, 19 deletions(-) diff --git a/singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java b/singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java index 6f12b12bb..ba5044721 100644 --- a/singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java +++ b/singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java @@ -57,9 +57,6 @@ public static List getToolParameters() put("initialValues", "unstranded"); put("delimiter", ";"); }}, null), - ToolParameterDescriptor.create(ALIGN_OUTPUT, "Create Alignment/Debug Output", "If checked, an alignment-level summary TSV will be created", "checkbox", new JSONObject(){{ - put("checked", true); - }}, true), ToolParameterDescriptor.create(MAX_HITS_TO_REPORT, "Max Hits To Report", "If a given hit has more than this number of references, it is discarded", "ldk-integerfield", new JSONObject(){{ put("minValue", 0); }}, 4) diff --git a/singlecell/src/org/labkey/singlecell/run/NimbleHelper.java b/singlecell/src/org/labkey/singlecell/run/NimbleHelper.java index 042960cc4..8c91512bd 100644 --- a/singlecell/src/org/labkey/singlecell/run/NimbleHelper.java +++ b/singlecell/src/org/labkey/singlecell/run/NimbleHelper.java @@ -43,7 +43,6 @@ import java.util.Map; import java.util.stream.Collectors; -import static org.labkey.singlecell.run.NimbleAlignmentStep.ALIGN_OUTPUT; import static org.labkey.singlecell.run.NimbleAlignmentStep.MAX_HITS_TO_REPORT; import static org.labkey.singlecell.run.NimbleAlignmentStep.REF_GENOMES; import static org.labkey.singlecell.run.NimbleAlignmentStep.STRANDEDNESS; @@ -425,14 +424,6 @@ private Map doAlignment(List genomes, List doAlignment(List genomes, List Date: Fri, 7 Jun 2024 10:42:23 -0700 Subject: [PATCH 08/47] Add Classify_Myeloid to RIRA step --- singlecell/resources/chunks/RunRiraClassification.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/singlecell/resources/chunks/RunRiraClassification.R b/singlecell/resources/chunks/RunRiraClassification.R index 106bf8426..986208e64 100644 --- a/singlecell/resources/chunks/RunRiraClassification.R +++ b/singlecell/resources/chunks/RunRiraClassification.R @@ -4,8 +4,9 @@ for (datasetId in names(seuratObjects)) { seuratObj <- RIRA::Classify_ImmuneCells(seuratObj, maxBatchSize = maxBatchSize, retainProbabilityMatrix = retainProbabilityMatrix) seuratObj <- RIRA::Classify_TNK(seuratObj, maxBatchSize = maxBatchSize, retainProbabilityMatrix = retainProbabilityMatrix) - seuratObj$RIRA_TNK_v2.predicted_labels[seuratObj$RIRA_Immune_v2.majority_voting != 'T_NK'] <- 'Other' + seuratObj <- RIRA::Classify_Myeloid(seuratObj, maxBatchSize = maxBatchSize, retainProbabilityMatrix = retainProbabilityMatrix) + saveData(seuratObj, datasetId) } \ No newline at end of file From e776fd2e96a960a618b8e5c5ea1cf4fa0e9bb1da Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 10 Jun 2024 07:44:58 -0700 Subject: [PATCH 09/47] Support GATK SVAnnotateStep --- .../SequenceAnalysisModule.java | 2 + .../run/alignment/StarWrapper.java | 2 +- .../run/util/SVAnnotateStep.java | 104 ++++++++++++++++++ .../run/variant/SelectSNVsStep.java | 2 +- 4 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/SVAnnotateStep.java diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java index 1718c9433..d9f027b02 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java @@ -113,6 +113,7 @@ import org.labkey.sequenceanalysis.run.util.FastqcRunner; import org.labkey.sequenceanalysis.run.util.GenomicsDBAppendHandler; import org.labkey.sequenceanalysis.run.util.GenomicsDBImportHandler; +import org.labkey.sequenceanalysis.run.util.SVAnnotateStep; import org.labkey.sequenceanalysis.run.variant.*; import org.labkey.sequenceanalysis.util.Barcoder; import org.labkey.sequenceanalysis.util.ChainFileValidator; @@ -300,6 +301,7 @@ public static void registerPipelineSteps() SequencePipelineService.get().registerPipelineStep(new MendelianViolationReportStep.Provider()); SequencePipelineService.get().registerPipelineStep(new SummarizeGenotypeQualityStep.Provider()); SequencePipelineService.get().registerPipelineStep(new BcftoolsFillTagsStep.Provider()); + SequencePipelineService.get().registerPipelineStep(new SVAnnotateStep.Provider()); //handlers SequenceAnalysisService.get().registerFileHandler(new LiftoverHandler()); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/StarWrapper.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/StarWrapper.java index d5a0cd1f2..38c635908 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/StarWrapper.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/StarWrapper.java @@ -53,7 +53,7 @@ public StarWrapper(@Nullable Logger logger) public static class StarAlignmentStep extends AbstractAlignmentPipelineStep implements AlignmentStep { - public StarAlignmentStep(AlignmentStepProvider provider, PipelineContext ctx) + public StarAlignmentStep(AlignmentStepProvider provider, PipelineContext ctx) { super(provider, ctx, new StarWrapper(ctx.getLogger())); } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/SVAnnotateStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/SVAnnotateStep.java new file mode 100644 index 000000000..cac32155a --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/SVAnnotateStep.java @@ -0,0 +1,104 @@ +package org.labkey.sequenceanalysis.run.util; + +import htsjdk.samtools.util.Interval; +import org.apache.logging.log4j.Logger; +import org.json.JSONObject; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider; +import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; +import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; +import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; +import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; +import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep; +import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl; +import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep; +import org.labkey.api.sequenceanalysis.run.AbstractGatk4Wrapper; +import org.labkey.api.util.PageFlowUtil; +import org.labkey.sequenceanalysis.pipeline.SequenceTaskHelper; + +import javax.annotation.Nullable; +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class SVAnnotateStep extends AbstractCommandPipelineStep implements VariantProcessingStep +{ + public static final String GENE_PARAM = "gene_file"; + + public SVAnnotateStep(PipelineStepProvider provider, PipelineContext ctx) + { + super(provider, ctx, new SNAnnotateWrapper(ctx.getLogger())); + } + + public static class Provider extends AbstractVariantProcessingStepProvider + { + public Provider() + { + super("SVAnnotateStep", "GATK SVAnnotate", "GATK", "This will run GATK's SVAnnotate to classify SVs by impact", Arrays.asList( + ToolParameterDescriptor.createExpDataParam(GENE_PARAM, "Gene File", "This is the ID of a GTF or GFF3 file containing genes from this genome.", "sequenceanalysis-genomefileselectorfield", new JSONObject() + {{ + put("extensions", Arrays.asList("gtf")); + put("width", 400); + put("allowBlank", false); + }}, null) + ), PageFlowUtil.set("sequenceanalysis/field/GenomeFileSelectorField.js"), ""); + + } + + @Override + public SVAnnotateStep create(PipelineContext ctx) + { + return new SVAnnotateStep(this, ctx); + } + } + + @Override + public Output processVariants(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List intervals) throws PipelineJobException + { + VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl(); + + output.addInput(inputVCF, "Input VCF"); + output.addInput(genome.getWorkingFastaFile(), "Reference Genome"); + + List args = new ArrayList<>(getWrapper().getBaseArgs("SVAnnotate")); + args.add("-V"); + args.add(inputVCF.getPath()); + + if (intervals != null) + { + intervals.forEach(interval -> { + args.add("-L"); + args.add(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd()); + }); + } + + Integer geneFileId = getProvider().getParameterByName(GENE_PARAM).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class); + File geneFile = getPipelineCtx().getSequenceSupport().getCachedData(geneFileId); + if (!geneFile.exists()) + { + throw new PipelineJobException("Unable to find file: " + geneFile.getPath()); + } + args.add("--protein-coding-gtf"); + args.add(geneFile.getPath()); + + File outputVcf = new File(outputDirectory, SequenceTaskHelper.getUnzippedBaseName(inputVCF) + ".svannotate.vcf.gz"); + getWrapper().execute(args); + if (!outputVcf.exists()) + { + throw new PipelineJobException("output not found: " + outputVcf); + } + + output.setVcf(outputVcf); + + return output; + } + + public static class SNAnnotateWrapper extends AbstractGatk4Wrapper + { + public SNAnnotateWrapper(@Nullable Logger logger) + { + super(logger); + } + } +} diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SelectSNVsStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SelectSNVsStep.java index ea057d26d..a94a218b7 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SelectSNVsStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SelectSNVsStep.java @@ -33,7 +33,7 @@ public class SelectSNVsStep extends AbstractCommandPipelineStep provider, PipelineContext ctx) { super(provider, ctx, new SelectVariantsWrapper(ctx.getLogger())); } From 0aaf6e861f1ad0af958cc5e89eb79bf07f50922c Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 11 Jun 2024 18:33:01 -0700 Subject: [PATCH 10/47] Drop unused bissnp code --- .../pipeline_code/sequence_tools_install.sh | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/SequenceAnalysis/pipeline_code/sequence_tools_install.sh b/SequenceAnalysis/pipeline_code/sequence_tools_install.sh index feef079c7..558d131c7 100755 --- a/SequenceAnalysis/pipeline_code/sequence_tools_install.sh +++ b/SequenceAnalysis/pipeline_code/sequence_tools_install.sh @@ -335,30 +335,6 @@ else fi -# -# BisSNP -# -echo "" -echo "" -echo "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" -echo "Install BisSNP" -echo "" -cd $LKSRC_DIR - -if [[ ! -e ${LKTOOLS_DIR}/BisSNP.jar || ! -z $FORCE_REINSTALL ]]; -then - echo "Cleaning up previous installs" - rm -Rf BisSNP* - rm -Rf $LKTOOLS_DIR/BisSNP.jar - - wget $WGET_OPTS https://downloads.sourceforge.net/project/bissnp/BisSNP-0.82.2/BisSNP-0.82.2.jar - - install ./BisSNP-0.82.2.jar $LKTOOLS_DIR/BisSNP.jar -else - echo "Already installed" -fi - - # #mosaik # From cfc31758af4756acb125c3976fa9627057672cbe Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 11 Jun 2024 18:34:21 -0700 Subject: [PATCH 11/47] Drop unused DockerHTCondorExecutionEngine code --- .../web/cluster/htcondor/pipelineConfig.xml | 21 ---- .../pipeline/AbstractClusterEngineConfig.java | 2 +- .../DockerHTCondorExecutionEngineConfig.java | 100 ------------------ 3 files changed, 1 insertion(+), 122 deletions(-) delete mode 100644 cluster/src/org/labkey/cluster/pipeline/DockerHTCondorExecutionEngineConfig.java diff --git a/cluster/resources/web/cluster/htcondor/pipelineConfig.xml b/cluster/resources/web/cluster/htcondor/pipelineConfig.xml index 44d6d328f..df0c3a663 100644 --- a/cluster/resources/web/cluster/htcondor/pipelineConfig.xml +++ b/cluster/resources/web/cluster/htcondor/pipelineConfig.xml @@ -89,27 +89,6 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/cluster/src/org/labkey/cluster/pipeline/AbstractClusterEngineConfig.java b/cluster/src/org/labkey/cluster/pipeline/AbstractClusterEngineConfig.java index 2ca6bd501..7d6b38fd0 100644 --- a/cluster/src/org/labkey/cluster/pipeline/AbstractClusterEngineConfig.java +++ b/cluster/src/org/labkey/cluster/pipeline/AbstractClusterEngineConfig.java @@ -109,7 +109,7 @@ protected List getFinalJavaOpts(PipelineJob job, RemoteExecutionEngine e return javaOpts; } - public List getJobArgs(File localPipelineDir, File localSerializedJobXmlFile, PipelineJob job, RemoteExecutionEngine engine) + public List getJobArgs(File localPipelineDir, File localSerializedJobXmlFile, PipelineJob job, RemoteExecutionEngine engine) { List ret = new ArrayList<>(); ret.addAll(getFinalJavaOpts(job, engine)); diff --git a/cluster/src/org/labkey/cluster/pipeline/DockerHTCondorExecutionEngineConfig.java b/cluster/src/org/labkey/cluster/pipeline/DockerHTCondorExecutionEngineConfig.java deleted file mode 100644 index 6293818ac..000000000 --- a/cluster/src/org/labkey/cluster/pipeline/DockerHTCondorExecutionEngineConfig.java +++ /dev/null @@ -1,100 +0,0 @@ -package org.labkey.cluster.pipeline; - -import org.labkey.api.pipeline.PipelineJob; -import org.labkey.api.pipeline.RemoteExecutionEngine; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -/** - * Created by bimber on 10/31/2015. - */ -public class DockerHTCondorExecutionEngineConfig extends HTCondorExecutionEngineConfig -{ - private String _dockerImageName = "bbimber/discvr-seq"; - private String _configDir = ""; - protected String _activeMqHost = ""; - - public DockerHTCondorExecutionEngineConfig() - { - _remoteExecutable = "docker"; - _labKeyDir = "/labkey"; - } - - @Override - public List getExtraSubmitLines() - { - List ret = super.getExtraSubmitLines(); - ret.add("requirements = (TARGET.IsDockerComputeNode =?= True)"); - - return ret; - } - - @Override - public List getJobArgs(File localPipelineDir, File localSerializedJobFile, PipelineJob job, RemoteExecutionEngine engine) - { - List ret = new ArrayList<>(); - ret.add("run"); - ret.add("--rm=true"); - //TODO: add flag to force rebuild of image - //ret.add("-e"); - //ret.add("ACTIVEMQ_HOST=X"); - //TODO: mount whole file root - ret.add("-v"); - ret.add(getClusterPath(localPipelineDir) + ":/data"); - ret.add("-v"); - ret.add("/mnt/scratch:/work"); - ret.add("-v"); - ret.add(_configDir + ":/labkey/config"); - - ret.add("--add-host=activeMqServer:" + _activeMqHost); - //TODO: add -env for CPUs, memory? - ret.add(_dockerImageName); - ret.add("java"); - ret.addAll(getFinalJavaOpts(job, engine)); - - //TODO: support as config param - //ret.add("-Xdebug"); - //ret.add("-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"); - ret.add("-cp"); - ret.add("/labkey/labkeyBootstrap.jar"); - ret.add("org.labkey.bootstrap.ClusterBootstrap"); - ret.add("-modulesdir=" + "/labkey/modules"); - ret.add("-webappdir=" + "/labkey/labkeywebapp"); - ret.add("-configdir=" + "/labkey/config"); - ret.add(getClusterPath(localSerializedJobFile, true)); - - return ret; - } - - public String getDockerImageName() - { - return _dockerImageName; - } - - public void setDockerImageName(String dockerImageName) - { - _dockerImageName = dockerImageName; - } - - public String getConfigDir() - { - return _configDir; - } - - public void setConfigDir(String configDir) - { - _configDir = configDir; - } - - public String getActiveMqHost() - { - return _activeMqHost; - } - - public void setActiveMqHost(String activeMqHost) - { - _activeMqHost = activeMqHost; - } -} From d177bc45a95ec388576e19d4c27497c4a27695c9 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 12 Jun 2024 09:18:08 -0700 Subject: [PATCH 12/47] Correct parameter label --- .../org/labkey/singlecell/pipeline/singlecell/RunSingleR.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/RunSingleR.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/RunSingleR.java index 7458be6a0..6ca369a04 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/RunSingleR.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/RunSingleR.java @@ -26,7 +26,7 @@ public Provider() SeuratToolParameter.create("nThreads", "# Threads", "If provided, this value will be passed to BiocParallel::MulticoreParam().", "ldk-integerfield", new JSONObject(){{ put("minValue", 0); }}, null), - SeuratToolParameter.create("singleRSpecies", "Tests To Use", "If human, hpca, blueprint, dice, monaco, and immgen will be used. If mouse, MouseRNAseqData will be used.", "ldk-simplecombo", new JSONObject() + SeuratToolParameter.create("singleRSpecies", "Species", "If human, hpca, blueprint, dice, monaco, and immgen will be used. If mouse, MouseRNAseqData will be used.", "ldk-simplecombo", new JSONObject() {{ put("multiSelect", false); put("allowBlank", false); From 043aa41b9ba2ae7728ba63d1f32792873c8cfe87 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 12 Jun 2024 10:50:35 -0700 Subject: [PATCH 13/47] Support bcftools as optional liftover tool --- .../SequenceAnalysis/window/LiftoverWindow.js | 7 +- .../analysis/LiftoverHandler.java | 29 ++++-- .../run/util/LiftoverBcfToolsWrapper.java | 95 +++++++++++++++++++ 3 files changed, 124 insertions(+), 7 deletions(-) create mode 100644 SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/LiftoverBcfToolsWrapper.java diff --git a/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js b/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js index 290b58322..fba9f476e 100644 --- a/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js +++ b/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js @@ -104,7 +104,7 @@ Ext4.define('SequenceAnalysis.window.LiftoverWindow', { maxValue: 1.0, value: 0.95, fieldLabel: 'Min Percent Match', - helpPopup: 'In order to lift to the target genome, the feature must have at least this percent match. Lower this value to be more permissive; however, this risks incorrect liftovers', + helpPopup: 'In order to lift to the target genome, the feature must have at least this percent match. Lower this value to be more permissive; however, this risks incorrect liftovers. This is ignored if using bcftools.', itemId: 'pctField' },{ xtype: 'checkbox', @@ -112,6 +112,11 @@ Ext4.define('SequenceAnalysis.window.LiftoverWindow', { checked: false, helpPopup: 'If checked, no genotypes will be written to the output file (applies to VCFs only). This can be useful (and necessary) when lifting VCFs with extremely high sample number.', fieldLabel: 'Drop Genotypes' + },{ + xtype: 'checkbox', + itemId: 'useBcfTools', + checked: false, + fieldLabel: 'Use bcftools' }].concat(SequenceAnalysis.window.OutputHandlerWindow.getCfgForToolParameters(this.toolParameters)), buttons: [{ text: 'Submit', diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java index f7af4b587..600780cb7 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java @@ -28,6 +28,7 @@ import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep; import org.labkey.api.sequenceanalysis.run.SelectVariantsWrapper; import org.labkey.api.util.FileType; import org.labkey.api.util.FileUtil; @@ -35,6 +36,7 @@ import org.labkey.api.writer.PrintWriters; import org.labkey.sequenceanalysis.SequenceAnalysisModule; import org.labkey.sequenceanalysis.pipeline.ProcessVariantsHandler; +import org.labkey.sequenceanalysis.run.util.LiftoverBcfToolsWrapper; import org.labkey.sequenceanalysis.run.util.LiftoverVcfWrapper; import org.labkey.sequenceanalysis.util.SequenceUtil; @@ -49,7 +51,7 @@ /** * Created by bimber on 8/26/2014. */ -public class LiftoverHandler implements SequenceOutputHandler +public class LiftoverHandler implements SequenceOutputHandler, VariantProcessingStep.SupportsScatterGather { private final FileType _bedFileType = new FileType(".bed", false); //private FileType _gffFileType = new FileType("gff", false); @@ -60,6 +62,12 @@ public LiftoverHandler() } + @Override + public boolean doSortAfterMerge() + { + return true; + } + @Override public String getName() { @@ -167,8 +175,9 @@ public void processFilesRemote(List inputFiles, JobContext c JSONObject params = ctx.getParams(); boolean dropGenotypes = params.optBoolean("dropGenotypes", false); + boolean useBcfTools = params.optBoolean("useBcfTools", false); - Integer chainFileId = params.getInt("chainFileId"); + int chainFileId = params.getInt("chainFileId"); File chainFile = ctx.getSequenceSupport().getCachedData(chainFileId); int targetGenomeId = params.getInt("targetGenomeId"); @@ -217,7 +226,7 @@ else if (_vcfFileType.isType(f.getFile())) { ReferenceGenome targetGenome = ctx.getSequenceSupport().getCachedGenome(targetGenomeId); ReferenceGenome sourceGenome = ctx.getSequenceSupport().getCachedGenome(f.getLibrary_id()); - liftOverVcf(ctx, targetGenome, sourceGenome, chainFile, f.getFile(), lifted, unmappedOutput, job, pct, dropGenotypes); + liftOverVcf(ctx, targetGenome, sourceGenome, chainFile, f.getFile(), lifted, unmappedOutput, job, pct, dropGenotypes, useBcfTools); } } catch (Exception e) @@ -293,7 +302,7 @@ else if (!SequenceUtil.hasLineCount(unmappedOutput)) } } - public void liftOverVcf(JobContext ctx, ReferenceGenome targetGenome, ReferenceGenome sourceGenome, File chain, File input, File output, @Nullable File unmappedOutput, PipelineJob job, double pct, boolean dropGenotypes) throws IOException, PipelineJobException + public void liftOverVcf(JobContext ctx, ReferenceGenome targetGenome, ReferenceGenome sourceGenome, File chain, File input, File output, @Nullable File unmappedOutput, PipelineJob job, double pct, boolean dropGenotypes, boolean useBcfTools) throws IOException, PipelineJobException { File currentVCF = input; if (dropGenotypes) @@ -315,8 +324,16 @@ public void liftOverVcf(JobContext ctx, ReferenceGenome targetGenome, ReferenceG ctx.getFileManager().addIntermediateFile(new File(outputFile.getPath() + ".tbi")); } - LiftoverVcfWrapper wrapper = new LiftoverVcfWrapper(job.getLogger()); - wrapper.doLiftover(currentVCF, chain, targetGenome.getWorkingFastaFile(), unmappedOutput, output, pct); + if (useBcfTools) + { + LiftoverBcfToolsWrapper wrapper = new LiftoverBcfToolsWrapper(job.getLogger()); + wrapper.doLiftover(currentVCF, chain, sourceGenome.getWorkingFastaFile(), targetGenome.getWorkingFastaFile(), unmappedOutput, output); + } + else + { + LiftoverVcfWrapper wrapper = new LiftoverVcfWrapper(job.getLogger()); + wrapper.doLiftover(currentVCF, chain, targetGenome.getWorkingFastaFile(), unmappedOutput, output, pct); + } Long mapped = null; if (output.exists()) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/LiftoverBcfToolsWrapper.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/LiftoverBcfToolsWrapper.java new file mode 100644 index 000000000..1a7c575ca --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/LiftoverBcfToolsWrapper.java @@ -0,0 +1,95 @@ +package org.labkey.sequenceanalysis.run.util; + +import org.apache.logging.log4j.Logger; +import org.jetbrains.annotations.Nullable; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.run.PicardWrapper; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Created by bimber on 3/24/2016. + */ +public class LiftoverBcfToolsWrapper extends PicardWrapper +{ + public LiftoverBcfToolsWrapper(@Nullable Logger logger) + { + super(logger); + } + + public void doLiftover(File inputVcf, File chainFile, File sourceGenomeFasta, File targetGenomeFasta, @Nullable File rejectVcf, File outputVcf) throws PipelineJobException + { + getLogger().info("Liftover VCF (bcftools): " + inputVcf.getPath()); + + List params = new ArrayList<>(); + params.add(SequencePipelineService.get().getExeForPackage("BCFTOOLS", "bcftools").getPath()); + params.add("+liftover"); + + params.add("--no-version"); + params.add("-Oz"); + + Integer threads = SequencePipelineService.get().getMaxThreads(getLogger()); + if (threads != null) + { + params.add("--threads"); + params.add(threads.toString()); + } + + params.add("-o"); + params.add(outputVcf.getPath()); + + params.add(inputVcf.getPath()); + params.add("--"); + + params.add("-s"); + params.add(sourceGenomeFasta.getPath()); + + params.add("-f"); + params.add(targetGenomeFasta.getPath()); + + params.add("-c"); + params.add(chainFile.getPath()); + + params.add("--write-src"); + params.add("--fix-tags"); + + if (rejectVcf != null) + { + params.add("--reject"); + params.add(rejectVcf.getPath()); + + params.add("--reject-type"); + params.add("z"); + } + + execute(params); + + if (!outputVcf.exists()) + { + throw new PipelineJobException("Output file could not be found: " + outputVcf.getPath()); + } + + if (rejectVcf != null && rejectVcf.exists()) + { + try + { + SequenceAnalysisService.get().ensureVcfIndex(rejectVcf, getLogger()); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + } + + @Override + protected String getToolName() + { + return "LiftoverVcf"; + } +} From fce1f1f881e900f3d3e394ec127fc31e959d84f2 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 12 Jun 2024 13:14:46 -0700 Subject: [PATCH 14/47] Inital support for paraGRAPH --- .../SequenceAnalysisModule.java | 2 + .../run/alignment/ParagraphStep.java | 172 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java index d9f027b02..1830fbf8b 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java @@ -77,6 +77,7 @@ import org.labkey.sequenceanalysis.run.alignment.BowtieWrapper; import org.labkey.sequenceanalysis.run.alignment.GSnapWrapper; import org.labkey.sequenceanalysis.run.alignment.MosaikWrapper; +import org.labkey.sequenceanalysis.run.alignment.ParagraphStep; import org.labkey.sequenceanalysis.run.alignment.Pbmm2Wrapper; import org.labkey.sequenceanalysis.run.alignment.StarWrapper; import org.labkey.sequenceanalysis.run.alignment.VulcanWrapper; @@ -336,6 +337,7 @@ public static void registerPipelineSteps() SequenceAnalysisService.get().registerFileHandler(new PbsvJointCallingHandler()); SequenceAnalysisService.get().registerFileHandler(new DeepVariantHandler()); SequenceAnalysisService.get().registerFileHandler(new GLNexusHandler()); + SequenceAnalysisService.get().registerFileHandler(new ParagraphStep()); SequenceAnalysisService.get().registerReadsetHandler(new MultiQCHandler()); SequenceAnalysisService.get().registerReadsetHandler(new RestoreSraDataHandler()); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java new file mode 100644 index 000000000..76780d215 --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -0,0 +1,172 @@ +package org.labkey.sequenceanalysis.run.alignment; + +import org.json.JSONObject; +import org.labkey.api.module.ModuleLoader; +import org.labkey.api.pipeline.PipelineJob; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.pipeline.RecordedAction; +import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.SequenceOutputFile; +import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; +import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; +import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; +import org.labkey.api.util.FileUtil; +import org.labkey.sequenceanalysis.SequenceAnalysisModule; +import org.labkey.sequenceanalysis.run.variant.DepthOfCoverageHandler; +import org.labkey.sequenceanalysis.util.SequenceUtil; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class ParagraphStep extends AbstractParameterizedOutputHandler +{ + public ParagraphStep() + { + super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), "Paragraph SV Genotyping", "This will run paraGRAPH on one or more BAM files to genotype SVs", null, Arrays.asList( + ToolParameterDescriptor.createExpDataParam("svVCF", "Input VCF", "This is the DataId of the VCF containing the SVs to genotype", "ldk-expdatafield", new JSONObject() + {{ + put("allowBlank", false); + }}, null) + )); + } + + @Override + public boolean canProcess(SequenceOutputFile o) + { + return o.getFile() != null && o.getFile().exists() && SequenceUtil.FILETYPE.bamOrCram.getFileType().isType(o.getFile()); + } + + @Override + public boolean doRunRemote() + { + return true; + } + + @Override + public boolean doRunLocal() + { + return false; + } + + @Override + public SequenceOutputProcessor getProcessor() + { + return new DepthOfCoverageHandler.Processor(); + } + + public static class Processor implements SequenceOutputProcessor + { + @Override + public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport support, List inputFiles, JSONObject params, File outputDir, List actions, List outputsToCreate) throws UnsupportedOperationException, PipelineJobException + { + + } + + @Override + public void processFilesRemote(List inputFiles, JobContext ctx) throws UnsupportedOperationException, PipelineJobException + { + File inputVCF = ctx.getSequenceSupport().getCachedData(ctx.getParams().getInt("svVCF")); + if (!inputVCF.exists()) + { + throw new PipelineJobException("Unable to find file: " + inputVCF.getPath()); + } + + for (SequenceOutputFile so : inputFiles) + { + List depthArgs = new ArrayList<>(); + depthArgs.add("idxdepth"); + depthArgs.add("-d"); + depthArgs.add(so.getFile().getPath()); + + File coverageFile = new File(ctx.getWorkingDirectory(), "coverage.txt"); + depthArgs.add("-o"); + depthArgs.add(coverageFile.getPath()); + + depthArgs.add("-r"); + depthArgs.add(ctx.getSequenceSupport().getCachedGenome(so.getLibrary_id()).getWorkingFastaFile().getPath()); + + new SimpleScriptWrapper(ctx.getLogger()).execute(depthArgs); + + if (!coverageFile.exists()) + { + throw new PipelineJobException("Missing file: " + coverageFile.getPath()); + } + + // Should produce a simple text file: + // id path depth read length + // TNPRC-IB18 ../IB18.cram 29.77 150 + + List paragraphArgs = new ArrayList<>(); + paragraphArgs.add("multigrmpy.py"); + paragraphArgs.add("--verbose"); + + File paragraphOut = new File(ctx.getWorkingDirectory(), FileUtil.getBaseName(so.getFile()) + ".paragraph.txt"); + paragraphArgs.add("-o"); + paragraphArgs.add(paragraphOut.getPath()); + + int svVcfId = ctx.getParams().optInt("svVCF"); + if (svVcfId == 0) + { + throw new PipelineJobException("Missing svVCF ID"); + } + + File svVcf = ctx.getSequenceSupport().getCachedData(svVcfId); + if (svVcf == null) + { + throw new PipelineJobException("File not found for ID: " + svVcfId); + } + else if (!svVcf.exists()) + { + throw new PipelineJobException("Missing file: " + svVcf.getPath()); + } + + paragraphArgs.add("-i"); + paragraphArgs.add(svVcf.getPath()); + + paragraphArgs.add("-m"); + paragraphArgs.add(coverageFile.getPath()); + + paragraphArgs.add("-r"); + paragraphArgs.add(ctx.getSequenceSupport().getCachedGenome(so.getLibrary_id()).getWorkingFastaFile().getPath()); + + paragraphArgs.add("--scratch-dir"); + paragraphArgs.add(SequencePipelineService.get().getJavaTempDir()); + + Integer threads = SequencePipelineService.get().getMaxThreads(ctx.getLogger()); + if (threads != null) + { + paragraphArgs.add("--threads"); + paragraphArgs.add(threads.toString()); + } + + paragraphArgs.add("--logfile"); + paragraphArgs.add(new File(ctx.getWorkingDirectory(), "paragraph.log").getPath()); + + new SimpleScriptWrapper(ctx.getLogger()).execute(paragraphArgs); + + File genotypes = new File(ctx.getWorkingDirectory(), "genotypes.vcf.gz"); + if (!genotypes.exists()) + { + throw new PipelineJobException("Missing file: " + genotypes.getPath()); + } + + try + { + SequenceAnalysisService.get().ensureVcfIndex(genotypes, ctx.getLogger()); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + + ctx.getFileManager().addSequenceOutput(genotypes, "paraGRAPH Genotypes: " + so.getName(), "paraGRAPH Genoypes", so.getReadset(), null, so.getLibrary_id(), "Input VCF: " + svVcf.getName() + " (" + svVcfId + ")"); + } + } + } +} \ No newline at end of file From 53980d781ccfd18cd6167ff5118b8b8d1837fb38 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 12 Jun 2024 14:39:03 -0700 Subject: [PATCH 15/47] Update library type terms --- .../resources/web/singlecell/panel/LibraryExportPanel.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/singlecell/resources/web/singlecell/panel/LibraryExportPanel.js b/singlecell/resources/web/singlecell/panel/LibraryExportPanel.js index 4978a61db..7bc335470 100644 --- a/singlecell/resources/web/singlecell/panel/LibraryExportPanel.js +++ b/singlecell/resources/web/singlecell/panel/LibraryExportPanel.js @@ -935,17 +935,17 @@ Ext4.define('SingleCell.panel.LibraryExportPanel', { data.push(comment || 'Please QC individually and pool in equal amounts per lane'); } else if (instrument === 'Novogene-New') { - let libraryType = 'Premade-10X Single Cell Transcriptome Library'; + let libraryType = 'UNKNOWN'; switch (suffix) { case 'GEX': - libraryType = 'Premade-10X Single Cell Transcriptome Library'; + libraryType = 'Premade-10X 5 prime Single Cell Transcriptome Library'; break; case 'TCR': libraryType = 'Premade-10X VDJ Library'; break; case 'HTO': case 'CITE': - libraryType = 'Premade-10X Feature Barcode Library'; + libraryType = 'Premade-10X 5 prime Feature Barcode Library'; break; default: console.error('Unknown suffix: ' + suffix); From fa22d6c46d2dc8de0e18afe6c5d8172d33b52214 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 12 Jun 2024 18:23:38 -0700 Subject: [PATCH 16/47] Add step to re-process Seurat prototypes --- .../resources/chunks/UpdateSeuratPrototype.R | 41 +++++ .../labkey/singlecell/SingleCellModule.java | 1 + .../singlecell/UpdateSeuratPrototype.java | 146 ++++++++++++++++++ 3 files changed, 188 insertions(+) create mode 100644 singlecell/resources/chunks/UpdateSeuratPrototype.R create mode 100644 singlecell/src/org/labkey/singlecell/pipeline/singlecell/UpdateSeuratPrototype.java diff --git a/singlecell/resources/chunks/UpdateSeuratPrototype.R b/singlecell/resources/chunks/UpdateSeuratPrototype.R new file mode 100644 index 000000000..e9af18a97 --- /dev/null +++ b/singlecell/resources/chunks/UpdateSeuratPrototype.R @@ -0,0 +1,41 @@ +if (!file.exists('/homeDir/.netrc')) { + print(list.files('/homeDir')) + stop('Unable to find file: /homeDir/.netrc') +} + +invisible(Rlabkey::labkey.setCurlOptions(NETRC_FILE = '/homeDir/.netrc')) +Rdiscvr::SetLabKeyDefaults(baseUrl = serverBaseUrl, defaultFolder = defaultLabKeyFolder) + +for (datasetId in names(seuratObjects)) { + printName(datasetId) + seuratObj <- readSeuratRDS(seuratObjects[[datasetId]]) + + if (reapplyMetadata) { + seuratObj <- Rdiscvr::QueryAndApplyCdnaMetadata(seuratObj) + } + + if (runRira) { + seuratObj <- RIRA::Classify_ImmuneCells(seuratObj, maxBatchSize = maxBatchSize, retainProbabilityMatrix = retainProbabilityMatrix) + seuratObj <- RIRA::Classify_TNK(seuratObj, maxBatchSize = maxBatchSize, retainProbabilityMatrix = retainProbabilityMatrix) + seuratObj <- RIRA::Classify_Myeloid(seuratObj, maxBatchSize = maxBatchSize, retainProbabilityMatrix = retainProbabilityMatrix) + } + + if (applyTCR) { + seuratObj <- Rdiscvr::DownloadAndAppendTcrClonotypes(seuratObj, allowMissing = allowMissingTcr) + } + + if (runTNKClassification) { + # ClassifyTNKByExpression will fail without this, so ignore allowMissingTcr + if (!'HasCDR3Data' %in% names(seuratObj@meta.data)) { + seuratObj <- Rdiscvr::DownloadAndAppendTcrClonotypes(seuratObj) + } + + seuratObj <- Rdiscvr::ClassifyTNKByExpression(seuratObj) + } + + saveData(seuratObj, datasetId) + + # Cleanup + rm(seuratObj) + gc() +} \ No newline at end of file diff --git a/singlecell/src/org/labkey/singlecell/SingleCellModule.java b/singlecell/src/org/labkey/singlecell/SingleCellModule.java index 8dcf1e4cd..18b925e38 100644 --- a/singlecell/src/org/labkey/singlecell/SingleCellModule.java +++ b/singlecell/src/org/labkey/singlecell/SingleCellModule.java @@ -226,6 +226,7 @@ public static void registerPipelineSteps() SequencePipelineService.get().registerPipelineStep(new RunCsCore.Provider()); SequencePipelineService.get().registerPipelineStep(new CustomGSEA.Provider()); SequencePipelineService.get().registerPipelineStep(new StudyMetadata.Provider()); + SequencePipelineService.get().registerPipelineStep(new UpdateSeuratPrototype.Provider()); SequenceAnalysisService.get().registerReadsetListener(new SingleCellReadsetListener()); } diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/UpdateSeuratPrototype.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/UpdateSeuratPrototype.java new file mode 100644 index 000000000..accf676c7 --- /dev/null +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/UpdateSeuratPrototype.java @@ -0,0 +1,146 @@ +package org.labkey.singlecell.pipeline.singlecell; + +import org.apache.commons.io.FileUtils; +import org.json.JSONObject; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.SequenceOutputFile; +import org.labkey.api.sequenceanalysis.model.Readset; +import org.labkey.api.sequenceanalysis.pipeline.AbstractPipelineStepProvider; +import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; +import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.singlecell.pipeline.SeuratToolParameter; +import org.labkey.api.singlecell.pipeline.SingleCellStep; +import org.labkey.api.util.FileUtil; +import org.labkey.api.writer.PrintWriters; +import org.labkey.singlecell.analysis.AbstractSingleCellHandler; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.util.Arrays; +import java.util.List; + +import static org.labkey.singlecell.analysis.AbstractSingleCellHandler.SEURAT_PROTOTYPE; + +public class UpdateSeuratPrototype extends AbstractRDiscvrStep +{ + public UpdateSeuratPrototype(PipelineContext ctx, UpdateSeuratPrototype.Provider provider) + { + super(provider, ctx); + } + + public static class Provider extends AbstractPipelineStepProvider + { + public Provider() + { + super("UpdateSeuratPrototype", "Update Seurat Prototype", "CellMembrane/Rdiscvr", "This will re-process an existing seurat prototype object and overwrite the original", Arrays.asList( + SeuratToolParameter.create("reapplyMetadata", "Reapply Metadata", "If checked, metadata will be re-applied", "checkbox", null, true), + SeuratToolParameter.create("runRira", "Run RIRA", "If checked, RIRA classification will be re-run", "checkbox", null, true), + SeuratToolParameter.create("runTNKClassification", "Run T/NK Classification", "If checked, T/NK expression-based classification will be re-run", "checkbox", null, true), + SeuratToolParameter.create("applyTCR", "Append TCR Data", "If checked, TCR data will be applied. This will fail if", "checkbox", null, true), + SeuratToolParameter.create("allowMissingTcr", "Allow Missing TCR Data", "Unless checked, an error will be thrown if any sample lacks TCR data", "checkbox", new JSONObject() + {{ + put("checked", false); + }}, false), + SeuratToolParameter.create("keepOriginal", "Keep Copy of Original File", "If checked, the original file will be copied with the file extension '.bk'", "checkbox", new JSONObject() + {{ + put("checked", false); + }}, false) + ), null, null); + } + + @Override + public UpdateSeuratPrototype create(PipelineContext ctx) + { + return new UpdateSeuratPrototype(ctx, this); + } + } + + @Override + public void init(SequenceOutputHandler.JobContext ctx, List inputFiles) throws PipelineJobException + { + if (inputFiles.size() > 1) + { + throw new PipelineJobException("Seurat prototype step expects this job to have a single input. Consider selecting the option to run jobs individually instead of merged"); + } + + if (inputFiles.get(0).getReadset() == null) + { + throw new PipelineJobException("Seurat prototype step expects all inputs to have a readset ID."); + } + + if (!SEURAT_PROTOTYPE.equals(inputFiles.get(0).getCategory())) + { + throw new PipelineJobException("Expected the input to be a seurat prototype, found: " + inputFiles.get(0).getCategory()); + } + + if (ctx.getSequenceSupport().getCachedGenomes().size() > 1) + { + throw new PipelineJobException("Expected seurat prototype step to use a single genome"); + } + + Readset rs = ctx.getSequenceSupport().getCachedReadset(inputFiles.get(0).getReadset()); + if (!ctx.getJob().getContainer().getId().equalsIgnoreCase(rs.getContainer())) + { + throw new PipelineJobException("Seurat prototype jobs must be submitted to the same folder as the source readset"); + } + } + + @Override + public Output execute(SequenceOutputHandler.JobContext ctx, List inputObjects, String outputPrefix) throws PipelineJobException + { + Output output = super.execute(ctx, inputObjects, outputPrefix); + + if (ctx.getSequenceSupport().getCachedGenomes().size() > 1) + { + throw new PipelineJobException("Expected seurat prototype step to use a single genome"); + } + + if (output.getSeuratObjects().size() != 1) + { + throw new PipelineJobException("Expected a single output object, found: " + output.getSeuratObjects().size()); + } + + SeuratObjectWrapper inputRDS = inputObjects.get(0); + SeuratObjectWrapper wrapper = output.getSeuratObjects().get(0); + if (wrapper.getReadsetId() == null) + { + throw new PipelineJobException("Missing readset Id: " + wrapper.getDatasetId()); + } + + File toReplace = inputRDS.getSequenceOutputFile().getFile(); + if (!toReplace.exists()) + { + throw new PipelineJobException("Missing file: " + toReplace); + } + try + { + ctx.getLogger().info("Replacing existing prototype: " + toReplace.getPath()); + + if (ctx.getParams().optBoolean("keepOriginal", false)) + { + File backup = new File(toReplace.getPath() + ".orig"); + if (backup.exists()) + { + backup.delete(); + } + + FileUtils.moveFile(toReplace, backup); + } + + if (toReplace.exists()) + { + toReplace.delete(); + } + + FileUtils.moveFile(wrapper.getFile(), toReplace); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + + return output; + } +} From ab124b734edc23c1322f75f06b537a30d59b0ae3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 13 Jun 2024 01:27:03 +0000 Subject: [PATCH 17/47] Bump braces from 3.0.2 to 3.0.3 in /jbrowse Bumps [braces](https://github.com/micromatch/braces) from 3.0.2 to 3.0.3. - [Changelog](https://github.com/micromatch/braces/blob/master/CHANGELOG.md) - [Commits](https://github.com/micromatch/braces/compare/3.0.2...3.0.3) --- updated-dependencies: - dependency-name: braces dependency-type: indirect ... Signed-off-by: dependabot[bot] --- jbrowse/package-lock.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jbrowse/package-lock.json b/jbrowse/package-lock.json index dc3e491f6..1bfd1235d 100644 --- a/jbrowse/package-lock.json +++ b/jbrowse/package-lock.json @@ -6793,11 +6793,11 @@ } }, "node_modules/braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "dependencies": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" }, "engines": { "node": ">=8" @@ -9112,9 +9112,9 @@ "integrity": "sha512-P9bmyZ3h/PRG+Nzga+rbdI4OEpNDzAVyy74uVO9ATgzLK6VtAsYybF/+TOCvrc0MO793d6+42lLyZTw7/ArVzA==" }, "node_modules/fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "dependencies": { "to-regex-range": "^5.0.1" }, From 4cc44ec18ffe03945a0d74c5b0c8381991fde667 Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 12 Jun 2024 21:37:03 -0700 Subject: [PATCH 18/47] Add step to replace the sample in BAM/VCFs based on the current DB info --- .../run/AbstractGatk4Wrapper.java | 3 + .../SequenceAnalysisModule.java | 2 + .../analysis/UpdateReadsetFilesHandler.java | 315 ++++++++++++++++++ .../run/alignment/BWAMemWrapper.java | 3 +- .../AddOrReplaceReadGroupsStep.java | 3 +- .../sequenceanalysis/util/SequenceUtil.java | 11 + 6 files changed, 335 insertions(+), 2 deletions(-) create mode 100644 SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/UpdateReadsetFilesHandler.java diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java index 334fcf88b..91959921b 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java @@ -90,6 +90,7 @@ public List getBaseArgs(@Nullable String toolName) List args = new ArrayList<>(); args.add(SequencePipelineService.get().getJavaFilepath()); args.addAll(SequencePipelineService.get().getJavaOpts(_maxRamOverride)); + args.add("-DGATK_STACKTRACE_ON_USER_EXCEPTION=true"); args.add("-jar"); args.add(getJAR().getPath()); @@ -98,6 +99,8 @@ public List getBaseArgs(@Nullable String toolName) args.add(toolName); } + + return args; } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java index 1830fbf8b..777ccc87a 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java @@ -59,6 +59,7 @@ import org.labkey.sequenceanalysis.analysis.RnaSeqcHandler; import org.labkey.sequenceanalysis.analysis.SbtGeneCountHandler; import org.labkey.sequenceanalysis.analysis.UnmappedSequenceBasedGenotypeHandler; +import org.labkey.sequenceanalysis.analysis.UpdateReadsetFilesHandler; import org.labkey.sequenceanalysis.button.AddSraRunButton; import org.labkey.sequenceanalysis.button.ArchiveReadsetsButton; import org.labkey.sequenceanalysis.button.ChangeReadsetStatusButton; @@ -338,6 +339,7 @@ public static void registerPipelineSteps() SequenceAnalysisService.get().registerFileHandler(new DeepVariantHandler()); SequenceAnalysisService.get().registerFileHandler(new GLNexusHandler()); SequenceAnalysisService.get().registerFileHandler(new ParagraphStep()); + SequenceAnalysisService.get().registerFileHandler(new UpdateReadsetFilesHandler()); SequenceAnalysisService.get().registerReadsetHandler(new MultiQCHandler()); SequenceAnalysisService.get().registerReadsetHandler(new RestoreSraDataHandler()); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/UpdateReadsetFilesHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/UpdateReadsetFilesHandler.java new file mode 100644 index 000000000..9fc12cc93 --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/UpdateReadsetFilesHandler.java @@ -0,0 +1,315 @@ +package org.labkey.sequenceanalysis.analysis; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileWriter; +import htsjdk.samtools.SAMFileWriterFactory; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.util.FileExtensions; +import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFReader; +import org.apache.commons.io.FileUtils; +import org.apache.logging.log4j.Logger; +import org.json.JSONObject; +import org.labkey.api.module.ModuleLoader; +import org.labkey.api.pipeline.PipelineJob; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.pipeline.RecordedAction; +import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.SequenceOutputFile; +import org.labkey.api.sequenceanalysis.model.Readset; +import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.BcftoolsRunner; +import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; +import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.run.PicardWrapper; +import org.labkey.api.util.FileType; +import org.labkey.api.writer.PrintWriters; +import org.labkey.sequenceanalysis.SequenceAnalysisModule; +import org.labkey.sequenceanalysis.util.SequenceUtil; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class UpdateReadsetFilesHandler extends AbstractParameterizedOutputHandler +{ + public UpdateReadsetFilesHandler() + { + super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), "Update Sample/Header Information", "This will re-header any BAM or gVCF files using the sample name from the source readset. All inputs must be single-sample and have a readset attached to the record", null, List.of( + + )); + } + + @Override + public boolean doRunRemote() + { + return true; + } + + @Override + public boolean doRunLocal() + { + return false; + } + + @Override + public boolean canProcess(SequenceOutputFile f) + { + return f.getFile() != null && ( + SequenceUtil.FILETYPE.gvcf.getFileType().isType(f.getFile()) || + SequenceUtil.FILETYPE.bamOrCram.getFileType().isType(f.getFile()) + ); + } + + @Override + public boolean doSplitJobs() + { + return true; + } + + @Override + public SequenceOutputProcessor getProcessor() + { + return new Processor(); + } + + public static class Processor implements SequenceOutputProcessor + { + @Override + public void init(JobContext ctx, List inputFiles, List actions, List outputsToCreate) throws UnsupportedOperationException, PipelineJobException + { + if (inputFiles.size() > 1) + { + throw new PipelineJobException("This job expects a single input file!"); + } + + SequenceOutputFile so = inputFiles.get(0); + if (so.getReadset() == null) + { + throw new PipelineJobException("All inputs must have a readset, missing: " + so.getRowid()); + } + + Readset rs = SequenceAnalysisService.get().getReadset(so.getReadset(), ctx.getJob().getUser()); + String newRsName = SequenceUtil.getLegalReadGroupName(rs.getName()); + + if (SequenceUtil.FILETYPE.bamOrCram.getFileType().isType(so.getFile())) + { + getAndValidateHeaderForBam(so, newRsName); + } + else if (SequenceUtil.FILETYPE.gvcf.getFileType().isType(so.getFile()) | SequenceUtil.FILETYPE.vcf.getFileType().isType(so.getFile())) + { + getAndValidateHeaderForVcf(so, newRsName); + } + + ctx.getSequenceSupport().cacheObject("readsetId", newRsName); + } + + private SAMFileHeader getAndValidateHeaderForBam(SequenceOutputFile so, String newRsName) throws PipelineJobException + { + SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault(); + try (SamReader reader = samReaderFactory.open(so.getFile())) + { + SAMFileHeader header = reader.getFileHeader().clone(); + int nSamples = reader.getFileHeader().getReadGroups().size(); + if (nSamples != 1) + { + throw new PipelineJobException("File has more than one read group, found: " + nSamples); + } + + List rgs = header.getReadGroups(); + String existingSample = rgs.get(0).getSample(); + if (existingSample.equals(newRsName)) + { + throw new PipelineJobException("Sample names match, aborting"); + } + + return header; + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + private VCFHeader getAndValidateHeaderForVcf(SequenceOutputFile so, String newRsName) throws PipelineJobException + { + try (VCFReader reader = new VCFFileReader(so.getFile())) + { + VCFHeader header = reader.getHeader(); + int nSamples = header.getGenotypeSamples().size(); + if (nSamples != 1) + { + throw new PipelineJobException("File has more than one sample, found: " + nSamples); + } + + String existingSample = header.getGenotypeSamples().get(0); + if (existingSample.equals(newRsName)) + { + throw new PipelineJobException("Sample names match, aborting"); + } + + return header; + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + @Override + public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport support, List inputFiles, JSONObject params, File outputDir, List actions, List outputsToCreate) throws UnsupportedOperationException, PipelineJobException + { + + } + + @Override + public void processFilesRemote(List inputFiles, JobContext ctx) throws UnsupportedOperationException, PipelineJobException + { + String newRsName = ctx.getSequenceSupport().getCachedObject("readsetId", String.class); + if (newRsName == null) + { + throw new PipelineJobException("Missing cached readsetId"); + } + + SequenceOutputFile so = inputFiles.get(0); + if (SequenceUtil.FILETYPE.bamOrCram.getFileType().isType(so.getFile())) + { + reheaderBamOrCram(so, ctx, newRsName); + } + else if (SequenceUtil.FILETYPE.gvcf.getFileType().isType(so.getFile()) | SequenceUtil.FILETYPE.vcf.getFileType().isType(so.getFile())) + { + reheaderVcf(so, ctx, newRsName); + } + } + + private void reheaderVcf(SequenceOutputFile so, JobContext ctx, String newRsName) throws PipelineJobException + { + VCFHeader header = getAndValidateHeaderForVcf(so, newRsName); + String existingSample = header.getGenotypeSamples().get(0); + + File sampleNamesFile = new File(ctx.getWorkingDirectory(), "sampleNames.txt"); + try (PrintWriter writer = PrintWriters.getPrintWriter(sampleNamesFile, StandardOpenOption.APPEND)) + { + writer.println(newRsName); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + ctx.getFileManager().addIntermediateFile(sampleNamesFile); + + File outputVcf = new File(ctx.getWorkingDirectory(), so.getFile().getName()); + + BcftoolsRunner wrapper = new BcftoolsRunner(ctx.getLogger()); + wrapper.execute(Arrays.asList(BcftoolsRunner.getBcfToolsPath().getPath(), "reheader", "-s", sampleNamesFile.getPath(), "-o", outputVcf.getPath(), so.getFile().getPath())); + + try + { + File outputIdx = SequenceAnalysisService.get().ensureVcfIndex(outputVcf, ctx.getLogger(), false); + FileUtils.moveFile(outputVcf, so.getFile(), StandardCopyOption.REPLACE_EXISTING); + + FileType gz = new FileType(".gz"); + File inputIndex = gz.isType(so.getFile()) ? new File(so.getFile().getPath() + ".tbi") : new File(so.getFile().getPath() + FileExtensions.TRIBBLE_INDEX); + FileUtils.moveFile(outputIdx, inputIndex, StandardCopyOption.REPLACE_EXISTING); + + addTracker(so, existingSample, newRsName); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + private void addTracker(SequenceOutputFile so, String existingSample, String newRsName) throws IOException + { + File tracker = new File(so.getFile().getParentFile(), "reheaderHistory.txt"); + boolean preExisting = tracker.exists(); + try (PrintWriter writer = PrintWriters.getPrintWriter(tracker, StandardOpenOption.APPEND)) + { + if (!preExisting) + { + writer.println("OriginalSample\tNewSample"); + } + + writer.println(existingSample + "\t" + newRsName); + } + } + + private void reheaderBamOrCram(SequenceOutputFile so, JobContext ctx, String newRsName) throws PipelineJobException + { + try + { + SAMFileHeader header = getAndValidateHeaderForBam(so, newRsName); + + List rgs = header.getReadGroups(); + String existingSample = rgs.get(0).getSample(); + rgs.get(0).setSample(newRsName); + + File headerBam = new File(ctx.getWorkingDirectory(), "header.bam"); + try (SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, headerBam)) + { + + } + ctx.getFileManager().addIntermediateFile(headerBam); + ctx.getFileManager().addIntermediateFile(SequencePipelineService.get().getExpectedIndex(headerBam)); + + File output = new File(ctx.getWorkingDirectory(), so.getFile().getName()); + new ReplaceSamHeaderWrapper(ctx.getLogger()).execute(so.getFile(), output, headerBam); + if (!output.exists()) + { + throw new PipelineJobException("Missing file: " + output.getPath()); + } + + File outputIdx = SequencePipelineService.get().ensureBamIndex(output, ctx.getLogger(), false); + + FileUtils.moveFile(output, so.getFile(), StandardCopyOption.REPLACE_EXISTING); + FileUtils.moveFile(outputIdx, SequenceAnalysisService.get().getExpectedBamOrCramIndex(so.getFile()), StandardCopyOption.REPLACE_EXISTING); + + addTracker(so, existingSample, newRsName); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + private static class ReplaceSamHeaderWrapper extends PicardWrapper + { + public ReplaceSamHeaderWrapper(Logger log) + { + super(log); + } + + @Override + protected String getToolName() + { + return "ReplaceSamHeader"; + } + + public void execute(File input, File output, File headerBam) throws PipelineJobException + { + List params = new ArrayList<>(getBaseArgs()); + + params.add("--INPUT"); + params.add(input.getPath()); + + params.add("--OUTPUT"); + params.add(output.getPath()); + + params.add("--HEADER"); + params.add(headerBam.getPath()); + + execute(params); + } + } + } +} \ No newline at end of file diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/BWAMemWrapper.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/BWAMemWrapper.java index 05a702793..883b5b6fb 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/BWAMemWrapper.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/BWAMemWrapper.java @@ -17,6 +17,7 @@ import org.labkey.api.sequenceanalysis.pipeline.SamtoolsRunner; import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; import org.labkey.api.util.FileUtil; +import org.labkey.sequenceanalysis.util.SequenceUtil; import java.io.File; import java.util.ArrayList; @@ -61,7 +62,7 @@ protected void doPerformAlignment(AlignmentOutputImpl output, File inputFastq1, rg.add("LB:" + rs.getReadsetId().toString()); rg.add("PL:" + (rs.getPlatform() == null ? "ILLUMINA" : rs.getPlatform())); rg.add("PU:" + (platformUnit == null ? rs.getReadsetId().toString() : platformUnit)); - rg.add("SM:" + rs.getName().replaceAll(" ", "_")); + rg.add("SM:" + SequenceUtil.getLegalReadGroupName(rs)); extraArgs.add("'" + StringUtils.join(rg, "\\t") + "'"); getWrapper().performMemAlignment(getPipelineCtx().getJob(), output, inputFastq1, inputFastq2, outputDirectory, referenceGenome, basename, extraArgs); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/AddOrReplaceReadGroupsStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/AddOrReplaceReadGroupsStep.java index 776a42182..4b845742d 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/AddOrReplaceReadGroupsStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/AddOrReplaceReadGroupsStep.java @@ -11,6 +11,7 @@ import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep; import org.labkey.sequenceanalysis.run.util.AddOrReplaceReadGroupsWrapper; +import org.labkey.sequenceanalysis.util.SequenceUtil; import java.io.File; @@ -48,7 +49,7 @@ public Output processBam(Readset rs, File inputBam, ReferenceGenome referenceGen File outputBam = new File(outputDirectory, FileUtil.getBaseName(inputBam) + ".readgroups.bam"); output.addIntermediateFile(outputBam); - output.setBAM(getWrapper().executeCommand(inputBam, outputBam, rs.getReadsetId().toString(), rs.getPlatform(), rs.getReadsetId().toString(), rs.getName().replaceAll(" ", "_"))); + output.setBAM(getWrapper().executeCommand(inputBam, outputBam, rs.getReadsetId().toString(), rs.getPlatform(), rs.getReadsetId().toString(), SequenceUtil.getLegalReadGroupName(rs))); return output; } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java index f270b7e8d..6808024d9 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java @@ -31,6 +31,7 @@ import org.json.JSONObject; import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.model.Readset; import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.sequenceanalysis.run.CommandWrapper; @@ -650,4 +651,14 @@ else if (FILETYPE.cram.getFileType().isType(bamOrCram)) return null; } + + public static String getLegalReadGroupName(Readset rs) + { + return getLegalReadGroupName(rs.getName()); + } + + public static String getLegalReadGroupName(String rsName) + { + return rsName.replaceAll(" ", "_"); + } } \ No newline at end of file From cc6b74b108d133ffac47d5b1b4c04300117f5983 Mon Sep 17 00:00:00 2001 From: bbimber Date: Fri, 14 Jun 2024 09:38:13 -0700 Subject: [PATCH 19/47] Support parameter for collapsing Gamma-only cells to Gamma/Delta --- singlecell/resources/chunks/ClassifyTNKByExpression.R | 2 +- .../singlecell/pipeline/singlecell/ClassifyTNKByExpression.java | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/singlecell/resources/chunks/ClassifyTNKByExpression.R b/singlecell/resources/chunks/ClassifyTNKByExpression.R index 822ab94d8..2e3f7bb1a 100644 --- a/singlecell/resources/chunks/ClassifyTNKByExpression.R +++ b/singlecell/resources/chunks/ClassifyTNKByExpression.R @@ -14,7 +14,7 @@ for (datasetId in names(seuratObjects)) { seuratObj <- Rdiscvr::DownloadAndAppendTcrClonotypes(seuratObj) } - seuratObj <- Rdiscvr::ClassifyTNKByExpression(seuratObj) + seuratObj <- Rdiscvr::ClassifyTNKByExpression(seuratObj, collapseGOnlyToGD = collapseGOnlyToGD) saveData(seuratObj, datasetId) diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/ClassifyTNKByExpression.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/ClassifyTNKByExpression.java index e2813b4bb..502e2e409 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/ClassifyTNKByExpression.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/ClassifyTNKByExpression.java @@ -2,6 +2,7 @@ import org.labkey.api.sequenceanalysis.pipeline.AbstractPipelineStepProvider; import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; +import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; import org.labkey.api.singlecell.pipeline.SingleCellStep; import java.util.Arrays; @@ -18,6 +19,7 @@ public static class Provider extends AbstractPipelineStepProvider Date: Sat, 15 Jun 2024 05:01:39 -0700 Subject: [PATCH 20/47] Update DimplotLogic to allow missing reductions --- singlecell/resources/chunks/DimPlots.R | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/singlecell/resources/chunks/DimPlots.R b/singlecell/resources/chunks/DimPlots.R index 17aa129b3..d0ec59d97 100644 --- a/singlecell/resources/chunks/DimPlots.R +++ b/singlecell/resources/chunks/DimPlots.R @@ -13,17 +13,20 @@ for (datasetId in names(seuratObjects)) { next } - P1 <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'tsne') - P2 <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'umap') + plotList <- list() + if ('tsne' %in% names(seuratObj@reductions)) { + plotList[['tsne']] <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'tsne') + } + + if ('umap' %in% names(seuratObj@reductions)) { + plotList[['umap']] <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'umap') + } if ('wnn.umap' %in% names(seuratObj@reductions)) { - P3 <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'wnn.umap') - P1 <- P1 | P2 | P3 - } else { - P1 <- P1 | P2 + plotList[['wnn.umap']] <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'wnn.umap') } - P1 <- P1 + patchwork::plot_annotation(title = field) + patchwork::plot_layout(guides = "collect") + P1 <- patchwork::wrap_plots(plotList) + patchwork::plot_layout(ncol = length(plotList)) + patchwork::plot_annotation(title = field) + patchwork::plot_layout(guides = "collect") print(P1) } From 6ae10e430636c14df4cf88132b71c3fb5998e67d Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 15 Jun 2024 05:25:50 -0700 Subject: [PATCH 21/47] Backport Dimplot update --- singlecell/resources/chunks/DimPlots.R | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/singlecell/resources/chunks/DimPlots.R b/singlecell/resources/chunks/DimPlots.R index 17aa129b3..d0ec59d97 100644 --- a/singlecell/resources/chunks/DimPlots.R +++ b/singlecell/resources/chunks/DimPlots.R @@ -13,17 +13,20 @@ for (datasetId in names(seuratObjects)) { next } - P1 <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'tsne') - P2 <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'umap') + plotList <- list() + if ('tsne' %in% names(seuratObj@reductions)) { + plotList[['tsne']] <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'tsne') + } + + if ('umap' %in% names(seuratObj@reductions)) { + plotList[['umap']] <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'umap') + } if ('wnn.umap' %in% names(seuratObj@reductions)) { - P3 <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'wnn.umap') - P1 <- P1 | P2 | P3 - } else { - P1 <- P1 | P2 + plotList[['wnn.umap']] <- Seurat::DimPlot(seuratObj, group.by = field, reduction = 'wnn.umap') } - P1 <- P1 + patchwork::plot_annotation(title = field) + patchwork::plot_layout(guides = "collect") + P1 <- patchwork::wrap_plots(plotList) + patchwork::plot_layout(ncol = length(plotList)) + patchwork::plot_annotation(title = field) + patchwork::plot_layout(guides = "collect") print(P1) } From 0a4c838d140486a9180fa9f36721aa086f03f18a Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 16 Jun 2024 18:44:49 -0700 Subject: [PATCH 22/47] Support additional liftover params --- .../web/SequenceAnalysis/window/LiftoverWindow.js | 8 ++++++++ .../sequenceanalysis/analysis/LiftoverHandler.java | 9 +++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js b/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js index fba9f476e..9a41148d5 100644 --- a/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js +++ b/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js @@ -157,6 +157,14 @@ Ext4.define('SequenceAnalysis.window.LiftoverWindow', { params.dropGenotypes = this.down('#dropGenotypes').getValue(); } + if (this.down('#useBcfTools').getValue()){ + params.useBcfTools = this.down('#useBcfTools').getValue(); + } + + if (this.down('#doNotRetainUnmapped').getValue()){ + params.doNotRetainUnmapped = this.down('#doNotRetainUnmapped').getValue(); + } + Ext4.Msg.wait('Saving...'); LABKEY.Ajax.request({ url: LABKEY.ActionURL.buildURL('sequenceanalysis', 'runSequenceHandler', this.containerPath), diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java index 600780cb7..fbe6d8e04 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java @@ -176,6 +176,7 @@ public void processFilesRemote(List inputFiles, JobContext c boolean dropGenotypes = params.optBoolean("dropGenotypes", false); boolean useBcfTools = params.optBoolean("useBcfTools", false); + boolean doNotRetainUnmapped = params.optBoolean("doNotRetainUnmapped", false); int chainFileId = params.getInt("chainFileId"); File chainFile = ctx.getSequenceSupport().getCachedData(chainFileId); @@ -214,7 +215,7 @@ else if (_vcfFileType.isType(f.getFile())) } File lifted = new File(outDir, baseName + ".lifted-" + targetGenomeId + ext); - File unmappedOutput = new File(outDir, baseName + ".unmapped-" + targetGenomeId + ext); + File unmappedOutput = doNotRetainUnmapped ? new File(outDir, baseName + ".unmapped-" + targetGenomeId + ext) : null; try { @@ -260,7 +261,11 @@ else if (_vcfFileType.isType(f.getFile())) ctx.addSequenceOutput(so1); } - if (!unmappedOutput.exists()) + if (unmappedOutput == null) + { + // skip + } + else if (!unmappedOutput.exists()) { job.getLogger().info("no unmapped intervals"); } From 99cabf0242b2ca4528ab89b2095f9af7d3eac149 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 16 Jun 2024 21:56:40 -0700 Subject: [PATCH 23/47] Support additional liftover params --- .../resources/web/SequenceAnalysis/window/LiftoverWindow.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js b/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js index 9a41148d5..15f6a5a6c 100644 --- a/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js +++ b/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js @@ -117,6 +117,11 @@ Ext4.define('SequenceAnalysis.window.LiftoverWindow', { itemId: 'useBcfTools', checked: false, fieldLabel: 'Use bcftools' + },{ + xtype: 'checkbox', + itemId: 'doNotRetainUnmapped', + checked: false, + fieldLabel: 'Do Not Retain Unmapped' }].concat(SequenceAnalysis.window.OutputHandlerWindow.getCfgForToolParameters(this.toolParameters)), buttons: [{ text: 'Submit', From 9f5f325c4425a3bd431dd3e8d4418b205ea570c6 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 17 Jun 2024 16:53:47 -0700 Subject: [PATCH 24/47] Account for difference between hg19 and GRCh37 MT contigs --- SequenceAnalysis/pipeline_code/sequence_tools_install.sh | 8 ++++---- .../labkey/sequenceanalysis/analysis/LiftoverHandler.java | 2 +- .../labkey/sequenceanalysis/util/ChainFileValidator.java | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/SequenceAnalysis/pipeline_code/sequence_tools_install.sh b/SequenceAnalysis/pipeline_code/sequence_tools_install.sh index 558d131c7..9333860c7 100755 --- a/SequenceAnalysis/pipeline_code/sequence_tools_install.sh +++ b/SequenceAnalysis/pipeline_code/sequence_tools_install.sh @@ -486,10 +486,10 @@ then rm -Rf bcftools* rm -Rf $LKTOOLS_DIR/bcftools - wget $WGET_OPTS https://github.com/samtools/bcftools/releases/download/1.18/bcftools-1.18.tar.bz2 - tar xjvf bcftools-1.18.tar.bz2 - chmod 755 bcftools-1.18 - cd bcftools-1.18 + wget $WGET_OPTS https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2 + tar xjvf bcftools-1.20.tar.bz2 + chmod 755 bcftools-1.20 + cd bcftools-1.20 rm -f plugins/liftover.c wget $WGET_OPTS -P plugins https://raw.githubusercontent.com/freeseek/score/master/liftover.c diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java index fbe6d8e04..e986726b4 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java @@ -215,7 +215,7 @@ else if (_vcfFileType.isType(f.getFile())) } File lifted = new File(outDir, baseName + ".lifted-" + targetGenomeId + ext); - File unmappedOutput = doNotRetainUnmapped ? new File(outDir, baseName + ".unmapped-" + targetGenomeId + ext) : null; + File unmappedOutput = doNotRetainUnmapped ? null : new File(outDir, baseName + ".unmapped-" + targetGenomeId + ext); try { diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ChainFileValidator.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ChainFileValidator.java index 90978fcf0..9055c0cbe 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ChainFileValidator.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ChainFileValidator.java @@ -420,7 +420,8 @@ else if (refName.endsWith("_alt")) refName = StringUtils.join(Arrays.copyOfRange(tokens, 1, tokens.length), "_"); } - if (refName.equals("chrM")) + // NOTE: hg19 and GRCh37 have different MT contigs. chrM is the legacy version. + if (refName.equals("chrMT")) { return "MT"; } From c1c85c83ff709d408846dafc561a390ee7467414 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 17 Jun 2024 17:58:16 -0700 Subject: [PATCH 25/47] Add chain_file default views --- .../sequenceanalysis/chain_files/.qview.xml | 16 ++++++++++++++++ .../chain_files/With Filepath.qview.xml | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/.qview.xml create mode 100644 SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/With Filepath.qview.xml diff --git a/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/.qview.xml b/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/.qview.xml new file mode 100644 index 000000000..6ad9c3493 --- /dev/null +++ b/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/.qview.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/With Filepath.qview.xml b/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/With Filepath.qview.xml new file mode 100644 index 000000000..d7d962cf2 --- /dev/null +++ b/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/With Filepath.qview.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file From f99aba9e5ffbb2167fc2944720613fc78f7cacd7 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 17 Jun 2024 20:59:21 -0700 Subject: [PATCH 26/47] Fix paraGRAPH typo --- .../labkey/sequenceanalysis/run/alignment/ParagraphStep.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 76780d215..68cd280c3 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -15,7 +15,6 @@ import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; import org.labkey.api.util.FileUtil; import org.labkey.sequenceanalysis.SequenceAnalysisModule; -import org.labkey.sequenceanalysis.run.variant.DepthOfCoverageHandler; import org.labkey.sequenceanalysis.util.SequenceUtil; import java.io.File; @@ -57,7 +56,7 @@ public boolean doRunLocal() @Override public SequenceOutputProcessor getProcessor() { - return new DepthOfCoverageHandler.Processor(); + return new Processor(); } public static class Processor implements SequenceOutputProcessor From 8bfdcebf016d960fcf9848714c3977f3807eeb5f Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 18 Jun 2024 06:19:20 -0700 Subject: [PATCH 27/47] Fix paraGRAPH / idxdepth args --- .../sequenceanalysis/run/alignment/ParagraphStep.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 68cd280c3..b4d955df9 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -76,11 +76,12 @@ public void processFilesRemote(List inputFiles, JobContext c throw new PipelineJobException("Unable to find file: " + inputVCF.getPath()); } + Integer threads = SequencePipelineService.get().getMaxThreads(ctx.getLogger()); for (SequenceOutputFile so : inputFiles) { List depthArgs = new ArrayList<>(); depthArgs.add("idxdepth"); - depthArgs.add("-d"); + depthArgs.add("-b"); depthArgs.add(so.getFile().getPath()); File coverageFile = new File(ctx.getWorkingDirectory(), "coverage.txt"); @@ -90,6 +91,12 @@ public void processFilesRemote(List inputFiles, JobContext c depthArgs.add("-r"); depthArgs.add(ctx.getSequenceSupport().getCachedGenome(so.getLibrary_id()).getWorkingFastaFile().getPath()); + if (threads != null) + { + depthArgs.add("--threads"); + depthArgs.add(threads.toString()); + } + new SimpleScriptWrapper(ctx.getLogger()).execute(depthArgs); if (!coverageFile.exists()) @@ -137,7 +144,6 @@ else if (!svVcf.exists()) paragraphArgs.add("--scratch-dir"); paragraphArgs.add(SequencePipelineService.get().getJavaTempDir()); - Integer threads = SequencePipelineService.get().getMaxThreads(ctx.getLogger()); if (threads != null) { paragraphArgs.add("--threads"); From 53eadbc4679728f3aca4a4f092ccb5d7b2e4e44f Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 18 Jun 2024 14:33:23 -0700 Subject: [PATCH 28/47] Ensure bcftools liftover sorts VCF --- .../org/labkey/sequenceanalysis/analysis/LiftoverHandler.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java index e986726b4..dd63cc081 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java @@ -28,6 +28,7 @@ import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep; import org.labkey.api.sequenceanalysis.run.SelectVariantsWrapper; import org.labkey.api.util.FileType; @@ -333,6 +334,8 @@ public void liftOverVcf(JobContext ctx, ReferenceGenome targetGenome, ReferenceG { LiftoverBcfToolsWrapper wrapper = new LiftoverBcfToolsWrapper(job.getLogger()); wrapper.doLiftover(currentVCF, chain, sourceGenome.getWorkingFastaFile(), targetGenome.getWorkingFastaFile(), unmappedOutput, output); + + SequencePipelineService.get().sortVcf(output, null, targetGenome.getSequenceDictionary(), ctx.getLogger()); } else { From a6a8d3dbe87bebae23a13ea47b464a1b176529b3 Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 18 Jun 2024 15:40:07 -0700 Subject: [PATCH 29/47] Resolve paragraph executables within PATH --- .../api/sequenceanalysis/run/AbstractCommandWrapper.java | 2 +- .../labkey/sequenceanalysis/run/alignment/ParagraphStep.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java index e18b9799e..2e3e84e2c 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java @@ -289,7 +289,7 @@ protected boolean isThrowNonZeroExits() return _throwNonZeroExits; } - protected static File resolveFileInPath(String exe, @Nullable String packageName, boolean throwIfNotFound) + public static File resolveFileInPath(String exe, @Nullable String packageName, boolean throwIfNotFound) { File fn; String path; diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index b4d955df9..8800bf6e2 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -12,6 +12,7 @@ import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; +import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper; import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; import org.labkey.api.util.FileUtil; import org.labkey.sequenceanalysis.SequenceAnalysisModule; @@ -109,7 +110,7 @@ public void processFilesRemote(List inputFiles, JobContext c // TNPRC-IB18 ../IB18.cram 29.77 150 List paragraphArgs = new ArrayList<>(); - paragraphArgs.add("multigrmpy.py"); + paragraphArgs.add(AbstractCommandWrapper.resolveFileInPath("multigrmpy.py", null, true).getPath()); paragraphArgs.add("--verbose"); File paragraphOut = new File(ctx.getWorkingDirectory(), FileUtil.getBaseName(so.getFile()) + ".paragraph.txt"); From 2fa209773d63d34a87df45f181e0e2e1ef92fd1a Mon Sep 17 00:00:00 2001 From: bbimber Date: Tue, 18 Jun 2024 17:28:53 -0700 Subject: [PATCH 30/47] Clean up paraGRAPH VCF code --- .../run/alignment/ParagraphStep.java | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 8800bf6e2..06780cdc6 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -71,10 +71,20 @@ public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport @Override public void processFilesRemote(List inputFiles, JobContext ctx) throws UnsupportedOperationException, PipelineJobException { - File inputVCF = ctx.getSequenceSupport().getCachedData(ctx.getParams().getInt("svVCF")); - if (!inputVCF.exists()) + int svVcfId = ctx.getParams().optInt("svVCF", 0); + if (svVcfId == 0) { - throw new PipelineJobException("Unable to find file: " + inputVCF.getPath()); + throw new PipelineJobException("svVCF param was null"); + } + + File svVcf = ctx.getSequenceSupport().getCachedData(svVcfId); + if (svVcf == null) + { + throw new PipelineJobException("File not found for ID: " + svVcfId); + } + else if (!svVcf.exists()) + { + throw new PipelineJobException("Missing file: " + svVcf.getPath()); } Integer threads = SequencePipelineService.get().getMaxThreads(ctx.getLogger()); @@ -117,22 +127,6 @@ public void processFilesRemote(List inputFiles, JobContext c paragraphArgs.add("-o"); paragraphArgs.add(paragraphOut.getPath()); - int svVcfId = ctx.getParams().optInt("svVCF"); - if (svVcfId == 0) - { - throw new PipelineJobException("Missing svVCF ID"); - } - - File svVcf = ctx.getSequenceSupport().getCachedData(svVcfId); - if (svVcf == null) - { - throw new PipelineJobException("File not found for ID: " + svVcfId); - } - else if (!svVcf.exists()) - { - throw new PipelineJobException("Missing file: " + svVcf.getPath()); - } - paragraphArgs.add("-i"); paragraphArgs.add(svVcf.getPath()); From 65bdfe54cfeb6a1d68760f71194524f0353c991b Mon Sep 17 00:00:00 2001 From: bbimber Date: Wed, 19 Jun 2024 05:21:08 -0700 Subject: [PATCH 31/47] Fix handling of idxdepth/paraGRAPH coverage --- .../run/alignment/ParagraphStep.java | 42 +++++++++++++++++-- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 06780cdc6..cb92bd051 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -1,5 +1,9 @@ package org.labkey.sequenceanalysis.run.alignment; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import org.apache.commons.io.FileUtils; import org.json.JSONObject; import org.labkey.api.module.ModuleLoader; import org.labkey.api.pipeline.PipelineJob; @@ -15,11 +19,14 @@ import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper; import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; import org.labkey.api.util.FileUtil; +import org.labkey.api.writer.PrintWriters; import org.labkey.sequenceanalysis.SequenceAnalysisModule; import org.labkey.sequenceanalysis.util.SequenceUtil; import java.io.File; import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -95,9 +102,9 @@ else if (!svVcf.exists()) depthArgs.add("-b"); depthArgs.add(so.getFile().getPath()); - File coverageFile = new File(ctx.getWorkingDirectory(), "coverage.txt"); + File coverageJson = new File(ctx.getWorkingDirectory(), "coverage.json"); depthArgs.add("-o"); - depthArgs.add(coverageFile.getPath()); + depthArgs.add(coverageJson.getPath()); depthArgs.add("-r"); depthArgs.add(ctx.getSequenceSupport().getCachedGenome(so.getLibrary_id()).getWorkingFastaFile().getPath()); @@ -110,14 +117,41 @@ else if (!svVcf.exists()) new SimpleScriptWrapper(ctx.getLogger()).execute(depthArgs); - if (!coverageFile.exists()) + if (!coverageJson.exists()) { - throw new PipelineJobException("Missing file: " + coverageFile.getPath()); + throw new PipelineJobException("Missing file: " + coverageJson.getPath()); } + ctx.getFileManager().addIntermediateFile(coverageJson); // Should produce a simple text file: // id path depth read length // TNPRC-IB18 ../IB18.cram 29.77 150 + File coverageFile = new File(ctx.getWorkingDirectory(), "coverage.txt"); + try (PrintWriter writer = PrintWriters.getPrintWriter(coverageFile); SamReader reader = SamReaderFactory.makeDefault().open(so.getFile())) + { + SAMFileHeader header = reader.getFileHeader(); + if (header.getReadGroups().size() == 0) + { + throw new PipelineJobException("No read groups found in input BAM"); + } + else if (header.getReadGroups().size() > 1) + { + throw new PipelineJobException("More than one read group found in BAM"); + } + + String rgId = header.getReadGroups().get(0).getSample(); + + JSONObject json = new JSONObject(FileUtils.readFileToString(coverageFile, Charset.defaultCharset())); + writer.println("id\tpath\tdepth\tread length"); + double depth = json.getJSONObject("autosome").getDouble("depth"); + double readLength = json.getInt("read_length"); + writer.println(rgId + "\t" + so.getFile().getPath() + "\t" + depth + "\t" + readLength); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + ctx.getFileManager().addIntermediateFile(coverageFile); List paragraphArgs = new ArrayList<>(); paragraphArgs.add(AbstractCommandWrapper.resolveFileInPath("multigrmpy.py", null, true).getPath()); From 15f6fc24570c1072ea76afb93d3fc69d3e15b8cd Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 20 Jun 2024 20:19:41 -0700 Subject: [PATCH 32/47] Bugfix for SampleRenameStep without intervals --- .../labkey/sequenceanalysis/run/variant/SampleRenameStep.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SampleRenameStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SampleRenameStep.java index 46ffae773..6af568325 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SampleRenameStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SampleRenameStep.java @@ -27,6 +27,7 @@ import java.io.File; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -110,7 +111,7 @@ else if (enforceChangeAll) List queryIntervals = intervals; if (queryIntervals == null || queryIntervals.isEmpty()) { - queryIntervals.add(null); + queryIntervals = Collections.singletonList(null); } int i = 0; From 82ecaa85446f302f34ede27e2a1c4e9dcf4bffb2 Mon Sep 17 00:00:00 2001 From: bbimber Date: Thu, 20 Jun 2024 20:22:28 -0700 Subject: [PATCH 33/47] Bugfix for ParagraphStep --- .../labkey/sequenceanalysis/run/alignment/ParagraphStep.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index cb92bd051..20819c231 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -141,7 +141,7 @@ else if (header.getReadGroups().size() > 1) String rgId = header.getReadGroups().get(0).getSample(); - JSONObject json = new JSONObject(FileUtils.readFileToString(coverageFile, Charset.defaultCharset())); + JSONObject json = new JSONObject(FileUtils.readFileToString(coverageJson, Charset.defaultCharset())); writer.println("id\tpath\tdepth\tread length"); double depth = json.getJSONObject("autosome").getDouble("depth"); double readLength = json.getInt("read_length"); From 53d362d9d0843f8d9fa9bf1a907757eea638ef00 Mon Sep 17 00:00:00 2001 From: Sebastian Benjamin Date: Thu, 20 Jun 2024 11:54:16 -0700 Subject: [PATCH 34/47] Change sorting field type parsing to Long instead of Int --- jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java b/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java index 1b2780f01..a3b1a1a16 100644 --- a/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java +++ b/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java @@ -194,7 +194,7 @@ public JSONObject doSearch(User u, String searchString, final int pageSize, fina pointsConfigMap.put(field, doublePointsConfig); } case Integer -> { - numericQueryParserFields.put(field, SortField.Type.INT); + numericQueryParserFields.put(field, SortField.Type.LONG); pointsConfigMap.put(field, intPointsConfig); } } From 881d11dedec75736a58b2e3acd675942f9b3b46f Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 22 Jun 2024 21:17:39 -0700 Subject: [PATCH 35/47] Switch ParagraphStep to use docker --- .../sequenceanalysis/run/DockerWrapper.java | 107 ++++++++++++++++++ .../run/alignment/ParagraphStep.java | 25 ++-- 2 files changed, 123 insertions(+), 9 deletions(-) create mode 100644 SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java new file mode 100644 index 000000000..e37a98ff9 --- /dev/null +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java @@ -0,0 +1,107 @@ +package org.labkey.api.sequenceanalysis.run; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.Logger; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.pipeline.PipelineOutputTracker; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.writer.PrintWriters; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.List; + +public class DockerWrapper extends AbstractCommandWrapper +{ + private final String _containerName; + private File _tmpDir = null; + + public DockerWrapper(String containerName, Logger log) + { + super(log); + _containerName = containerName; + } + + public void setTmpDir(File tmpDir) + { + _tmpDir = tmpDir; + } + + public void execute(List containerArgs, File workDir, PipelineOutputTracker tracker) throws PipelineJobException + { + File localBashScript = new File(workDir, "docker.sh"); + File dockerBashScript = new File(workDir, "dockerRun.sh"); + tracker.addIntermediateFile(localBashScript); + tracker.addIntermediateFile(dockerBashScript); + + setWorkingDir(workDir); + try (PrintWriter writer = PrintWriters.getPrintWriter(localBashScript); PrintWriter dockerWriter = PrintWriters.getPrintWriter(dockerBashScript)) + { + writer.println("#!/bin/bash"); + writer.println("set -x"); + writer.println("WD=`pwd`"); + writer.println("HOME=`echo ~/`"); + writer.println("DOCKER='" + SequencePipelineService.get().getDockerCommand() + "'"); + writer.println("sudo $DOCKER pull " + _containerName); + writer.println("sudo $DOCKER run --rm=true \\"); + writer.println("\t-v \"${WD}:/work\" \\"); + writer.println("\t-v \"${HOME}:/homeDir\" \\"); + if (_tmpDir != null) + { + writer.println("\t-v \"" + _tmpDir.getPath() + ":/tmp\" \\"); + } + writer.println("\t--entrypoint /bin/bash \\"); + writer.println("\t-w /work \\"); + Integer maxRam = SequencePipelineService.get().getMaxRam(); + if (maxRam != null) + { + writer.println("\t-e SEQUENCEANALYSIS_MAX_RAM=" + maxRam + " \\"); + writer.println("\t--memory='" + maxRam + "g' \\"); + } + writer.println("\t" + _containerName + " \\"); + writer.println("\t/work/" + dockerBashScript.getName()); + writer.println("EXIT_CODE=$?"); + writer.println("echo 'Docker run exit code: '$EXIT_CODE"); + writer.println("exit $EXIT_CODE"); + + dockerWriter.println("#!/bin/bash"); + dockerWriter.println("set -x"); + dockerWriter.println(StringUtils.join(containerArgs, " ")); + dockerWriter.println("EXIT_CODE=$?"); + dockerWriter.println("echo 'Exit code: '$?"); + dockerWriter.println("exit $EXIT_CODE"); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + public File ensureLocalCopy(File input, File workingDirectory, PipelineOutputTracker output) throws PipelineJobException + { + try + { + if (workingDirectory.equals(input.getParentFile())) + { + return input; + } + + File local = new File(workingDirectory, input.getName()); + if (!local.exists()) + { + getLogger().debug("Copying file locally: " + input.getPath()); + FileUtils.copyFile(input, local); + } + + output.addIntermediateFile(local); + + return local; + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } +} diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 20819c231..f84bc89d5 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -16,7 +16,7 @@ import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; -import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper; +import org.labkey.api.sequenceanalysis.run.DockerWrapper; import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; import org.labkey.api.util.FileUtil; import org.labkey.api.writer.PrintWriters; @@ -153,25 +153,32 @@ else if (header.getReadGroups().size() > 1) } ctx.getFileManager().addIntermediateFile(coverageFile); + DockerWrapper dockerWrapper = new DockerWrapper("ghcr.io/bimberlabinternal/paragraph:latest", ctx.getLogger()); List paragraphArgs = new ArrayList<>(); - paragraphArgs.add(AbstractCommandWrapper.resolveFileInPath("multigrmpy.py", null, true).getPath()); + paragraphArgs.add("/opt/paragraph/bin/multigrmpy.py"); + paragraphArgs.add("--verbose"); File paragraphOut = new File(ctx.getWorkingDirectory(), FileUtil.getBaseName(so.getFile()) + ".paragraph.txt"); paragraphArgs.add("-o"); - paragraphArgs.add(paragraphOut.getPath()); + paragraphArgs.add("/work/" + paragraphOut.getName()); paragraphArgs.add("-i"); - paragraphArgs.add(svVcf.getPath()); + dockerWrapper.ensureLocalCopy(svVcf, ctx.getWorkingDirectory(), ctx.getFileManager()); + paragraphArgs.add("/work/" + svVcf.getName()); paragraphArgs.add("-m"); - paragraphArgs.add(coverageFile.getPath()); + paragraphArgs.add("/work/" + coverageFile.getName()); paragraphArgs.add("-r"); - paragraphArgs.add(ctx.getSequenceSupport().getCachedGenome(so.getLibrary_id()).getWorkingFastaFile().getPath()); + File genomeFasta = ctx.getSequenceSupport().getCachedGenome(so.getLibrary_id()).getWorkingFastaFile(); + dockerWrapper.ensureLocalCopy(genomeFasta, ctx.getWorkingDirectory(), ctx.getFileManager()); + dockerWrapper.ensureLocalCopy(new File(genomeFasta.getPath() + ".fai"), ctx.getWorkingDirectory(), ctx.getFileManager()); + paragraphArgs.add("/work/" + genomeFasta.getName()); paragraphArgs.add("--scratch-dir"); - paragraphArgs.add(SequencePipelineService.get().getJavaTempDir()); + paragraphArgs.add("/tmp"); + dockerWrapper.setTmpDir(new File(SequencePipelineService.get().getJavaTempDir())); if (threads != null) { @@ -180,9 +187,9 @@ else if (header.getReadGroups().size() > 1) } paragraphArgs.add("--logfile"); - paragraphArgs.add(new File(ctx.getWorkingDirectory(), "paragraph.log").getPath()); + paragraphArgs.add(new File("/work/paragraph.log").getPath()); - new SimpleScriptWrapper(ctx.getLogger()).execute(paragraphArgs); + dockerWrapper.execute(paragraphArgs, ctx.getWorkingDirectory()); File genotypes = new File(ctx.getWorkingDirectory(), "genotypes.vcf.gz"); if (!genotypes.exists()) From ae1ec6d4c94f57f7dfc864a61a8309582c136f4d Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 23 Jun 2024 05:17:30 -0700 Subject: [PATCH 36/47] Update ParagraphStep/docker execution --- .../org/labkey/api/sequenceanalysis/run/DockerWrapper.java | 5 ++++- .../labkey/sequenceanalysis/run/alignment/ParagraphStep.java | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java index e37a98ff9..245274230 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java @@ -11,6 +11,7 @@ import java.io.File; import java.io.IOException; import java.io.PrintWriter; +import java.util.Arrays; import java.util.List; public class DockerWrapper extends AbstractCommandWrapper @@ -29,7 +30,7 @@ public void setTmpDir(File tmpDir) _tmpDir = tmpDir; } - public void execute(List containerArgs, File workDir, PipelineOutputTracker tracker) throws PipelineJobException + public void executeWithDocker(List containerArgs, File workDir, PipelineOutputTracker tracker) throws PipelineJobException { File localBashScript = new File(workDir, "docker.sh"); File dockerBashScript = new File(workDir, "dockerRun.sh"); @@ -77,6 +78,8 @@ public void execute(List containerArgs, File workDir, PipelineOutputTrac { throw new PipelineJobException(e); } + + execute(Arrays.asList("/bin/bash", localBashScript.getPath())); } public File ensureLocalCopy(File input, File workingDirectory, PipelineOutputTracker output) throws PipelineJobException diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index f84bc89d5..1d2db665b 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -189,7 +189,7 @@ else if (header.getReadGroups().size() > 1) paragraphArgs.add("--logfile"); paragraphArgs.add(new File("/work/paragraph.log").getPath()); - dockerWrapper.execute(paragraphArgs, ctx.getWorkingDirectory()); + dockerWrapper.executeWithDocker(paragraphArgs, ctx.getWorkingDirectory(), ctx.getFileManager()); File genotypes = new File(ctx.getWorkingDirectory(), "genotypes.vcf.gz"); if (!genotypes.exists()) From bf37e4842b6b8f75b1bae653eda14892e7f2ff0b Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 23 Jun 2024 07:10:26 -0700 Subject: [PATCH 37/47] Update ParagraphStep/docker to include VCF index --- .../org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java | 1 + 1 file changed, 1 insertion(+) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 1d2db665b..4a3a9ff7b 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -165,6 +165,7 @@ else if (header.getReadGroups().size() > 1) paragraphArgs.add("-i"); dockerWrapper.ensureLocalCopy(svVcf, ctx.getWorkingDirectory(), ctx.getFileManager()); + dockerWrapper.ensureLocalCopy(new File(svVcf.getPath() + ".tbi"), ctx.getWorkingDirectory(), ctx.getFileManager()); paragraphArgs.add("/work/" + svVcf.getName()); paragraphArgs.add("-m"); From 11e89d358f62c0864f374b86cb42c8fe2622cb63 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 23 Jun 2024 08:26:56 -0700 Subject: [PATCH 38/47] Paragraph: log to stderr instead of separate file --- .../labkey/sequenceanalysis/run/alignment/ParagraphStep.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 4a3a9ff7b..b7c86b789 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -187,9 +187,6 @@ else if (header.getReadGroups().size() > 1) paragraphArgs.add(threads.toString()); } - paragraphArgs.add("--logfile"); - paragraphArgs.add(new File("/work/paragraph.log").getPath()); - dockerWrapper.executeWithDocker(paragraphArgs, ctx.getWorkingDirectory(), ctx.getFileManager()); File genotypes = new File(ctx.getWorkingDirectory(), "genotypes.vcf.gz"); From 39623329c74205a08c26e91d01e8b21a392031f1 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 23 Jun 2024 09:10:45 -0700 Subject: [PATCH 39/47] Paragraph: drop verbose --- .../labkey/sequenceanalysis/run/alignment/ParagraphStep.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index b7c86b789..007368ce9 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -157,8 +157,6 @@ else if (header.getReadGroups().size() > 1) List paragraphArgs = new ArrayList<>(); paragraphArgs.add("/opt/paragraph/bin/multigrmpy.py"); - paragraphArgs.add("--verbose"); - File paragraphOut = new File(ctx.getWorkingDirectory(), FileUtil.getBaseName(so.getFile()) + ".paragraph.txt"); paragraphArgs.add("-o"); paragraphArgs.add("/work/" + paragraphOut.getName()); From 2df816e21e3a2508aa927c807a52d1099fa0c549 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 23 Jun 2024 09:35:46 -0700 Subject: [PATCH 40/47] Paragraph: simplify output --- .../run/alignment/ParagraphStep.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 007368ce9..1a126204b 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -127,6 +127,7 @@ else if (!svVcf.exists()) // id path depth read length // TNPRC-IB18 ../IB18.cram 29.77 150 File coverageFile = new File(ctx.getWorkingDirectory(), "coverage.txt"); + String rgId = null; try (PrintWriter writer = PrintWriters.getPrintWriter(coverageFile); SamReader reader = SamReaderFactory.makeDefault().open(so.getFile())) { SAMFileHeader header = reader.getFileHeader(); @@ -139,13 +140,13 @@ else if (header.getReadGroups().size() > 1) throw new PipelineJobException("More than one read group found in BAM"); } - String rgId = header.getReadGroups().get(0).getSample(); + rgId = header.getReadGroups().get(0).getSample(); JSONObject json = new JSONObject(FileUtils.readFileToString(coverageJson, Charset.defaultCharset())); writer.println("id\tpath\tdepth\tread length"); double depth = json.getJSONObject("autosome").getDouble("depth"); double readLength = json.getInt("read_length"); - writer.println(rgId + "\t" + so.getFile().getPath() + "\t" + depth + "\t" + readLength); + writer.println(rgId + "\t" + "/work/" + so.getFile().getName() + "\t" + depth + "\t" + readLength); } catch (IOException e) { @@ -157,9 +158,12 @@ else if (header.getReadGroups().size() > 1) List paragraphArgs = new ArrayList<>(); paragraphArgs.add("/opt/paragraph/bin/multigrmpy.py"); - File paragraphOut = new File(ctx.getWorkingDirectory(), FileUtil.getBaseName(so.getFile()) + ".paragraph.txt"); + dockerWrapper.ensureLocalCopy(so.getFile(), ctx.getWorkingDirectory(), ctx.getFileManager()); + dockerWrapper.ensureLocalCopy(SequenceAnalysisService.get().getExpectedBamOrCramIndex(so.getFile()), ctx.getWorkingDirectory(), ctx.getFileManager()); + + File paragraphOutDir = new File(ctx.getWorkingDirectory(), FileUtil.getBaseName(so.getFile())); paragraphArgs.add("-o"); - paragraphArgs.add("/work/" + paragraphOut.getName()); + paragraphArgs.add("/work/" + paragraphOutDir.getName()); paragraphArgs.add("-i"); dockerWrapper.ensureLocalCopy(svVcf, ctx.getWorkingDirectory(), ctx.getFileManager()); @@ -187,7 +191,7 @@ else if (header.getReadGroups().size() > 1) dockerWrapper.executeWithDocker(paragraphArgs, ctx.getWorkingDirectory(), ctx.getFileManager()); - File genotypes = new File(ctx.getWorkingDirectory(), "genotypes.vcf.gz"); + File genotypes = new File(paragraphOutDir, "genotypes.vcf.gz"); if (!genotypes.exists()) { throw new PipelineJobException("Missing file: " + genotypes.getPath()); From f3192f66df0fbe432ffdcc8341441576dc84a05a Mon Sep 17 00:00:00 2001 From: bbimber Date: Sun, 23 Jun 2024 19:35:35 -0700 Subject: [PATCH 41/47] Paragraph: perform more cleanup --- .../labkey/sequenceanalysis/run/alignment/ParagraphStep.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index 1a126204b..c1759fcd2 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -207,6 +207,11 @@ else if (header.getReadGroups().size() > 1) } ctx.getFileManager().addSequenceOutput(genotypes, "paraGRAPH Genotypes: " + so.getName(), "paraGRAPH Genoypes", so.getReadset(), null, so.getLibrary_id(), "Input VCF: " + svVcf.getName() + " (" + svVcfId + ")"); + + ctx.getFileManager().addIntermediateFile(new File(paragraphOutDir, "variants.json.gz")); + ctx.getFileManager().addIntermediateFile(new File(paragraphOutDir, "variants.vcf.gz")); + ctx.getFileManager().addIntermediateFile(new File(paragraphOutDir, "genotypes.json.gz")); + ctx.getFileManager().addIntermediateFile(new File(paragraphOutDir, "grmpy.log")); } } } From 6b4a6f3c6bba975a3daa641b804a58147844e473 Mon Sep 17 00:00:00 2001 From: bbimber Date: Fri, 5 Jul 2024 12:05:24 -0700 Subject: [PATCH 42/47] Support min cells to keep param for TcrFilter --- singlecell/resources/chunks/TcrFilter.R | 3 ++- .../labkey/singlecell/pipeline/singlecell/TcrFilter.java | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/singlecell/resources/chunks/TcrFilter.R b/singlecell/resources/chunks/TcrFilter.R index db54a0308..23ca33b88 100644 --- a/singlecell/resources/chunks/TcrFilter.R +++ b/singlecell/resources/chunks/TcrFilter.R @@ -26,7 +26,8 @@ for (datasetId in names(seuratObjects)) { return(length(intersect(values, cdr3ForLocus)) != 0) }) - if (sum(matchingCells) == 0) { + if (sum(matchingCells) <= thresholdToKeep) { + print(paste0('Too few cells, skipping: ', datasetId)) next } diff --git a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/TcrFilter.java b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/TcrFilter.java index f346ebf96..bb7f97289 100644 --- a/singlecell/src/org/labkey/singlecell/pipeline/singlecell/TcrFilter.java +++ b/singlecell/src/org/labkey/singlecell/pipeline/singlecell/TcrFilter.java @@ -21,12 +21,15 @@ public static class Provider extends AbstractPipelineStepProvider Date: Sat, 6 Jul 2024 07:26:58 -0700 Subject: [PATCH 43/47] Bugfix to TcrFilter thresholdToKeep param --- singlecell/resources/chunks/TcrFilter.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/singlecell/resources/chunks/TcrFilter.R b/singlecell/resources/chunks/TcrFilter.R index 23ca33b88..f16a1c03f 100644 --- a/singlecell/resources/chunks/TcrFilter.R +++ b/singlecell/resources/chunks/TcrFilter.R @@ -26,8 +26,8 @@ for (datasetId in names(seuratObjects)) { return(length(intersect(values, cdr3ForLocus)) != 0) }) - if (sum(matchingCells) <= thresholdToKeep) { - print(paste0('Too few cells, skipping: ', datasetId)) + if (sum(matchingCells) == 0) { + print(paste0('No matching cells, skipping: ', datasetId)) next } @@ -41,6 +41,8 @@ for (datasetId in names(seuratObjects)) { if (all(is.null(cellsToKeep))) { print('There were no matching cells') + } else if (length(cellsToKeep) <= thresholdToKeep) { + print(paste0('Too few cells, skipping: ', datasetId)) } else { print(paste0('Total passing cells: ', length(cellsToKeep))) seuratObj <- subset(seuratObj, cells = cellsToKeep) From ae89008c0e98a7c07bee5184d4f3625afd213788 Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 6 Jul 2024 10:10:41 -0700 Subject: [PATCH 44/47] Add check to MergeSeurat for single-cell objects --- singlecell/resources/chunks/MergeSeurat.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/singlecell/resources/chunks/MergeSeurat.R b/singlecell/resources/chunks/MergeSeurat.R index 3e18dd65c..8b0d72d7c 100644 --- a/singlecell/resources/chunks/MergeSeurat.R +++ b/singlecell/resources/chunks/MergeSeurat.R @@ -15,6 +15,11 @@ mergeBatch <- function(dat) { } else { toMerge[[datasetId]] <- readSeuratRDS(dat[[datasetId]]) } + + if (ncol(toMerge[[datasetId]]) == 1) { + logger::log_info(paste0('Dataset has single cell, skipping: ', datasetId)) + toMerge[[datasetId]] <- NULL + } } if (!is.null(assaysToDrop)) { @@ -28,6 +33,10 @@ mergeBatch <- function(dat) { } } + if (length(toMerge) == 0) { + stop('There were no passing seurat objects!') + } + seuratObj <- CellMembrane::MergeSeuratObjs(toMerge, projectName = projectName, doGC = doDiet, errorOnBarcodeSuffix = errorOnBarcodeSuffix) return(seuratObj) } @@ -61,6 +70,7 @@ if (length(seuratObjects) == 1) { unlink(mergedObjectFiles[[1]]) } else { logger::log_info('performing final merge') + # TODO: check for single cell in object seuratObj <- readRDS(mergedObjectFiles[[1]]) unlink(mergedObjectFiles[[1]]) From 02cca7c37162810e916125525dc59893a5eafaea Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 6 Jul 2024 11:36:03 -0700 Subject: [PATCH 45/47] Improve pipeline logging --- singlecell/resources/chunks/Functions.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/singlecell/resources/chunks/Functions.R b/singlecell/resources/chunks/Functions.R index fc32beb2a..622e88e48 100644 --- a/singlecell/resources/chunks/Functions.R +++ b/singlecell/resources/chunks/Functions.R @@ -70,7 +70,7 @@ file.create(trackerFile) print(paste0('Total lines in ', trackerFile, ' on job start:', length(readLines(trackerFile)))) saveData <- function(seuratObj, datasetId) { - message(paste0('Saving dataset: ', datasetId, ' with ', ncol(seuratObj), ' cells')) + logger::log_info(paste0('Saving dataset: ', datasetId, ' with ', ncol(seuratObj), ' cells')) print(paste0('Saving dataset: ', datasetId)) print(seuratObj) @@ -78,9 +78,8 @@ saveData <- function(seuratObj, datasetId) { datasetIdForFile <- makeLegalFileName(datasetId) fn <- paste0(outputPrefix, '.', datasetIdForFile, '.seurat.rds') - message(paste0('Filename: ', fn)) - message(paste0('Saving RDS file: ', fn, ' with ', ncol(seuratObj), ' cells')) + logger::log_info(paste0('Saving RDS file: ', fn, ' with ', ncol(seuratObj), ' cells')) barcodeFile <- paste0(outputPrefix, '.', datasetIdForFile, '.cellBarcodes.csv') metaFile <- paste0(outputPrefix, '.', datasetIdForFile, '.seurat.meta.txt') From da774b27d9d820441792b076aeeadbb90387b083 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 8 Jul 2024 05:54:22 -0700 Subject: [PATCH 46/47] Make ParagraphStep split jobs --- .../sequenceanalysis/run/alignment/ParagraphStep.java | 6 ++++++ .../singlecell/analysis/AbstractSingleCellHandler.java | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java index c1759fcd2..07636dfef 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -43,6 +43,12 @@ public ParagraphStep() )); } + @Override + public boolean doSplitJobs() + { + return true; + } + @Override public boolean canProcess(SequenceOutputFile o) { diff --git a/singlecell/src/org/labkey/singlecell/analysis/AbstractSingleCellHandler.java b/singlecell/src/org/labkey/singlecell/analysis/AbstractSingleCellHandler.java index c8ab0caf6..359cc292b 100644 --- a/singlecell/src/org/labkey/singlecell/analysis/AbstractSingleCellHandler.java +++ b/singlecell/src/org/labkey/singlecell/analysis/AbstractSingleCellHandler.java @@ -1063,7 +1063,7 @@ else if ("NotUsed".equals(val)) pf.setMaximumFractionDigits(2); NumberFormat decimal = DecimalFormat.getNumberInstance(); - decimal.setGroupingUsed(false); + decimal.setGroupingUsed(true); descriptions.add("Total Cells: " + decimal.format(totalCells)); if (hashingUsed) From bb92e7cf8691e92bd2fbc2ba396a1aa3c7c4ae07 Mon Sep 17 00:00:00 2001 From: hextraza Date: Thu, 11 Jul 2024 07:38:42 -0700 Subject: [PATCH 47/47] Caching fix (#282) * Cache the JBrowse/Lucene indexSearcher --------- Co-authored-by: Sebastian Benjamin Co-authored-by: bbimber --- .../labkey/api/jbrowse/JBrowseService.java | 2 + .../labkey/jbrowse/JBrowseLuceneSearch.java | 465 +++++++++++++----- .../src/org/labkey/jbrowse/JBrowseModule.java | 3 + .../labkey/jbrowse/JBrowseServiceImpl.java | 6 + 4 files changed, 346 insertions(+), 130 deletions(-) diff --git a/jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java b/jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java index 513adb923..4e14293a1 100644 --- a/jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java +++ b/jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java @@ -56,4 +56,6 @@ public interface LuceneIndexDetector boolean isAvailable(Container c); } + + abstract public void clearLuceneCacheEntry(File luceneIndexDir); } diff --git a/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java b/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java index a3b1a1a16..f886d1c86 100644 --- a/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java +++ b/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java @@ -19,7 +19,6 @@ import org.apache.lucene.search.LRUQueryCache; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -27,11 +26,14 @@ import org.apache.lucene.search.UsageTrackingQueryCachingPolicy; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.json.JSONArray; import org.json.JSONObject; import org.labkey.api.cache.Cache; +import org.labkey.api.cache.CacheLoader; import org.labkey.api.cache.CacheManager; +import org.labkey.api.cache.TrackingCache; import org.labkey.api.data.Container; import org.labkey.api.data.ContainerManager; import org.labkey.api.jbrowse.AbstractJBrowseFieldCustomizer; @@ -40,6 +42,8 @@ import org.labkey.api.module.ModuleLoader; import org.labkey.api.security.User; import org.labkey.api.settings.AppProps; +import org.labkey.api.util.Filter; +import org.labkey.api.util.ShutdownListener; import org.labkey.api.util.logging.LogHelper; import org.labkey.jbrowse.model.JBrowseSession; import org.labkey.jbrowse.model.JsonFile; @@ -56,6 +60,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -76,7 +81,7 @@ public class JBrowseLuceneSearch private static final int maxCachedQueries = 1000; private static final long maxRamBytesUsed = 250 * 1024 * 1024L; - private static final Cache _cache = CacheManager.getStringKeyCache(1000, CacheManager.UNLIMITED, "JBrowseLuceneSearchCache"); + private static final Cache _cache = new LuceneIndexCache(); private JBrowseLuceneSearch(final JBrowseSession session, final JsonFile jsonFile, User u) { @@ -97,18 +102,36 @@ public static JBrowseLuceneSearch create(String sessionId, String trackId, User return new JBrowseLuceneSearch(session, getTrack(session, trackId, u), u); } - private static synchronized QueryCache getCacheForSession(String trackObjectId) { - LRUQueryCache qc = _cache.get(trackObjectId); - if (qc == null) + private static synchronized CacheEntry getCacheEntryForSession(String trackObjectId, File indexPath) throws IOException { + CacheEntry cacheEntry = _cache.get(trackObjectId); + + // Open directory of lucene path, get a directory reader, and create the index search manager + if (cacheEntry == null) { - qc = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed); - _cache.put(trackObjectId, qc); + try + { + Directory indexDirectory = FSDirectory.open(indexPath.toPath()); + LRUQueryCache queryCache = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed); + IndexReader indexReader = DirectoryReader.open(indexDirectory); + IndexSearcher indexSearcher = new IndexSearcher(indexReader); + indexSearcher.setQueryCache(queryCache); + indexSearcher.setQueryCachingPolicy(new ForceMatchAllDocsCachingPolicy()); + cacheEntry = new CacheEntry(queryCache, indexSearcher, indexPath); + _cache.put(trackObjectId, cacheEntry); + } + catch (Exception e) + { + _log.error("Error creating jbrowse/lucene index reader for: " + trackObjectId, e); + + throw new IllegalStateException("Error creating search index reader for: " + trackObjectId); + } } - return qc; + return cacheEntry; } - private String templateReplace(final String searchString) { + private String templateReplace(final String searchString) + { String result = searchString; Pattern pattern = Pattern.compile("~(.*?)~"); Matcher matcher = pattern.matcher(searchString); @@ -128,25 +151,34 @@ private String templateReplace(final String searchString) { return result; } - private String tryUrlDecode(String input) { - try { + private String tryUrlDecode(String input) + { + try + { //special case for urls containing +; this isn't necessary for strings sent from the client-side, but URLs //sent via unit tests autodecode, and strings containing + rather than the URL-encoded symbol are unsafe //to pass through URLDecoded.decode - if (input.contains("+")) { + if (input.contains("+")) + { return input; } return URLDecoder.decode(input, StandardCharsets.UTF_8); - } catch (IllegalArgumentException e) { + } catch (IllegalArgumentException e) + { + _log.error("Unable to URL decode input string: " + input, e); + return input; } } - public String extractFieldName(String queryString) { + public String extractFieldName(String queryString) + { // Check if the query starts with any of the start patterns - for (String pattern : specialStartPatterns) { - if (queryString.startsWith(pattern)) { + for (String pattern : specialStartPatterns) + { + if (queryString.startsWith(pattern)) + { queryString = queryString.substring(pattern.length()).trim(); break; } @@ -163,152 +195,148 @@ public JSONObject doSearch(User u, String searchString, final int pageSize, fina File indexPath = _jsonFile.getExpectedLocationOfLuceneIndex(true); Map fields = JBrowseFieldUtils.getIndexedFields(_jsonFile, u, getContainer()); - // Open directory of lucene path, get a directory reader, and create the index search manager - try ( - Directory indexDirectory = FSDirectory.open(indexPath.toPath()); - IndexReader indexReader = DirectoryReader.open(indexDirectory); - Analyzer analyzer = new StandardAnalyzer(); - ) - { - IndexSearcher indexSearcher = new IndexSearcher(indexReader); - indexSearcher.setQueryCache(getCacheForSession(_jsonFile.getObjectId())); - indexSearcher.setQueryCachingPolicy(new ForceMatchAllDocsCachingPolicy()); + CacheEntry cacheEntry = getCacheEntryForSession(_jsonFile.getObjectId(), indexPath); + Analyzer analyzer = new StandardAnalyzer(); - List stringQueryParserFields = new ArrayList<>(); - Map numericQueryParserFields = new HashMap<>(); - PointsConfig intPointsConfig = new PointsConfig(new DecimalFormat(), Integer.class); - PointsConfig doublePointsConfig = new PointsConfig(new DecimalFormat(), Double.class); - Map pointsConfigMap = new HashMap<>(); + List stringQueryParserFields = new ArrayList<>(); + Map numericQueryParserFields = new HashMap<>(); + PointsConfig intPointsConfig = new PointsConfig(new DecimalFormat(), Integer.class); + PointsConfig doublePointsConfig = new PointsConfig(new DecimalFormat(), Double.class); + Map pointsConfigMap = new HashMap<>(); - // Iterate fields and split them into fields for the queryParser and the numericQueryParser - for (Map.Entry entry : fields.entrySet()) - { - String field = entry.getKey(); - JBrowseFieldDescriptor descriptor = entry.getValue(); + // Iterate fields and split them into fields for the queryParser and the numericQueryParser + for (Map.Entry entry : fields.entrySet()) + { + String field = entry.getKey(); + JBrowseFieldDescriptor descriptor = entry.getValue(); - switch(descriptor.getType()) - { - case Flag, String, Character -> stringQueryParserFields.add(field); - case Float -> { - numericQueryParserFields.put(field, SortField.Type.DOUBLE); - pointsConfigMap.put(field, doublePointsConfig); - } - case Integer -> { - numericQueryParserFields.put(field, SortField.Type.LONG); - pointsConfigMap.put(field, intPointsConfig); - } + switch(descriptor.getType()) + { + case Flag, String, Character -> stringQueryParserFields.add(field); + case Float -> { + numericQueryParserFields.put(field, SortField.Type.DOUBLE); + pointsConfigMap.put(field, doublePointsConfig); + } + case Integer -> { + numericQueryParserFields.put(field, SortField.Type.LONG); + pointsConfigMap.put(field, intPointsConfig); } } + } - // The numericQueryParser can perform range queries, but numeric fields they can't be indexed alongside - // lexicographic fields, so they get split into a separate parser - MultiFieldQueryParser queryParser = new MultiFieldQueryParser(stringQueryParserFields.toArray(new String[0]), new StandardAnalyzer()); - queryParser.setAllowLeadingWildcard(true); + // The numericQueryParser can perform range queries, but numeric fields they can't be indexed alongside + // lexicographic fields, so they get split into a separate parser + MultiFieldQueryParser queryParser = new MultiFieldQueryParser(stringQueryParserFields.toArray(new String[0]), new StandardAnalyzer()); + queryParser.setAllowLeadingWildcard(true); - StandardQueryParser numericQueryParser = new StandardQueryParser(); - numericQueryParser.setAnalyzer(analyzer); - numericQueryParser.setPointsConfigMap(pointsConfigMap); + StandardQueryParser numericQueryParser = new StandardQueryParser(); + numericQueryParser.setAnalyzer(analyzer); + numericQueryParser.setPointsConfigMap(pointsConfigMap); - BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); - if (searchString.equals(ALL_DOCS)) { - booleanQueryBuilder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST); - } + if (searchString.equals(ALL_DOCS)) + { + booleanQueryBuilder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST); + } - // Split input into tokens, 1 token per query separated by & - StringTokenizer tokenizer = new StringTokenizer(searchString, "&"); + // Split input into tokens, 1 token per query separated by & + StringTokenizer tokenizer = new StringTokenizer(searchString, "&"); - while (tokenizer.hasMoreTokens() && !searchString.equals(ALL_DOCS)) - { - String queryString = tokenizer.nextToken(); - Query query = null; + while (tokenizer.hasMoreTokens() && !searchString.equals(ALL_DOCS)) + { + String queryString = tokenizer.nextToken(); + Query query = null; - String fieldName = extractFieldName(queryString); + String fieldName = extractFieldName(queryString); - if (VARIABLE_SAMPLES.equals(fieldName)) - { - queryString = templateReplace(queryString); - } + if (VARIABLE_SAMPLES.equals(fieldName)) + { + queryString = templateReplace(queryString); + } - if (stringQueryParserFields.contains(fieldName)) - { - query = queryParser.parse(queryString); - } - else if (numericQueryParserFields.containsKey(fieldName)) + if (stringQueryParserFields.contains(fieldName)) + { + query = queryParser.parse(queryString); + } + else if (numericQueryParserFields.containsKey(fieldName)) + { + try { - try - { - query = numericQueryParser.parse(queryString, ""); - } - catch (QueryNodeException e) - { - e.printStackTrace(); - } + query = numericQueryParser.parse(queryString, ""); } - else + catch (QueryNodeException e) { - throw new IllegalArgumentException("No such field(s), or malformed query: " + queryString + ", field: " + fieldName); + _log.error("Unable to parse query for field " + fieldName + ": " + queryString, e); + + throw new IllegalArgumentException("Unable to parse query: " + queryString + " for field: " + fieldName); } + } + else + { + _log.error("No such field(s), or malformed query: " + queryString + ", field: " + fieldName); - booleanQueryBuilder.add(query, BooleanClause.Occur.MUST); + throw new IllegalArgumentException("No such field(s), or malformed query: " + queryString + ", field: " + fieldName); } - BooleanQuery query = booleanQueryBuilder.build(); + booleanQueryBuilder.add(query, BooleanClause.Occur.MUST); + } - // By default, sort in INDEXORDER, which is by genomicPosition - Sort sort = Sort.INDEXORDER; + BooleanQuery query = booleanQueryBuilder.build(); - // If the sort field is not genomicPosition, use the provided sorting data - if (!sortField.equals(GENOMIC_POSITION)) { - SortField.Type fieldType; + // By default, sort in INDEXORDER, which is by genomicPosition + Sort sort = Sort.INDEXORDER; - if (stringQueryParserFields.contains(sortField)) { - fieldType = SortField.Type.STRING; - } else if (numericQueryParserFields.containsKey(sortField)) { - fieldType = numericQueryParserFields.get(sortField); - } else { - throw new IllegalArgumentException("Could not find type for sort field: " + sortField); - } + // If the sort field is not genomicPosition, use the provided sorting data + if (!sortField.equals(GENOMIC_POSITION)) { + SortField.Type fieldType; - sort = new Sort(new SortField(sortField + "_sort", fieldType, sortReverse)); + if (stringQueryParserFields.contains(sortField)) { + fieldType = SortField.Type.STRING; + } else if (numericQueryParserFields.containsKey(sortField)) { + fieldType = numericQueryParserFields.get(sortField); + } else { + throw new IllegalArgumentException("Could not find type for sort field: " + sortField); } - // Get chunks of size {pageSize}. Default to 1 chunk -- add to the offset to get more. - // We then iterate over the range of documents we want based on the offset. This does grow in memory - // linearly with the number of documents, but my understanding is that these are just score,id pairs - // rather than full documents, so mem usage *should* still be pretty low. - // Perform the search with sorting - TopFieldDocs topDocs = indexSearcher.search(query, pageSize * (offset + 1), sort); + sort = new Sort(new SortField(sortField + "_sort", fieldType, sortReverse)); + } + + // Get chunks of size {pageSize}. Default to 1 chunk -- add to the offset to get more. + // We then iterate over the range of documents we want based on the offset. This does grow in memory + // linearly with the number of documents, but my understanding is that these are just score,id pairs + // rather than full documents, so mem usage *should* still be pretty low. + // Perform the search with sorting + TopFieldDocs topDocs = cacheEntry.indexSearcher.search(query, pageSize * (offset + 1), sort); - JSONObject results = new JSONObject(); + JSONObject results = new JSONObject(); - // Iterate over the doc list, (either to the total end or until the page ends) grab the requested docs, - // and add to returned results - List data = new ArrayList<>(); - for (int i = pageSize * offset; i < Math.min(pageSize * (offset + 1), topDocs.scoreDocs.length); i++) - { - JSONObject elem = new JSONObject(); - Document doc = indexSearcher.storedFields().document(topDocs.scoreDocs[i].doc); - - for (IndexableField field : doc.getFields()) { - String fieldName = field.name(); - String[] fieldValues = doc.getValues(fieldName); - if (fieldValues.length > 1) { - elem.put(fieldName, fieldValues); - } else { - elem.put(fieldName, fieldValues[0]); - } + // Iterate over the doc list, (either to the total end or until the page ends) grab the requested docs, + // and add to returned results + List data = new ArrayList<>(); + for (int i = pageSize * offset; i < Math.min(pageSize * (offset + 1), topDocs.scoreDocs.length); i++) + { + JSONObject elem = new JSONObject(); + Document doc = cacheEntry.indexSearcher.storedFields().document(topDocs.scoreDocs[i].doc); + + for (IndexableField field : doc.getFields()) { + String fieldName = field.name(); + String[] fieldValues = doc.getValues(fieldName); + if (fieldValues.length > 1) { + elem.put(fieldName, fieldValues); + } else { + elem.put(fieldName, fieldValues[0]); } - - data.add(elem); } - results.put("data", data); - results.put("totalHits", topDocs.totalHits.value); - - //TODO: we should probably stream this - return results; + data.add(elem); } + + results.put("data", data); + results.put("totalHits", topDocs.totalHits.value); + + //TODO: we should probably stream this + return results; } public static class DefaultJBrowseFieldCustomizer extends AbstractJBrowseFieldCustomizer @@ -382,6 +410,8 @@ public boolean shouldCache(Query query) throws IOException { return true; } } + } else if (query instanceof MatchAllDocsQuery) { + return true; } return defaultPolicy.shouldCache(query); @@ -393,12 +423,148 @@ public void onUse(Query query) { } } + public static class LuceneIndexCache implements Cache + { + private final Cache _cache; + + public LuceneIndexCache() + { + _cache = CacheManager.getStringKeyCache(1000, CacheManager.UNLIMITED, "JBrowseLuceneSearchCache"); + + } + + @Override + public void remove(@NotNull String key) + { + CacheEntry e = get(key); + _cache.remove(key); + + closeReader(e); + } + + @Override + public void clear() + { + for (String key : getKeys()) + { + closeReader(get(key)); + } + + _cache.clear(); + } + + @Override + public void close() + { + for (String key : getKeys()) + { + closeReader(get(key)); + } + + _cache.close(); + } + + private void closeReader(@Nullable CacheEntry entry) + { + if (entry == null) + { + return; + } + + try + { + entry.getIndexSearcher().getIndexReader().close(); + } + catch (IOException e) + { + _log.error("Error closing JBrowseLuceneSearch index reader", e); + } + } + + @Override + public void put(@NotNull String key, CacheEntry value) + { + _cache.put(key, value); + } + + @Override + public void put(@NotNull String key, CacheEntry value, long timeToLive) + { + _cache.put(key, value, timeToLive); + } + + @Override + public CacheEntry get(@NotNull String key) + { + return _cache.get(key); + } + + @Override + public CacheEntry get(@NotNull String key, @Nullable Object arg, CacheLoader loader) + { + return _cache.get(key, arg, loader); + } + + @Override + public int removeUsingFilter(Filter filter) + { + return _cache.removeUsingFilter(filter); + } + + @Override + public Set getKeys() + { + return _cache.getKeys(); + } + + @Override + public TrackingCache getTrackingCache() + { + return _cache.getTrackingCache(); + } + + @Override + public Cache createTemporaryCache() + { + return _cache.createTemporaryCache(); + } + } + + public static class CacheEntry + { + private final LRUQueryCache queryCache; + private final IndexSearcher indexSearcher; + private final File luceneIndexDir; + + public CacheEntry(LRUQueryCache queryCache, IndexSearcher indexSearcher, File luceneIndexDir) + { + this.queryCache = queryCache; + this.indexSearcher = indexSearcher; + this.luceneIndexDir = luceneIndexDir; + } + + public LRUQueryCache getQueryCache() + { + return queryCache; + } + + public IndexSearcher getIndexSearcher() + { + return indexSearcher; + } + + public File getLuceneIndexDir() + { + return luceneIndexDir; + } + } + public static JSONArray reportCacheInfo() { JSONArray cacheInfo = new JSONArray(); for (String sessionId : _cache.getKeys()) { - LRUQueryCache qc = _cache.get(sessionId); + LRUQueryCache qc = _cache.get(sessionId).getQueryCache(); JSONObject info = new JSONObject(); info.put("cacheSize", qc.getCacheSize()); info.put("cacheCount", qc.getCacheCount()); @@ -425,6 +591,23 @@ public void cacheDefaultQuery() } } + public static void emptyCache() + { + clearCache(null); + } + + public static void clearCacheForFile(@NotNull File luceneIndexDir) + { + for (String key : _cache.getKeys()) + { + CacheEntry entry = _cache.get(key); + if (entry != null && luceneIndexDir.equals(entry.getLuceneIndexDir())) + { + _cache.remove(key); + } + } + } + public static void clearCache(@Nullable String jbrowseTrackId) { if (jbrowseTrackId == null) @@ -436,4 +619,26 @@ public static void clearCache(@Nullable String jbrowseTrackId) _cache.remove(jbrowseTrackId); } } + + public static class ShutdownHandler implements ShutdownListener + { + @Override + public String getName() + { + return "JBrowse-Lucene Shutdown Listener"; + } + + @Override + public void shutdownPre() + { + + } + + @Override + public void shutdownStarted() + { + _log.info("Clearing all open JBrowse/Lucene cached readers"); + JBrowseLuceneSearch.emptyCache(); + } + } } diff --git a/jbrowse/src/org/labkey/jbrowse/JBrowseModule.java b/jbrowse/src/org/labkey/jbrowse/JBrowseModule.java index 7221529af..d59366531 100644 --- a/jbrowse/src/org/labkey/jbrowse/JBrowseModule.java +++ b/jbrowse/src/org/labkey/jbrowse/JBrowseModule.java @@ -30,6 +30,7 @@ import org.labkey.api.sequenceanalysis.SequenceAnalysisService; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.settings.AdminConsole; +import org.labkey.api.util.ContextListener; import org.labkey.api.util.PageFlowUtil; import org.labkey.api.util.SystemMaintenance; import org.labkey.api.view.WebPartFactory; @@ -110,6 +111,8 @@ public void doStartupAfterSpringConfig(ModuleContext moduleContext) JBrowseService.get().registerFieldCustomizer(new JBrowseLuceneSearch.DefaultJBrowseFieldCustomizer()); JBrowseService.get().registerGroupsProvider(new JBrowseLuceneSearch.TestJBrowseGroupProvider()); + + ContextListener.addShutdownListener(new JBrowseLuceneSearch.ShutdownHandler()); } public static void registerPipelineSteps() diff --git a/jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java b/jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java index daeb91419..6ad23749a 100644 --- a/jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java +++ b/jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java @@ -319,6 +319,12 @@ public void cacheDefaultQuery(User u, String sessionId, String trackId) luceneSearch.cacheDefaultQuery(); } + @Override + public void clearLuceneCacheEntry(File luceneIndexDir) + { + JBrowseLuceneSearch.clearCacheForFile(luceneIndexDir); + } + public static final class DefaultLuceneIndexDetector implements LuceneIndexDetector { @Override