diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java index e18b9799e..2e3e84e2c 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractCommandWrapper.java @@ -289,7 +289,7 @@ protected boolean isThrowNonZeroExits() return _throwNonZeroExits; } - protected static File resolveFileInPath(String exe, @Nullable String packageName, boolean throwIfNotFound) + public static File resolveFileInPath(String exe, @Nullable String packageName, boolean throwIfNotFound) { File fn; String path; diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java index 334fcf88b..91959921b 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/AbstractGatk4Wrapper.java @@ -90,6 +90,7 @@ public List getBaseArgs(@Nullable String toolName) List args = new ArrayList<>(); args.add(SequencePipelineService.get().getJavaFilepath()); args.addAll(SequencePipelineService.get().getJavaOpts(_maxRamOverride)); + args.add("-DGATK_STACKTRACE_ON_USER_EXCEPTION=true"); args.add("-jar"); args.add(getJAR().getPath()); @@ -98,6 +99,8 @@ public List getBaseArgs(@Nullable String toolName) args.add(toolName); } + + return args; } diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java new file mode 100644 index 000000000..245274230 --- /dev/null +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/run/DockerWrapper.java @@ -0,0 +1,110 @@ +package org.labkey.api.sequenceanalysis.run; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.Logger; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.pipeline.PipelineOutputTracker; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.writer.PrintWriters; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Arrays; +import java.util.List; + +public class DockerWrapper extends AbstractCommandWrapper +{ + private final String _containerName; + private File _tmpDir = null; + + public DockerWrapper(String containerName, Logger log) + { + super(log); + _containerName = containerName; + } + + public void setTmpDir(File tmpDir) + { + _tmpDir = tmpDir; + } + + public void executeWithDocker(List containerArgs, File workDir, PipelineOutputTracker tracker) throws PipelineJobException + { + File localBashScript = new File(workDir, "docker.sh"); + File dockerBashScript = new File(workDir, "dockerRun.sh"); + tracker.addIntermediateFile(localBashScript); + tracker.addIntermediateFile(dockerBashScript); + + setWorkingDir(workDir); + try (PrintWriter writer = PrintWriters.getPrintWriter(localBashScript); PrintWriter dockerWriter = PrintWriters.getPrintWriter(dockerBashScript)) + { + writer.println("#!/bin/bash"); + writer.println("set -x"); + writer.println("WD=`pwd`"); + writer.println("HOME=`echo ~/`"); + writer.println("DOCKER='" + SequencePipelineService.get().getDockerCommand() + "'"); + writer.println("sudo $DOCKER pull " + _containerName); + writer.println("sudo $DOCKER run --rm=true \\"); + writer.println("\t-v \"${WD}:/work\" \\"); + writer.println("\t-v \"${HOME}:/homeDir\" \\"); + if (_tmpDir != null) + { + writer.println("\t-v \"" + _tmpDir.getPath() + ":/tmp\" \\"); + } + writer.println("\t--entrypoint /bin/bash \\"); + writer.println("\t-w /work \\"); + Integer maxRam = SequencePipelineService.get().getMaxRam(); + if (maxRam != null) + { + writer.println("\t-e SEQUENCEANALYSIS_MAX_RAM=" + maxRam + " \\"); + writer.println("\t--memory='" + maxRam + "g' \\"); + } + writer.println("\t" + _containerName + " \\"); + writer.println("\t/work/" + dockerBashScript.getName()); + writer.println("EXIT_CODE=$?"); + writer.println("echo 'Docker run exit code: '$EXIT_CODE"); + writer.println("exit $EXIT_CODE"); + + dockerWriter.println("#!/bin/bash"); + dockerWriter.println("set -x"); + dockerWriter.println(StringUtils.join(containerArgs, " ")); + dockerWriter.println("EXIT_CODE=$?"); + dockerWriter.println("echo 'Exit code: '$?"); + dockerWriter.println("exit $EXIT_CODE"); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + + execute(Arrays.asList("/bin/bash", localBashScript.getPath())); + } + + public File ensureLocalCopy(File input, File workingDirectory, PipelineOutputTracker output) throws PipelineJobException + { + try + { + if (workingDirectory.equals(input.getParentFile())) + { + return input; + } + + File local = new File(workingDirectory, input.getName()); + if (!local.exists()) + { + getLogger().debug("Copying file locally: " + input.getPath()); + FileUtils.copyFile(input, local); + } + + output.addIntermediateFile(local); + + return local; + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } +} diff --git a/SequenceAnalysis/pipeline_code/sequence_tools_install.sh b/SequenceAnalysis/pipeline_code/sequence_tools_install.sh index feef079c7..9333860c7 100755 --- a/SequenceAnalysis/pipeline_code/sequence_tools_install.sh +++ b/SequenceAnalysis/pipeline_code/sequence_tools_install.sh @@ -335,30 +335,6 @@ else fi -# -# BisSNP -# -echo "" -echo "" -echo "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" -echo "Install BisSNP" -echo "" -cd $LKSRC_DIR - -if [[ ! -e ${LKTOOLS_DIR}/BisSNP.jar || ! -z $FORCE_REINSTALL ]]; -then - echo "Cleaning up previous installs" - rm -Rf BisSNP* - rm -Rf $LKTOOLS_DIR/BisSNP.jar - - wget $WGET_OPTS https://downloads.sourceforge.net/project/bissnp/BisSNP-0.82.2/BisSNP-0.82.2.jar - - install ./BisSNP-0.82.2.jar $LKTOOLS_DIR/BisSNP.jar -else - echo "Already installed" -fi - - # #mosaik # @@ -510,10 +486,10 @@ then rm -Rf bcftools* rm -Rf $LKTOOLS_DIR/bcftools - wget $WGET_OPTS https://github.com/samtools/bcftools/releases/download/1.18/bcftools-1.18.tar.bz2 - tar xjvf bcftools-1.18.tar.bz2 - chmod 755 bcftools-1.18 - cd bcftools-1.18 + wget $WGET_OPTS https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2 + tar xjvf bcftools-1.20.tar.bz2 + chmod 755 bcftools-1.20 + cd bcftools-1.20 rm -f plugins/liftover.c wget $WGET_OPTS -P plugins https://raw.githubusercontent.com/freeseek/score/master/liftover.c diff --git a/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/.qview.xml b/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/.qview.xml new file mode 100644 index 000000000..6ad9c3493 --- /dev/null +++ b/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/.qview.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/With Filepath.qview.xml b/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/With Filepath.qview.xml new file mode 100644 index 000000000..d7d962cf2 --- /dev/null +++ b/SequenceAnalysis/resources/queries/sequenceanalysis/chain_files/With Filepath.qview.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js b/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js index 290b58322..15f6a5a6c 100644 --- a/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js +++ b/SequenceAnalysis/resources/web/SequenceAnalysis/window/LiftoverWindow.js @@ -104,7 +104,7 @@ Ext4.define('SequenceAnalysis.window.LiftoverWindow', { maxValue: 1.0, value: 0.95, fieldLabel: 'Min Percent Match', - helpPopup: 'In order to lift to the target genome, the feature must have at least this percent match. Lower this value to be more permissive; however, this risks incorrect liftovers', + helpPopup: 'In order to lift to the target genome, the feature must have at least this percent match. Lower this value to be more permissive; however, this risks incorrect liftovers. This is ignored if using bcftools.', itemId: 'pctField' },{ xtype: 'checkbox', @@ -112,6 +112,16 @@ Ext4.define('SequenceAnalysis.window.LiftoverWindow', { checked: false, helpPopup: 'If checked, no genotypes will be written to the output file (applies to VCFs only). This can be useful (and necessary) when lifting VCFs with extremely high sample number.', fieldLabel: 'Drop Genotypes' + },{ + xtype: 'checkbox', + itemId: 'useBcfTools', + checked: false, + fieldLabel: 'Use bcftools' + },{ + xtype: 'checkbox', + itemId: 'doNotRetainUnmapped', + checked: false, + fieldLabel: 'Do Not Retain Unmapped' }].concat(SequenceAnalysis.window.OutputHandlerWindow.getCfgForToolParameters(this.toolParameters)), buttons: [{ text: 'Submit', @@ -152,6 +162,14 @@ Ext4.define('SequenceAnalysis.window.LiftoverWindow', { params.dropGenotypes = this.down('#dropGenotypes').getValue(); } + if (this.down('#useBcfTools').getValue()){ + params.useBcfTools = this.down('#useBcfTools').getValue(); + } + + if (this.down('#doNotRetainUnmapped').getValue()){ + params.doNotRetainUnmapped = this.down('#doNotRetainUnmapped').getValue(); + } + Ext4.Msg.wait('Saving...'); LABKEY.Ajax.request({ url: LABKEY.ActionURL.buildURL('sequenceanalysis', 'runSequenceHandler', this.containerPath), diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java index a713eb1c4..ac5215794 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java @@ -59,6 +59,7 @@ import org.labkey.sequenceanalysis.analysis.RnaSeqcHandler; import org.labkey.sequenceanalysis.analysis.SbtGeneCountHandler; import org.labkey.sequenceanalysis.analysis.UnmappedSequenceBasedGenotypeHandler; +import org.labkey.sequenceanalysis.analysis.UpdateReadsetFilesHandler; import org.labkey.sequenceanalysis.button.AddSraRunButton; import org.labkey.sequenceanalysis.button.ArchiveReadsetsButton; import org.labkey.sequenceanalysis.button.ChangeReadsetStatusButton; @@ -77,6 +78,7 @@ import org.labkey.sequenceanalysis.run.alignment.BowtieWrapper; import org.labkey.sequenceanalysis.run.alignment.GSnapWrapper; import org.labkey.sequenceanalysis.run.alignment.MosaikWrapper; +import org.labkey.sequenceanalysis.run.alignment.ParagraphStep; import org.labkey.sequenceanalysis.run.alignment.Pbmm2Wrapper; import org.labkey.sequenceanalysis.run.alignment.StarWrapper; import org.labkey.sequenceanalysis.run.alignment.VulcanWrapper; @@ -113,6 +115,7 @@ import org.labkey.sequenceanalysis.run.util.FastqcRunner; import org.labkey.sequenceanalysis.run.util.GenomicsDBAppendHandler; import org.labkey.sequenceanalysis.run.util.GenomicsDBImportHandler; +import org.labkey.sequenceanalysis.run.util.SVAnnotateStep; import org.labkey.sequenceanalysis.run.variant.*; import org.labkey.sequenceanalysis.util.Barcoder; import org.labkey.sequenceanalysis.util.ChainFileValidator; @@ -299,6 +302,7 @@ public static void registerPipelineSteps() SequencePipelineService.get().registerPipelineStep(new MendelianViolationReportStep.Provider()); SequencePipelineService.get().registerPipelineStep(new SummarizeGenotypeQualityStep.Provider()); SequencePipelineService.get().registerPipelineStep(new BcftoolsFillTagsStep.Provider()); + SequencePipelineService.get().registerPipelineStep(new SVAnnotateStep.Provider()); //handlers SequenceAnalysisService.get().registerFileHandler(new LiftoverHandler()); @@ -333,6 +337,8 @@ public static void registerPipelineSteps() SequenceAnalysisService.get().registerFileHandler(new PbsvJointCallingHandler()); SequenceAnalysisService.get().registerFileHandler(new DeepVariantHandler()); SequenceAnalysisService.get().registerFileHandler(new GLNexusHandler()); + SequenceAnalysisService.get().registerFileHandler(new ParagraphStep()); + SequenceAnalysisService.get().registerFileHandler(new UpdateReadsetFilesHandler()); SequenceAnalysisService.get().registerReadsetHandler(new MultiQCHandler()); SequenceAnalysisService.get().registerReadsetHandler(new RestoreSraDataHandler()); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java index f7af4b587..dd63cc081 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/LiftoverHandler.java @@ -28,6 +28,8 @@ import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep; import org.labkey.api.sequenceanalysis.run.SelectVariantsWrapper; import org.labkey.api.util.FileType; import org.labkey.api.util.FileUtil; @@ -35,6 +37,7 @@ import org.labkey.api.writer.PrintWriters; import org.labkey.sequenceanalysis.SequenceAnalysisModule; import org.labkey.sequenceanalysis.pipeline.ProcessVariantsHandler; +import org.labkey.sequenceanalysis.run.util.LiftoverBcfToolsWrapper; import org.labkey.sequenceanalysis.run.util.LiftoverVcfWrapper; import org.labkey.sequenceanalysis.util.SequenceUtil; @@ -49,7 +52,7 @@ /** * Created by bimber on 8/26/2014. */ -public class LiftoverHandler implements SequenceOutputHandler +public class LiftoverHandler implements SequenceOutputHandler, VariantProcessingStep.SupportsScatterGather { private final FileType _bedFileType = new FileType(".bed", false); //private FileType _gffFileType = new FileType("gff", false); @@ -60,6 +63,12 @@ public LiftoverHandler() } + @Override + public boolean doSortAfterMerge() + { + return true; + } + @Override public String getName() { @@ -167,8 +176,10 @@ public void processFilesRemote(List inputFiles, JobContext c JSONObject params = ctx.getParams(); boolean dropGenotypes = params.optBoolean("dropGenotypes", false); + boolean useBcfTools = params.optBoolean("useBcfTools", false); + boolean doNotRetainUnmapped = params.optBoolean("doNotRetainUnmapped", false); - Integer chainFileId = params.getInt("chainFileId"); + int chainFileId = params.getInt("chainFileId"); File chainFile = ctx.getSequenceSupport().getCachedData(chainFileId); int targetGenomeId = params.getInt("targetGenomeId"); @@ -205,7 +216,7 @@ else if (_vcfFileType.isType(f.getFile())) } File lifted = new File(outDir, baseName + ".lifted-" + targetGenomeId + ext); - File unmappedOutput = new File(outDir, baseName + ".unmapped-" + targetGenomeId + ext); + File unmappedOutput = doNotRetainUnmapped ? null : new File(outDir, baseName + ".unmapped-" + targetGenomeId + ext); try { @@ -217,7 +228,7 @@ else if (_vcfFileType.isType(f.getFile())) { ReferenceGenome targetGenome = ctx.getSequenceSupport().getCachedGenome(targetGenomeId); ReferenceGenome sourceGenome = ctx.getSequenceSupport().getCachedGenome(f.getLibrary_id()); - liftOverVcf(ctx, targetGenome, sourceGenome, chainFile, f.getFile(), lifted, unmappedOutput, job, pct, dropGenotypes); + liftOverVcf(ctx, targetGenome, sourceGenome, chainFile, f.getFile(), lifted, unmappedOutput, job, pct, dropGenotypes, useBcfTools); } } catch (Exception e) @@ -251,7 +262,11 @@ else if (_vcfFileType.isType(f.getFile())) ctx.addSequenceOutput(so1); } - if (!unmappedOutput.exists()) + if (unmappedOutput == null) + { + // skip + } + else if (!unmappedOutput.exists()) { job.getLogger().info("no unmapped intervals"); } @@ -293,7 +308,7 @@ else if (!SequenceUtil.hasLineCount(unmappedOutput)) } } - public void liftOverVcf(JobContext ctx, ReferenceGenome targetGenome, ReferenceGenome sourceGenome, File chain, File input, File output, @Nullable File unmappedOutput, PipelineJob job, double pct, boolean dropGenotypes) throws IOException, PipelineJobException + public void liftOverVcf(JobContext ctx, ReferenceGenome targetGenome, ReferenceGenome sourceGenome, File chain, File input, File output, @Nullable File unmappedOutput, PipelineJob job, double pct, boolean dropGenotypes, boolean useBcfTools) throws IOException, PipelineJobException { File currentVCF = input; if (dropGenotypes) @@ -315,8 +330,18 @@ public void liftOverVcf(JobContext ctx, ReferenceGenome targetGenome, ReferenceG ctx.getFileManager().addIntermediateFile(new File(outputFile.getPath() + ".tbi")); } - LiftoverVcfWrapper wrapper = new LiftoverVcfWrapper(job.getLogger()); - wrapper.doLiftover(currentVCF, chain, targetGenome.getWorkingFastaFile(), unmappedOutput, output, pct); + if (useBcfTools) + { + LiftoverBcfToolsWrapper wrapper = new LiftoverBcfToolsWrapper(job.getLogger()); + wrapper.doLiftover(currentVCF, chain, sourceGenome.getWorkingFastaFile(), targetGenome.getWorkingFastaFile(), unmappedOutput, output); + + SequencePipelineService.get().sortVcf(output, null, targetGenome.getSequenceDictionary(), ctx.getLogger()); + } + else + { + LiftoverVcfWrapper wrapper = new LiftoverVcfWrapper(job.getLogger()); + wrapper.doLiftover(currentVCF, chain, targetGenome.getWorkingFastaFile(), unmappedOutput, output, pct); + } Long mapped = null; if (output.exists()) diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/UpdateReadsetFilesHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/UpdateReadsetFilesHandler.java new file mode 100644 index 000000000..9fc12cc93 --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/UpdateReadsetFilesHandler.java @@ -0,0 +1,315 @@ +package org.labkey.sequenceanalysis.analysis; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileWriter; +import htsjdk.samtools.SAMFileWriterFactory; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.util.FileExtensions; +import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFReader; +import org.apache.commons.io.FileUtils; +import org.apache.logging.log4j.Logger; +import org.json.JSONObject; +import org.labkey.api.module.ModuleLoader; +import org.labkey.api.pipeline.PipelineJob; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.pipeline.RecordedAction; +import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.SequenceOutputFile; +import org.labkey.api.sequenceanalysis.model.Readset; +import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.BcftoolsRunner; +import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; +import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.run.PicardWrapper; +import org.labkey.api.util.FileType; +import org.labkey.api.writer.PrintWriters; +import org.labkey.sequenceanalysis.SequenceAnalysisModule; +import org.labkey.sequenceanalysis.util.SequenceUtil; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class UpdateReadsetFilesHandler extends AbstractParameterizedOutputHandler +{ + public UpdateReadsetFilesHandler() + { + super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), "Update Sample/Header Information", "This will re-header any BAM or gVCF files using the sample name from the source readset. All inputs must be single-sample and have a readset attached to the record", null, List.of( + + )); + } + + @Override + public boolean doRunRemote() + { + return true; + } + + @Override + public boolean doRunLocal() + { + return false; + } + + @Override + public boolean canProcess(SequenceOutputFile f) + { + return f.getFile() != null && ( + SequenceUtil.FILETYPE.gvcf.getFileType().isType(f.getFile()) || + SequenceUtil.FILETYPE.bamOrCram.getFileType().isType(f.getFile()) + ); + } + + @Override + public boolean doSplitJobs() + { + return true; + } + + @Override + public SequenceOutputProcessor getProcessor() + { + return new Processor(); + } + + public static class Processor implements SequenceOutputProcessor + { + @Override + public void init(JobContext ctx, List inputFiles, List actions, List outputsToCreate) throws UnsupportedOperationException, PipelineJobException + { + if (inputFiles.size() > 1) + { + throw new PipelineJobException("This job expects a single input file!"); + } + + SequenceOutputFile so = inputFiles.get(0); + if (so.getReadset() == null) + { + throw new PipelineJobException("All inputs must have a readset, missing: " + so.getRowid()); + } + + Readset rs = SequenceAnalysisService.get().getReadset(so.getReadset(), ctx.getJob().getUser()); + String newRsName = SequenceUtil.getLegalReadGroupName(rs.getName()); + + if (SequenceUtil.FILETYPE.bamOrCram.getFileType().isType(so.getFile())) + { + getAndValidateHeaderForBam(so, newRsName); + } + else if (SequenceUtil.FILETYPE.gvcf.getFileType().isType(so.getFile()) | SequenceUtil.FILETYPE.vcf.getFileType().isType(so.getFile())) + { + getAndValidateHeaderForVcf(so, newRsName); + } + + ctx.getSequenceSupport().cacheObject("readsetId", newRsName); + } + + private SAMFileHeader getAndValidateHeaderForBam(SequenceOutputFile so, String newRsName) throws PipelineJobException + { + SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault(); + try (SamReader reader = samReaderFactory.open(so.getFile())) + { + SAMFileHeader header = reader.getFileHeader().clone(); + int nSamples = reader.getFileHeader().getReadGroups().size(); + if (nSamples != 1) + { + throw new PipelineJobException("File has more than one read group, found: " + nSamples); + } + + List rgs = header.getReadGroups(); + String existingSample = rgs.get(0).getSample(); + if (existingSample.equals(newRsName)) + { + throw new PipelineJobException("Sample names match, aborting"); + } + + return header; + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + private VCFHeader getAndValidateHeaderForVcf(SequenceOutputFile so, String newRsName) throws PipelineJobException + { + try (VCFReader reader = new VCFFileReader(so.getFile())) + { + VCFHeader header = reader.getHeader(); + int nSamples = header.getGenotypeSamples().size(); + if (nSamples != 1) + { + throw new PipelineJobException("File has more than one sample, found: " + nSamples); + } + + String existingSample = header.getGenotypeSamples().get(0); + if (existingSample.equals(newRsName)) + { + throw new PipelineJobException("Sample names match, aborting"); + } + + return header; + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + @Override + public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport support, List inputFiles, JSONObject params, File outputDir, List actions, List outputsToCreate) throws UnsupportedOperationException, PipelineJobException + { + + } + + @Override + public void processFilesRemote(List inputFiles, JobContext ctx) throws UnsupportedOperationException, PipelineJobException + { + String newRsName = ctx.getSequenceSupport().getCachedObject("readsetId", String.class); + if (newRsName == null) + { + throw new PipelineJobException("Missing cached readsetId"); + } + + SequenceOutputFile so = inputFiles.get(0); + if (SequenceUtil.FILETYPE.bamOrCram.getFileType().isType(so.getFile())) + { + reheaderBamOrCram(so, ctx, newRsName); + } + else if (SequenceUtil.FILETYPE.gvcf.getFileType().isType(so.getFile()) | SequenceUtil.FILETYPE.vcf.getFileType().isType(so.getFile())) + { + reheaderVcf(so, ctx, newRsName); + } + } + + private void reheaderVcf(SequenceOutputFile so, JobContext ctx, String newRsName) throws PipelineJobException + { + VCFHeader header = getAndValidateHeaderForVcf(so, newRsName); + String existingSample = header.getGenotypeSamples().get(0); + + File sampleNamesFile = new File(ctx.getWorkingDirectory(), "sampleNames.txt"); + try (PrintWriter writer = PrintWriters.getPrintWriter(sampleNamesFile, StandardOpenOption.APPEND)) + { + writer.println(newRsName); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + ctx.getFileManager().addIntermediateFile(sampleNamesFile); + + File outputVcf = new File(ctx.getWorkingDirectory(), so.getFile().getName()); + + BcftoolsRunner wrapper = new BcftoolsRunner(ctx.getLogger()); + wrapper.execute(Arrays.asList(BcftoolsRunner.getBcfToolsPath().getPath(), "reheader", "-s", sampleNamesFile.getPath(), "-o", outputVcf.getPath(), so.getFile().getPath())); + + try + { + File outputIdx = SequenceAnalysisService.get().ensureVcfIndex(outputVcf, ctx.getLogger(), false); + FileUtils.moveFile(outputVcf, so.getFile(), StandardCopyOption.REPLACE_EXISTING); + + FileType gz = new FileType(".gz"); + File inputIndex = gz.isType(so.getFile()) ? new File(so.getFile().getPath() + ".tbi") : new File(so.getFile().getPath() + FileExtensions.TRIBBLE_INDEX); + FileUtils.moveFile(outputIdx, inputIndex, StandardCopyOption.REPLACE_EXISTING); + + addTracker(so, existingSample, newRsName); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + private void addTracker(SequenceOutputFile so, String existingSample, String newRsName) throws IOException + { + File tracker = new File(so.getFile().getParentFile(), "reheaderHistory.txt"); + boolean preExisting = tracker.exists(); + try (PrintWriter writer = PrintWriters.getPrintWriter(tracker, StandardOpenOption.APPEND)) + { + if (!preExisting) + { + writer.println("OriginalSample\tNewSample"); + } + + writer.println(existingSample + "\t" + newRsName); + } + } + + private void reheaderBamOrCram(SequenceOutputFile so, JobContext ctx, String newRsName) throws PipelineJobException + { + try + { + SAMFileHeader header = getAndValidateHeaderForBam(so, newRsName); + + List rgs = header.getReadGroups(); + String existingSample = rgs.get(0).getSample(); + rgs.get(0).setSample(newRsName); + + File headerBam = new File(ctx.getWorkingDirectory(), "header.bam"); + try (SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, headerBam)) + { + + } + ctx.getFileManager().addIntermediateFile(headerBam); + ctx.getFileManager().addIntermediateFile(SequencePipelineService.get().getExpectedIndex(headerBam)); + + File output = new File(ctx.getWorkingDirectory(), so.getFile().getName()); + new ReplaceSamHeaderWrapper(ctx.getLogger()).execute(so.getFile(), output, headerBam); + if (!output.exists()) + { + throw new PipelineJobException("Missing file: " + output.getPath()); + } + + File outputIdx = SequencePipelineService.get().ensureBamIndex(output, ctx.getLogger(), false); + + FileUtils.moveFile(output, so.getFile(), StandardCopyOption.REPLACE_EXISTING); + FileUtils.moveFile(outputIdx, SequenceAnalysisService.get().getExpectedBamOrCramIndex(so.getFile()), StandardCopyOption.REPLACE_EXISTING); + + addTracker(so, existingSample, newRsName); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + + private static class ReplaceSamHeaderWrapper extends PicardWrapper + { + public ReplaceSamHeaderWrapper(Logger log) + { + super(log); + } + + @Override + protected String getToolName() + { + return "ReplaceSamHeader"; + } + + public void execute(File input, File output, File headerBam) throws PipelineJobException + { + List params = new ArrayList<>(getBaseArgs()); + + params.add("--INPUT"); + params.add(input.getPath()); + + params.add("--OUTPUT"); + params.add(output.getPath()); + + params.add("--HEADER"); + params.add(headerBam.getPath()); + + execute(params); + } + } + } +} \ No newline at end of file diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/BWAMemWrapper.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/BWAMemWrapper.java index 05a702793..883b5b6fb 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/BWAMemWrapper.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/BWAMemWrapper.java @@ -17,6 +17,7 @@ import org.labkey.api.sequenceanalysis.pipeline.SamtoolsRunner; import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; import org.labkey.api.util.FileUtil; +import org.labkey.sequenceanalysis.util.SequenceUtil; import java.io.File; import java.util.ArrayList; @@ -61,7 +62,7 @@ protected void doPerformAlignment(AlignmentOutputImpl output, File inputFastq1, rg.add("LB:" + rs.getReadsetId().toString()); rg.add("PL:" + (rs.getPlatform() == null ? "ILLUMINA" : rs.getPlatform())); rg.add("PU:" + (platformUnit == null ? rs.getReadsetId().toString() : platformUnit)); - rg.add("SM:" + rs.getName().replaceAll(" ", "_")); + rg.add("SM:" + SequenceUtil.getLegalReadGroupName(rs)); extraArgs.add("'" + StringUtils.join(rg, "\\t") + "'"); getWrapper().performMemAlignment(getPipelineCtx().getJob(), output, inputFastq1, inputFastq2, outputDirectory, referenceGenome, basename, extraArgs); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java new file mode 100644 index 000000000..07636dfef --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/ParagraphStep.java @@ -0,0 +1,224 @@ +package org.labkey.sequenceanalysis.run.alignment; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import org.apache.commons.io.FileUtils; +import org.json.JSONObject; +import org.labkey.api.module.ModuleLoader; +import org.labkey.api.pipeline.PipelineJob; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.pipeline.RecordedAction; +import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.SequenceOutputFile; +import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; +import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; +import org.labkey.api.sequenceanalysis.run.DockerWrapper; +import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; +import org.labkey.api.util.FileUtil; +import org.labkey.api.writer.PrintWriters; +import org.labkey.sequenceanalysis.SequenceAnalysisModule; +import org.labkey.sequenceanalysis.util.SequenceUtil; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class ParagraphStep extends AbstractParameterizedOutputHandler +{ + public ParagraphStep() + { + super(ModuleLoader.getInstance().getModule(SequenceAnalysisModule.class), "Paragraph SV Genotyping", "This will run paraGRAPH on one or more BAM files to genotype SVs", null, Arrays.asList( + ToolParameterDescriptor.createExpDataParam("svVCF", "Input VCF", "This is the DataId of the VCF containing the SVs to genotype", "ldk-expdatafield", new JSONObject() + {{ + put("allowBlank", false); + }}, null) + )); + } + + @Override + public boolean doSplitJobs() + { + return true; + } + + @Override + public boolean canProcess(SequenceOutputFile o) + { + return o.getFile() != null && o.getFile().exists() && SequenceUtil.FILETYPE.bamOrCram.getFileType().isType(o.getFile()); + } + + @Override + public boolean doRunRemote() + { + return true; + } + + @Override + public boolean doRunLocal() + { + return false; + } + + @Override + public SequenceOutputProcessor getProcessor() + { + return new Processor(); + } + + public static class Processor implements SequenceOutputProcessor + { + @Override + public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport support, List inputFiles, JSONObject params, File outputDir, List actions, List outputsToCreate) throws UnsupportedOperationException, PipelineJobException + { + + } + + @Override + public void processFilesRemote(List inputFiles, JobContext ctx) throws UnsupportedOperationException, PipelineJobException + { + int svVcfId = ctx.getParams().optInt("svVCF", 0); + if (svVcfId == 0) + { + throw new PipelineJobException("svVCF param was null"); + } + + File svVcf = ctx.getSequenceSupport().getCachedData(svVcfId); + if (svVcf == null) + { + throw new PipelineJobException("File not found for ID: " + svVcfId); + } + else if (!svVcf.exists()) + { + throw new PipelineJobException("Missing file: " + svVcf.getPath()); + } + + Integer threads = SequencePipelineService.get().getMaxThreads(ctx.getLogger()); + for (SequenceOutputFile so : inputFiles) + { + List depthArgs = new ArrayList<>(); + depthArgs.add("idxdepth"); + depthArgs.add("-b"); + depthArgs.add(so.getFile().getPath()); + + File coverageJson = new File(ctx.getWorkingDirectory(), "coverage.json"); + depthArgs.add("-o"); + depthArgs.add(coverageJson.getPath()); + + depthArgs.add("-r"); + depthArgs.add(ctx.getSequenceSupport().getCachedGenome(so.getLibrary_id()).getWorkingFastaFile().getPath()); + + if (threads != null) + { + depthArgs.add("--threads"); + depthArgs.add(threads.toString()); + } + + new SimpleScriptWrapper(ctx.getLogger()).execute(depthArgs); + + if (!coverageJson.exists()) + { + throw new PipelineJobException("Missing file: " + coverageJson.getPath()); + } + ctx.getFileManager().addIntermediateFile(coverageJson); + + // Should produce a simple text file: + // id path depth read length + // TNPRC-IB18 ../IB18.cram 29.77 150 + File coverageFile = new File(ctx.getWorkingDirectory(), "coverage.txt"); + String rgId = null; + try (PrintWriter writer = PrintWriters.getPrintWriter(coverageFile); SamReader reader = SamReaderFactory.makeDefault().open(so.getFile())) + { + SAMFileHeader header = reader.getFileHeader(); + if (header.getReadGroups().size() == 0) + { + throw new PipelineJobException("No read groups found in input BAM"); + } + else if (header.getReadGroups().size() > 1) + { + throw new PipelineJobException("More than one read group found in BAM"); + } + + rgId = header.getReadGroups().get(0).getSample(); + + JSONObject json = new JSONObject(FileUtils.readFileToString(coverageJson, Charset.defaultCharset())); + writer.println("id\tpath\tdepth\tread length"); + double depth = json.getJSONObject("autosome").getDouble("depth"); + double readLength = json.getInt("read_length"); + writer.println(rgId + "\t" + "/work/" + so.getFile().getName() + "\t" + depth + "\t" + readLength); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + ctx.getFileManager().addIntermediateFile(coverageFile); + + DockerWrapper dockerWrapper = new DockerWrapper("ghcr.io/bimberlabinternal/paragraph:latest", ctx.getLogger()); + List paragraphArgs = new ArrayList<>(); + paragraphArgs.add("/opt/paragraph/bin/multigrmpy.py"); + + dockerWrapper.ensureLocalCopy(so.getFile(), ctx.getWorkingDirectory(), ctx.getFileManager()); + dockerWrapper.ensureLocalCopy(SequenceAnalysisService.get().getExpectedBamOrCramIndex(so.getFile()), ctx.getWorkingDirectory(), ctx.getFileManager()); + + File paragraphOutDir = new File(ctx.getWorkingDirectory(), FileUtil.getBaseName(so.getFile())); + paragraphArgs.add("-o"); + paragraphArgs.add("/work/" + paragraphOutDir.getName()); + + paragraphArgs.add("-i"); + dockerWrapper.ensureLocalCopy(svVcf, ctx.getWorkingDirectory(), ctx.getFileManager()); + dockerWrapper.ensureLocalCopy(new File(svVcf.getPath() + ".tbi"), ctx.getWorkingDirectory(), ctx.getFileManager()); + paragraphArgs.add("/work/" + svVcf.getName()); + + paragraphArgs.add("-m"); + paragraphArgs.add("/work/" + coverageFile.getName()); + + paragraphArgs.add("-r"); + File genomeFasta = ctx.getSequenceSupport().getCachedGenome(so.getLibrary_id()).getWorkingFastaFile(); + dockerWrapper.ensureLocalCopy(genomeFasta, ctx.getWorkingDirectory(), ctx.getFileManager()); + dockerWrapper.ensureLocalCopy(new File(genomeFasta.getPath() + ".fai"), ctx.getWorkingDirectory(), ctx.getFileManager()); + paragraphArgs.add("/work/" + genomeFasta.getName()); + + paragraphArgs.add("--scratch-dir"); + paragraphArgs.add("/tmp"); + dockerWrapper.setTmpDir(new File(SequencePipelineService.get().getJavaTempDir())); + + if (threads != null) + { + paragraphArgs.add("--threads"); + paragraphArgs.add(threads.toString()); + } + + dockerWrapper.executeWithDocker(paragraphArgs, ctx.getWorkingDirectory(), ctx.getFileManager()); + + File genotypes = new File(paragraphOutDir, "genotypes.vcf.gz"); + if (!genotypes.exists()) + { + throw new PipelineJobException("Missing file: " + genotypes.getPath()); + } + + try + { + SequenceAnalysisService.get().ensureVcfIndex(genotypes, ctx.getLogger()); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + + ctx.getFileManager().addSequenceOutput(genotypes, "paraGRAPH Genotypes: " + so.getName(), "paraGRAPH Genoypes", so.getReadset(), null, so.getLibrary_id(), "Input VCF: " + svVcf.getName() + " (" + svVcfId + ")"); + + ctx.getFileManager().addIntermediateFile(new File(paragraphOutDir, "variants.json.gz")); + ctx.getFileManager().addIntermediateFile(new File(paragraphOutDir, "variants.vcf.gz")); + ctx.getFileManager().addIntermediateFile(new File(paragraphOutDir, "genotypes.json.gz")); + ctx.getFileManager().addIntermediateFile(new File(paragraphOutDir, "grmpy.log")); + } + } + } +} \ No newline at end of file diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/StarWrapper.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/StarWrapper.java index d5a0cd1f2..38c635908 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/StarWrapper.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/alignment/StarWrapper.java @@ -53,7 +53,7 @@ public StarWrapper(@Nullable Logger logger) public static class StarAlignmentStep extends AbstractAlignmentPipelineStep implements AlignmentStep { - public StarAlignmentStep(AlignmentStepProvider provider, PipelineContext ctx) + public StarAlignmentStep(AlignmentStepProvider provider, PipelineContext ctx) { super(provider, ctx, new StarWrapper(ctx.getLogger())); } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/AddOrReplaceReadGroupsStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/AddOrReplaceReadGroupsStep.java index 776a42182..4b845742d 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/AddOrReplaceReadGroupsStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/bampostprocessing/AddOrReplaceReadGroupsStep.java @@ -11,6 +11,7 @@ import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep; import org.labkey.sequenceanalysis.run.util.AddOrReplaceReadGroupsWrapper; +import org.labkey.sequenceanalysis.util.SequenceUtil; import java.io.File; @@ -48,7 +49,7 @@ public Output processBam(Readset rs, File inputBam, ReferenceGenome referenceGen File outputBam = new File(outputDirectory, FileUtil.getBaseName(inputBam) + ".readgroups.bam"); output.addIntermediateFile(outputBam); - output.setBAM(getWrapper().executeCommand(inputBam, outputBam, rs.getReadsetId().toString(), rs.getPlatform(), rs.getReadsetId().toString(), rs.getName().replaceAll(" ", "_"))); + output.setBAM(getWrapper().executeCommand(inputBam, outputBam, rs.getReadsetId().toString(), rs.getPlatform(), rs.getReadsetId().toString(), SequenceUtil.getLegalReadGroupName(rs))); return output; } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/LiftoverBcfToolsWrapper.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/LiftoverBcfToolsWrapper.java new file mode 100644 index 000000000..1a7c575ca --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/LiftoverBcfToolsWrapper.java @@ -0,0 +1,95 @@ +package org.labkey.sequenceanalysis.run.util; + +import org.apache.logging.log4j.Logger; +import org.jetbrains.annotations.Nullable; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; +import org.labkey.api.sequenceanalysis.run.PicardWrapper; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Created by bimber on 3/24/2016. + */ +public class LiftoverBcfToolsWrapper extends PicardWrapper +{ + public LiftoverBcfToolsWrapper(@Nullable Logger logger) + { + super(logger); + } + + public void doLiftover(File inputVcf, File chainFile, File sourceGenomeFasta, File targetGenomeFasta, @Nullable File rejectVcf, File outputVcf) throws PipelineJobException + { + getLogger().info("Liftover VCF (bcftools): " + inputVcf.getPath()); + + List params = new ArrayList<>(); + params.add(SequencePipelineService.get().getExeForPackage("BCFTOOLS", "bcftools").getPath()); + params.add("+liftover"); + + params.add("--no-version"); + params.add("-Oz"); + + Integer threads = SequencePipelineService.get().getMaxThreads(getLogger()); + if (threads != null) + { + params.add("--threads"); + params.add(threads.toString()); + } + + params.add("-o"); + params.add(outputVcf.getPath()); + + params.add(inputVcf.getPath()); + params.add("--"); + + params.add("-s"); + params.add(sourceGenomeFasta.getPath()); + + params.add("-f"); + params.add(targetGenomeFasta.getPath()); + + params.add("-c"); + params.add(chainFile.getPath()); + + params.add("--write-src"); + params.add("--fix-tags"); + + if (rejectVcf != null) + { + params.add("--reject"); + params.add(rejectVcf.getPath()); + + params.add("--reject-type"); + params.add("z"); + } + + execute(params); + + if (!outputVcf.exists()) + { + throw new PipelineJobException("Output file could not be found: " + outputVcf.getPath()); + } + + if (rejectVcf != null && rejectVcf.exists()) + { + try + { + SequenceAnalysisService.get().ensureVcfIndex(rejectVcf, getLogger()); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + } + + @Override + protected String getToolName() + { + return "LiftoverVcf"; + } +} diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/SVAnnotateStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/SVAnnotateStep.java new file mode 100644 index 000000000..cac32155a --- /dev/null +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/util/SVAnnotateStep.java @@ -0,0 +1,104 @@ +package org.labkey.sequenceanalysis.run.util; + +import htsjdk.samtools.util.Interval; +import org.apache.logging.log4j.Logger; +import org.json.JSONObject; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider; +import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; +import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider; +import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; +import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; +import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep; +import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl; +import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep; +import org.labkey.api.sequenceanalysis.run.AbstractGatk4Wrapper; +import org.labkey.api.util.PageFlowUtil; +import org.labkey.sequenceanalysis.pipeline.SequenceTaskHelper; + +import javax.annotation.Nullable; +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class SVAnnotateStep extends AbstractCommandPipelineStep implements VariantProcessingStep +{ + public static final String GENE_PARAM = "gene_file"; + + public SVAnnotateStep(PipelineStepProvider provider, PipelineContext ctx) + { + super(provider, ctx, new SNAnnotateWrapper(ctx.getLogger())); + } + + public static class Provider extends AbstractVariantProcessingStepProvider + { + public Provider() + { + super("SVAnnotateStep", "GATK SVAnnotate", "GATK", "This will run GATK's SVAnnotate to classify SVs by impact", Arrays.asList( + ToolParameterDescriptor.createExpDataParam(GENE_PARAM, "Gene File", "This is the ID of a GTF or GFF3 file containing genes from this genome.", "sequenceanalysis-genomefileselectorfield", new JSONObject() + {{ + put("extensions", Arrays.asList("gtf")); + put("width", 400); + put("allowBlank", false); + }}, null) + ), PageFlowUtil.set("sequenceanalysis/field/GenomeFileSelectorField.js"), ""); + + } + + @Override + public SVAnnotateStep create(PipelineContext ctx) + { + return new SVAnnotateStep(this, ctx); + } + } + + @Override + public Output processVariants(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List intervals) throws PipelineJobException + { + VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl(); + + output.addInput(inputVCF, "Input VCF"); + output.addInput(genome.getWorkingFastaFile(), "Reference Genome"); + + List args = new ArrayList<>(getWrapper().getBaseArgs("SVAnnotate")); + args.add("-V"); + args.add(inputVCF.getPath()); + + if (intervals != null) + { + intervals.forEach(interval -> { + args.add("-L"); + args.add(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd()); + }); + } + + Integer geneFileId = getProvider().getParameterByName(GENE_PARAM).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class); + File geneFile = getPipelineCtx().getSequenceSupport().getCachedData(geneFileId); + if (!geneFile.exists()) + { + throw new PipelineJobException("Unable to find file: " + geneFile.getPath()); + } + args.add("--protein-coding-gtf"); + args.add(geneFile.getPath()); + + File outputVcf = new File(outputDirectory, SequenceTaskHelper.getUnzippedBaseName(inputVCF) + ".svannotate.vcf.gz"); + getWrapper().execute(args); + if (!outputVcf.exists()) + { + throw new PipelineJobException("output not found: " + outputVcf); + } + + output.setVcf(outputVcf); + + return output; + } + + public static class SNAnnotateWrapper extends AbstractGatk4Wrapper + { + public SNAnnotateWrapper(@Nullable Logger logger) + { + super(logger); + } + } +} diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SampleRenameStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SampleRenameStep.java index 46ffae773..6af568325 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SampleRenameStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SampleRenameStep.java @@ -27,6 +27,7 @@ import java.io.File; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -110,7 +111,7 @@ else if (enforceChangeAll) List queryIntervals = intervals; if (queryIntervals == null || queryIntervals.isEmpty()) { - queryIntervals.add(null); + queryIntervals = Collections.singletonList(null); } int i = 0; diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SelectSNVsStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SelectSNVsStep.java index ea057d26d..a94a218b7 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SelectSNVsStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/SelectSNVsStep.java @@ -33,7 +33,7 @@ public class SelectSNVsStep extends AbstractCommandPipelineStep provider, PipelineContext ctx) { super(provider, ctx, new SelectVariantsWrapper(ctx.getLogger())); } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ChainFileValidator.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ChainFileValidator.java index 90978fcf0..9055c0cbe 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ChainFileValidator.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ChainFileValidator.java @@ -420,7 +420,8 @@ else if (refName.endsWith("_alt")) refName = StringUtils.join(Arrays.copyOfRange(tokens, 1, tokens.length), "_"); } - if (refName.equals("chrM")) + // NOTE: hg19 and GRCh37 have different MT contigs. chrM is the legacy version. + if (refName.equals("chrMT")) { return "MT"; } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java index f270b7e8d..6808024d9 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/SequenceUtil.java @@ -31,6 +31,7 @@ import org.json.JSONObject; import org.labkey.api.pipeline.PipelineJobException; import org.labkey.api.sequenceanalysis.SequenceAnalysisService; +import org.labkey.api.sequenceanalysis.model.Readset; import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.sequenceanalysis.run.CommandWrapper; @@ -650,4 +651,14 @@ else if (FILETYPE.cram.getFileType().isType(bamOrCram)) return null; } + + public static String getLegalReadGroupName(Readset rs) + { + return getLegalReadGroupName(rs.getName()); + } + + public static String getLegalReadGroupName(String rsName) + { + return rsName.replaceAll(" ", "_"); + } } \ No newline at end of file diff --git a/cluster/resources/web/cluster/htcondor/pipelineConfig.xml b/cluster/resources/web/cluster/htcondor/pipelineConfig.xml index 36f8e6c7a..e32a28080 100644 --- a/cluster/resources/web/cluster/htcondor/pipelineConfig.xml +++ b/cluster/resources/web/cluster/htcondor/pipelineConfig.xml @@ -89,27 +89,6 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/cluster/src/org/labkey/cluster/pipeline/AbstractClusterEngineConfig.java b/cluster/src/org/labkey/cluster/pipeline/AbstractClusterEngineConfig.java index 2ca6bd501..7d6b38fd0 100644 --- a/cluster/src/org/labkey/cluster/pipeline/AbstractClusterEngineConfig.java +++ b/cluster/src/org/labkey/cluster/pipeline/AbstractClusterEngineConfig.java @@ -109,7 +109,7 @@ protected List getFinalJavaOpts(PipelineJob job, RemoteExecutionEngine e return javaOpts; } - public List getJobArgs(File localPipelineDir, File localSerializedJobXmlFile, PipelineJob job, RemoteExecutionEngine engine) + public List getJobArgs(File localPipelineDir, File localSerializedJobXmlFile, PipelineJob job, RemoteExecutionEngine engine) { List ret = new ArrayList<>(); ret.addAll(getFinalJavaOpts(job, engine)); diff --git a/cluster/src/org/labkey/cluster/pipeline/DockerHTCondorExecutionEngineConfig.java b/cluster/src/org/labkey/cluster/pipeline/DockerHTCondorExecutionEngineConfig.java deleted file mode 100644 index 6293818ac..000000000 --- a/cluster/src/org/labkey/cluster/pipeline/DockerHTCondorExecutionEngineConfig.java +++ /dev/null @@ -1,100 +0,0 @@ -package org.labkey.cluster.pipeline; - -import org.labkey.api.pipeline.PipelineJob; -import org.labkey.api.pipeline.RemoteExecutionEngine; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -/** - * Created by bimber on 10/31/2015. - */ -public class DockerHTCondorExecutionEngineConfig extends HTCondorExecutionEngineConfig -{ - private String _dockerImageName = "bbimber/discvr-seq"; - private String _configDir = ""; - protected String _activeMqHost = ""; - - public DockerHTCondorExecutionEngineConfig() - { - _remoteExecutable = "docker"; - _labKeyDir = "/labkey"; - } - - @Override - public List getExtraSubmitLines() - { - List ret = super.getExtraSubmitLines(); - ret.add("requirements = (TARGET.IsDockerComputeNode =?= True)"); - - return ret; - } - - @Override - public List getJobArgs(File localPipelineDir, File localSerializedJobFile, PipelineJob job, RemoteExecutionEngine engine) - { - List ret = new ArrayList<>(); - ret.add("run"); - ret.add("--rm=true"); - //TODO: add flag to force rebuild of image - //ret.add("-e"); - //ret.add("ACTIVEMQ_HOST=X"); - //TODO: mount whole file root - ret.add("-v"); - ret.add(getClusterPath(localPipelineDir) + ":/data"); - ret.add("-v"); - ret.add("/mnt/scratch:/work"); - ret.add("-v"); - ret.add(_configDir + ":/labkey/config"); - - ret.add("--add-host=activeMqServer:" + _activeMqHost); - //TODO: add -env for CPUs, memory? - ret.add(_dockerImageName); - ret.add("java"); - ret.addAll(getFinalJavaOpts(job, engine)); - - //TODO: support as config param - //ret.add("-Xdebug"); - //ret.add("-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"); - ret.add("-cp"); - ret.add("/labkey/labkeyBootstrap.jar"); - ret.add("org.labkey.bootstrap.ClusterBootstrap"); - ret.add("-modulesdir=" + "/labkey/modules"); - ret.add("-webappdir=" + "/labkey/labkeywebapp"); - ret.add("-configdir=" + "/labkey/config"); - ret.add(getClusterPath(localSerializedJobFile, true)); - - return ret; - } - - public String getDockerImageName() - { - return _dockerImageName; - } - - public void setDockerImageName(String dockerImageName) - { - _dockerImageName = dockerImageName; - } - - public String getConfigDir() - { - return _configDir; - } - - public void setConfigDir(String configDir) - { - _configDir = configDir; - } - - public String getActiveMqHost() - { - return _activeMqHost; - } - - public void setActiveMqHost(String activeMqHost) - { - _activeMqHost = activeMqHost; - } -} diff --git a/jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java b/jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java index 513adb923..4e14293a1 100644 --- a/jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java +++ b/jbrowse/api-src/org/labkey/api/jbrowse/JBrowseService.java @@ -56,4 +56,6 @@ public interface LuceneIndexDetector boolean isAvailable(Container c); } + + abstract public void clearLuceneCacheEntry(File luceneIndexDir); } diff --git a/jbrowse/package-lock.json b/jbrowse/package-lock.json index dc3e491f6..1bfd1235d 100644 --- a/jbrowse/package-lock.json +++ b/jbrowse/package-lock.json @@ -6793,11 +6793,11 @@ } }, "node_modules/braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "dependencies": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" }, "engines": { "node": ">=8" @@ -9112,9 +9112,9 @@ "integrity": "sha512-P9bmyZ3h/PRG+Nzga+rbdI4OEpNDzAVyy74uVO9ATgzLK6VtAsYybF/+TOCvrc0MO793d6+42lLyZTw7/ArVzA==" }, "node_modules/fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "dependencies": { "to-regex-range": "^5.0.1" }, diff --git a/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java b/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java index 1b2780f01..f886d1c86 100644 --- a/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java +++ b/jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java @@ -19,7 +19,6 @@ import org.apache.lucene.search.LRUQueryCache; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -27,11 +26,14 @@ import org.apache.lucene.search.UsageTrackingQueryCachingPolicy; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.json.JSONArray; import org.json.JSONObject; import org.labkey.api.cache.Cache; +import org.labkey.api.cache.CacheLoader; import org.labkey.api.cache.CacheManager; +import org.labkey.api.cache.TrackingCache; import org.labkey.api.data.Container; import org.labkey.api.data.ContainerManager; import org.labkey.api.jbrowse.AbstractJBrowseFieldCustomizer; @@ -40,6 +42,8 @@ import org.labkey.api.module.ModuleLoader; import org.labkey.api.security.User; import org.labkey.api.settings.AppProps; +import org.labkey.api.util.Filter; +import org.labkey.api.util.ShutdownListener; import org.labkey.api.util.logging.LogHelper; import org.labkey.jbrowse.model.JBrowseSession; import org.labkey.jbrowse.model.JsonFile; @@ -56,6 +60,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -76,7 +81,7 @@ public class JBrowseLuceneSearch private static final int maxCachedQueries = 1000; private static final long maxRamBytesUsed = 250 * 1024 * 1024L; - private static final Cache _cache = CacheManager.getStringKeyCache(1000, CacheManager.UNLIMITED, "JBrowseLuceneSearchCache"); + private static final Cache _cache = new LuceneIndexCache(); private JBrowseLuceneSearch(final JBrowseSession session, final JsonFile jsonFile, User u) { @@ -97,18 +102,36 @@ public static JBrowseLuceneSearch create(String sessionId, String trackId, User return new JBrowseLuceneSearch(session, getTrack(session, trackId, u), u); } - private static synchronized QueryCache getCacheForSession(String trackObjectId) { - LRUQueryCache qc = _cache.get(trackObjectId); - if (qc == null) + private static synchronized CacheEntry getCacheEntryForSession(String trackObjectId, File indexPath) throws IOException { + CacheEntry cacheEntry = _cache.get(trackObjectId); + + // Open directory of lucene path, get a directory reader, and create the index search manager + if (cacheEntry == null) { - qc = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed); - _cache.put(trackObjectId, qc); + try + { + Directory indexDirectory = FSDirectory.open(indexPath.toPath()); + LRUQueryCache queryCache = new LRUQueryCache(maxCachedQueries, maxRamBytesUsed); + IndexReader indexReader = DirectoryReader.open(indexDirectory); + IndexSearcher indexSearcher = new IndexSearcher(indexReader); + indexSearcher.setQueryCache(queryCache); + indexSearcher.setQueryCachingPolicy(new ForceMatchAllDocsCachingPolicy()); + cacheEntry = new CacheEntry(queryCache, indexSearcher, indexPath); + _cache.put(trackObjectId, cacheEntry); + } + catch (Exception e) + { + _log.error("Error creating jbrowse/lucene index reader for: " + trackObjectId, e); + + throw new IllegalStateException("Error creating search index reader for: " + trackObjectId); + } } - return qc; + return cacheEntry; } - private String templateReplace(final String searchString) { + private String templateReplace(final String searchString) + { String result = searchString; Pattern pattern = Pattern.compile("~(.*?)~"); Matcher matcher = pattern.matcher(searchString); @@ -128,25 +151,34 @@ private String templateReplace(final String searchString) { return result; } - private String tryUrlDecode(String input) { - try { + private String tryUrlDecode(String input) + { + try + { //special case for urls containing +; this isn't necessary for strings sent from the client-side, but URLs //sent via unit tests autodecode, and strings containing + rather than the URL-encoded symbol are unsafe //to pass through URLDecoded.decode - if (input.contains("+")) { + if (input.contains("+")) + { return input; } return URLDecoder.decode(input, StandardCharsets.UTF_8); - } catch (IllegalArgumentException e) { + } catch (IllegalArgumentException e) + { + _log.error("Unable to URL decode input string: " + input, e); + return input; } } - public String extractFieldName(String queryString) { + public String extractFieldName(String queryString) + { // Check if the query starts with any of the start patterns - for (String pattern : specialStartPatterns) { - if (queryString.startsWith(pattern)) { + for (String pattern : specialStartPatterns) + { + if (queryString.startsWith(pattern)) + { queryString = queryString.substring(pattern.length()).trim(); break; } @@ -163,152 +195,148 @@ public JSONObject doSearch(User u, String searchString, final int pageSize, fina File indexPath = _jsonFile.getExpectedLocationOfLuceneIndex(true); Map fields = JBrowseFieldUtils.getIndexedFields(_jsonFile, u, getContainer()); - // Open directory of lucene path, get a directory reader, and create the index search manager - try ( - Directory indexDirectory = FSDirectory.open(indexPath.toPath()); - IndexReader indexReader = DirectoryReader.open(indexDirectory); - Analyzer analyzer = new StandardAnalyzer(); - ) - { - IndexSearcher indexSearcher = new IndexSearcher(indexReader); - indexSearcher.setQueryCache(getCacheForSession(_jsonFile.getObjectId())); - indexSearcher.setQueryCachingPolicy(new ForceMatchAllDocsCachingPolicy()); + CacheEntry cacheEntry = getCacheEntryForSession(_jsonFile.getObjectId(), indexPath); + Analyzer analyzer = new StandardAnalyzer(); - List stringQueryParserFields = new ArrayList<>(); - Map numericQueryParserFields = new HashMap<>(); - PointsConfig intPointsConfig = new PointsConfig(new DecimalFormat(), Integer.class); - PointsConfig doublePointsConfig = new PointsConfig(new DecimalFormat(), Double.class); - Map pointsConfigMap = new HashMap<>(); + List stringQueryParserFields = new ArrayList<>(); + Map numericQueryParserFields = new HashMap<>(); + PointsConfig intPointsConfig = new PointsConfig(new DecimalFormat(), Integer.class); + PointsConfig doublePointsConfig = new PointsConfig(new DecimalFormat(), Double.class); + Map pointsConfigMap = new HashMap<>(); - // Iterate fields and split them into fields for the queryParser and the numericQueryParser - for (Map.Entry entry : fields.entrySet()) - { - String field = entry.getKey(); - JBrowseFieldDescriptor descriptor = entry.getValue(); + // Iterate fields and split them into fields for the queryParser and the numericQueryParser + for (Map.Entry entry : fields.entrySet()) + { + String field = entry.getKey(); + JBrowseFieldDescriptor descriptor = entry.getValue(); - switch(descriptor.getType()) - { - case Flag, String, Character -> stringQueryParserFields.add(field); - case Float -> { - numericQueryParserFields.put(field, SortField.Type.DOUBLE); - pointsConfigMap.put(field, doublePointsConfig); - } - case Integer -> { - numericQueryParserFields.put(field, SortField.Type.INT); - pointsConfigMap.put(field, intPointsConfig); - } + switch(descriptor.getType()) + { + case Flag, String, Character -> stringQueryParserFields.add(field); + case Float -> { + numericQueryParserFields.put(field, SortField.Type.DOUBLE); + pointsConfigMap.put(field, doublePointsConfig); + } + case Integer -> { + numericQueryParserFields.put(field, SortField.Type.LONG); + pointsConfigMap.put(field, intPointsConfig); } } + } - // The numericQueryParser can perform range queries, but numeric fields they can't be indexed alongside - // lexicographic fields, so they get split into a separate parser - MultiFieldQueryParser queryParser = new MultiFieldQueryParser(stringQueryParserFields.toArray(new String[0]), new StandardAnalyzer()); - queryParser.setAllowLeadingWildcard(true); + // The numericQueryParser can perform range queries, but numeric fields they can't be indexed alongside + // lexicographic fields, so they get split into a separate parser + MultiFieldQueryParser queryParser = new MultiFieldQueryParser(stringQueryParserFields.toArray(new String[0]), new StandardAnalyzer()); + queryParser.setAllowLeadingWildcard(true); - StandardQueryParser numericQueryParser = new StandardQueryParser(); - numericQueryParser.setAnalyzer(analyzer); - numericQueryParser.setPointsConfigMap(pointsConfigMap); + StandardQueryParser numericQueryParser = new StandardQueryParser(); + numericQueryParser.setAnalyzer(analyzer); + numericQueryParser.setPointsConfigMap(pointsConfigMap); - BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); - if (searchString.equals(ALL_DOCS)) { - booleanQueryBuilder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST); - } + if (searchString.equals(ALL_DOCS)) + { + booleanQueryBuilder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST); + } - // Split input into tokens, 1 token per query separated by & - StringTokenizer tokenizer = new StringTokenizer(searchString, "&"); + // Split input into tokens, 1 token per query separated by & + StringTokenizer tokenizer = new StringTokenizer(searchString, "&"); - while (tokenizer.hasMoreTokens() && !searchString.equals(ALL_DOCS)) - { - String queryString = tokenizer.nextToken(); - Query query = null; + while (tokenizer.hasMoreTokens() && !searchString.equals(ALL_DOCS)) + { + String queryString = tokenizer.nextToken(); + Query query = null; - String fieldName = extractFieldName(queryString); + String fieldName = extractFieldName(queryString); - if (VARIABLE_SAMPLES.equals(fieldName)) - { - queryString = templateReplace(queryString); - } + if (VARIABLE_SAMPLES.equals(fieldName)) + { + queryString = templateReplace(queryString); + } - if (stringQueryParserFields.contains(fieldName)) - { - query = queryParser.parse(queryString); - } - else if (numericQueryParserFields.containsKey(fieldName)) + if (stringQueryParserFields.contains(fieldName)) + { + query = queryParser.parse(queryString); + } + else if (numericQueryParserFields.containsKey(fieldName)) + { + try { - try - { - query = numericQueryParser.parse(queryString, ""); - } - catch (QueryNodeException e) - { - e.printStackTrace(); - } + query = numericQueryParser.parse(queryString, ""); } - else + catch (QueryNodeException e) { - throw new IllegalArgumentException("No such field(s), or malformed query: " + queryString + ", field: " + fieldName); + _log.error("Unable to parse query for field " + fieldName + ": " + queryString, e); + + throw new IllegalArgumentException("Unable to parse query: " + queryString + " for field: " + fieldName); } + } + else + { + _log.error("No such field(s), or malformed query: " + queryString + ", field: " + fieldName); - booleanQueryBuilder.add(query, BooleanClause.Occur.MUST); + throw new IllegalArgumentException("No such field(s), or malformed query: " + queryString + ", field: " + fieldName); } - BooleanQuery query = booleanQueryBuilder.build(); + booleanQueryBuilder.add(query, BooleanClause.Occur.MUST); + } - // By default, sort in INDEXORDER, which is by genomicPosition - Sort sort = Sort.INDEXORDER; + BooleanQuery query = booleanQueryBuilder.build(); - // If the sort field is not genomicPosition, use the provided sorting data - if (!sortField.equals(GENOMIC_POSITION)) { - SortField.Type fieldType; + // By default, sort in INDEXORDER, which is by genomicPosition + Sort sort = Sort.INDEXORDER; - if (stringQueryParserFields.contains(sortField)) { - fieldType = SortField.Type.STRING; - } else if (numericQueryParserFields.containsKey(sortField)) { - fieldType = numericQueryParserFields.get(sortField); - } else { - throw new IllegalArgumentException("Could not find type for sort field: " + sortField); - } + // If the sort field is not genomicPosition, use the provided sorting data + if (!sortField.equals(GENOMIC_POSITION)) { + SortField.Type fieldType; - sort = new Sort(new SortField(sortField + "_sort", fieldType, sortReverse)); + if (stringQueryParserFields.contains(sortField)) { + fieldType = SortField.Type.STRING; + } else if (numericQueryParserFields.containsKey(sortField)) { + fieldType = numericQueryParserFields.get(sortField); + } else { + throw new IllegalArgumentException("Could not find type for sort field: " + sortField); } - // Get chunks of size {pageSize}. Default to 1 chunk -- add to the offset to get more. - // We then iterate over the range of documents we want based on the offset. This does grow in memory - // linearly with the number of documents, but my understanding is that these are just score,id pairs - // rather than full documents, so mem usage *should* still be pretty low. - // Perform the search with sorting - TopFieldDocs topDocs = indexSearcher.search(query, pageSize * (offset + 1), sort); + sort = new Sort(new SortField(sortField + "_sort", fieldType, sortReverse)); + } + + // Get chunks of size {pageSize}. Default to 1 chunk -- add to the offset to get more. + // We then iterate over the range of documents we want based on the offset. This does grow in memory + // linearly with the number of documents, but my understanding is that these are just score,id pairs + // rather than full documents, so mem usage *should* still be pretty low. + // Perform the search with sorting + TopFieldDocs topDocs = cacheEntry.indexSearcher.search(query, pageSize * (offset + 1), sort); - JSONObject results = new JSONObject(); + JSONObject results = new JSONObject(); - // Iterate over the doc list, (either to the total end or until the page ends) grab the requested docs, - // and add to returned results - List data = new ArrayList<>(); - for (int i = pageSize * offset; i < Math.min(pageSize * (offset + 1), topDocs.scoreDocs.length); i++) - { - JSONObject elem = new JSONObject(); - Document doc = indexSearcher.storedFields().document(topDocs.scoreDocs[i].doc); - - for (IndexableField field : doc.getFields()) { - String fieldName = field.name(); - String[] fieldValues = doc.getValues(fieldName); - if (fieldValues.length > 1) { - elem.put(fieldName, fieldValues); - } else { - elem.put(fieldName, fieldValues[0]); - } + // Iterate over the doc list, (either to the total end or until the page ends) grab the requested docs, + // and add to returned results + List data = new ArrayList<>(); + for (int i = pageSize * offset; i < Math.min(pageSize * (offset + 1), topDocs.scoreDocs.length); i++) + { + JSONObject elem = new JSONObject(); + Document doc = cacheEntry.indexSearcher.storedFields().document(topDocs.scoreDocs[i].doc); + + for (IndexableField field : doc.getFields()) { + String fieldName = field.name(); + String[] fieldValues = doc.getValues(fieldName); + if (fieldValues.length > 1) { + elem.put(fieldName, fieldValues); + } else { + elem.put(fieldName, fieldValues[0]); } - - data.add(elem); } - results.put("data", data); - results.put("totalHits", topDocs.totalHits.value); - - //TODO: we should probably stream this - return results; + data.add(elem); } + + results.put("data", data); + results.put("totalHits", topDocs.totalHits.value); + + //TODO: we should probably stream this + return results; } public static class DefaultJBrowseFieldCustomizer extends AbstractJBrowseFieldCustomizer @@ -382,6 +410,8 @@ public boolean shouldCache(Query query) throws IOException { return true; } } + } else if (query instanceof MatchAllDocsQuery) { + return true; } return defaultPolicy.shouldCache(query); @@ -393,12 +423,148 @@ public void onUse(Query query) { } } + public static class LuceneIndexCache implements Cache + { + private final Cache _cache; + + public LuceneIndexCache() + { + _cache = CacheManager.getStringKeyCache(1000, CacheManager.UNLIMITED, "JBrowseLuceneSearchCache"); + + } + + @Override + public void remove(@NotNull String key) + { + CacheEntry e = get(key); + _cache.remove(key); + + closeReader(e); + } + + @Override + public void clear() + { + for (String key : getKeys()) + { + closeReader(get(key)); + } + + _cache.clear(); + } + + @Override + public void close() + { + for (String key : getKeys()) + { + closeReader(get(key)); + } + + _cache.close(); + } + + private void closeReader(@Nullable CacheEntry entry) + { + if (entry == null) + { + return; + } + + try + { + entry.getIndexSearcher().getIndexReader().close(); + } + catch (IOException e) + { + _log.error("Error closing JBrowseLuceneSearch index reader", e); + } + } + + @Override + public void put(@NotNull String key, CacheEntry value) + { + _cache.put(key, value); + } + + @Override + public void put(@NotNull String key, CacheEntry value, long timeToLive) + { + _cache.put(key, value, timeToLive); + } + + @Override + public CacheEntry get(@NotNull String key) + { + return _cache.get(key); + } + + @Override + public CacheEntry get(@NotNull String key, @Nullable Object arg, CacheLoader loader) + { + return _cache.get(key, arg, loader); + } + + @Override + public int removeUsingFilter(Filter filter) + { + return _cache.removeUsingFilter(filter); + } + + @Override + public Set getKeys() + { + return _cache.getKeys(); + } + + @Override + public TrackingCache getTrackingCache() + { + return _cache.getTrackingCache(); + } + + @Override + public Cache createTemporaryCache() + { + return _cache.createTemporaryCache(); + } + } + + public static class CacheEntry + { + private final LRUQueryCache queryCache; + private final IndexSearcher indexSearcher; + private final File luceneIndexDir; + + public CacheEntry(LRUQueryCache queryCache, IndexSearcher indexSearcher, File luceneIndexDir) + { + this.queryCache = queryCache; + this.indexSearcher = indexSearcher; + this.luceneIndexDir = luceneIndexDir; + } + + public LRUQueryCache getQueryCache() + { + return queryCache; + } + + public IndexSearcher getIndexSearcher() + { + return indexSearcher; + } + + public File getLuceneIndexDir() + { + return luceneIndexDir; + } + } + public static JSONArray reportCacheInfo() { JSONArray cacheInfo = new JSONArray(); for (String sessionId : _cache.getKeys()) { - LRUQueryCache qc = _cache.get(sessionId); + LRUQueryCache qc = _cache.get(sessionId).getQueryCache(); JSONObject info = new JSONObject(); info.put("cacheSize", qc.getCacheSize()); info.put("cacheCount", qc.getCacheCount()); @@ -425,6 +591,23 @@ public void cacheDefaultQuery() } } + public static void emptyCache() + { + clearCache(null); + } + + public static void clearCacheForFile(@NotNull File luceneIndexDir) + { + for (String key : _cache.getKeys()) + { + CacheEntry entry = _cache.get(key); + if (entry != null && luceneIndexDir.equals(entry.getLuceneIndexDir())) + { + _cache.remove(key); + } + } + } + public static void clearCache(@Nullable String jbrowseTrackId) { if (jbrowseTrackId == null) @@ -436,4 +619,26 @@ public static void clearCache(@Nullable String jbrowseTrackId) _cache.remove(jbrowseTrackId); } } + + public static class ShutdownHandler implements ShutdownListener + { + @Override + public String getName() + { + return "JBrowse-Lucene Shutdown Listener"; + } + + @Override + public void shutdownPre() + { + + } + + @Override + public void shutdownStarted() + { + _log.info("Clearing all open JBrowse/Lucene cached readers"); + JBrowseLuceneSearch.emptyCache(); + } + } } diff --git a/jbrowse/src/org/labkey/jbrowse/JBrowseModule.java b/jbrowse/src/org/labkey/jbrowse/JBrowseModule.java index 318b1fb42..47c11c414 100644 --- a/jbrowse/src/org/labkey/jbrowse/JBrowseModule.java +++ b/jbrowse/src/org/labkey/jbrowse/JBrowseModule.java @@ -30,6 +30,7 @@ import org.labkey.api.sequenceanalysis.SequenceAnalysisService; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; import org.labkey.api.settings.AdminConsole; +import org.labkey.api.util.ContextListener; import org.labkey.api.util.PageFlowUtil; import org.labkey.api.util.SystemMaintenance; import org.labkey.api.view.WebPartFactory; @@ -114,6 +115,8 @@ public void doStartupAfterSpringConfig(ModuleContext moduleContext) // These are all part of the JBrowse demo data: ContentSecurityPolicyFilter.registerAllowedConnectionSource(this.getClass().getName(), "https://jbrowse.org", "https://s3.amazonaws.com", "https://ftp.ncbi.nlm.nih.gov"); + + ContextListener.addShutdownListener(new JBrowseLuceneSearch.ShutdownHandler()); } public static void registerPipelineSteps() diff --git a/jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java b/jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java index daeb91419..6ad23749a 100644 --- a/jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java +++ b/jbrowse/src/org/labkey/jbrowse/JBrowseServiceImpl.java @@ -319,6 +319,12 @@ public void cacheDefaultQuery(User u, String sessionId, String trackId) luceneSearch.cacheDefaultQuery(); } + @Override + public void clearLuceneCacheEntry(File luceneIndexDir) + { + JBrowseLuceneSearch.clearCacheForFile(luceneIndexDir); + } + public static final class DefaultLuceneIndexDetector implements LuceneIndexDetector { @Override diff --git a/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java b/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java index 8e101a5c6..d97f01de3 100644 --- a/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java +++ b/singlecell/api-src/org/labkey/api/singlecell/pipeline/AbstractSingleCellPipelineStep.java @@ -43,12 +43,17 @@ public AbstractSingleCellPipelineStep(PipelineStepProvider provider, Pipeline super(provider, ctx); } + private String getSavedSeuratFileName(String outputPrefix) + { + return outputPrefix + ".savedSeuratObjects.txt"; + } + @Override public Output execute(SequenceOutputHandler.JobContext ctx, List inputObjects, String outputPrefix) throws PipelineJobException { SingleCellOutput output = new SingleCellOutput(); - File tracker = new File(ctx.getOutputDir(), "savedSeuratObjects.txt"); + File tracker = new File(ctx.getOutputDir(), getSavedSeuratFileName(outputPrefix)); if (tracker.exists()) { ctx.getLogger().debug("deleting pre-existing file: " + tracker.getName()); @@ -481,6 +486,7 @@ protected Chunk createParamChunk(SequenceOutputHandler.JobContext ctx, List + { + public Provider() + { + super("UpdateSeuratPrototype", "Update Seurat Prototype", "CellMembrane/Rdiscvr", "This will re-process an existing seurat prototype object and overwrite the original", Arrays.asList( + SeuratToolParameter.create("reapplyMetadata", "Reapply Metadata", "If checked, metadata will be re-applied", "checkbox", null, true), + SeuratToolParameter.create("runRira", "Run RIRA", "If checked, RIRA classification will be re-run", "checkbox", null, true), + SeuratToolParameter.create("runTNKClassification", "Run T/NK Classification", "If checked, T/NK expression-based classification will be re-run", "checkbox", null, true), + SeuratToolParameter.create("applyTCR", "Append TCR Data", "If checked, TCR data will be applied. This will fail if", "checkbox", null, true), + SeuratToolParameter.create("allowMissingTcr", "Allow Missing TCR Data", "Unless checked, an error will be thrown if any sample lacks TCR data", "checkbox", new JSONObject() + {{ + put("checked", false); + }}, false), + SeuratToolParameter.create("keepOriginal", "Keep Copy of Original File", "If checked, the original file will be copied with the file extension '.bk'", "checkbox", new JSONObject() + {{ + put("checked", false); + }}, false) + ), null, null); + } + + @Override + public UpdateSeuratPrototype create(PipelineContext ctx) + { + return new UpdateSeuratPrototype(ctx, this); + } + } + + @Override + public void init(SequenceOutputHandler.JobContext ctx, List inputFiles) throws PipelineJobException + { + if (inputFiles.size() > 1) + { + throw new PipelineJobException("Seurat prototype step expects this job to have a single input. Consider selecting the option to run jobs individually instead of merged"); + } + + if (inputFiles.get(0).getReadset() == null) + { + throw new PipelineJobException("Seurat prototype step expects all inputs to have a readset ID."); + } + + if (!SEURAT_PROTOTYPE.equals(inputFiles.get(0).getCategory())) + { + throw new PipelineJobException("Expected the input to be a seurat prototype, found: " + inputFiles.get(0).getCategory()); + } + + if (ctx.getSequenceSupport().getCachedGenomes().size() > 1) + { + throw new PipelineJobException("Expected seurat prototype step to use a single genome"); + } + + Readset rs = ctx.getSequenceSupport().getCachedReadset(inputFiles.get(0).getReadset()); + if (!ctx.getJob().getContainer().getId().equalsIgnoreCase(rs.getContainer())) + { + throw new PipelineJobException("Seurat prototype jobs must be submitted to the same folder as the source readset"); + } + } + + @Override + public Output execute(SequenceOutputHandler.JobContext ctx, List inputObjects, String outputPrefix) throws PipelineJobException + { + Output output = super.execute(ctx, inputObjects, outputPrefix); + + if (ctx.getSequenceSupport().getCachedGenomes().size() > 1) + { + throw new PipelineJobException("Expected seurat prototype step to use a single genome"); + } + + if (output.getSeuratObjects().size() != 1) + { + throw new PipelineJobException("Expected a single output object, found: " + output.getSeuratObjects().size()); + } + + SeuratObjectWrapper inputRDS = inputObjects.get(0); + SeuratObjectWrapper wrapper = output.getSeuratObjects().get(0); + if (wrapper.getReadsetId() == null) + { + throw new PipelineJobException("Missing readset Id: " + wrapper.getDatasetId()); + } + + File toReplace = inputRDS.getSequenceOutputFile().getFile(); + if (!toReplace.exists()) + { + throw new PipelineJobException("Missing file: " + toReplace); + } + try + { + ctx.getLogger().info("Replacing existing prototype: " + toReplace.getPath()); + + if (ctx.getParams().optBoolean("keepOriginal", false)) + { + File backup = new File(toReplace.getPath() + ".orig"); + if (backup.exists()) + { + backup.delete(); + } + + FileUtils.moveFile(toReplace, backup); + } + + if (toReplace.exists()) + { + toReplace.delete(); + } + + FileUtils.moveFile(wrapper.getFile(), toReplace); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + + return output; + } +} diff --git a/singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java b/singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java index 6f12b12bb..ba5044721 100644 --- a/singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java +++ b/singlecell/src/org/labkey/singlecell/run/NimbleAlignmentStep.java @@ -57,9 +57,6 @@ public static List getToolParameters() put("initialValues", "unstranded"); put("delimiter", ";"); }}, null), - ToolParameterDescriptor.create(ALIGN_OUTPUT, "Create Alignment/Debug Output", "If checked, an alignment-level summary TSV will be created", "checkbox", new JSONObject(){{ - put("checked", true); - }}, true), ToolParameterDescriptor.create(MAX_HITS_TO_REPORT, "Max Hits To Report", "If a given hit has more than this number of references, it is discarded", "ldk-integerfield", new JSONObject(){{ put("minValue", 0); }}, 4) diff --git a/singlecell/src/org/labkey/singlecell/run/NimbleHelper.java b/singlecell/src/org/labkey/singlecell/run/NimbleHelper.java index f7bbfe34a..9850e99ed 100644 --- a/singlecell/src/org/labkey/singlecell/run/NimbleHelper.java +++ b/singlecell/src/org/labkey/singlecell/run/NimbleHelper.java @@ -43,7 +43,6 @@ import java.util.Map; import java.util.stream.Collectors; -import static org.labkey.singlecell.run.NimbleAlignmentStep.ALIGN_OUTPUT; import static org.labkey.singlecell.run.NimbleAlignmentStep.MAX_HITS_TO_REPORT; import static org.labkey.singlecell.run.NimbleAlignmentStep.REF_GENOMES; import static org.labkey.singlecell.run.NimbleAlignmentStep.STRANDEDNESS; @@ -425,14 +424,6 @@ private Map doAlignment(List genomes, List doAlignment(List genomes, List