LabKey · bbimber · Apr 6, 2023 · Mar 10, 2023 · Mar 16, 2023 · Mar 17, 2023
diff --git a/mGAP/resources/schemas/mgap.xml b/mGAP/resources/schemas/mgap.xml
@@ -689,7 +689,7 @@
     <table tableName="releaseTrackSubsets" tableDbType="TABLE" useColumnOrder="true">
         <javaCustomizer class="org.labkey.ldk.query.DefaultTableCustomizer" />
         <pkColumnName>rowid</pkColumnName>
-        <tableTitle>mGAP Release Track Sample Sets</tableTitle>
+        <tableTitle>mGAP Samples To Include Per Track</tableTitle>
         <auditLogging>DETAILED</auditLogging>
         <columns>
             <column columnName="rowid">

diff --git a/mGAP/src/org/labkey/mgap/pipeline/AnnotationStep.java b/mGAP/src/org/labkey/mgap/pipeline/AnnotationStep.java
@@ -51,9 +51,11 @@ public class AnnotationStep extends AbstractCommandPipelineStep<CassandraRunner>
 {
     public static final String GRCH37 = "genome37";
     private static final String CLINVAR_VCF = "clinvar37";
+    private static final String DBNSFP_FILE = "dbnsfpFile";
+
     public static final String CHAIN_FILE = "CHAIN_FILE";
 
-    public AnnotationStep(PipelineStepProvider provider, PipelineContext ctx)
+    public AnnotationStep(PipelineStepProvider<?> provider, PipelineContext ctx)
     {
         super(provider, ctx, new CassandraRunner(ctx.getLogger()));
     }
@@ -67,6 +69,10 @@ public Provider()
                     {{
                         put("allowBlank", false);
                     }}, null),
+                    ToolParameterDescriptor.createExpDataParam(DBNSFP_FILE, "dbNSFP Database (GRCh37)", "This is the DataId of the dbNSFP database (txt.gz file) using the GRCh37 genome.", "ldk-expdatafield", new JSONObject()
+                    {{
+                        put("allowBlank", false);
+                    }}, null),
                     ToolParameterDescriptor.create(GRCH37, "GRCh37 Genome", "The genome that matches human GRCh37.", "ldk-simplelabkeycombo", new JSONObject()
                     {{
                         put("width", 400);
@@ -126,10 +132,21 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
         VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl();
 
         File clinvarVCF = getPipelineCtx().getSequenceSupport().getCachedData(getProvider().getParameterByName(CLINVAR_VCF).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class));
+        if (!clinvarVCF.exists())
+        {
+            throw new PipelineJobException("Unable to find file: " + clinvarVCF.getPath());
+        }
+
         ReferenceGenome grch37Genome = getPipelineCtx().getSequenceSupport().getCachedGenome(getProvider().getParameterByName(GRCH37).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class));
         Integer chainFileId = getPipelineCtx().getSequenceSupport().getCachedObject(CHAIN_FILE, Integer.class);
         File chainFile = getPipelineCtx().getSequenceSupport().getCachedData(chainFileId);
 
+        File dbnsfpFile = getPipelineCtx().getSequenceSupport().getCachedData(getProvider().getParameterByName(DBNSFP_FILE).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class));
+        if (!dbnsfpFile.exists())
+        {
+            throw new PipelineJobException("Unable to find file: " + dbnsfpFile.getPath());
+        }
+
         getPipelineCtx().getLogger().info("processing file: " + inputVCF.getName());
 
         ReferenceGenome originalGenome = getPipelineCtx().getSequenceSupport().getCachedGenome(genome.getGenomeId());
@@ -293,6 +310,22 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
         output.addIntermediateFile(clinvarAnnotated);
         output.addIntermediateFile(new File(clinvarAnnotated.getPath() + ".tbi"));
 
+        //annotate with SnpSift
+        getPipelineCtx().getLogger().info("annotating with SnpSift");
+        File snpSiftAnnotated = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".snpSift.vcf.gz");
+        if (forceRecreate || !indexExists(snpSiftAnnotated))
+        {
+            SnpSiftWrapper ssRunner = new SnpSiftWrapper(getPipelineCtx().getLogger());
+            ssRunner.runSnpSift(dbnsfpFile, clinvarAnnotated, snpSiftAnnotated);
+        }
+        else
+        {
+            getPipelineCtx().getLogger().info("resuming with existing file: " + snpSiftAnnotated.getPath());
+        }
+        output.addOutput(snpSiftAnnotated, "VCF Annotated With SnpSift");
+        output.addIntermediateFile(snpSiftAnnotated);
+        output.addIntermediateFile(new File(snpSiftAnnotated.getPath() + ".tbi"));
+
         //annotate with cassandra
         getPipelineCtx().getLogger().info("annotating with Cassandra");
         String basename = SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".cassandra";