diff --git a/pom.xml b/pom.xml index 9eb85d1..a841182 100644 --- a/pom.xml +++ b/pom.xml @@ -207,6 +207,11 @@ + + com.google.cloud.genomics + gatk-tools-java + 1.0 + diff --git a/src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java b/src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java index a1ec992..1d8bf49 100644 --- a/src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java +++ b/src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java @@ -15,6 +15,7 @@ */ package com.google.cloud.genomics.dataflow.readers.bam; +import com.google.api.client.util.Maps; import com.google.api.services.genomics.model.CigarUnit; import com.google.api.services.genomics.model.LinearAlignment; import com.google.api.services.genomics.model.Position; @@ -23,15 +24,12 @@ import com.google.common.collect.BiMap; import com.google.common.collect.HashBiMap; import com.google.common.collect.Lists; - -import htsjdk.samtools.CigarElement; -import htsjdk.samtools.CigarOperator; -import htsjdk.samtools.SAMException; -import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.*; import htsjdk.samtools.util.SequenceUtil; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** * Converts SAMRecords to Reads. @@ -62,7 +60,7 @@ public static final Read makeRead(final SAMRecord record) { read.setId(record.getReadName()); // TODO: make more unique read.setFragmentName(record.getReadName()); read.setReadGroupId(getAttr(record, "RG")); - read.setNumberReads(record.getReadPairedFlag() ? 1 : 2); + read.setNumberReads(record.getReadPairedFlag() ? 2 : 1); read.setProperPlacement(record.getReadPairedFlag() && record.getProperPairFlag()); if (!record.getReadUnmappedFlag() && record.getAlignmentStart() > 0) { LinearAlignment alignment = new LinearAlignment(); @@ -97,7 +95,7 @@ public CigarUnit apply(CigarElement c) { read.setAlignment(alignment); } read.setDuplicateFragment(record.getDuplicateReadFlag()); - read.setFragmentLength(record.getReadLength()); + read.setFragmentLength(record.getInferredInsertSize()); if (record.getReadPairedFlag()) { if (record.getFirstOfPairFlag()) { read.setReadNumber(0); @@ -126,6 +124,12 @@ public CigarUnit apply(CigarElement c) { read.setAlignedQuality(readBaseQualities); } + Map> attributes = Maps.newHashMap(); + for( SAMRecord.SAMTagAndValue tagAndValue: record.getAttributes()) { + attributes.put(tagAndValue.tag, Lists.newArrayList(tagAndValue.value.toString())); + } + read.setInfo(attributes); + return read; } diff --git a/src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java b/src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java index 7235055..e189610 100644 --- a/src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java +++ b/src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java @@ -1,22 +1,19 @@ package com.google.cloud.genomics.dataflow.readers.bam; import com.google.api.services.genomics.model.Read; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertThat; -import static org.junit.Assert.assertTrue; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; - -import org.junit.Before; +import com.google.cloud.genomics.gatk.common.GenomicsConverter; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -import htsjdk.samtools.SAMRecord; +import java.io.File; +import java.io.IOException; + +import static org.junit.Assert.assertEquals; @RunWith(JUnit4.class) public class ReadConverterTest { @@ -41,5 +38,22 @@ public void testConversion() { assertEquals("chr20", read.getNextMatePosition().getReferenceName()); assertEquals((Boolean)true, read.getNextMatePosition().getReverseStrand()); } - + + @Test + public void SamToReadToSamTest() throws IOException { + String filePath = "src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam"; + File samInput = new File(filePath); + SamReader reads = SamReaderFactory.makeDefault().open(samInput); + SAMFileHeader header = reads.getFileHeader(); + + int numReads = 0; + for (SAMRecord sam : reads){ + Read read = ReadConverter.makeRead(sam); + SAMRecord newSam = GenomicsConverter.makeSAMRecord(read, header ); + assertEquals(newSam.getSAMString(), sam.getSAMString()); + numReads++; + } + assertEquals(19, numReads);//sanity check to make sure we actually read the file + } + } diff --git a/src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam b/src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam new file mode 100644 index 0000000..aacbba1 --- /dev/null +++ b/src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam @@ -0,0 +1,29 @@ +@HD VN:1.0 SO:coordinate +@SQ SN:chr1 LN:101 +@SQ SN:chr2 LN:101 +@SQ SN:chr3 LN:101 +@SQ SN:chr4 LN:101 +@SQ SN:chr5 LN:101 +@SQ SN:chr6 LN:101 +@SQ SN:chr7 LN:202 +@SQ SN:chr8 LN:202 +@RG ID:0 SM:Hi,Momma! LB:whatever PU:me PL:ILLUMINA +SL-XAV:1:1:0:764#0/1 89 chr1 1 255 101M * 0 0 TTCATGCTGANGCNCTCTTACGATCGTACAGATGCAAATATTAACANNCNTTNAAGNNCANNNNNNNNNCAATACAATANTAGAGTACGTNAACACTCCAN &/,&-.1/6/&&)&).)/,&0768)&/.,/874,&.4137572)&/&&,&1-&.0/&&*,&&&&&&&&&&18775799,&16:8775-56256/69::;0& RG:Z:0 NN:Z:Hello +SL-XAV:1:1:0:1668#0/2 153 chr2 1 255 101M * 0 0 CATCTCTACANGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATACTTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTNGACACCTTTN (/,'-/'0////(1'&&1&&&&'2''-6/,/3-33653.6:1'.86/-++32.-4864653/5/583/346423203+28888644446688456/4880& RG:Z:0 NN:Z:Goodbye +SL-XAV:1:1:0:1914#0/2 153 chr3 1 255 101M * 0 0 CGTATGCGCTNTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAATAAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGNAATGTGCAAN (0,7&&*/*0*,)10/).-*&.&*/6669.&-337599;3,&,6/.,5::999987893+387020775777547999::668997448:::9;999::0& RG:Z:0 +SL-XAV:1:1:0:1639#0/2 153 chr4 1 255 101M * 0 0 CGTGATACCANCTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATATTTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGNTTTGCAGCCN '.&.&&'.0+01'2(1'(''-)','+0041/.+032;:867115/5267-.0/)-5.&-26200224,,0+0/0275/5605688::646875568882*& RG:Z:0 +SL-XAV:1:1:0:68#0/2 137 chr5 1 255 101M * 0 0 NTCTCATTTANAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTTCATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCANGACGTTATCT &1<<999;;;;<<<87579:556972789977444.'.023.&,7621/54.49.)/53055-22--''+(.'-))6-168/(3&&0(<).))*&&&&&'0 RG:Z:0 +SL-XAV:1:1:0:700#0/2 137 chr6 1 255 101M * 0 0 NAATTGTTCTNAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACAATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACNAGTGTCGATC &0::887::::6/646::838388811/679:87640&./2+/-4/28:3,536/4''&&.78/(/554/./02*)*',-(57()&.6(6:(0601'/(,* RG:Z:0 +SL-XAV:1:1:0:1721#0/1 83 chr7 1 255 101M = 102 40 CAACAGAAGGNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCGAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0 +SL-XAV:1:1:0:105#0/2 403 chr7 1 255 101M = 102 79 CACATCGTGANTCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGAGAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCNTAAGATGACN /))3--/&*()&)&&+'++.'-&,(.))'4,)&'&&,')8,&&*'.&*0'225/&)3-8//)*,5-*).7851453583.3568526:863688:::85.& RG:Z:0 +SL-XAV:1:1:0:1721#0/2 163 chr7 102 255 101M = 1 -40 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0 +SL-XAV:1:1:0:105#0/2 147 chr8 1 255 101M = 102 79 CACATCGTGANTCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGAGAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCNTAAGATGACN /))3--/&*()&)&&+'++.'-&,(.))'4,)&'&&,')8,&&*'.&*0'225/&)3-8//)*,5-*).7851453583.3568526:863688:::85.& RG:Z:0 +SL-XAV:1:1:0:105#0/1 99 chr8 102 255 101M = 1 -79 NCAGGTTCAANTGTGCAGCCCNTTTTGAGAGATNNNNNNNNTGNNCTGNAANANNGACACAGCTATTCCTAAGATGACAAGATCAGANAANAAGTCAAGCA &06665578::41.*/7577/&/77403-324.&&&&&&&&/.&&..&&.0&&&&',:9:/-/(55002020+3'12+2/&.2-&//&),&*&&&&&&&51 RG:Z:0 +SL-XAV:1:1:0:1300#0/1 77 * 0 0 * * 0 0 NAAACACAAGNNANAGTCTTANCNGCTATTCCNNNNNNNNNCTNNNCTNAGNANNACATACAACAGTATCCACACAAGTGTACTCGTNCANACATGTGAAC &*5535)*-,,&.&.*-1)*,&'&)&1&&.,)&&&&&&&&&)0&&&0'&&&&.&&*2'/4''0/**&)&,'-&*,&,&&&.0.&)&&&**&,.&&&')&&) RG:Z:0 +SL-XAV:1:1:0:1300#0/2 141 * 0 0 * * 0 0 NGATCATGGANGACTCTCCCCATCCCCCGCTCCAGCGCTCAGTTATATGCCTAGCCTCGGACACGTCACCAACATCTCACGCACTCTGCANAGTCTCTCAC &&'+''3*&-/)/1'26/*-2-/542-*&-&/'/*/&-'&)-')&.'-/&&2+122*'&+,(/-&)((,/-,,.'2(2'+)/&/&-66-&&/16&)&*&'3 RG:Z:0 +SL-XAV:1:1:0:1639#0/1 101 * 0 0 * chr1 1 0 NCCCTCTCAGNNTNTCTGCCANANCCTTAAGCNNNNNNNNNTANNNCTNAANCNNAAACTTTTGCCTCAGGCATCCGCAGAATGTTTNTCNGCCTATATCG &1::::::64/&/&0:3.280&/&087881,/&&&&&&&&&..&&&..&,,&-&&,265341-)/5680&-.5552-25/322/42/&)&&).421&-&-/ RG:Z:0 +SL-XAV:1:1:0:1668#0/1 101 * 0 0 * chr2 1 0 NATAGCATACNNTNCATTGGANTNCAGCACAANNNNNNNNNTGNNNCANTNNANNCCTTTGAGATCGGAAGAGCGGTTCAGCAGGAANNCNCAGACCGATC &1988998890&0&.8863//&.&.0-2875.&&&&&&&&&.)&&&..&.&&.&&.5782-2+262)&-0-0510*.332-2.-,0*&&*&'.&-2-)0., RG:Z:0 +SL-XAV:1:1:0:1914#0/1 101 * 0 0 * chr3 1 0 NTTTTTCTCCNNCNGTGCCTANTNTAGCCCCTNNNNNNNNNAANNNATNANNANNTTTACTTAAAAAACTGAAACTAGTAATGTGCANNANATCGNAAGAG &0::::<<;90&/&.244760&,&.414798/&&&&&&&&&00&&&0.&/&&-&&.4475687363504.&.557/.*)65.&/*./&&.&.+*)&..).& RG:Z:0 +SL-XAV:1:1:0:68#0/1 581 * 0 0 * chr4 1 0 NAATATTCATNNGNTCAGCCTNTNCATTAATTNNNNNNNNNTTNNNATNATNANNTTTTTTATAACCATTTATAAATGAGAGAGATCNTANCACAATATCA &0<<:::::