diff --git a/pom.xml b/pom.xml
index 9eb85d1..a841182 100644
--- a/pom.xml
+++ b/pom.xml
@@ -207,6 +207,11 @@
+
+ com.google.cloud.genomics
+ gatk-tools-java
+ 1.0
+
diff --git a/src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java b/src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java
index a1ec992..1d8bf49 100644
--- a/src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java
+++ b/src/main/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverter.java
@@ -15,6 +15,7 @@
*/
package com.google.cloud.genomics.dataflow.readers.bam;
+import com.google.api.client.util.Maps;
import com.google.api.services.genomics.model.CigarUnit;
import com.google.api.services.genomics.model.LinearAlignment;
import com.google.api.services.genomics.model.Position;
@@ -23,15 +24,12 @@
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Lists;
-
-import htsjdk.samtools.CigarElement;
-import htsjdk.samtools.CigarOperator;
-import htsjdk.samtools.SAMException;
-import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.*;
import htsjdk.samtools.util.SequenceUtil;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
/**
* Converts SAMRecords to Reads.
@@ -62,7 +60,7 @@ public static final Read makeRead(final SAMRecord record) {
read.setId(record.getReadName()); // TODO: make more unique
read.setFragmentName(record.getReadName());
read.setReadGroupId(getAttr(record, "RG"));
- read.setNumberReads(record.getReadPairedFlag() ? 1 : 2);
+ read.setNumberReads(record.getReadPairedFlag() ? 2 : 1);
read.setProperPlacement(record.getReadPairedFlag() && record.getProperPairFlag());
if (!record.getReadUnmappedFlag() && record.getAlignmentStart() > 0) {
LinearAlignment alignment = new LinearAlignment();
@@ -97,7 +95,7 @@ public CigarUnit apply(CigarElement c) {
read.setAlignment(alignment);
}
read.setDuplicateFragment(record.getDuplicateReadFlag());
- read.setFragmentLength(record.getReadLength());
+ read.setFragmentLength(record.getInferredInsertSize());
if (record.getReadPairedFlag()) {
if (record.getFirstOfPairFlag()) {
read.setReadNumber(0);
@@ -126,6 +124,12 @@ public CigarUnit apply(CigarElement c) {
read.setAlignedQuality(readBaseQualities);
}
+ Map> attributes = Maps.newHashMap();
+ for( SAMRecord.SAMTagAndValue tagAndValue: record.getAttributes()) {
+ attributes.put(tagAndValue.tag, Lists.newArrayList(tagAndValue.value.toString()));
+ }
+ read.setInfo(attributes);
+
return read;
}
diff --git a/src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java b/src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java
index 7235055..e189610 100644
--- a/src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java
+++ b/src/test/java/com/google/cloud/genomics/dataflow/readers/bam/ReadConverterTest.java
@@ -1,22 +1,19 @@
package com.google.cloud.genomics.dataflow.readers.bam;
import com.google.api.services.genomics.model.Read;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertThat;
-import static org.junit.Assert.assertTrue;
-
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.List;
-
-import org.junit.Before;
+import com.google.cloud.genomics.gatk.common.GenomicsConverter;
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.SamReaderFactory;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
-import htsjdk.samtools.SAMRecord;
+import java.io.File;
+import java.io.IOException;
+
+import static org.junit.Assert.assertEquals;
@RunWith(JUnit4.class)
public class ReadConverterTest {
@@ -41,5 +38,22 @@ public void testConversion() {
assertEquals("chr20", read.getNextMatePosition().getReferenceName());
assertEquals((Boolean)true, read.getNextMatePosition().getReverseStrand());
}
-
+
+ @Test
+ public void SamToReadToSamTest() throws IOException {
+ String filePath = "src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam";
+ File samInput = new File(filePath);
+ SamReader reads = SamReaderFactory.makeDefault().open(samInput);
+ SAMFileHeader header = reads.getFileHeader();
+
+ int numReads = 0;
+ for (SAMRecord sam : reads){
+ Read read = ReadConverter.makeRead(sam);
+ SAMRecord newSam = GenomicsConverter.makeSAMRecord(read, header );
+ assertEquals(newSam.getSAMString(), sam.getSAMString());
+ numReads++;
+ }
+ assertEquals(19, numReads);//sanity check to make sure we actually read the file
+ }
+
}
diff --git a/src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam b/src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam
new file mode 100644
index 0000000..aacbba1
--- /dev/null
+++ b/src/test/resources/com/google/cloud/genomics/dataflow/readers/bam/conversion_test.sam
@@ -0,0 +1,29 @@
+@HD VN:1.0 SO:coordinate
+@SQ SN:chr1 LN:101
+@SQ SN:chr2 LN:101
+@SQ SN:chr3 LN:101
+@SQ SN:chr4 LN:101
+@SQ SN:chr5 LN:101
+@SQ SN:chr6 LN:101
+@SQ SN:chr7 LN:202
+@SQ SN:chr8 LN:202
+@RG ID:0 SM:Hi,Momma! LB:whatever PU:me PL:ILLUMINA
+SL-XAV:1:1:0:764#0/1 89 chr1 1 255 101M * 0 0 TTCATGCTGANGCNCTCTTACGATCGTACAGATGCAAATATTAACANNCNTTNAAGNNCANNNNNNNNNCAATACAATANTAGAGTACGTNAACACTCCAN &/,&-.1/6/&&)&).)/,&0768)&/.,/874,&.4137572)&/&&,&1-&.0/&&*,&&&&&&&&&&18775799,&16:8775-56256/69::;0& RG:Z:0 NN:Z:Hello
+SL-XAV:1:1:0:1668#0/2 153 chr2 1 255 101M * 0 0 CATCTCTACANGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATACTTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTNGACACCTTTN (/,'-/'0////(1'&&1&&&&'2''-6/,/3-33653.6:1'.86/-++32.-4864653/5/583/346423203+28888644446688456/4880& RG:Z:0 NN:Z:Goodbye
+SL-XAV:1:1:0:1914#0/2 153 chr3 1 255 101M * 0 0 CGTATGCGCTNTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAATAAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGNAATGTGCAAN (0,7&&*/*0*,)10/).-*&.&*/6669.&-337599;3,&,6/.,5::999987893+387020775777547999::668997448:::9;999::0& RG:Z:0
+SL-XAV:1:1:0:1639#0/2 153 chr4 1 255 101M * 0 0 CGTGATACCANCTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATATTTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGNTTTGCAGCCN '.&.&&'.0+01'2(1'(''-)','+0041/.+032;:867115/5267-.0/)-5.&-26200224,,0+0/0275/5605688::646875568882*& RG:Z:0
+SL-XAV:1:1:0:68#0/2 137 chr5 1 255 101M * 0 0 NTCTCATTTANAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTTCATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCANGACGTTATCT &1<<999;;;;<<<87579:556972789977444.'.023.&,7621/54.49.)/53055-22--''+(.'-))6-168/(3&&0(<).))*&&&&&'0 RG:Z:0
+SL-XAV:1:1:0:700#0/2 137 chr6 1 255 101M * 0 0 NAATTGTTCTNAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACAATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACNAGTGTCGATC &0::887::::6/646::838388811/679:87640&./2+/-4/28:3,536/4''&&.78/(/554/./02*)*',-(57()&.6(6:(0601'/(,* RG:Z:0
+SL-XAV:1:1:0:1721#0/1 83 chr7 1 255 101M = 102 40 CAACAGAAGGNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCGAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0
+SL-XAV:1:1:0:105#0/2 403 chr7 1 255 101M = 102 79 CACATCGTGANTCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGAGAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCNTAAGATGACN /))3--/&*()&)&&+'++.'-&,(.))'4,)&'&&,')8,&&*'.&*0'225/&)3-8//)*,5-*).7851453583.3568526:863688:::85.& RG:Z:0
+SL-XAV:1:1:0:1721#0/2 163 chr7 102 255 101M = 1 -40 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0
+SL-XAV:1:1:0:105#0/2 147 chr8 1 255 101M = 102 79 CACATCGTGANTCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGAGAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCNTAAGATGACN /))3--/&*()&)&&+'++.'-&,(.))'4,)&'&&,')8,&&*'.&*0'225/&)3-8//)*,5-*).7851453583.3568526:863688:::85.& RG:Z:0
+SL-XAV:1:1:0:105#0/1 99 chr8 102 255 101M = 1 -79 NCAGGTTCAANTGTGCAGCCCNTTTTGAGAGATNNNNNNNNTGNNCTGNAANANNGACACAGCTATTCCTAAGATGACAAGATCAGANAANAAGTCAAGCA &06665578::41.*/7577/&/77403-324.&&&&&&&&/.&&..&&.0&&&&',:9:/-/(55002020+3'12+2/&.2-&//&),&*&&&&&&&51 RG:Z:0
+SL-XAV:1:1:0:1300#0/1 77 * 0 0 * * 0 0 NAAACACAAGNNANAGTCTTANCNGCTATTCCNNNNNNNNNCTNNNCTNAGNANNACATACAACAGTATCCACACAAGTGTACTCGTNCANACATGTGAAC &*5535)*-,,&.&.*-1)*,&'&)&1&&.,)&&&&&&&&&)0&&&0'&&&&.&&*2'/4''0/**&)&,'-&*,&,&&&.0.&)&&&**&,.&&&')&&) RG:Z:0
+SL-XAV:1:1:0:1300#0/2 141 * 0 0 * * 0 0 NGATCATGGANGACTCTCCCCATCCCCCGCTCCAGCGCTCAGTTATATGCCTAGCCTCGGACACGTCACCAACATCTCACGCACTCTGCANAGTCTCTCAC &&'+''3*&-/)/1'26/*-2-/542-*&-&/'/*/&-'&)-')&.'-/&&2+122*'&+,(/-&)((,/-,,.'2(2'+)/&/&-66-&&/16&)&*&'3 RG:Z:0
+SL-XAV:1:1:0:1639#0/1 101 * 0 0 * chr1 1 0 NCCCTCTCAGNNTNTCTGCCANANCCTTAAGCNNNNNNNNNTANNNCTNAANCNNAAACTTTTGCCTCAGGCATCCGCAGAATGTTTNTCNGCCTATATCG &1::::::64/&/&0:3.280&/&087881,/&&&&&&&&&..&&&..&,,&-&&,265341-)/5680&-.5552-25/322/42/&)&&).421&-&-/ RG:Z:0
+SL-XAV:1:1:0:1668#0/1 101 * 0 0 * chr2 1 0 NATAGCATACNNTNCATTGGANTNCAGCACAANNNNNNNNNTGNNNCANTNNANNCCTTTGAGATCGGAAGAGCGGTTCAGCAGGAANNCNCAGACCGATC &1988998890&0&.8863//&.&.0-2875.&&&&&&&&&.)&&&..&.&&.&&.5782-2+262)&-0-0510*.332-2.-,0*&&*&'.&-2-)0., RG:Z:0
+SL-XAV:1:1:0:1914#0/1 101 * 0 0 * chr3 1 0 NTTTTTCTCCNNCNGTGCCTANTNTAGCCCCTNNNNNNNNNAANNNATNANNANNTTTACTTAAAAAACTGAAACTAGTAATGTGCANNANATCGNAAGAG &0::::<<;90&/&.244760&,&.414798/&&&&&&&&&00&&&0.&/&&-&&.4475687363504.&.557/.*)65.&/*./&&.&.+*)&..).& RG:Z:0
+SL-XAV:1:1:0:68#0/1 581 * 0 0 * chr4 1 0 NAATATTCATNNGNTCAGCCTNTNCATTAATTNNNNNNNNNTTNNNATNATNANNTTTTTTATAACCATTTATAAATGAGAGAGATCNTANCACAATATCA &0<<:::::&&&.73'290&.&0;:::90&&&&&&&&&&..&&&0)&0-&0&&&.743799995253348597921.,.'050.*&.0&)*)&&&&*). RG:Z:0
+SL-XAV:1:1:0:700#0/1 581 * 0 0 * chr5 1 0 NGAAGCCCATNNTNGTGTTACNCNCCTGGAAGNNNNNNNNNACNNNGANACNTNNAACAATTCAGATCGGAAGAGCGGTTCAGCAGANNTNCCGAGACCGA &.88888:88/&0&,03189.&/&.8/))12/&&&&&&&&&./&&&&.&1.&)&&/35962/6432-3&),0&/2+0,),61&-6,&&&'&/,.0&...)0 RG:Z:0
+SL-XAV:1:1:0:764#0/2 165 * 0 0 * chr6 1 0 NACAGATGCANATATTAACAGGCTTTAAAGGACAGATGGACTGCAATACAATAATAGAGTACGTCAACACTCCACAGATCGCTAGAGCATNACATCGGTGT &/:5358::9999::99998255::7275,,/5567-'+387537857:54-4.51'31059547320;73/720+22.4(6.;((.;(;8()(''&&2&& RG:Z:0