diff --git a/pom.xml b/pom.xml index 19849cf..0691573 100644 --- a/pom.xml +++ b/pom.xml @@ -127,7 +127,7 @@ com.google.cloud.genomics google-genomics-utils - v1beta2-0.37 + v1beta2-0.38 diff --git a/src/main/java/com/google/cloud/genomics/dataflow/functions/AlleleSimilarityCalculator.java b/src/main/java/com/google/cloud/genomics/dataflow/functions/AlleleSimilarityCalculator.java index 79edaf6..1283c5f 100644 --- a/src/main/java/com/google/cloud/genomics/dataflow/functions/AlleleSimilarityCalculator.java +++ b/src/main/java/com/google/cloud/genomics/dataflow/functions/AlleleSimilarityCalculator.java @@ -16,15 +16,15 @@ import java.util.HashMap; import java.util.Map; -import com.google.api.services.genomics.model.Call; -import com.google.api.services.genomics.model.Variant; import com.google.cloud.dataflow.sdk.transforms.DoFn; import com.google.cloud.dataflow.sdk.values.KV; import com.google.cloud.genomics.dataflow.utils.CallFilters; import com.google.cloud.genomics.dataflow.utils.PairGenerator; -import com.google.cloud.genomics.utils.VariantUtils; +import com.google.cloud.genomics.utils.grpc.VariantUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.Maps; +import com.google.genomics.v1.Variant; +import com.google.genomics.v1.VariantCall; /** * For each pair of calls on any of the given variants, computes a score, a number between 0 and 1 @@ -55,14 +55,14 @@ public void processElement(ProcessContext context) { Variant variant = context.element(); CallSimilarityCalculator callSimilarityCalculator = callSimilarityCalculatorFactory.get(isReferenceMajor(variant)); - for (KV pair : PairGenerator.WITHOUT_REPLACEMENT.allPairs( + for (KV pair : PairGenerator.WITHOUT_REPLACEMENT.allPairs( getSamplesWithVariant(variant), VariantUtils.CALL_COMPARATOR)) { accumulateCallSimilarity(callSimilarityCalculator, pair.getKey(), pair.getValue()); } } private void accumulateCallSimilarity(CallSimilarityCalculator callSimilarityCalculator, - Call call1, Call call2) { + VariantCall call1, VariantCall call2) { KV callPair = KV.of(call1.getCallSetName(), call2.getCallSetName()); KV callPairAccumulation = accumulator.get(callPair); if (callPairAccumulation == null) { @@ -79,15 +79,15 @@ public void finishBundle(Context context) { output(context, accumulator); } - static ImmutableList getSamplesWithVariant(Variant variant) { + static ImmutableList getSamplesWithVariant(Variant variant) { return CallFilters.getSamplesWithVariantOfMinGenotype(variant, 0); } static boolean isReferenceMajor(Variant variant) { int referenceAlleles = 0; int alternateAlleles = 0; - for (Call call : variant.getCalls()) { - for (Integer i : call.getGenotype()) { + for (VariantCall call : variant.getCallsList()) { + for (Integer i : call.getGenotypeList()) { if (i == 0) { ++referenceAlleles; } else if (i > 0) { diff --git a/src/main/java/com/google/cloud/genomics/dataflow/functions/CallSimilarityCalculator.java b/src/main/java/com/google/cloud/genomics/dataflow/functions/CallSimilarityCalculator.java index b95e721..15d87fe 100644 --- a/src/main/java/com/google/cloud/genomics/dataflow/functions/CallSimilarityCalculator.java +++ b/src/main/java/com/google/cloud/genomics/dataflow/functions/CallSimilarityCalculator.java @@ -15,10 +15,10 @@ */ package com.google.cloud.genomics.dataflow.functions; -import com.google.api.services.genomics.model.Call; +import com.google.genomics.v1.VariantCall; public interface CallSimilarityCalculator { - double similarity(Call call1, Call call2); + double similarity(VariantCall call1, VariantCall call2); } diff --git a/src/main/java/com/google/cloud/genomics/dataflow/functions/SharedAllelesRatioCalculator.java b/src/main/java/com/google/cloud/genomics/dataflow/functions/SharedAllelesRatioCalculator.java index 136bf47..f5d7a1a 100644 --- a/src/main/java/com/google/cloud/genomics/dataflow/functions/SharedAllelesRatioCalculator.java +++ b/src/main/java/com/google/cloud/genomics/dataflow/functions/SharedAllelesRatioCalculator.java @@ -15,7 +15,7 @@ */ package com.google.cloud.genomics.dataflow.functions; -import com.google.api.services.genomics.model.Call; +import com.google.genomics.v1.VariantCall; /** * See this @@ -27,15 +27,15 @@ public class SharedAllelesRatioCalculator implements CallSimilarityCalculator { // TODO: Double check that the following is the right way of computing the IBS // scores when the number of alleles is different than 2 and when the genotypes are unphased. @Override - public double similarity(Call call1, Call call2) { - int minNumberOfGenotypes = Math.min(call1.getGenotype().size(), call2.getGenotype().size()); + public double similarity(VariantCall call1, VariantCall call2) { + int minNumberOfGenotypes = Math.min(call1.getGenotypeCount(), call2.getGenotypeCount()); int numberOfSharedAlleles = 0; for (int i = 0; i < minNumberOfGenotypes; ++i) { - if (call1.getGenotype().get(i) == call2.getGenotype().get(i)) { + if (call1.getGenotype(i) == call2.getGenotype(i)) { ++numberOfSharedAlleles; } } - int maxNumberOfGenotypes = Math.max(call1.getGenotype().size(), call2.getGenotype().size()); + int maxNumberOfGenotypes = Math.max(call1.getGenotypeCount(), call2.getGenotypeCount()); return (double) numberOfSharedAlleles / maxNumberOfGenotypes; } diff --git a/src/main/java/com/google/cloud/genomics/dataflow/functions/SharedMinorAllelesCalculator.java b/src/main/java/com/google/cloud/genomics/dataflow/functions/SharedMinorAllelesCalculator.java index 6b385c9..c6e86cc 100644 --- a/src/main/java/com/google/cloud/genomics/dataflow/functions/SharedMinorAllelesCalculator.java +++ b/src/main/java/com/google/cloud/genomics/dataflow/functions/SharedMinorAllelesCalculator.java @@ -15,9 +15,9 @@ */ package com.google.cloud.genomics.dataflow.functions; -import com.google.api.services.genomics.model.Call; import com.google.common.base.Predicate; import com.google.common.collect.Iterables; +import com.google.genomics.v1.VariantCall; /** * Computes the similarity of two calls based on whether they share a minor allele or not. @@ -30,8 +30,8 @@ public SharedMinorAllelesCalculator(boolean isReferenceMajor) { this.isReferenceMajor = isReferenceMajor; } - private boolean hasMinorAllele(Call call) { - return Iterables.any(call.getGenotype(), new Predicate() { + private boolean hasMinorAllele(VariantCall call) { + return Iterables.any(call.getGenotypeList(), new Predicate() { @Override public boolean apply(Integer genotype) { @@ -46,7 +46,7 @@ public boolean apply(Integer genotype) { } @Override - public double similarity(Call call1, Call call2) { + public double similarity(VariantCall call1, VariantCall call2) { if (call1.getCallSetName().equals(call2.getCallSetName())) { return 1.0; } diff --git a/src/main/java/com/google/cloud/genomics/dataflow/pipelines/IdentityByState.java b/src/main/java/com/google/cloud/genomics/dataflow/pipelines/IdentityByState.java index e22a0be..3ebd933 100644 --- a/src/main/java/com/google/cloud/genomics/dataflow/pipelines/IdentityByState.java +++ b/src/main/java/com/google/cloud/genomics/dataflow/pipelines/IdentityByState.java @@ -17,8 +17,6 @@ import java.security.GeneralSecurityException; import java.util.List; -import com.google.api.services.genomics.model.SearchVariantsRequest; -import com.google.api.services.genomics.model.Variant; import com.google.cloud.dataflow.sdk.Pipeline; import com.google.cloud.dataflow.sdk.io.TextIO; import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; @@ -32,14 +30,17 @@ import com.google.cloud.genomics.dataflow.functions.CallSimilarityCalculatorFactory; import com.google.cloud.genomics.dataflow.functions.FormatIBSData; import com.google.cloud.genomics.dataflow.functions.IBSCalculator; -import com.google.cloud.genomics.dataflow.functions.JoinNonVariantSegmentsWithVariants; +import com.google.cloud.genomics.dataflow.functions.grpc.JoinNonVariantSegmentsWithVariants; import com.google.cloud.genomics.dataflow.readers.VariantReader; +import com.google.cloud.genomics.dataflow.readers.VariantStreamer; import com.google.cloud.genomics.dataflow.utils.GenomicsDatasetOptions; import com.google.cloud.genomics.dataflow.utils.GenomicsOptions; import com.google.cloud.genomics.dataflow.utils.IdentityByStateOptions; import com.google.cloud.genomics.utils.GenomicsFactory; import com.google.cloud.genomics.utils.ShardBoundary; import com.google.cloud.genomics.utils.ShardUtils; +import com.google.genomics.v1.StreamVariantsRequest; +import com.google.genomics.v1.Variant; /** * A pipeline that computes Identity by State (IBS) for each pair of individuals in a dataset. @@ -49,8 +50,8 @@ */ public class IdentityByState { - private static final String VARIANT_FIELDS = - "nextPageToken,variants(start,calls(genotype,callSetName))"; + // TODO: https://github.com/googlegenomics/utils-java/issues/48 + private static final String VARIANT_FIELDS = "variants(start,calls(genotype,callSetName))"; public static void main(String[] args) throws IOException, GeneralSecurityException, InstantiationException, IllegalAccessException { @@ -62,26 +63,28 @@ public static void main(String[] args) throws IOException, GeneralSecurityExcept GenomicsDatasetOptions.Methods.validateOptions(options); GenomicsFactory.OfflineAuth auth = GenomicsOptions.Methods.getGenomicsAuth(options); - List requests = options.isAllReferences() ? - ShardUtils.getPaginatedVariantRequests(options.getDatasetId(), ShardUtils.SexChromosomeFilter.EXCLUDE_XY, + List requests = options.isAllReferences() ? + ShardUtils.getVariantRequests(options.getDatasetId(), ShardUtils.SexChromosomeFilter.EXCLUDE_XY, options.getBasesPerShard(), auth) : - ShardUtils.getPaginatedVariantRequests(options.getDatasetId(), options.getReferences(), options.getBasesPerShard()); + ShardUtils.getVariantRequests(options.getDatasetId(), options.getReferences(), options.getBasesPerShard()); Pipeline p = Pipeline.create(options); p.getCoderRegistry().setFallbackCoderProvider(GenericJsonCoder.PROVIDER); - PCollection input = p.begin().apply(Create.of(requests)); + PCollection variants = p.begin() + .apply(Create.of(requests)) + .apply(new VariantStreamer(auth, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS)); - PCollection variants = - options.getHasNonVariantSegments() + PCollection processedVariants; + if(options.getHasNonVariantSegments()) { // Special handling is needed for data with non-variant segment records since IBS must // take into account reference-matches in addition to the variants (unlike // other analyses such as PCA). - ? JoinNonVariantSegmentsWithVariants.joinVariantsTransform(input, auth, - JoinNonVariantSegmentsWithVariants.VARIANT_JOIN_FIELDS) : input.apply(ParDo.named( - VariantReader.class.getSimpleName()).of( - new VariantReader(auth, ShardBoundary.Requirement.STRICT, VARIANT_FIELDS))); + processedVariants = JoinNonVariantSegmentsWithVariants.joinVariantsTransform(variants); + } else { + processedVariants = variants; + } - variants + processedVariants .apply( ParDo.named(AlleleSimilarityCalculator.class.getSimpleName()).of( new AlleleSimilarityCalculator(getCallSimilarityCalculatorFactory(options)))) diff --git a/src/test/java/com/google/cloud/genomics/dataflow/functions/AlleleSimilarityCalculatorTest.java b/src/test/java/com/google/cloud/genomics/dataflow/functions/AlleleSimilarityCalculatorTest.java index e6f9fe9..0da2973 100644 --- a/src/test/java/com/google/cloud/genomics/dataflow/functions/AlleleSimilarityCalculatorTest.java +++ b/src/test/java/com/google/cloud/genomics/dataflow/functions/AlleleSimilarityCalculatorTest.java @@ -20,27 +20,41 @@ import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; -import java.util.Collections; import java.util.List; import org.hamcrest.CoreMatchers; import org.junit.Test; -import com.google.api.services.genomics.model.Variant; import com.google.cloud.dataflow.sdk.transforms.DoFnTester; import com.google.cloud.dataflow.sdk.values.KV; import com.google.cloud.genomics.dataflow.utils.DataUtils; +import com.google.genomics.v1.Variant; public class AlleleSimilarityCalculatorTest { - static final Variant snp1 = DataUtils.makeVariant("chr7", 200019, 200020, "T", Collections.singletonList("G"), - DataUtils.makeCall("het-alt sample", 1, 0), DataUtils.makeCall("hom-alt sample", 1, 1), - DataUtils.makeCall("hom-ref sample", 0, 0), DataUtils.makeCall("hom-nocall sample", -1, -1), - DataUtils.makeCall("ref-nocall sample", -1, 0)); + static final Variant snp1 = Variant.newBuilder() + .setReferenceName("chr7") + .setStart(200019) + .setEnd(200020) + .setReferenceBases("T") + .addAlternateBases("G") + .addCalls(DataUtils.makeVariantCall("het-alt sample", 1, 0)) + .addCalls(DataUtils.makeVariantCall("hom-alt sample", 1, 1)) + .addCalls(DataUtils.makeVariantCall("hom-ref sample", 0, 0)) + .addCalls(DataUtils.makeVariantCall("hom-nocall sample", -1, -1)) + .addCalls(DataUtils.makeVariantCall("ref-nocall sample", -1, 0)) + .build(); - static final Variant snp2 = DataUtils.makeVariant("chr7", 200020, 200021, "C", Collections.singletonList("A"), - DataUtils.makeCall("hom-alt sample", 1, 1), DataUtils.makeCall("het-alt sample", 0, 1), - DataUtils.makeCall("ref-nocall sample", 0, -1)); + static final Variant snp2 = Variant.newBuilder() + .setReferenceName("chr7") + .setStart(200020) + .setEnd(200021) + .setReferenceBases("C") + .addAlternateBases("A") + .addCalls(DataUtils.makeVariantCall("hom-alt sample", 1, 1)) + .addCalls(DataUtils.makeVariantCall("het-alt sample", 0, 1)) + .addCalls(DataUtils.makeVariantCall("ref-nocall sample", 0, -1)) + .build(); @Test public void testIsReferenceMajor() { diff --git a/src/test/java/com/google/cloud/genomics/dataflow/functions/CallSimilarityCalculatorTest.java b/src/test/java/com/google/cloud/genomics/dataflow/functions/CallSimilarityCalculatorTest.java index 3032e95..df54992 100644 --- a/src/test/java/com/google/cloud/genomics/dataflow/functions/CallSimilarityCalculatorTest.java +++ b/src/test/java/com/google/cloud/genomics/dataflow/functions/CallSimilarityCalculatorTest.java @@ -29,18 +29,18 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -import com.google.api.services.genomics.model.Call; -import com.google.api.services.genomics.model.Variant; import com.google.cloud.dataflow.sdk.transforms.DoFnTester; import com.google.cloud.dataflow.sdk.values.KV; import com.google.cloud.genomics.dataflow.utils.DataUtils; +import com.google.genomics.v1.Variant; +import com.google.genomics.v1.VariantCall; @RunWith(JUnit4.class) public class CallSimilarityCalculatorTest { private static final double DELTA = 1e-6; - private List calls = newArrayList(); + private List calls = newArrayList(); private List variants = newArrayList(); @@ -52,20 +52,20 @@ public class CallSimilarityCalculatorTest { @Before public void setUp() { - calls.add(DataUtils.makeCall(H1, 0, 0)); - calls.add(DataUtils.makeCall(H2, 1, 0)); - calls.add(DataUtils.makeCall(H3, 0, 1)); - calls.add(DataUtils.makeCall(H2, 1, 1)); - calls.add(DataUtils.makeCall(H3, 1, 1)); - calls.add(DataUtils.makeCall(H2, 1)); - calls.add(DataUtils.makeCall(H3, 0)); - calls.add(DataUtils.makeCall(H2, 1, 0, 1)); - calls.add(DataUtils.makeCall(H3, 1, 0, 0)); - - variants.add(DataUtils.makeSimpleVariant(calls.get(0), calls.get(1), calls.get(2))); - variants.add(DataUtils.makeSimpleVariant(calls.get(0), calls.get(3), calls.get(4))); - variants.add(DataUtils.makeSimpleVariant(calls.get(0), calls.get(5), calls.get(6))); - variants.add(DataUtils.makeSimpleVariant(calls.get(0), calls.get(7), calls.get(8))); + calls.add(DataUtils.makeVariantCall(H1, 0, 0)); + calls.add(DataUtils.makeVariantCall(H2, 1, 0)); + calls.add(DataUtils.makeVariantCall(H3, 0, 1)); + calls.add(DataUtils.makeVariantCall(H2, 1, 1)); + calls.add(DataUtils.makeVariantCall(H3, 1, 1)); + calls.add(DataUtils.makeVariantCall(H2, 1)); + calls.add(DataUtils.makeVariantCall(H3, 0)); + calls.add(DataUtils.makeVariantCall(H2, 1, 0, 1)); + calls.add(DataUtils.makeVariantCall(H3, 1, 0, 0)); + + variants.add(Variant.newBuilder().addCalls(calls.get(0)).addCalls(calls.get(1)).addCalls(calls.get(2)).build()); + variants.add(Variant.newBuilder().addCalls(calls.get(0)).addCalls(calls.get(3)).addCalls(calls.get(4)).build()); + variants.add(Variant.newBuilder().addCalls(calls.get(0)).addCalls(calls.get(5)).addCalls(calls.get(6)).build()); + variants.add(Variant.newBuilder().addCalls(calls.get(0)).addCalls(calls.get(7)).addCalls(calls.get(8)).build()); } @Test