-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Single dimension hash-based partitioning #2570
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,11 +22,13 @@ | |
|
|
||
| import com.fasterxml.jackson.annotation.JsonCreator; | ||
| import com.fasterxml.jackson.annotation.JsonProperty; | ||
| import com.google.common.collect.ImmutableList; | ||
| import io.druid.indexer.DeterminePartitionsJob; | ||
| import io.druid.indexer.HadoopDruidIndexerConfig; | ||
| import io.druid.indexer.Jobby; | ||
|
|
||
| import javax.annotation.Nullable; | ||
| import java.util.List; | ||
|
|
||
| public class SingleDimensionPartitionsSpec extends AbstractPartitionsSpec | ||
| { | ||
|
|
@@ -57,4 +59,11 @@ public Jobby getPartitionJob(HadoopDruidIndexerConfig config) | |
| { | ||
| return new DeterminePartitionsJob(config); | ||
| } | ||
|
|
||
| @Override | ||
| @JsonProperty | ||
| public List<String> getPartitionDimensions() | ||
| { | ||
| return ImmutableList.of(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why doesn't this return "partitionDimension"? |
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,31 +21,48 @@ | |
|
|
||
| import com.fasterxml.jackson.annotation.JacksonInject; | ||
| import com.fasterxml.jackson.annotation.JsonCreator; | ||
| import com.fasterxml.jackson.annotation.JsonIgnore; | ||
| import com.fasterxml.jackson.annotation.JsonProperty; | ||
| import com.fasterxml.jackson.core.JsonProcessingException; | ||
| import com.fasterxml.jackson.databind.ObjectMapper; | ||
| import com.google.common.base.Function; | ||
| import com.google.common.base.Throwables; | ||
| import com.google.common.collect.ImmutableList; | ||
| import com.google.common.collect.Lists; | ||
| import com.google.common.hash.HashFunction; | ||
| import com.google.common.hash.Hashing; | ||
| import io.druid.data.input.InputRow; | ||
| import io.druid.data.input.Rows; | ||
|
|
||
| import javax.annotation.Nullable; | ||
| import java.util.List; | ||
|
|
||
| public class HashBasedNumberedShardSpec extends NumberedShardSpec | ||
| { | ||
| private static final HashFunction hashFunction = Hashing.murmur3_32(); | ||
| private static final List<String> DEFAULT_PARTITION_DIMENSIONS = ImmutableList.of(); | ||
|
|
||
| private final ObjectMapper jsonMapper; | ||
| @JsonIgnore | ||
| private final List<String> partitionDimensions; | ||
|
|
||
| @JsonCreator | ||
| public HashBasedNumberedShardSpec( | ||
| @JsonProperty("partitionNum") int partitionNum, | ||
| @JsonProperty("partitions") int partitions, | ||
| @JsonProperty("partitionDimensions") @Nullable List<String> partitionDimensions, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i don't see this in the docs, or i am missing something ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The HashBasedNumberedShardSpec is used internal, so there is no docs.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @binlijin I don't understand. Why should people not use this? Why not document it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fjy For batch ingestion, people use PartitionsSpec, which will use HashBasedNumberedShardSpec and SingleDimensionShardSpec internal for it's work.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @fjy For realtime ingestion, people only use none/linear/numbered shardSpec, and not use HashBasedNumberedShardSpec and SingleDimensionShardSpec directly. |
||
| @JacksonInject ObjectMapper jsonMapper | ||
| ) | ||
| { | ||
| super(partitionNum, partitions); | ||
| this.jsonMapper = jsonMapper; | ||
| this.partitionDimensions = partitionDimensions == null ? DEFAULT_PARTITION_DIMENSIONS : partitionDimensions; | ||
| } | ||
|
|
||
| @JsonProperty("partitionDimensions") | ||
| public List<String> getPartitionDimensions() | ||
| { | ||
| return partitionDimensions; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -56,7 +73,7 @@ public boolean isInChunk(long timestamp, InputRow inputRow) | |
|
|
||
| protected int hash(long timestamp, InputRow inputRow) | ||
| { | ||
| final List<Object> groupKey = Rows.toGroupKey(timestamp, inputRow); | ||
| final List<Object> groupKey = getGroupKey(timestamp, inputRow); | ||
| try { | ||
| return hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asInt(); | ||
| } | ||
|
|
@@ -65,12 +82,29 @@ protected int hash(long timestamp, InputRow inputRow) | |
| } | ||
| } | ||
|
|
||
| List<Object> getGroupKey(final long timestamp, final InputRow inputRow) | ||
| { | ||
| if (partitionDimensions.isEmpty()) { | ||
| return Rows.toGroupKey(timestamp, inputRow); | ||
| } else { | ||
| return Lists.transform(partitionDimensions, new Function<String, Object>() | ||
| { | ||
| @Override | ||
| public Object apply(final String dim) | ||
| { | ||
| return inputRow.getDimension(dim); | ||
| } | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() | ||
| { | ||
| return "HashBasedNumberedShardSpec{" + | ||
| "partitionNum=" + getPartitionNum() + | ||
| ", partitions=" + getPartitions() + | ||
| ", partitionDimensions=" + getPartitionDimensions() + | ||
| '}'; | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it appears that for "hashed" partition spec, partitionDimensions is completely ignored if "targetPartitionSize" was set. from documentation, it appears they should have been considered.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add doc for it.