-
Notifications
You must be signed in to change notification settings - Fork 850
Parallel chunk fetching from DynamoDB #603
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,6 +107,7 @@ type DynamoDBConfig struct { | |
| DynamoDB util.URLValue | ||
| APILimit float64 | ||
| ApplicationAutoScaling util.URLValue | ||
| DynamoDBChunkGangSize int | ||
| } | ||
|
|
||
| // RegisterFlags adds the flags required to config this to the given FlagSet | ||
|
|
@@ -115,6 +116,7 @@ func (cfg *DynamoDBConfig) RegisterFlags(f *flag.FlagSet) { | |
| "If only region is specified as a host, proper endpoint will be deduced. Use inmemory:///<table-name> to use a mock in-memory implementation.") | ||
| f.Float64Var(&cfg.APILimit, "dynamodb.api-limit", 2.0, "DynamoDB table management requests per second limit.") | ||
| f.Var(&cfg.ApplicationAutoScaling, "applicationautoscaling.url", "ApplicationAutoscaling endpoint URL with escaped Key and Secret encoded.") | ||
| f.IntVar(&cfg.DynamoDBChunkGangSize, "dynamodb.chunk.gang.size", 10, "Number of chunks to group together to parallelise fetches (zero to disable)") | ||
| } | ||
|
|
||
| // AWSStorageConfig specifies config for storing data on AWS. | ||
|
|
@@ -415,6 +417,11 @@ func (a dynamoDBRequestAdapter) Retryable() bool { | |
| return *a.request.Retryable | ||
| } | ||
|
|
||
| type chunksPlusError struct { | ||
|
||
| chunks []Chunk | ||
| err error | ||
| } | ||
|
|
||
| func (a awsStorageClient) GetChunks(ctx context.Context, chunks []Chunk) ([]Chunk, error) { | ||
| sp, ctx := ot.StartSpanFromContext(ctx, "GetChunks") | ||
| defer sp.Finish() | ||
|
|
@@ -443,10 +450,37 @@ func (a awsStorageClient) GetChunks(ctx context.Context, chunks []Chunk) ([]Chun | |
| return s3Chunks, err | ||
| } | ||
|
|
||
| dynamoDBChunks, err = a.getDynamoDBChunks(ctx, dynamoDBChunks) | ||
| gangSize := a.cfg.DynamoDBChunkGangSize * dynamoDBMaxReadBatchSize | ||
| if gangSize == 0 { // zero means turn feature off | ||
| gangSize = len(dynamoDBChunks) | ||
| } | ||
|
|
||
| results := make(chan chunksPlusError) | ||
|
||
| for i := 0; i < len(dynamoDBChunks); i += gangSize { | ||
| go func(start int) { | ||
| end := start + gangSize | ||
| if end > len(dynamoDBChunks) { | ||
| end = len(dynamoDBChunks) | ||
| } | ||
| outChunks, err := a.getDynamoDBChunks(ctx, dynamoDBChunks[start:end]) | ||
| results <- chunksPlusError{outChunks, err} | ||
| }(i) | ||
| } | ||
| finalChunks := s3Chunks | ||
| for i := 0; i < len(dynamoDBChunks); i += gangSize { | ||
| in := <-results | ||
| if in.err != nil { | ||
| err = in.err // TODO: cancel other sub-queries at this point | ||
| } else { | ||
| finalChunks = append(finalChunks, in.chunks...) | ||
| } | ||
| } | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| // Return any chunks we did receive: a partial result may be useful | ||
| return append(dynamoDBChunks, s3Chunks...), err | ||
| return finalChunks, err | ||
| } | ||
|
|
||
| func (a awsStorageClient) getS3Chunks(ctx context.Context, chunks []Chunk) ([]Chunk, error) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm more familiar with choosing the level of parallelism (e.g. how many concurrent goroutines) than choosing the size of each concurrent job, as you're doing here. I don't have opinions on which is better. Why did you decide to do it this way?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Choosing the level of parallelism requires a global queue of work across all queries. I'd like to do that, but don't feel I can complete it in the current sprint.
Computing the "gang size" to target a certain parallelism per query is harder to tune (wanting to keep the batches sent to DynamdDB fairly large), and the end result, given many queries in parallel, will still have highly variable overall parallelism.