From f7b251b05f9962c2a5cb75304e3cf6e875d813a2 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Fri, 20 Oct 2017 16:07:58 -0400 Subject: [PATCH 1/2] only load a dictionary once --- js/src/reader/arrow.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/js/src/reader/arrow.ts b/js/src/reader/arrow.ts index 9716c7fb601..b50838dc8d0 100644 --- a/js/src/reader/arrow.ts +++ b/js/src/reader/arrow.ts @@ -52,9 +52,12 @@ export function* readBuffers(...bytes: Array) { let index = -1, fieldsLength = schema.fieldsLength(); if (batch.id) { while (++index < fieldsLength) { + let found: boolean = false; for (let [id, vector] of readDictionaries(schema.fields(index), batch, state, dictionaries)) { dictionaries[id] = dictionaries[id] && dictionaries[id].concat(vector) || vector; + found = true; } + if (found) break; } } else { while (++index < fieldsLength) { From aec2f6dbf11dbdf513c2caaca44907d72b71e5d7 Mon Sep 17 00:00:00 2001 From: Brian Hulette Date: Fri, 20 Oct 2017 17:41:25 -0400 Subject: [PATCH 2/2] readDictionary now returns a single vector or null --- js/src/reader/arrow.ts | 13 +++++++------ js/src/reader/dictionary.ts | 18 +++++++----------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/js/src/reader/arrow.ts b/js/src/reader/arrow.ts index b50838dc8d0..dbb6acd0e79 100644 --- a/js/src/reader/arrow.ts +++ b/js/src/reader/arrow.ts @@ -23,7 +23,7 @@ import { readFile } from './file'; import { readStream } from './stream'; import { readVector } from './vector'; import { Vector } from '../vector/vector'; -import { readDictionaries } from './dictionary'; +import { readDictionary } from './dictionary'; import ByteBuffer = flatbuffers.ByteBuffer; export import Schema = Schema_.org.apache.arrow.flatbuf.Schema; @@ -51,13 +51,14 @@ export function* readBuffers(...bytes: Array) { let state = { nodeIndex: 0, bufferIndex: 0 }; let index = -1, fieldsLength = schema.fieldsLength(); if (batch.id) { + // A dictionary batch only contain a single vector. Traverse each + // field and its children until we find one that uses this dictionary while (++index < fieldsLength) { - let found: boolean = false; - for (let [id, vector] of readDictionaries(schema.fields(index), batch, state, dictionaries)) { - dictionaries[id] = dictionaries[id] && dictionaries[id].concat(vector) || vector; - found = true; + let vector = readDictionary(schema.fields(index), batch, state, dictionaries); + if (vector) { + dictionaries[batch.id] = dictionaries[batch.id] && dictionaries[batch.id].concat(vector) || vector; + break; } - if (found) break; } } else { while (++index < fieldsLength) { diff --git a/js/src/reader/dictionary.ts b/js/src/reader/dictionary.ts index abf7ac3dfb9..61698e80c00 100644 --- a/js/src/reader/dictionary.ts +++ b/js/src/reader/dictionary.ts @@ -21,20 +21,16 @@ import * as Schema_ from '../format/Schema_generated'; import { IteratorState, Dictionaries } from './arrow'; import Field = Schema_.org.apache.arrow.flatbuf.Field; -export function* readDictionaries(field: Field | null, - batch: MessageBatch, - iterator: IteratorState, - dictionaries: Dictionaries) { +export function readDictionary(field: Field | null, + batch: MessageBatch, + iterator: IteratorState, + dictionaries: Dictionaries) { let id: string, encoding = field && field.dictionary(); if (encoding && batch.id === (id = encoding.id().toFloat64().toString())) { - yield [id, readVector(field, batch, iterator, null)]; - return; + return readVector(field, batch, iterator, null); } for (let i = -1, n = field && field.childrenLength() || 0; ++i < n;) { - // Since a dictionary batch can only contain a single vector, return early after we find it - for (let result of readDictionaries(field.children(i), batch, iterator, dictionaries)) { - yield result; - return; - } + let vector = readDictionary(field.children(i), batch, iterator, dictionaries); + if (vector) return vector; } }