From bc6927a21f0ec41c0fa49b7b1b752c61d2f4cae3 Mon Sep 17 00:00:00 2001 From: Shuhao Wu Date: Tue, 4 Apr 2023 12:23:00 -0400 Subject: [PATCH] Added Proxy for Table, RecordBatch, and Vector Certain codebases that previously uses row-oriented way to access data may wish to migrate to Arrow to save serialization and deserialization cost, and to be able to gain access to fast column-oriented operations. As it stands, Arrow is sort of a drop-in replacement to row-oriented data such as a JavaScript Array of objects. This is great to incrementally migrate legacy codebases to Arrow, as it is frequently infeasible to rewrite the application to use the column-oriented data access patterns. For most data, JavaScript-object-compatible and row-oriented access is already provided via the `StructRowProxy`. However, if the structs themselves include a `Vector`, existing code will break as it assumes the `Vector` object to behave like a JavaScript array, which it does not due to the lack of index access. An example of such a data structure is as follows: ``` [ {x: 1, y: [1, 2]}, {x: 2, y: [2, 3]}, ] ``` In this case, with the Arrow JS library as it is, the API consumer is unable to get individual element of the `y` array via `table[i].y[j]`. Instead, the API consumer must use the API `table.get(i).y.get(j)`. In the situation where we are migrating a legacy code base to Arrow, this requires a large refactor of the entire codebase, which is infeasible in a short time. This negates the advantage of using Arrow as a drop-in replacement and prevents incremental migration of code to Arrow. To address this problem, this patch adds a Proxy at the root of the prototype chain for the `Table`, `RecordBatch`, and `Vector` objects and allow index access for these objects for backward compatibility purposes. Basically, objects like `Vector` now supports `vector[i]` in addition to `vector.get(i)`. However, code should not be using `vector[i]` as it is ~1.5 orders of magnitude slower than `vector.get(i)` as ES6 Proxy objects are quite slow. This should only be used to provide compatibility for legacy codebases. For code that desires high performance, `.get` remains a much better solution. This is also why the Proxy object is added to the root of the prototype chain, as opposed to the usual pattern where a Proxy object is returned from a constructor. Documentation has been added to compare the performance of the various access. --- js/perf/index.ts | 15 ++++++++ js/src/recordbatch.ts | 16 +++++++++ js/src/table.ts | 15 ++++++++ js/src/util/proxyhandler.ts | 24 +++++++++++++ js/src/vector.ts | 35 +++++++++++++++++++ .../unit/recordbatch/record-batch-tests.ts | 13 +++++++ js/test/unit/table-tests.ts | 4 +++ js/test/unit/vector/vector-tests.ts | 7 +++- 8 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 js/src/util/proxyhandler.ts diff --git a/js/perf/index.ts b/js/perf/index.ts index 40225a6d8ae..6c54502ce50 100644 --- a/js/perf/index.ts +++ b/js/perf/index.ts @@ -133,6 +133,21 @@ b.suite( b.cycle(cycle) ); +b.suite( + `[index] Vector`, + + ...Object.entries(vectors).map(([name, vector]) => + b.add(`from: ${name}`, () => { + for (let i = -1, n = vector.length; ++i < n;) { + vector[i]; + } + })), + + b.cycle(cycle) +); + + + for (const { name, ipc, table } of config) { b.suite( `Parse`, diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts index eed9de66e8e..0f4683e3291 100644 --- a/js/src/recordbatch.ts +++ b/js/src/recordbatch.ts @@ -21,6 +21,8 @@ import { Vector } from './vector.js'; import { Schema, Field } from './schema.js'; import { DataType, Struct, Null, TypeMap } from './type.js'; +import { IndexAccessProxyHandler } from './util/proxyhandler.js' + import { instance as getVisitor } from './visitor/get.js'; import { instance as setVisitor } from './visitor/set.js'; import { instance as indexOfVisitor } from './visitor/indexof.js'; @@ -95,6 +97,13 @@ export class RecordBatch { public readonly schema: Schema; public readonly data: Data>; + /** + * Index access of the record batch elements. While equivalent to + * {@link * RecordBatch.get}, * it is 1-2 orders of magnitude slower than + * {@link * RecordBatch.get}. + */ + [index: number]: T['TValue'] | null; + public get dictionaries() { return this._dictionaries || (this._dictionaries = collectDictionaries(this.schema.fields, this.data.children)); } @@ -280,6 +289,13 @@ export class RecordBatch { protected static [Symbol.toStringTag] = ((proto: RecordBatch) => { (proto as any)._nullCount = -1; (proto as any)[Symbol.isConcatSpreadable] = true; + + // The Proxy object will slow down all method access if it is returned + // from the constructor. By putting it at the root of the prototype + // chain, we do not affect the speed of normal access. That said, index + // access will be much slower than `.get()`. + Object.setPrototypeOf(proto, new Proxy({}, new IndexAccessProxyHandler())) + return 'RecordBatch'; })(RecordBatch.prototype); } diff --git a/js/src/table.ts b/js/src/table.ts index 26f77d74f51..f216b1fdcab 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -33,6 +33,7 @@ import { wrapChunkedIndexOf, sliceChunks, } from './util/chunk.js'; +import { IndexAccessProxyHandler } from './util/proxyhandler.js' import { instance as getVisitor } from './visitor/get.js'; import { instance as setVisitor } from './visitor/set.js'; @@ -152,6 +153,13 @@ export class Table { */ declare public readonly batches: RecordBatch[]; + /** + * Index access of the table elements. While equivalent to + * {@link * Table.get}, * it is 1-2 orders of magnitude slower than + * {@link * Table.get}. + */ + [index: number]: T['TValue'] | null; + /** * The contiguous {@link RecordBatch `RecordBatch`} chunks of the Table rows. */ @@ -389,6 +397,13 @@ export class Table { (proto as any)['set'] = wrapChunkedCall2(setVisitor.getVisitFn(Type.Struct)); (proto as any)['indexOf'] = wrapChunkedIndexOf(indexOfVisitor.getVisitFn(Type.Struct)); (proto as any)['getByteLength'] = wrapChunkedCall1(byteLengthVisitor.getVisitFn(Type.Struct)); + + // The Proxy object will slow down all method access if it is returned + // from the constructor. By putting it at the root of the prototype + // chain, we do not affect the speed of normal access. That said, index + // access will be much slower than `.get()`. + Object.setPrototypeOf(proto, new Proxy({}, new IndexAccessProxyHandler())) + return 'Table'; })(Table.prototype); } diff --git a/js/src/util/proxyhandler.ts b/js/src/util/proxyhandler.ts new file mode 100644 index 00000000000..c5c1f591478 --- /dev/null +++ b/js/src/util/proxyhandler.ts @@ -0,0 +1,24 @@ +export class IndexAccessProxyHandler implements ProxyHandler { + get(target: any, key: string, receiver: any) { + if (typeof key === "string") { // Need to check because key can be a symbol, such as [Symbol.iterator]. + const idx = +key; // Convert the key to a number + if (idx === idx) { // Basically an inverse NaN check + return (receiver || target).get(idx); + } + } + + return Reflect.get(target, key, receiver); + } + + set(target: any, key: string, value: any, receiver: any) { + if (typeof key === "string") { // Need to check because key can be a symbol, such as [Symbol.iterator]. + const idx = +key; // Convert the key to a number + if (idx === idx) { // Basically an inverse NaN check + (receiver || target).set(idx, value); + return true; // Is this correct? + } + } + + return Reflect.set(target, key, value, receiver); + } +} diff --git a/js/src/vector.ts b/js/src/vector.ts index a2baf83c95d..b94db7f16dd 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -31,6 +31,7 @@ import { wrapChunkedIndexOf, } from './util/chunk.js'; import { BigInt64Array, BigUint64Array } from './util/compat.js'; +import { IndexAccessProxyHandler } from './util/proxyhandler.js' import { instance as getVisitor } from './visitor/get.js'; import { instance as setVisitor } from './visitor/set.js'; @@ -103,6 +104,12 @@ export class Vector { declare protected _nullCount: number; declare protected _byteLength: number; + /** + * Index access of the vector elements. While equivalent to {@link * Vector.get}, + * it is 1-2 orders of magnitude slower than {@link * Vector.get}. + */ + [index: number]: T['TValue'] | null; + /** * The {@link DataType `DataType`} of this Vector. */ @@ -358,6 +365,30 @@ export class Vector { (proto as any)._offsets = new Uint32Array([0]); (proto as any)[Symbol.isConcatSpreadable] = true; + // The prototype chain of the Vector object is complex to get the best + // possible performance: + // + // - The Proxy object is quite slow, so we put it at the bottom of the + // prototype chain. This means that known access such as functions + // like `vector.get` will be immediately resolved and is fast. Unknown + // access such as index notation (`vector[0]`) will bubble up to the + // proxy object and be resolved. Experimentally, this is about 1-2 + // orders of magnitude slower than using `vector.get(index)`. + // - When the Vector object has multiple chunks in it, we need to find + // the appropriate chunk to iterate through when using methods like + // `.get()`. To do this, in the Vector constructor, it sets the + // prototype of `this` to `vectorPrototypesByTypeId[typeId]`, which + // defines the appropriate methods to find the appropriate chunk. + // The prototypes provided by `vectorPrototypesByTypeId` is also + // chained from the Proxy object, which means Vector objects with + // multiple chunks also retain the index access API. + // - As a note, using `vector.get(i)` is slow as it needs to perform a + // binary search while looking for the right chunk. So operations + // that loop through the array with an index (i.e. `(for i=0; i Type[T] as any) .filter((T: any) => typeof T === 'number' && T !== Type.NONE); @@ -370,6 +401,10 @@ export class Vector { visitorsByTypeId[typeId] = { get, set, indexOf, byteLength }; vectorPrototypesByTypeId[typeId] = Object.create(proto, { + // These object keys are equivalent to string keys. However, by + // putting them in [] (which makes them computed property + // names), the Closure compiler we use to compile this library + // will not optimize out the function names. ['isValid']: { value: wrapChunkedCall1(isChunkedValid) }, ['get']: { value: wrapChunkedCall1(getVisitor.getVisitFnByTypeId(typeId)) }, ['set']: { value: wrapChunkedCall2(setVisitor.getVisitFnByTypeId(typeId)) }, diff --git a/js/test/unit/recordbatch/record-batch-tests.ts b/js/test/unit/recordbatch/record-batch-tests.ts index fe537c86e80..826cf63b282 100644 --- a/js/test/unit/recordbatch/record-batch-tests.ts +++ b/js/test/unit/recordbatch/record-batch-tests.ts @@ -129,4 +129,17 @@ describe(`RecordBatch`, () => { expect(f32sBatch.numRows).toBe(45); }); }); + + describe(`get()`, () => { + test(`can get row with get and []`, () => { + const batch = numsRecordBatch(32, 45); + const row = batch.get(2) + expect(row).not.toBeNull(); + expect(row!.f32).toEqual(2); + expect(row!.i32).toEqual(2); + + const row2 = batch[2]; + expect(row2).toEqual(row); + }); + }); }); diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 50c8565f0f1..3d9d43ee879 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -291,6 +291,10 @@ describe(`Table`, () => { expect(row.f32).toEqual(expected[F32]); expect(row.i32).toEqual(expected[I32]); expect(row.dictionary).toEqual(expected[DICT]); + + // Test index access as well + const row2 = table[i]; + expect(row2).toEqual(row) } }); test(`iterates expected values`, () => { diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index a259cbef877..4e7a0e764f3 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -73,7 +73,12 @@ describe(`StructVector`, () => { test(`get value`, () => { for (const [i, value] of values.entries()) { - expect(vector.get(i)!.toJSON()).toEqual(value); + const row = vector.get(i); + expect(row).not.toBeNull(); + expect(row!.toJSON()).toEqual(value); + + const row2 = vector[i]; + expect(row2).toEqual(row); } }); });