From eb8ff11a90cbae33f02bf1594bde2e8777a144ee Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 4 Sep 2019 00:06:17 -0700 Subject: [PATCH 1/3] ARROW-6318: [Integration] Run tests against pregenerated files --- ci/travis_script_integration.sh | 3 +- integration/integration_test.py | 170 ++++++++++++++++++++------------ testing | 2 +- 3 files changed, 109 insertions(+), 66 deletions(-) diff --git a/ci/travis_script_integration.sh b/ci/travis_script_integration.sh index c043189d243..8356c348adc 100755 --- a/ci/travis_script_integration.sh +++ b/ci/travis_script_integration.sh @@ -63,9 +63,10 @@ conda install -y -q python=3.6 six numpy # ARROW-4008: Create a directory to write temporary files since /tmp can be # unstable in Travis CI INTEGRATION_TEMPDIR=$TRAVIS_BUILD_DIR/integration_temp +GOLD_14_1=$ARROW_TEST_DATA/arrow-ipc/integration/0.14.1 mkdir -p $INTEGRATION_TEMPDIR -python integration_test.py --debug --tempdir=$INTEGRATION_TEMPDIR --run_flight +python integration_test.py --debug --tempdir=$INTEGRATION_TEMPDIR --gold_dirs=$GOLD_14_1 --run_flight popd diff --git a/integration/integration_test.py b/integration/integration_test.py index b791af5f726..d9df57a327b 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -20,6 +20,7 @@ import binascii import contextlib import glob +import gzip import itertools import json import os @@ -1018,7 +1019,9 @@ def generate_decimal_case(): possible_batch_sizes = 7, 10 batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))] - return _generate_file('decimal', fields, batch_sizes) + skip = set() + skip.add('Go') # TODO(ARROW-3676) + return _generate_file('decimal', fields, batch_sizes, skip=skip) def generate_datetime_case(): @@ -1055,6 +1058,8 @@ def generate_interval_case(): ] batch_sizes = [7, 10] + skip = set() + skip.add('JS') # TODO(ARROW-5239) return _generate_file("interval", fields, batch_sizes) @@ -1067,7 +1072,10 @@ def generate_map_case(): ] batch_sizes = [7, 10] - return _generate_file("map", fields, batch_sizes) + skip = set() + skip.add('JS') # TODO(ARROW-1279) + skip.add('Go') # TODO(ARROW-3679) + return _generate_file("map", fields, batch_sizes, skip=skip) def generate_nested_case(): @@ -1103,9 +1111,12 @@ def generate_dictionary_case(): DictionaryType('dict1', get_field('', 'int32'), dict1), DictionaryType('dict2', get_field('', 'int16'), dict2) ] + skip = set() + skip.add('Go') # TODO(ARROW-3039) batch_sizes = [7, 10] return _generate_file("dictionary", fields, batch_sizes, - dictionaries=[dict0, dict1, dict2]) + dictionaries=[dict0, dict1, dict2], + skip=skip) def generate_nested_dictionary_case(): @@ -1137,7 +1148,7 @@ def generate_nested_dictionary_case(): dictionaries=[dict0, dict1, dict2]) -def get_generated_json_files(tempdir=None, flight=False): +def get_generated_json_files(tempdir=None, flight=False, testing_dir=None): tempdir = tempdir or tempfile.mkdtemp() def _temp_path(): @@ -1183,14 +1194,32 @@ def __init__(self, json_files, testers, args): self.temp_dir = args.tempdir or tempfile.mkdtemp() self.debug = args.debug self.stop_on_error = args.stop_on_error + self.gold_dirs = args.gold_dirs def run(self): failures = [] + for producer, consumer in itertools.product( filter(lambda t: t.PRODUCER, self.testers), filter(lambda t: t.CONSUMER, self.testers)): - for failure in self._compare_implementations(producer, consumer): + for failure in self._compare_implementations( + producer, consumer, self._produce_consume, self.json_files): failures.append(failure) + if self.gold_dirs: + for gold_dir, consumer in itertools.product( + self.gold_dirs, + filter(lambda t: t.CONSUMER, self.testers)): + print('\n\n\n\n') + print('******************************************************') + print('Tests against golden files in {}'.format(gold_dir)) + print('******************************************************') + + def run_gold(producer, consumer, test_case): + self._run_gold(gold_dir, producer, consumer, test_case) + for failure in self._compare_implementations( + consumer, consumer, run_gold, self._gold_tests(gold_dir)): + failures.append(failure) + return failures def run_flight(self): @@ -1204,63 +1233,39 @@ def run_flight(self): failures.append(failure) return failures - def _compare_implementations(self, producer, consumer): + def _gold_tests(self, gold_dir): + prefix = os.path.basename(os.path.normpath(gold_dir)) + SUFFIX = ".json.gz" + golds = [jf for jf in os.listdir(gold_dir) if jf.endswith(SUFFIX)] + for json_path in golds: + name = json_path[json_path.index('_')+1: -len(SUFFIX)] + base_name = prefix + "_" + name + ".gold.json" + out_path = os.path.join(self.temp_dir, base_name) + with gzip.open(os.path.join(gold_dir, json_path)) as i: + with open(out_path, "wb") as out: + out.write(i.read()) + + try: + skip = next(f for f in self.json_files + if f.name == name).skip + except StopIteration: + skip = set() + yield JsonFile(name, None, None, skip=skip, path=out_path) + + def _compare_implementations( + self, producer, consumer, run_binaries, test_cases): print('##########################################################') print( '{0} producing, {1} consuming'.format(producer.name, consumer.name) ) print('##########################################################') - for test_case in self.json_files: + for test_case in test_cases: json_path = test_case.path print('==========================================================') print('Testing file {0}'.format(json_path)) print('==========================================================') - name = os.path.splitext(os.path.basename(json_path))[0] - - file_id = guid()[:8] - - if ('JS' in (producer.name, consumer.name) and - "map" in test_case.name): - print('TODO(ARROW-1279): Enable map tests ' + - ' for JS once they are unbroken') - continue - - if ('JS' in (producer.name, consumer.name) and - "interval" in test_case.name): - print('TODO(ARROW-5239): Enable interval tests ' + - ' for JS once JS supports them') - continue - - if ('Go' in (producer.name, consumer.name) and - "decimal" in test_case.name): - print('TODO(ARROW-3676): Enable decimal tests ' + - ' for Go') - continue - - if ('Go' in (producer.name, consumer.name) and - "map" in test_case.name): - print('TODO(ARROW-3679): Enable map tests ' + - ' for Go') - continue - - if ('Go' in (producer.name, consumer.name) and - "dictionary" in test_case.name): - print('TODO(ARROW-3039): Enable dictionary tests ' + - ' for Go') - continue - - # Make the random access file - producer_file_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.json_as_file') - producer_stream_path = os.path.join(self.temp_dir, file_id + '_' + - name + - '.producer_file_as_stream') - consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + - name + - '.consumer_stream_as_file') - if producer.name in test_case.skip: print('-- Skipping test because producer {0} does ' 'not support'.format(producer.name)) @@ -1276,19 +1281,7 @@ def _compare_implementations(self, producer, consumer): continue try: - print('-- Creating binary inputs') - producer.json_to_file(json_path, producer_file_path) - - # Validate the file - print('-- Validating file') - consumer.validate(json_path, producer_file_path) - - print('-- Validating stream') - producer.file_to_stream(producer_file_path, - producer_stream_path) - consumer.stream_to_file(producer_stream_path, - consumer_file_path) - consumer.validate(json_path, consumer_file_path) + run_binaries(producer, consumer, test_case) except Exception: traceback.print_exc() yield (test_case, producer, consumer, sys.exc_info()) @@ -1297,6 +1290,52 @@ def _compare_implementations(self, producer, consumer): else: continue + def _produce_consume(self, producer, consumer, test_case): + # Make the random access file + json_path = test_case.path + file_id = guid()[:8] + name = os.path.splitext(os.path.basename(json_path))[0] + + producer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + '.json_as_file') + producer_stream_path = os.path.join(self.temp_dir, file_id + '_' + + name + '.producer_file_as_stream') + consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + '.consumer_stream_as_file') + + print('-- Creating binary inputs') + producer.json_to_file(json_path, producer_file_path) + + # Validate the file + print('-- Validating file') + consumer.validate(json_path, producer_file_path) + + print('-- Validating stream') + producer.file_to_stream(producer_file_path, producer_stream_path) + consumer.stream_to_file(producer_stream_path, consumer_file_path) + consumer.validate(json_path, consumer_file_path) + + def _run_gold(self, gold_dir, producer, consumer, test_case): + json_path = test_case.path + + # Validate the file + print('-- Validating file') + producer_file_path = os.path.join( + gold_dir, "generated_" + test_case.name + ".arrow_file") + consumer.validate(json_path, producer_file_path) + + print('-- Validating stream') + consumer_stream_path = os.path.join( + gold_dir, "generated_" + test_case.name + ".stream") + file_id = guid()[:8] + name = os.path.splitext(os.path.basename(json_path))[0] + + consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + + name + '.consumer_stream_as_file') + + consumer.stream_to_file(consumer_stream_path, consumer_file_path) + consumer.validate(json_path, consumer_file_path) + def _compare_flight_implementations(self, producer, consumer): print('##########################################################') print( @@ -1741,6 +1780,9 @@ def write_js_test_json(directory): action='store_true', default=False, help='Stop on first error') + parser.add_argument('--gold_dirs', action='append', + help="gold integration test file paths") + args = parser.parse_args() if args.generated_json_path: try: diff --git a/testing b/testing index f1e3be91276..93f33ae9222 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit f1e3be91276df2cade959e54ec3b7710853c6e9e +Subproject commit 93f33ae922226686e623d6b1d0307f48030a8a67 From 7a5a5d563f1043365e0e42de6d2efb3ce101ad92 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 4 Sep 2019 00:09:57 -0700 Subject: [PATCH 2/3] remove unused parameter --- integration/integration_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/integration_test.py b/integration/integration_test.py index d9df57a327b..7771758656f 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -1148,7 +1148,7 @@ def generate_nested_dictionary_case(): dictionaries=[dict0, dict1, dict2]) -def get_generated_json_files(tempdir=None, flight=False, testing_dir=None): +def get_generated_json_files(tempdir=None, flight=False): tempdir = tempdir or tempfile.mkdtemp() def _temp_path(): From 3c7d6f79449c9bcc931863c4487143f5061871e4 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 4 Sep 2019 07:38:38 -0700 Subject: [PATCH 3/3] add skip param for interval --- integration/integration_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/integration_test.py b/integration/integration_test.py index 7771758656f..be20b4ee233 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -1060,7 +1060,7 @@ def generate_interval_case(): batch_sizes = [7, 10] skip = set() skip.add('JS') # TODO(ARROW-5239) - return _generate_file("interval", fields, batch_sizes) + return _generate_file("interval", fields, batch_sizes, skip=skip) def generate_map_case():