From 3572281c487d07b65810e3abb42175d4492c3cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krist=C3=B3f=20Umann?= Date: Wed, 10 Jan 2024 13:33:54 +0100 Subject: [PATCH 1/4] [store] Unique reports _before_ storing We used to zip every result file in the result directory and send it to the server. This is tremendously wasteful, as many report files contained the very same reports (for instance, if the report originated from a header file). In this patch, I unique all reports, dump them in a tmpfile, and store that to the server. For a clang-tidy --enable-all analysis on xerces, this reduced the size of the zipfile drastically: BEFORE: Compressing report zip file done (1.6GiB / 62.2MiB). AFTER: Compressing report zip file done (372.6MiB / 7.9MiB). While this doesn't speed up CodeChecker parse or CodeChecker diff, it still speeds up the server quite a bit, both for storage and query times. --- .../report/hash.py | 2 +- .../report/parser/plist.py | 5 ++ web/client/codechecker_client/cmd/store.py | 63 +++++++++++++------ .../functional/component/test_component.py | 25 +++----- .../dynamic_results/test_dynamic_results.py | 2 +- 5 files changed, 60 insertions(+), 37 deletions(-) diff --git a/tools/report-converter/codechecker_report_converter/report/hash.py b/tools/report-converter/codechecker_report_converter/report/hash.py index 8c8eab52c2..e69ff13e8b 100644 --- a/tools/report-converter/codechecker_report_converter/report/hash.py +++ b/tools/report-converter/codechecker_report_converter/report/hash.py @@ -205,5 +205,5 @@ def get_report_path_hash(report: Report) -> str: if not report_path_hash: LOG.error('Failed to generate report path hash: %s', report) - LOG.debug(report_path_hash) + # LOG.debug(report_path_hash) return __str_to_hash(report_path_hash) diff --git a/tools/report-converter/codechecker_report_converter/report/parser/plist.py b/tools/report-converter/codechecker_report_converter/report/parser/plist.py index 405981feeb..2a02f88eb5 100644 --- a/tools/report-converter/codechecker_report_converter/report/parser/plist.py +++ b/tools/report-converter/codechecker_report_converter/report/parser/plist.py @@ -519,6 +519,11 @@ def convert( self._create_macro_expansion( macro_expansion, file_index_map)) + if report.annotations: + diagnostic["report-annotation"] = dict() + for key, value in report.annotations.items(): + diagnostic["report-annotation"][key] = value + data['diagnostics'].append(diagnostic) return data diff --git a/web/client/codechecker_client/cmd/store.py b/web/client/codechecker_client/cmd/store.py index 0a45194687..ecc4d91399 100644 --- a/web/client/codechecker_client/cmd/store.py +++ b/web/client/codechecker_client/cmd/store.py @@ -37,6 +37,7 @@ reports as reports_helper, statistics as report_statistics from codechecker_report_converter.report.hash import HashType, \ get_report_path_hash +from codechecker_report_converter.report.parser.base import AnalyzerInfo from codechecker_client import client as libclient from codechecker_client import product @@ -50,6 +51,7 @@ from codechecker_web.shared import webserver_context, host_check from codechecker_web.shared.env import get_default_workspace +# from codechecker_server import MetadataInfoParser try: from codechecker_client.blame_info import assemble_blame_info except ImportError: @@ -404,10 +406,12 @@ def parse_analyzer_result_files( """ Get reports from the given analyzer result files. """ analyzer_result_file_reports: AnalyzerResultFileReports = defaultdict(list) - for file_path, reports in zip( + for idx, (file_path, reports) in enumerate(zip( analyzer_result_files, zip_iter( functools.partial(get_reports, checker_labels=checker_labels), - analyzer_result_files)): + analyzer_result_files))): + if idx % 10 == 0: + LOG.debug(f"Parsed {idx}/{len(analyzer_result_files)} files...") analyzer_result_file_reports[file_path] = reports return analyzer_result_file_reports @@ -422,7 +426,7 @@ def assemble_zip(inputs, contanining analysis related information into a zip file which will be sent to the server. """ - files_to_compress = set() + files_to_compress: Dict[str, set] = defaultdict(set) analyzer_result_file_paths = [] stats = StorageZipStatistics() @@ -431,7 +435,8 @@ def assemble_zip(inputs, metadata_file_path = os.path.join(dir_path, 'metadata.json') if os.path.exists(metadata_file_path): - files_to_compress.add(metadata_file_path) + files_to_compress[os.path.dirname(metadata_file_path)] \ + .add(metadata_file_path) skip_file_path = os.path.join(dir_path, 'skip_file') if os.path.exists(skip_file_path): @@ -439,13 +444,15 @@ def assemble_zip(inputs, LOG.info("Found skip file %s with the following content:\n%s", skip_file_path, f.read()) - files_to_compress.add(skip_file_path) + files_to_compress[os.path.dirname(skip_file_path)] \ + .add(skip_file_path) review_status_file_path = os.path.join(dir_path, 'review_status.yaml') if os.path.exists(review_status_file_path): - files_to_compress.add(review_status_file_path) + files_to_compress[os.path.dirname(review_status_file_path)]\ + .add(review_status_file_path) - LOG.debug("Processing report files ...") + LOG.debug(f"Processing {len(analyzer_result_file_paths)} report files ...") with MultiProcessPool() as executor: analyzer_result_file_reports = parse_analyzer_result_files( @@ -456,9 +463,10 @@ def assemble_zip(inputs, changed_files = set() file_paths = set() file_report_positions: FileReportPositions = defaultdict(set) - unique_reports = set() + unique_reports: Dict[str, Dict[str, List[Report]]] = defaultdict(dict) + + unique_report_hashes = set() for file_path, reports in analyzer_result_file_reports.items(): - files_to_compress.add(file_path) stats.num_of_analyzer_result_files += 1 for report in reports: @@ -467,13 +475,30 @@ def assemble_zip(inputs, continue # Need to calculate unique reoirt count to determine report limit report_path_hash = get_report_path_hash(report) - if report_path_hash not in unique_reports: - unique_reports.add(report_path_hash) + if report_path_hash not in unique_report_hashes: + unique_report_hashes.add(report_path_hash) + unique_reports[os.path.dirname(file_path)]\ + .setdefault(report.analyzer_name, []) \ + .append(report) stats.add_report(report) file_paths.update(report.original_files) file_report_positions[report.file.original_path].add(report.line) + # TODO: Doesn't support storing multiple report dirs. + if unique_reports: + for dirname, analyzer_reports in unique_reports.items(): + for analyzer_name, reports in analyzer_reports.items(): + if not analyzer_name: + analyzer_name = 'unknown' + _, tmpfile = tempfile.mkstemp(f'-{analyzer_name}.plist') + + report_file.create(tmpfile, reports, checker_labels, + AnalyzerInfo(analyzer_name)) + LOG.debug(f"Stored '{analyzer_name}' unique reports " + f"in {tmpfile}.") + files_to_compress[dirname].add(tmpfile) + if changed_files: reports_helper.dump_changed_files(changed_files) sys.exit(1) @@ -529,15 +554,17 @@ def assemble_zip(inputs, LOG.info("Building report zip file...") with zipfile.ZipFile(zip_file, 'a', allowZip64=True) as zipf: # Add the files to the zip which will be sent to the server. - for file_path in files_to_compress: - _, file_name = os.path.split(file_path) - # Create a unique report directory name. - report_dir_name = hashlib.md5(os.path.dirname( - file_path).encode('utf-8')).hexdigest() + for dirname, files in files_to_compress.items(): + for file_path in files: + _, file_name = os.path.split(file_path) - zip_target = os.path.join('reports', report_dir_name, file_name) - zipf.write(file_path, zip_target) + # Create a unique report directory name. + report_dir_name = \ + hashlib.md5(dirname.encode('utf-8')).hexdigest() + zip_target = \ + os.path.join('reports', report_dir_name, file_name) + zipf.write(file_path, zip_target) collected_file_paths = set() for f, h in file_to_hash.items(): diff --git a/web/tests/functional/component/test_component.py b/web/tests/functional/component/test_component.py index e14821d20a..b44d981b80 100644 --- a/web/tests/functional/component/test_component.py +++ b/web/tests/functional/component/test_component.py @@ -199,6 +199,9 @@ def setup_method(self, method): } ] + def teardown_method(self, method): + self.__remove_all_source_componens() + def __add_new_component(self, component): """ Creates a new source component. @@ -226,6 +229,11 @@ def __get_user_defined_source_components(self): return [c for c in components if GEN_OTHER_COMPONENT_NAME not in c.name] + def __remove_all_source_componens(self): + print(self.__get_user_defined_source_components()) + for component in self.__get_user_defined_source_components(): + self.__remove_source_component(component.name) + def __test_other_component(self, components, excluded_from_other, included_in_other=None): """ @@ -325,8 +333,6 @@ def test_filter_report_by_component(self): r.checkedFile.endswith('null_dereference.cpp')] self.assertEqual(len(divide_zero_reports), 0) - self.__remove_source_component(test_component['name']) - def test_filter_report_by_complex_component(self): """ Test report filter by complex component which includes and excludes @@ -375,8 +381,6 @@ def test_filter_report_by_complex_component(self): r.checkedFile.endswith('path_begin.cpp')] self.assertEqual(len(divide_zero_reports), 0) - self.__remove_source_component(test_component['name']) - def test_filter_report_by_multiple_components(self): """ Test report filter by multiple components. @@ -425,9 +429,6 @@ def test_filter_report_by_multiple_components(self): r.checkedFile.endswith('call_and_message.cpp')] self.assertEqual(len(call_and_msg_reports), 0) - self.__remove_source_component(test_component1['name']) - self.__remove_source_component(test_component2['name']) - def test_filter_report_by_excluding_all_results_component(self): """ Test report filter by component which excludes all reports. @@ -452,8 +453,6 @@ def test_filter_report_by_excluding_all_results_component(self): # No reports for this component. self.assertEqual(len(run_results), 0) - self.__remove_source_component(test_component['name']) - def test_component_name_with_whitespaces(self): """ Creates a new component which contains white spaces and removes it at @@ -462,7 +461,6 @@ def test_component_name_with_whitespaces(self): test_component = self.components[1] self.__add_new_component(test_component) - self.__remove_source_component(test_component['name']) def test_no_user_defined_component(self): """ @@ -496,7 +494,6 @@ def test_other_with_single_user_defined_component(self): excluded_from_other = ['divide_zero.cpp', 'new_delete.cpp'] self.__test_other_component([component], excluded_from_other) - self.__remove_source_component(component['name']) def test_other_with_multiple_user_defined_component(self): """ @@ -540,9 +537,6 @@ def test_other_with_multiple_user_defined_component(self): self.__test_other_component(components, excluded_from_other, included_in_other) - for c in components: - self.__remove_source_component(c['name']) - def test_component_anywhere_on_path(self): """ Check "anywhere on report path" feature. With this flag one can query @@ -589,6 +583,3 @@ def test_component_anywhere_on_path(self): self.assertEqual(len(component_results), 1) self.assertTrue( component_results[0].checkedFile.endswith('path_end.h')) - - for c in components: - self.__remove_source_component(c['name']) diff --git a/web/tests/functional/dynamic_results/test_dynamic_results.py b/web/tests/functional/dynamic_results/test_dynamic_results.py index 4fa4c7a2b6..e88540f737 100644 --- a/web/tests/functional/dynamic_results/test_dynamic_results.py +++ b/web/tests/functional/dynamic_results/test_dynamic_results.py @@ -27,7 +27,7 @@ from libtest import env -class DiffRemote(unittest.TestCase): +class DynamicResults(unittest.TestCase): def setup_class(self): """Setup the environment for testing dynamic_results.""" From 3ba213010ffbf597ef02d7d116a2c979d459ce09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krist=C3=B3f=20Umann?= Date: Mon, 11 Mar 2024 11:04:26 +0100 Subject: [PATCH 2/4] Tidy up the patch --- .../codechecker_report_converter/report/hash.py | 1 + web/client/codechecker_client/cmd/store.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/report-converter/codechecker_report_converter/report/hash.py b/tools/report-converter/codechecker_report_converter/report/hash.py index e69ff13e8b..cf91f1dc4c 100644 --- a/tools/report-converter/codechecker_report_converter/report/hash.py +++ b/tools/report-converter/codechecker_report_converter/report/hash.py @@ -205,5 +205,6 @@ def get_report_path_hash(report: Report) -> str: if not report_path_hash: LOG.error('Failed to generate report path hash: %s', report) + # This might be a little too verbose even for the verbose output. # LOG.debug(report_path_hash) return __str_to_hash(report_path_hash) diff --git a/web/client/codechecker_client/cmd/store.py b/web/client/codechecker_client/cmd/store.py index ecc4d91399..d4502df093 100644 --- a/web/client/codechecker_client/cmd/store.py +++ b/web/client/codechecker_client/cmd/store.py @@ -51,7 +51,6 @@ from codechecker_web.shared import webserver_context, host_check from codechecker_web.shared.env import get_default_workspace -# from codechecker_server import MetadataInfoParser try: from codechecker_client.blame_info import assemble_blame_info except ImportError: @@ -425,6 +424,10 @@ def assemble_zip(inputs, """Collect and compress report and source files, together with files contanining analysis related information into a zip file which will be sent to the server. + + For each report directory, we create a uniqued zipped directory. Each + report directory to store could have been made with different + configurations, so we can't merge them all into a single zip. """ files_to_compress: Dict[str, set] = defaultdict(set) analyzer_result_file_paths = [] @@ -473,7 +476,9 @@ def assemble_zip(inputs, if report.changed_files: changed_files.update(report.changed_files) continue - # Need to calculate unique reoirt count to determine report limit + # Unique all bug reports per report directory; also, count how many + # reports we want to store at once to check for the report store + # limit. report_path_hash = get_report_path_hash(report) if report_path_hash not in unique_report_hashes: unique_report_hashes.add(report_path_hash) @@ -485,7 +490,6 @@ def assemble_zip(inputs, file_paths.update(report.original_files) file_report_positions[report.file.original_path].add(report.line) - # TODO: Doesn't support storing multiple report dirs. if unique_reports: for dirname, analyzer_reports in unique_reports.items(): for analyzer_name, reports in analyzer_reports.items(): From 40b1c6d754492efb6eafc69b8033bea9a0613252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krist=C3=B3f=20Umann?= Date: Thu, 21 Mar 2024 10:09:30 +0100 Subject: [PATCH 3/4] Partial fixes according to bruntib's comments --- .../report/hash.py | 2 -- web/client/codechecker_client/cmd/store.py | 26 +++++++++---------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/tools/report-converter/codechecker_report_converter/report/hash.py b/tools/report-converter/codechecker_report_converter/report/hash.py index cf91f1dc4c..c7682b66cd 100644 --- a/tools/report-converter/codechecker_report_converter/report/hash.py +++ b/tools/report-converter/codechecker_report_converter/report/hash.py @@ -205,6 +205,4 @@ def get_report_path_hash(report: Report) -> str: if not report_path_hash: LOG.error('Failed to generate report path hash: %s', report) - # This might be a little too verbose even for the verbose output. - # LOG.debug(report_path_hash) return __str_to_hash(report_path_hash) diff --git a/web/client/codechecker_client/cmd/store.py b/web/client/codechecker_client/cmd/store.py index d4502df093..6906bb81fa 100644 --- a/web/client/codechecker_client/cmd/store.py +++ b/web/client/codechecker_client/cmd/store.py @@ -409,8 +409,8 @@ def parse_analyzer_result_files( analyzer_result_files, zip_iter( functools.partial(get_reports, checker_labels=checker_labels), analyzer_result_files))): - if idx % 10 == 0: - LOG.debug(f"Parsed {idx}/{len(analyzer_result_files)} files...") + LOG.debug(f"[{idx}/{len(analyzer_result_files)}] " + f"Parsed '{file_path}' ...") analyzer_result_file_reports[file_path] = reports return analyzer_result_file_reports @@ -490,18 +490,16 @@ def assemble_zip(inputs, file_paths.update(report.original_files) file_report_positions[report.file.original_path].add(report.line) - if unique_reports: - for dirname, analyzer_reports in unique_reports.items(): - for analyzer_name, reports in analyzer_reports.items(): - if not analyzer_name: - analyzer_name = 'unknown' - _, tmpfile = tempfile.mkstemp(f'-{analyzer_name}.plist') - - report_file.create(tmpfile, reports, checker_labels, - AnalyzerInfo(analyzer_name)) - LOG.debug(f"Stored '{analyzer_name}' unique reports " - f"in {tmpfile}.") - files_to_compress[dirname].add(tmpfile) + for dirname, analyzer_reports in unique_reports.items(): + for analyzer_name, reports in analyzer_reports.items(): + if not analyzer_name: + analyzer_name = 'unknown' + _, tmpfile = tempfile.mkstemp(f'-{analyzer_name}.plist') + + report_file.create(tmpfile, reports, checker_labels, + AnalyzerInfo(analyzer_name)) + LOG.debug(f"Stored '{analyzer_name}' unique reports in {tmpfile}.") + files_to_compress[dirname].add(tmpfile) if changed_files: reports_helper.dump_changed_files(changed_files) From 5328dedd4c10459c3e40f2abe6913dbcb685a798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krist=C3=B3f=20Umann?= Date: Thu, 21 Mar 2024 12:58:38 +0100 Subject: [PATCH 4/4] Remove the temporary files --- web/client/codechecker_client/cmd/store.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/web/client/codechecker_client/cmd/store.py b/web/client/codechecker_client/cmd/store.py index 6906bb81fa..3b2f639d6c 100644 --- a/web/client/codechecker_client/cmd/store.py +++ b/web/client/codechecker_client/cmd/store.py @@ -490,6 +490,7 @@ def assemble_zip(inputs, file_paths.update(report.original_files) file_report_positions[report.file.original_path].add(report.line) + files_to_delete = [] for dirname, analyzer_reports in unique_reports.items(): for analyzer_name, reports in analyzer_reports.items(): if not analyzer_name: @@ -500,6 +501,7 @@ def assemble_zip(inputs, AnalyzerInfo(analyzer_name)) LOG.debug(f"Stored '{analyzer_name}' unique reports in {tmpfile}.") files_to_compress[dirname].add(tmpfile) + files_to_delete.append(tmpfile) if changed_files: reports_helper.dump_changed_files(changed_files) @@ -640,6 +642,10 @@ def assemble_zip(inputs, LOG.info("Compressing report zip file done (%s / %s).", sizeof_fmt(zip_size), sizeof_fmt(compressed_zip_size)) + # We are responsible for deleting these. + for file in files_to_delete: + os.remove(file) + def should_be_zipped(input_file: str, input_files: Iterable[str]) -> bool: """