From b2e39ac21bdf57a5e12d01d94b2251893fb28411 Mon Sep 17 00:00:00 2001 From: Luni-4 Date: Tue, 22 Dec 2020 15:40:19 +0100 Subject: [PATCH 1/3] Split script to update a submodule in two scripts Since a CI system uses the commit present in a repository that already contain the differences, it is not necessary to update the tree-sitter-language --- update-language-bindings.sh | 10 ++++++++++ update-submodule.sh | 11 +++++++++++ update-sumbodules.sh | 17 ----------------- 3 files changed, 21 insertions(+), 17 deletions(-) create mode 100755 update-language-bindings.sh create mode 100755 update-submodule.sh delete mode 100755 update-sumbodules.sh diff --git a/update-language-bindings.sh b/update-language-bindings.sh new file mode 100755 index 000000000..3a6a00657 --- /dev/null +++ b/update-language-bindings.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Recreate the language +pushd enums +cargo clean +cargo run -- -lrust -o ../src/languages +popd + +# Format the code +cargo fmt diff --git a/update-submodule.sh b/update-submodule.sh new file mode 100755 index 000000000..047d972cf --- /dev/null +++ b/update-submodule.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Update tree-sitter submodule +# +# Usage: ./update-submodule.sh $tree-sitter-language + +# Update submodule +git submodule update --remote $1 + +# Generate the updated grammar for the submodule +./update-language-bindings.sh diff --git a/update-sumbodules.sh b/update-sumbodules.sh deleted file mode 100755 index 0577a4485..000000000 --- a/update-sumbodules.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# Update tree-sitter submodules -# -# Usage: ./update-sumbodules.sh $tree-sitter-language - -# Update submodule -git submodule update --remote $1 - -# Recreate the language -pushd enums -cargo clean -cargo run -- -lrust -o ../src/languages -popd - -# Format the code -cargo fmt From a6c76d710737a365763a44c869a8fb7b056617df Mon Sep 17 00:00:00 2001 From: Luni-4 Date: Wed, 23 Dec 2020 16:59:30 +0100 Subject: [PATCH 2/3] Add a new command to compute metrics on ci systems --- check-submodule.py | 90 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/check-submodule.py b/check-submodule.py index 44781468b..74ff41193 100755 --- a/check-submodule.py +++ b/check-submodule.py @@ -13,6 +13,10 @@ NOTE: The compute-metrics subcommand MUST be run on a clean master branch! +To compute metrics on a continuous integration system: + +./check-submodule.py compute-ci-metrics -p LOCAL_DIR -l TREE_SITTER_LANGUAGE + To compare metrics and retrieve minimal tests: @@ -108,11 +112,16 @@ def run_subprocess(cmd: str, *args: T.Union[str, pathlib.Path]) -> None: # Run rust-code-analysis on the chosen repository to compute metrics. def run_rca( - repo_dir: pathlib.Path, output_dir: pathlib.Path, include_languages: T.List[str] + repo_dir: pathlib.Path, + output_dir: pathlib.Path, + manifest_path: T.Optional[pathlib.Path], + include_languages: T.List[str], ) -> None: run_subprocess( "cargo", "run", + "--manifest-path", + manifest_path / "Cargo.toml" if manifest_path else "Cargo.toml", "--release", "--package", "rust-code-analysis-cli", @@ -229,6 +238,54 @@ def save_diff_files( asyncio.run(json_diff.diff()) +# Compute continuous integration metrics before and after a +# tree-sitter-language update. +def compute_ci_metrics(args: argparse.Namespace) -> None: + + if args.language not in EXTENSIONS.keys(): + print(args.language, "is not a valid tree-sitter-language") + sys.exit(1) + + # Repository passed as input + repo_dir = pathlib.Path(args.path) + + # Create rust-code-analysis repository path + rca_path = WORKDIR / "rust-code-analysis" + + # Old metrics directory + old_dir = WORKDIR / (args.language + OLD_SUFFIX) + # New metrics directory + new_dir = WORKDIR / (args.language + NEW_SUFFIX) + + # Create output directories + old_dir.mkdir(parents=True, exist_ok=True) + new_dir.mkdir(parents=True, exist_ok=True) + + # Git clone rust-code-analysis master branch repository + print(f"Cloning rust-code-analysis master branch into /tmp") + run_subprocess( + "git", + "clone", + "--depth=1", + "--recurse-submodules", + "-j8", + "https://github.com/mozilla/rust-code-analysis", + rca_path, + ) + + # Compute old metrics + print("\nComputing metrics before the update and saving them in", old_dir) + run_rca(repo_dir, old_dir, rca_path, EXTENSIONS[args.language]) + + # Update tree-sitter-language submodule + print("\nUpdate", args.language) + run_subprocess("./update-language-bindings.sh") + + # Compute new metrics + print("\nComputing metrics after the update and saving them in", new_dir) + run_rca(repo_dir, new_dir, None, EXTENSIONS[args.language]) + + # Compute metrics before and after a tree-sitter-language update. def compute_metrics(args: argparse.Namespace) -> None: @@ -257,7 +314,7 @@ def compute_metrics(args: argparse.Namespace) -> None: # Compute old metrics print("\nComputing metrics before the update and saving them in", old_dir) - run_rca(repo_dir, old_dir, EXTENSIONS[args.language]) + run_rca(repo_dir, old_dir, None, EXTENSIONS[args.language]) # Create a new branch print("\nCreate a new branch called", args.language) @@ -265,11 +322,11 @@ def compute_metrics(args: argparse.Namespace) -> None: # Update tree-sitter-language submodule print("\nUpdate", args.language) - run_subprocess("./update-sumbodules.sh", args.language) + run_subprocess("./update-submodule.sh", args.language) # Compute new metrics print("\nComputing metrics after the update and saving them in", new_dir) - run_rca(repo_dir, new_dir, EXTENSIONS[args.language]) + run_rca(repo_dir, new_dir, None, EXTENSIONS[args.language]) # Compare metrics and dump the differences whether there are some. @@ -342,6 +399,31 @@ def main() -> None: ) compute_metrics_cmd.set_defaults(func=compute_metrics) + # Compute continuous integration metrics command + compute_ci_metrics_cmd = commands.add_parser( + "compute-ci-metrics", + help="Computes the metrics of a chosen repository before and after " + "a tree-sitter-language update on a continuous integration system.", + ) + + compute_ci_metrics_cmd.add_argument( + "-p", + "--path", + type=str, + required=True, + help="Path where the rust-code-analysis repository is saved on the " + "continuous integration system", + ) + compute_ci_metrics_cmd.add_argument( + "-l", + "--language", + type=str, + required=True, + help="tree-sitter-language to be updated", + ) + + compute_ci_metrics_cmd.set_defaults(func=compute_ci_metrics) + # Compare metrics command compare_metrics_cmd = commands.add_parser( "compare-metrics", From cf3a2e795b0d87321e91fab251c17e4b3b7468d7 Mon Sep 17 00:00:00 2001 From: Luni-4 Date: Wed, 23 Dec 2020 17:05:22 +0100 Subject: [PATCH 3/3] Use external software to compare metrics The two software now used to compare metrics are written in Rust and provide a considerable speed up on large repositories. --- check-submodule.py | 160 ++++----------------------------------------- 1 file changed, 13 insertions(+), 147 deletions(-) diff --git a/check-submodule.py b/check-submodule.py index 74ff41193..ae16b997d 100755 --- a/check-submodule.py +++ b/check-submodule.py @@ -17,26 +17,23 @@ ./check-submodule.py compute-ci-metrics -p LOCAL_DIR -l TREE_SITTER_LANGUAGE +To compare metrics and retrieve the structural JSON of differences +in addition to the files containing the minimal tests: -To compare metrics and retrieve minimal tests: - -1. Install deepdiff: pip install deepdiff +1. Install json-diff from here: https://github.com/Luni-4/json-diff/releases +2. Install json-minimal-tests from here: https://github.com/Luni-4/json-minimal-tests/releases ./check-submodule.py compare-metrics -l TREE_SITTER_LANGUAGE + +NOTE: Add the paths of the software above to the PATH environment variable! """ import argparse -import asyncio -import json -import math import pathlib -import re import subprocess import sys import typing as T -import deepdiff - # The /tmp directory will be used as workdir WORKDIR = pathlib.Path("/tmp") # Suffix for the directory containing the old metrics @@ -69,42 +66,6 @@ "tree-sitter-python": ["*.py"], } - -class JsonDiff: - def __init__( - self, - old_metrics: T.List[pathlib.Path], - new_metrics: T.List[pathlib.Path], - compare_dir: pathlib.Path, - max_workers: int, - ): - self.compare_dir = compare_dir - self.max_workers = max_workers - - # Max number of file paths in a sublist - n = math.ceil(len(old_metrics) / max_workers) - - # Assign a certain number of filepaths to each worker - self.workers_filepaths = [ - zip(old_metrics[i * n : (i + 1) * n], new_metrics[i * n : (i + 1) * n]) - for i in range((len(old_metrics) + n - 1) // n) - ] - - # Run asynchronous comparisons between json files. - async def diff(self): - # Save minimal tests in the chosen directory. - def _worker(worker_list: T.List[pathlib.Path]): - for old_filename, new_filename in worker_list: - - # Compute minimal tests - compute_minimal_tests(old_filename, new_filename, self.compare_dir) - - # Define the max number of coroutines used to compare json files - await asyncio.gather( - *(_worker(worker_filepaths) for worker_filepaths in self.workers_filepaths) - ) - - # Run a subprocess. def run_subprocess(cmd: str, *args: T.Union[str, pathlib.Path]) -> None: subprocess.run([cmd, *args]) @@ -138,106 +99,6 @@ def run_rca( ) -# Find the difference between the two json metric files. -def get_json_diff( - first_file: pathlib.Path, second_file: pathlib.Path -) -> T.Tuple[T.Dict[str, T.Any], T.Dict[str, T.Any]]: - with open(first_file, "r") as input_file: - t1 = json.load(input_file) - - with open(second_file, "r") as input_file: - t2 = json.load(input_file) - - diff = deepdiff.DeepDiff(t1, t2, ignore_order=True) - - return (t1, diff) - - -# Save the filename and the list of code spans associated to the differences -# in a dictionary. -def get_metrics_diff_span( - first_json: T.Dict[str, T.Any], diff: T.Dict[str, T.Any] -) -> T.Dict[str, T.List[T.Tuple[int, int]]]: - # Search for this pattern in the differences object - prog = re.compile(r"\['spaces'\]\[\d+\]") - - output = {"name": first_json["name"], "spaces_spans": []} - - for value in diff["values_changed"]: - val = "".join(prog.findall(value)) - # Subtracting one because files starts from 0 - start_line = eval(f'first_json{val}["start_line"]') - 1 - end_line = eval(f'first_json{val}["end_line"]') - output["spaces_spans"].append((start_line, end_line)) - - # Print the path of the repository file containing the differences - print(first_json["name"]) - - return output - - -# Dump minimal tests code in an output file. -def dump_minimal_tests( - code_spans_object: T.Dict[str, T.List[T.Tuple[int, int]]], - new_filename: pathlib.Path, - compare_dir: pathlib.Path, -) -> None: - # Remove duplicates from the list of spans - spans_list = dict.fromkeys(code_spans_object["spaces_spans"]) - - # Get filename - filename = code_spans_object["name"] - - # Read code spans from the input source code - with open(filename, "r", encoding="utf-8", errors="ignore") as input_file: - # Decode only utf-8 source code files - lines = input_file.readlines() - - # Write spans to output file - output_path = compare_dir / new_filename.stem - with open(output_path, "w") as output_file: - for span in spans_list: - output_file.write("Minimal test:\n") - output_file.write("".join(lines[span[0] : span[1]]) + "\n") - - -# Compute minimal tests. -def compute_minimal_tests( - old_filename: pathlib.Path, new_filename: pathlib.Path, compare_dir: pathlib.Path -) -> None: - # Find the difference between the two json files with the aim of - # getting some minimal tests - first_json, diff = get_json_diff(old_filename, new_filename) - - # If two json files are identical, return - if not diff: - return - - # Retrieve the code spans associated to the differences - code_spans_object = get_metrics_diff_span(first_json, diff) - - # Dump the minimal tests retrived from code spans on a file with the - # same extension of the analyzed source code - dump_minimal_tests(code_spans_object, new_filename, compare_dir) - - -# Save json files of differences and minimal tests in the chosen directory -# concurrently. -def save_diff_files( - old_dir: pathlib.Path, new_dir: pathlib.Path, compare_dir: pathlib.Path -) -> None: - # Get all metric files in old and new directories - old_paths = sorted(pathlib.Path(old_dir).glob("*.json")) - new_paths = sorted(pathlib.Path(new_dir).glob("*.json")) - - # Create a new coroutines handler - json_diff = JsonDiff(old_paths, new_paths, compare_dir, 4) - - # Find the differences between json files and save the results in a - # chosen directory asynchronously - asyncio.run(json_diff.diff()) - - # Compute continuous integration metrics before and after a # tree-sitter-language update. def compute_ci_metrics(args: argparse.Namespace) -> None: @@ -342,8 +203,13 @@ def compare_metrics(args: argparse.Namespace) -> None: # Create compare directory compare_dir.mkdir(parents=True, exist_ok=True) - # Save files of differences and minimal tests in the chosen directory - save_diff_files(old_dir, new_dir, compare_dir) + # Get JSON of differences + print("\nSave JSON of differences in", compare_dir) + run_subprocess("json-diff-cli", "--raw-json", "-o", compare_dir, old_dir, new_dir) + + # Get minimal tests + print("\nSave minimal tests in", compare_dir) + run_subprocess("json-minimal-tests", "-o", compare_dir, old_dir, new_dir) def main() -> None: