From b2e39ac21bdf57a5e12d01d94b2251893fb28411 Mon Sep 17 00:00:00 2001
From: Luni-4 <luni-4@hotmail.it>
Date: Tue, 22 Dec 2020 15:40:19 +0100
Subject: [PATCH 1/3] Split script to update a submodule in two scripts

Since a CI system uses the commit present in a repository that already
contain the differences, it is not necessary to update the
tree-sitter-language
---
 update-language-bindings.sh | 10 ++++++++++
 update-submodule.sh         | 11 +++++++++++
 update-sumbodules.sh        | 17 -----------------
 3 files changed, 21 insertions(+), 17 deletions(-)
 create mode 100755 update-language-bindings.sh
 create mode 100755 update-submodule.sh
 delete mode 100755 update-sumbodules.sh

diff --git a/update-language-bindings.sh b/update-language-bindings.sh
new file mode 100755
index 000000000..3a6a00657
--- /dev/null
+++ b/update-language-bindings.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Recreate the language
+pushd enums
+cargo clean
+cargo run -- -lrust -o ../src/languages
+popd
+
+# Format the code
+cargo fmt
diff --git a/update-submodule.sh b/update-submodule.sh
new file mode 100755
index 000000000..047d972cf
--- /dev/null
+++ b/update-submodule.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Update tree-sitter submodule
+#
+# Usage: ./update-submodule.sh $tree-sitter-language
+
+# Update submodule
+git submodule update --remote $1
+
+# Generate the updated grammar for the submodule
+./update-language-bindings.sh
diff --git a/update-sumbodules.sh b/update-sumbodules.sh
deleted file mode 100755
index 0577a4485..000000000
--- a/update-sumbodules.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-# Update tree-sitter submodules
-#
-# Usage: ./update-sumbodules.sh $tree-sitter-language
-
-# Update submodule
-git submodule update --remote $1
-
-# Recreate the language
-pushd enums
-cargo clean
-cargo run -- -lrust -o ../src/languages
-popd
-
-# Format the code
-cargo fmt

From a6c76d710737a365763a44c869a8fb7b056617df Mon Sep 17 00:00:00 2001
From: Luni-4 <luni-4@hotmail.it>
Date: Wed, 23 Dec 2020 16:59:30 +0100
Subject: [PATCH 2/3] Add a new command to compute metrics on ci systems

---
 check-submodule.py | 90 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/check-submodule.py b/check-submodule.py
index 44781468b..74ff41193 100755
--- a/check-submodule.py
+++ b/check-submodule.py
@@ -13,6 +13,10 @@
 
 NOTE: The compute-metrics subcommand MUST be run on a clean master branch!
 
+To compute metrics on a continuous integration system:
+
+./check-submodule.py compute-ci-metrics -p LOCAL_DIR -l TREE_SITTER_LANGUAGE
+
 
 To compare metrics and retrieve minimal tests:
 
@@ -108,11 +112,16 @@ def run_subprocess(cmd: str, *args: T.Union[str, pathlib.Path]) -> None:
 
 # Run rust-code-analysis on the chosen repository to compute metrics.
 def run_rca(
-    repo_dir: pathlib.Path, output_dir: pathlib.Path, include_languages: T.List[str]
+    repo_dir: pathlib.Path,
+    output_dir: pathlib.Path,
+    manifest_path: T.Optional[pathlib.Path],
+    include_languages: T.List[str],
 ) -> None:
     run_subprocess(
         "cargo",
         "run",
+        "--manifest-path",
+        manifest_path / "Cargo.toml" if manifest_path else "Cargo.toml",
         "--release",
         "--package",
         "rust-code-analysis-cli",
@@ -229,6 +238,54 @@ def save_diff_files(
     asyncio.run(json_diff.diff())
 
 
+# Compute continuous integration metrics before and after a
+# tree-sitter-language update.
+def compute_ci_metrics(args: argparse.Namespace) -> None:
+
+    if args.language not in EXTENSIONS.keys():
+        print(args.language, "is not a valid tree-sitter-language")
+        sys.exit(1)
+
+    # Repository passed as input
+    repo_dir = pathlib.Path(args.path)
+
+    # Create rust-code-analysis repository path
+    rca_path = WORKDIR / "rust-code-analysis"
+
+    # Old metrics directory
+    old_dir = WORKDIR / (args.language + OLD_SUFFIX)
+    # New metrics directory
+    new_dir = WORKDIR / (args.language + NEW_SUFFIX)
+
+    # Create output directories
+    old_dir.mkdir(parents=True, exist_ok=True)
+    new_dir.mkdir(parents=True, exist_ok=True)
+
+    # Git clone rust-code-analysis master branch repository
+    print(f"Cloning rust-code-analysis master branch into /tmp")
+    run_subprocess(
+        "git",
+        "clone",
+        "--depth=1",
+        "--recurse-submodules",
+        "-j8",
+        "https://github.com/mozilla/rust-code-analysis",
+        rca_path,
+    )
+
+    # Compute old metrics
+    print("\nComputing metrics before the update and saving them in", old_dir)
+    run_rca(repo_dir, old_dir, rca_path, EXTENSIONS[args.language])
+
+    # Update tree-sitter-language submodule
+    print("\nUpdate", args.language)
+    run_subprocess("./update-language-bindings.sh")
+
+    # Compute new metrics
+    print("\nComputing metrics after the update and saving them in", new_dir)
+    run_rca(repo_dir, new_dir, None, EXTENSIONS[args.language])
+
+
 # Compute metrics before and after a tree-sitter-language update.
 def compute_metrics(args: argparse.Namespace) -> None:
 
@@ -257,7 +314,7 @@ def compute_metrics(args: argparse.Namespace) -> None:
 
         # Compute old metrics
         print("\nComputing metrics before the update and saving them in", old_dir)
-        run_rca(repo_dir, old_dir, EXTENSIONS[args.language])
+        run_rca(repo_dir, old_dir, None, EXTENSIONS[args.language])
 
         # Create a new branch
         print("\nCreate a new branch called", args.language)
@@ -265,11 +322,11 @@ def compute_metrics(args: argparse.Namespace) -> None:
 
         # Update tree-sitter-language submodule
         print("\nUpdate", args.language)
-        run_subprocess("./update-sumbodules.sh", args.language)
+        run_subprocess("./update-submodule.sh", args.language)
 
     # Compute new metrics
     print("\nComputing metrics after the update and saving them in", new_dir)
-    run_rca(repo_dir, new_dir, EXTENSIONS[args.language])
+    run_rca(repo_dir, new_dir, None, EXTENSIONS[args.language])
 
 
 # Compare metrics and dump the differences whether there are some.
@@ -342,6 +399,31 @@ def main() -> None:
     )
     compute_metrics_cmd.set_defaults(func=compute_metrics)
 
+    # Compute continuous integration metrics command
+    compute_ci_metrics_cmd = commands.add_parser(
+        "compute-ci-metrics",
+        help="Computes the metrics of a chosen repository before and after "
+        "a tree-sitter-language update on a continuous integration system.",
+    )
+
+    compute_ci_metrics_cmd.add_argument(
+        "-p",
+        "--path",
+        type=str,
+        required=True,
+        help="Path where the rust-code-analysis repository is saved on the "
+        "continuous integration system",
+    )
+    compute_ci_metrics_cmd.add_argument(
+        "-l",
+        "--language",
+        type=str,
+        required=True,
+        help="tree-sitter-language to be updated",
+    )
+
+    compute_ci_metrics_cmd.set_defaults(func=compute_ci_metrics)
+
     # Compare metrics command
     compare_metrics_cmd = commands.add_parser(
         "compare-metrics",

From cf3a2e795b0d87321e91fab251c17e4b3b7468d7 Mon Sep 17 00:00:00 2001
From: Luni-4 <luni-4@hotmail.it>
Date: Wed, 23 Dec 2020 17:05:22 +0100
Subject: [PATCH 3/3] Use external software to compare metrics

The two software now used to compare metrics are written in Rust and
provide a considerable speed up on large repositories.
---
 check-submodule.py | 160 ++++-----------------------------------------
 1 file changed, 13 insertions(+), 147 deletions(-)

diff --git a/check-submodule.py b/check-submodule.py
index 74ff41193..ae16b997d 100755
--- a/check-submodule.py
+++ b/check-submodule.py
@@ -17,26 +17,23 @@
 
 ./check-submodule.py compute-ci-metrics -p LOCAL_DIR -l TREE_SITTER_LANGUAGE
 
+To compare metrics and retrieve the structural JSON of differences
+in addition to the files containing the minimal tests:
 
-To compare metrics and retrieve minimal tests:
-
-1. Install deepdiff: pip install deepdiff
+1. Install json-diff from here: https://github.com/Luni-4/json-diff/releases
+2. Install json-minimal-tests from here: https://github.com/Luni-4/json-minimal-tests/releases
 
 ./check-submodule.py compare-metrics -l TREE_SITTER_LANGUAGE
+
+NOTE: Add the paths of the software above to the PATH environment variable!
 """
 
 import argparse
-import asyncio
-import json
-import math
 import pathlib
-import re
 import subprocess
 import sys
 import typing as T
 
-import deepdiff
-
 # The /tmp directory will be used as workdir
 WORKDIR = pathlib.Path("/tmp")
 # Suffix for the directory containing the old metrics
@@ -69,42 +66,6 @@
     "tree-sitter-python": ["*.py"],
 }
 
-
-class JsonDiff:
-    def __init__(
-        self,
-        old_metrics: T.List[pathlib.Path],
-        new_metrics: T.List[pathlib.Path],
-        compare_dir: pathlib.Path,
-        max_workers: int,
-    ):
-        self.compare_dir = compare_dir
-        self.max_workers = max_workers
-
-        # Max number of file paths in a sublist
-        n = math.ceil(len(old_metrics) / max_workers)
-
-        # Assign a certain number of filepaths to each worker
-        self.workers_filepaths = [
-            zip(old_metrics[i * n : (i + 1) * n], new_metrics[i * n : (i + 1) * n])
-            for i in range((len(old_metrics) + n - 1) // n)
-        ]
-
-    # Run asynchronous comparisons between json files.
-    async def diff(self):
-        # Save minimal tests in the chosen directory.
-        def _worker(worker_list: T.List[pathlib.Path]):
-            for old_filename, new_filename in worker_list:
-
-                # Compute minimal tests
-                compute_minimal_tests(old_filename, new_filename, self.compare_dir)
-
-        # Define the max number of coroutines used to compare json files
-        await asyncio.gather(
-            *(_worker(worker_filepaths) for worker_filepaths in self.workers_filepaths)
-        )
-
-
 # Run a subprocess.
 def run_subprocess(cmd: str, *args: T.Union[str, pathlib.Path]) -> None:
     subprocess.run([cmd, *args])
@@ -138,106 +99,6 @@ def run_rca(
     )
 
 
-# Find the difference between the two json metric files.
-def get_json_diff(
-    first_file: pathlib.Path, second_file: pathlib.Path
-) -> T.Tuple[T.Dict[str, T.Any], T.Dict[str, T.Any]]:
-    with open(first_file, "r") as input_file:
-        t1 = json.load(input_file)
-
-    with open(second_file, "r") as input_file:
-        t2 = json.load(input_file)
-
-    diff = deepdiff.DeepDiff(t1, t2, ignore_order=True)
-
-    return (t1, diff)
-
-
-# Save the filename and the list of code spans associated to the differences
-# in a dictionary.
-def get_metrics_diff_span(
-    first_json: T.Dict[str, T.Any], diff: T.Dict[str, T.Any]
-) -> T.Dict[str, T.List[T.Tuple[int, int]]]:
-    # Search for this pattern in the differences object
-    prog = re.compile(r"\['spaces'\]\[\d+\]")
-
-    output = {"name": first_json["name"], "spaces_spans": []}
-
-    for value in diff["values_changed"]:
-        val = "".join(prog.findall(value))
-        # Subtracting one because files starts from 0
-        start_line = eval(f'first_json{val}["start_line"]') - 1
-        end_line = eval(f'first_json{val}["end_line"]')
-        output["spaces_spans"].append((start_line, end_line))
-
-    # Print the path of the repository file containing the differences
-    print(first_json["name"])
-
-    return output
-
-
-# Dump minimal tests code in an output file.
-def dump_minimal_tests(
-    code_spans_object: T.Dict[str, T.List[T.Tuple[int, int]]],
-    new_filename: pathlib.Path,
-    compare_dir: pathlib.Path,
-) -> None:
-    # Remove duplicates from the list of spans
-    spans_list = dict.fromkeys(code_spans_object["spaces_spans"])
-
-    # Get filename
-    filename = code_spans_object["name"]
-
-    # Read code spans from the input source code
-    with open(filename, "r", encoding="utf-8", errors="ignore") as input_file:
-        # Decode only utf-8 source code files
-        lines = input_file.readlines()
-
-    # Write spans to output file
-    output_path = compare_dir / new_filename.stem
-    with open(output_path, "w") as output_file:
-        for span in spans_list:
-            output_file.write("Minimal test:\n")
-            output_file.write("".join(lines[span[0] : span[1]]) + "\n")
-
-
-# Compute minimal tests.
-def compute_minimal_tests(
-    old_filename: pathlib.Path, new_filename: pathlib.Path, compare_dir: pathlib.Path
-) -> None:
-    # Find the difference between the two json files with the aim of
-    # getting some minimal tests
-    first_json, diff = get_json_diff(old_filename, new_filename)
-
-    # If two json files are identical, return
-    if not diff:
-        return
-
-    # Retrieve the code spans associated to the differences
-    code_spans_object = get_metrics_diff_span(first_json, diff)
-
-    # Dump the minimal tests retrived from code spans on a file with the
-    # same extension of the analyzed source code
-    dump_minimal_tests(code_spans_object, new_filename, compare_dir)
-
-
-# Save json files of differences and minimal tests in the chosen directory
-# concurrently.
-def save_diff_files(
-    old_dir: pathlib.Path, new_dir: pathlib.Path, compare_dir: pathlib.Path
-) -> None:
-    # Get all metric files in old and new directories
-    old_paths = sorted(pathlib.Path(old_dir).glob("*.json"))
-    new_paths = sorted(pathlib.Path(new_dir).glob("*.json"))
-
-    # Create a new coroutines handler
-    json_diff = JsonDiff(old_paths, new_paths, compare_dir, 4)
-
-    # Find the differences between json files and save the results in a
-    # chosen directory asynchronously
-    asyncio.run(json_diff.diff())
-
-
 # Compute continuous integration metrics before and after a
 # tree-sitter-language update.
 def compute_ci_metrics(args: argparse.Namespace) -> None:
@@ -342,8 +203,13 @@ def compare_metrics(args: argparse.Namespace) -> None:
     # Create compare directory
     compare_dir.mkdir(parents=True, exist_ok=True)
 
-    # Save files of differences and minimal tests in the chosen directory
-    save_diff_files(old_dir, new_dir, compare_dir)
+    # Get JSON of differences
+    print("\nSave JSON of differences in", compare_dir)
+    run_subprocess("json-diff-cli", "--raw-json", "-o", compare_dir, old_dir, new_dir)
+
+    # Get minimal tests
+    print("\nSave minimal tests in", compare_dir)
+    run_subprocess("json-minimal-tests", "-o", compare_dir, old_dir, new_dir)
 
 
 def main() -> None: