Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 87 additions & 139 deletions check-submodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,27 @@

NOTE: The compute-metrics subcommand MUST be run on a clean master branch!

To compute metrics on a continuous integration system:

To compare metrics and retrieve minimal tests:
./check-submodule.py compute-ci-metrics -p LOCAL_DIR -l TREE_SITTER_LANGUAGE

1. Install deepdiff: pip install deepdiff
To compare metrics and retrieve the structural JSON of differences
in addition to the files containing the minimal tests:

1. Install json-diff from here: https://github.com/Luni-4/json-diff/releases
2. Install json-minimal-tests from here: https://github.com/Luni-4/json-minimal-tests/releases

./check-submodule.py compare-metrics -l TREE_SITTER_LANGUAGE

NOTE: Add the paths of the software above to the PATH environment variable!
"""

import argparse
import asyncio
import json
import math
import pathlib
import re
import subprocess
import sys
import typing as T

import deepdiff

# The /tmp directory will be used as workdir
WORKDIR = pathlib.Path("/tmp")
# Suffix for the directory containing the old metrics
Expand Down Expand Up @@ -65,54 +66,23 @@
"tree-sitter-python": ["*.py"],
}


class JsonDiff:
def __init__(
self,
old_metrics: T.List[pathlib.Path],
new_metrics: T.List[pathlib.Path],
compare_dir: pathlib.Path,
max_workers: int,
):
self.compare_dir = compare_dir
self.max_workers = max_workers

# Max number of file paths in a sublist
n = math.ceil(len(old_metrics) / max_workers)

# Assign a certain number of filepaths to each worker
self.workers_filepaths = [
zip(old_metrics[i * n : (i + 1) * n], new_metrics[i * n : (i + 1) * n])
for i in range((len(old_metrics) + n - 1) // n)
]

# Run asynchronous comparisons between json files.
async def diff(self):
# Save minimal tests in the chosen directory.
def _worker(worker_list: T.List[pathlib.Path]):
for old_filename, new_filename in worker_list:

# Compute minimal tests
compute_minimal_tests(old_filename, new_filename, self.compare_dir)

# Define the max number of coroutines used to compare json files
await asyncio.gather(
*(_worker(worker_filepaths) for worker_filepaths in self.workers_filepaths)
)


# Run a subprocess.
def run_subprocess(cmd: str, *args: T.Union[str, pathlib.Path]) -> None:
subprocess.run([cmd, *args])


# Run rust-code-analysis on the chosen repository to compute metrics.
def run_rca(
repo_dir: pathlib.Path, output_dir: pathlib.Path, include_languages: T.List[str]
repo_dir: pathlib.Path,
output_dir: pathlib.Path,
manifest_path: T.Optional[pathlib.Path],
include_languages: T.List[str],
) -> None:
run_subprocess(
"cargo",
"run",
"--manifest-path",
manifest_path / "Cargo.toml" if manifest_path else "Cargo.toml",
"--release",
"--package",
"rust-code-analysis-cli",
Expand All @@ -129,104 +99,52 @@ def run_rca(
)


# Find the difference between the two json metric files.
def get_json_diff(
first_file: pathlib.Path, second_file: pathlib.Path
) -> T.Tuple[T.Dict[str, T.Any], T.Dict[str, T.Any]]:
with open(first_file, "r") as input_file:
t1 = json.load(input_file)

with open(second_file, "r") as input_file:
t2 = json.load(input_file)

diff = deepdiff.DeepDiff(t1, t2, ignore_order=True)

return (t1, diff)


# Save the filename and the list of code spans associated to the differences
# in a dictionary.
def get_metrics_diff_span(
first_json: T.Dict[str, T.Any], diff: T.Dict[str, T.Any]
) -> T.Dict[str, T.List[T.Tuple[int, int]]]:
# Search for this pattern in the differences object
prog = re.compile(r"\['spaces'\]\[\d+\]")

output = {"name": first_json["name"], "spaces_spans": []}

for value in diff["values_changed"]:
val = "".join(prog.findall(value))
# Subtracting one because files starts from 0
start_line = eval(f'first_json{val}["start_line"]') - 1
end_line = eval(f'first_json{val}["end_line"]')
output["spaces_spans"].append((start_line, end_line))

# Print the path of the repository file containing the differences
print(first_json["name"])

return output


# Dump minimal tests code in an output file.
def dump_minimal_tests(
code_spans_object: T.Dict[str, T.List[T.Tuple[int, int]]],
new_filename: pathlib.Path,
compare_dir: pathlib.Path,
) -> None:
# Remove duplicates from the list of spans
spans_list = dict.fromkeys(code_spans_object["spaces_spans"])

# Get filename
filename = code_spans_object["name"]

# Read code spans from the input source code
with open(filename, "r", encoding="utf-8", errors="ignore") as input_file:
# Decode only utf-8 source code files
lines = input_file.readlines()

# Write spans to output file
output_path = compare_dir / new_filename.stem
with open(output_path, "w") as output_file:
for span in spans_list:
output_file.write("Minimal test:\n")
output_file.write("".join(lines[span[0] : span[1]]) + "\n")
# Compute continuous integration metrics before and after a
# tree-sitter-language update.
def compute_ci_metrics(args: argparse.Namespace) -> None:

if args.language not in EXTENSIONS.keys():
print(args.language, "is not a valid tree-sitter-language")
sys.exit(1)

# Compute minimal tests.
def compute_minimal_tests(
old_filename: pathlib.Path, new_filename: pathlib.Path, compare_dir: pathlib.Path
) -> None:
# Find the difference between the two json files with the aim of
# getting some minimal tests
first_json, diff = get_json_diff(old_filename, new_filename)
# Repository passed as input
repo_dir = pathlib.Path(args.path)

# If two json files are identical, return
if not diff:
return
# Create rust-code-analysis repository path
rca_path = WORKDIR / "rust-code-analysis"

# Retrieve the code spans associated to the differences
code_spans_object = get_metrics_diff_span(first_json, diff)
# Old metrics directory
old_dir = WORKDIR / (args.language + OLD_SUFFIX)
# New metrics directory
new_dir = WORKDIR / (args.language + NEW_SUFFIX)

# Dump the minimal tests retrived from code spans on a file with the
# same extension of the analyzed source code
dump_minimal_tests(code_spans_object, new_filename, compare_dir)
# Create output directories
old_dir.mkdir(parents=True, exist_ok=True)
new_dir.mkdir(parents=True, exist_ok=True)

# Git clone rust-code-analysis master branch repository
print(f"Cloning rust-code-analysis master branch into /tmp")
run_subprocess(
"git",
"clone",
"--depth=1",
"--recurse-submodules",
"-j8",
"https://github.com/mozilla/rust-code-analysis",
rca_path,
)
Comment on lines +126 to +135
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have a local clone, we can just reuse it by checking out to master instead of recloning.

Copy link
Collaborator Author

@Luni-4 Luni-4 Dec 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some CI systems fetch a single branch and you need to reset commits and make complicated stuff to retrieve master branch's commits. This solution could potentially be used on any CI system. It takes less than a second, so we could maintain it imho


# Save json files of differences and minimal tests in the chosen directory
# concurrently.
def save_diff_files(
old_dir: pathlib.Path, new_dir: pathlib.Path, compare_dir: pathlib.Path
) -> None:
# Get all metric files in old and new directories
old_paths = sorted(pathlib.Path(old_dir).glob("*.json"))
new_paths = sorted(pathlib.Path(new_dir).glob("*.json"))
# Compute old metrics
print("\nComputing metrics before the update and saving them in", old_dir)
run_rca(repo_dir, old_dir, rca_path, EXTENSIONS[args.language])

# Create a new coroutines handler
json_diff = JsonDiff(old_paths, new_paths, compare_dir, 4)
# Update tree-sitter-language submodule
print("\nUpdate", args.language)
run_subprocess("./update-language-bindings.sh")

# Find the differences between json files and save the results in a
# chosen directory asynchronously
asyncio.run(json_diff.diff())
# Compute new metrics
print("\nComputing metrics after the update and saving them in", new_dir)
run_rca(repo_dir, new_dir, None, EXTENSIONS[args.language])


# Compute metrics before and after a tree-sitter-language update.
Expand Down Expand Up @@ -257,19 +175,19 @@ def compute_metrics(args: argparse.Namespace) -> None:

# Compute old metrics
print("\nComputing metrics before the update and saving them in", old_dir)
run_rca(repo_dir, old_dir, EXTENSIONS[args.language])
run_rca(repo_dir, old_dir, None, EXTENSIONS[args.language])

# Create a new branch
print("\nCreate a new branch called", args.language)
run_subprocess("git", "checkout", "-B", args.language)

# Update tree-sitter-language submodule
print("\nUpdate", args.language)
run_subprocess("./update-sumbodules.sh", args.language)
run_subprocess("./update-submodule.sh", args.language)

# Compute new metrics
print("\nComputing metrics after the update and saving them in", new_dir)
run_rca(repo_dir, new_dir, EXTENSIONS[args.language])
run_rca(repo_dir, new_dir, None, EXTENSIONS[args.language])


# Compare metrics and dump the differences whether there are some.
Expand All @@ -285,8 +203,13 @@ def compare_metrics(args: argparse.Namespace) -> None:
# Create compare directory
compare_dir.mkdir(parents=True, exist_ok=True)

# Save files of differences and minimal tests in the chosen directory
save_diff_files(old_dir, new_dir, compare_dir)
# Get JSON of differences
print("\nSave JSON of differences in", compare_dir)
run_subprocess("json-diff-cli", "--raw-json", "-o", compare_dir, old_dir, new_dir)

# Get minimal tests
print("\nSave minimal tests in", compare_dir)
run_subprocess("json-minimal-tests", "-o", compare_dir, old_dir, new_dir)


def main() -> None:
Expand Down Expand Up @@ -342,6 +265,31 @@ def main() -> None:
)
compute_metrics_cmd.set_defaults(func=compute_metrics)

# Compute continuous integration metrics command
compute_ci_metrics_cmd = commands.add_parser(
"compute-ci-metrics",
help="Computes the metrics of a chosen repository before and after "
"a tree-sitter-language update on a continuous integration system.",
)

compute_ci_metrics_cmd.add_argument(
"-p",
"--path",
type=str,
required=True,
help="Path where the rust-code-analysis repository is saved on the "
"continuous integration system",
)
compute_ci_metrics_cmd.add_argument(
"-l",
"--language",
type=str,
required=True,
help="tree-sitter-language to be updated",
)

compute_ci_metrics_cmd.set_defaults(func=compute_ci_metrics)

# Compare metrics command
compare_metrics_cmd = commands.add_parser(
"compare-metrics",
Expand Down
10 changes: 10 additions & 0 deletions update-language-bindings.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

# Recreate the language
pushd enums
cargo clean
cargo run -- -lrust -o ../src/languages
popd

# Format the code
cargo fmt
11 changes: 11 additions & 0 deletions update-submodule.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

# Update tree-sitter submodule
#
# Usage: ./update-submodule.sh $tree-sitter-language

# Update submodule
git submodule update --remote $1

# Generate the updated grammar for the submodule
./update-language-bindings.sh
17 changes: 0 additions & 17 deletions update-sumbodules.sh

This file was deleted.