From 395cc6f2fd1606a6a0d42b670744c8febbd7ab9e Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Fri, 21 May 2021 16:20:17 +0300 Subject: [PATCH 1/4] Add a script for filtering inspections --- src/python/evaluation/common/util.py | 6 ++ .../evaluation/inspectors/filter_issues.py | 8 +-- src/python/evaluation/qodana/README.md | 40 ++++++++++++- .../evaluation/qodana/dataset_marking.py | 12 +--- .../evaluation/qodana/filter_inspections.py | 60 +++++++++++++++++++ src/python/evaluation/qodana/util/util.py | 11 ++++ src/python/review/common/file_system.py | 9 ++- whitelist.txt | 1 + 8 files changed, 130 insertions(+), 17 deletions(-) create mode 100644 src/python/evaluation/qodana/filter_inspections.py create mode 100644 src/python/evaluation/qodana/util/util.py diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py index 271956f1..585e4ddd 100644 --- a/src/python/evaluation/common/util.py +++ b/src/python/evaluation/common/util.py @@ -1,4 +1,5 @@ from enum import Enum, unique +from typing import Set from src.python.review.application_config import LanguageVersion from src.python.review.common.file_system import Extension @@ -39,3 +40,8 @@ class EvaluationArgument(Enum): f'Acceptable language-names are: {LanguageVersion.PYTHON_3.value}, ' f'{LanguageVersion.JAVA_8.value} ,' f'{LanguageVersion.JAVA_11.value} and {LanguageVersion.KOTLIN.value}.') + + +# Split string by separator +def parse_set_arg(str_arg: str, separator: str = ',') -> Set[str]: + return set(str_arg.split(separator)) diff --git a/src/python/evaluation/inspectors/filter_issues.py b/src/python/evaluation/inspectors/filter_issues.py index ca4b38b6..6a4b115d 100644 --- a/src/python/evaluation/inspectors/filter_issues.py +++ b/src/python/evaluation/inspectors/filter_issues.py @@ -5,7 +5,7 @@ import pandas as pd from src.python.common.tool_arguments import RunToolArgument from src.python.evaluation.common.pandas_util import get_issues_from_json, get_solutions_df_by_file_path -from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.evaluation.common.util import ColumnName, EvaluationArgument, parse_set_arg from src.python.review.common.file_system import Extension, get_parent_folder, serialize_data_and_write_to_file from src.python.review.inspectors.issue import BaseIssue @@ -26,10 +26,6 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: default='') -def __parse_issues_arg(str_issues: str) -> Set[str]: - return set(str_issues.split(',')) - - def __get_new_issues(traceback: str, new_issues_classes: Set[str]) -> List[BaseIssue]: all_issues = get_issues_from_json(traceback) return list(filter(lambda i: i.origin_class in new_issues_classes, all_issues)) @@ -59,7 +55,7 @@ def main() -> None: solutions_file_path = args.solutions_file_path solutions_df = get_solutions_df_by_file_path(solutions_file_path) - issues = __parse_issues_arg(args.issues) + issues = parse_set_arg(args.issues) diffs = get_statistics_dict(solutions_df, issues) output_path = get_parent_folder(Path(solutions_file_path)) / f'diffs{Extension.PICKLE.value}' diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md index 35ccb782..a18748a4 100644 --- a/src/python/evaluation/qodana/README.md +++ b/src/python/evaluation/qodana/README.md @@ -1,5 +1,6 @@ # Dataset label -This script allows you to mark up a dataset using the found [Qodana](https://github.com/JetBrains/Qodana) inspections. + +[This](dataset_marking.py) script allows you to mark up a dataset using the found [Qodana](https://github.com/JetBrains/Qodana) inspections. The dataset must contain at least three columns: `id`, `code` and `lang`, where `id` is a unique solution number, `lang` is the language in which the code is written in the `code` column. The `lang` must belong to one of the following values: `java7`, `java8`, `java9`, `java11`, `python3`, `kotlin`. If `lang` is not equal to any of the values, the row will be skipped. @@ -21,3 +22,40 @@ Run the [dataset_marking.py](dataset_marking.py) with the arguments from command | **‑l**, **‑‑limit** | Allows you to read only the specified number of first rows from the dataset. If no limit is specified, the whole dataset will be processed. | | **‑s**, **‑‑chunk‑size** | The number of files that Qodana will process at a time. Default is `5000`. | | **‑o**, **‑‑dataset‑output‑path** | The path where the marked dataset will be saved. If not specified, the original dataset will be overwritten. | + +--- + +# Postprocessing + +The model that imitates Qodana analysis gets input from a dataset in a special format. +This module allows preparing datasets that were graded by [dataset_marking.py](dataset_marking.py) script. + +Data processing consists of several stages: +- union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script + and filter inspections list if it is necessary; +- convert `csv` file into a special format. + +## Filter inspections + +This stage allow you to union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script + and filter inspections list if it is necessary. + +Please, note that your all input files must be graded by [dataset_marking.py](dataset_marking.py) script +and have `inspections` column. + +Output file is a new `csv` file with the all columns from the input files. + +#### Usage + +Run the [filter_inspections.py](filter_inspections.py) with the arguments from command line. + +Required arguments: + +`dataset_folder` — path to a folder with csv files graded by Qodana. Each file must have `inspections` column. + +Optional arguments: +Argument | Description +--- | --- +|**‑i**, **‑‑inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. | + +The resulting file will be stored in the `dataset_folder`. diff --git a/src/python/evaluation/qodana/dataset_marking.py b/src/python/evaluation/qodana/dataset_marking.py index 00f17667..64716d27 100644 --- a/src/python/evaluation/qodana/dataset_marking.py +++ b/src/python/evaluation/qodana/dataset_marking.py @@ -19,7 +19,8 @@ from pandas import DataFrame from src.python.evaluation.common.csv_util import write_dataframe_to_csv from src.python.evaluation.common.util import ColumnName -from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue +from src.python.evaluation.qodana.util.util import to_json from src.python.review.application_config import LanguageVersion from src.python.review.common.file_system import ( create_directory, get_content_from_file, get_name_from_path, get_parent_folder, remove_directory, remove_slash, @@ -179,13 +180,6 @@ def _parse_inspections_files(cls, inspections_files: Set[Path]) -> Dict[int, Lis id_to_issues[fragment_id].append(qodana_issue) return id_to_issues - @classmethod - def _to_json(cls, issues: List[QodanaIssue]) -> str: - issues_json = { - QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)), - } - return json.dumps(issues_json) - def _mark_chunk(self, chunk: DataFrame, language: LanguageVersion, chunk_id: int) -> pd.DataFrame: tmp_file_path = self.dataset_path.parent.absolute() / f'qodana_project_{chunk_id}' create_directory(tmp_file_path) @@ -212,7 +206,7 @@ def _mark_chunk(self, chunk: DataFrame, language: LanguageVersion, chunk_id: int logger.info("Write inspections") chunk[QodanaColumnName.INSPECTIONS.value] = chunk.apply( - lambda row: self._to_json(inspections.get(row[ColumnName.ID.value], [])), axis=1) + lambda row: to_json(inspections.get(row[ColumnName.ID.value], [])), axis=1) remove_directory(tmp_file_path) return chunk diff --git a/src/python/evaluation/qodana/filter_inspections.py b/src/python/evaluation/qodana/filter_inspections.py new file mode 100644 index 00000000..6f758965 --- /dev/null +++ b/src/python/evaluation/qodana/filter_inspections.py @@ -0,0 +1,60 @@ +import argparse +import json +from pathlib import Path +from typing import List + +import pandas as pd +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.common.util import parse_set_arg +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField +from src.python.evaluation.qodana.util.util import to_json +from src.python.review.common.file_system import Extension, extension_file_condition, get_all_file_system_items + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument('dataset_folder', + type=lambda value: Path(value).absolute(), + help='Path to a folder with csv files graded by Qodana. ' + 'Each file must have "inspections" column.') + + parser.add_argument('-i', '--inspections', + help='Set of inspections ids to exclude from the dataset', + type=str, + default='') + + +def __get_qodana_dataset(root: Path) -> pd.DataFrame: + if not root.is_dir(): + raise ValueError(f'The {root} is not a directory') + dataset_files = get_all_file_system_items(root, extension_file_condition(Extension.CSV)) + datasets = [] + for file_path in dataset_files: + datasets.append(get_solutions_df_by_file_path(file_path)) + return pd.concat(datasets) + + +def __filter_inspections(json_issues: str, inspections_to_keep: List[str]) -> str: + issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value] + filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep, + map(lambda i: QodanaIssue.from_json(i), issues_list))) + return to_json(filtered_issues) + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + dataset_folder = args.dataset_folder + full_dataset = __get_qodana_dataset(dataset_folder) + inspections_to_keep = parse_set_arg(args.inspections) + + full_dataset[QodanaColumnName.INSPECTIONS.value] = full_dataset.apply( + lambda row: __filter_inspections(row[QodanaColumnName.INSPECTIONS.value], inspections_to_keep), axis=1) + + write_dataframe_to_csv(dataset_folder / f'filtered_issues{Extension.CSV.value}', full_dataset) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/qodana/util/util.py b/src/python/evaluation/qodana/util/util.py new file mode 100644 index 00000000..0c4b8712 --- /dev/null +++ b/src/python/evaluation/qodana/util/util.py @@ -0,0 +1,11 @@ +import json +from typing import List + +from src.python.evaluation.qodana.util.models import QodanaIssue, QodanaJsonField + + +def to_json(issues: List[QodanaIssue]) -> str: + issues_json = { + QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)), + } + return json.dumps(issues_json) diff --git a/src/python/review/common/file_system.py b/src/python/review/common/file_system.py index 3e2e8bce..b2ff6dcb 100644 --- a/src/python/review/common/file_system.py +++ b/src/python/review/common/file_system.py @@ -50,6 +50,13 @@ def all_items_condition(name: str) -> bool: return True +def extension_file_condition(extension: Extension) -> ItemCondition: + def has_this_extension(name: str) -> bool: + return get_extension_from_file(name) == extension + + return has_this_extension + + # To get all files or subdirs (depends on the last parameter) from root that match item_condition # Note that all subdirs or files already contain the full path for them def get_all_file_system_items(root: Path, item_condition: ItemCondition = all_items_condition, @@ -149,7 +156,7 @@ def get_content_from_file(file_path: Path, encoding: str = Encoding.ISO_ENCODING # Not empty extensions are returned with a dot, for example, '.txt' # If file has no extensions, an empty one ('') is returned -def get_extension_from_file(file: Path) -> Extension: +def get_extension_from_file(file: Union[Path, str]) -> Extension: return Extension(os.path.splitext(file)[1]) diff --git a/whitelist.txt b/whitelist.txt index 3e331750..e7c8e657 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -114,3 +114,4 @@ nrows groupby getuid Popen +datasets From 082ca6d31aa67b19c3b0fb7cb6a3d78a341c7535 Mon Sep 17 00:00:00 2001 From: Nastya Birillo Date: Mon, 24 May 2021 20:01:09 +0300 Subject: [PATCH 2/4] Qoadana handlers/get unique inspections (#35) Add handlers for getting unique inspections Add a script to convert data for a model Add a script for preprocessing data for the second qodana model (inspections line by line) --- src/python/common/tool_arguments.py | 7 + src/python/evaluation/README.md | 6 +- src/python/evaluation/inspectors/README.md | 4 +- .../inspectors/print_inspectors_statistics.py | 4 +- src/python/evaluation/qodana/README.md | 175 ++++++++++++++++++ .../evaluation/qodana/filter_inspections.py | 8 +- .../qodana/fragment_to_inspections_list.py | 33 ++++ ...agment_to_inspections_list_line_by_line.py | 62 +++++++ .../qodana/get_unique_inspectors.py | 94 ++++++++++ src/python/evaluation/qodana/util/models.py | 9 + src/python/evaluation/qodana/util/util.py | 44 ++++- whitelist.txt | 2 + 12 files changed, 434 insertions(+), 14 deletions(-) create mode 100644 src/python/evaluation/qodana/fragment_to_inspections_list.py create mode 100644 src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py create mode 100644 src/python/evaluation/qodana/get_unique_inspectors.py diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py index d3048051..65af5a53 100644 --- a/src/python/common/tool_arguments.py +++ b/src/python/common/tool_arguments.py @@ -89,3 +89,10 @@ class RunToolArgument(Enum): DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path', 'Path to a file with serialized diffs that were founded by diffs_between_df.py') + + QODANA_SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path', + 'Csv file with solutions. This file must be graded by Qodana.') + + QODANA_INSPECTIONS_PATH = ArgumentsInfo(None, 'inspections_path', 'Path to a CSV file with inspections list.') + + QODANA_DUPLICATES = ArgumentsInfo(None, '--remove-duplicates', 'Remove duplicates around inspections') diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md index 5aa4bdf7..af2dbbd8 100644 --- a/src/python/evaluation/README.md +++ b/src/python/evaluation/README.md @@ -29,7 +29,7 @@ Optional arguments: Argument | Description --- | --- |**‑f**, **‑‑format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.| -|**‑tp**, **‑‑tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .| +|**‑tp**, **‑‑tool‑path**| Path to run-tool. Default is `src/python/review/run_tool.py` .| |**‑‑traceback**| To include a column with errors traceback into an output file. Default is `False`.| -|**‑ofp**, **‑‑output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. | -|**‑ofn**, **‑‑output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.| +|**‑ofp**, **‑‑output‑folder‑path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. | +|**‑ofn**, **‑‑output‑file‑name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.| diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md index a0de1314..5c54fe93 100644 --- a/src/python/evaluation/inspectors/README.md +++ b/src/python/evaluation/inspectors/README.md @@ -161,8 +161,8 @@ Optional arguments: Argument | Description --- | --- |**‑‑categorize**| If True, statistics will be categorized by several categories. By default is disabled.| -|**‑n**, **‑‑top_n**| The top N items will be printed. Default value is 10.| -|**‑‑full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.| +|**‑n**, **‑‑top‑n**| The top N items will be printed. Default value is 10.| +|**‑‑full‑stat**| If True, full statistics (with all issues) will be printed. By default is disabled.| The statistics will be printed into console. diff --git a/src/python/evaluation/inspectors/print_inspectors_statistics.py b/src/python/evaluation/inspectors/print_inspectors_statistics.py index 8b132a31..0a5605dd 100644 --- a/src/python/evaluation/inspectors/print_inspectors_statistics.py +++ b/src/python/evaluation/inspectors/print_inspectors_statistics.py @@ -19,12 +19,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: help='If True, statistics will be categorized by several categories.', action='store_true') - parser.add_argument('-n', '--top_n', + parser.add_argument('-n', '--top-n', help='The top N items will be printed', type=int, default=10) - parser.add_argument('--full_stat', + parser.add_argument('--full-stat', help='If True, full statistics will be printed.', action='store_true') diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md index a18748a4..4e78972c 100644 --- a/src/python/evaluation/qodana/README.md +++ b/src/python/evaluation/qodana/README.md @@ -33,6 +33,7 @@ This module allows preparing datasets that were graded by [dataset_marking.py](d Data processing consists of several stages: - union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script and filter inspections list if it is necessary; +- get all unique inspections from the dataset; - convert `csv` file into a special format. ## Filter inspections @@ -59,3 +60,177 @@ Argument | Description |**‑i**, **‑‑inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. | The resulting file will be stored in the `dataset_folder`. + +___ + +## Get all unique inspections + +This stage allow you to get all unique inspections from a `csv` file graded by Qodana. +Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script +and has `inspections` column. + +Output file is a new `csv` file with four columns: `id`, `inspection_id`, `count_all`, `count_uniq`. +`id` is unique number for each inspection, minimal value is 1. +`inspection_id` is unique Qoadana id for each inspection. +`count_all` count all fragments where was this inspection (with duplicates). +`count_uniq` count all fragments where was this inspection (without duplicates). + +#### Usage + +Run the [get_unique_inspectors.py](get_unique_inspectors.py) with the arguments from command line. + +Required arguments: + +`solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script. + +Optional arguments: +Argument | Description +--- | --- +|**‑‑uniq**| To count all fragments for each inspection where was this inspection (without duplicates). By default it disabled. | + +The resulting file will be stored in the same folder as the input file. + +An example of the output file: + +```json +id | inspection_id | count_all | count_unique +-----|---------------------|--------------|-------------- +1 | SystemOutErr | 5 | 2 +2 | ConstantExpression | 1 | 1 +``` + +___ + +#### Convert `csv` file into a special format + +This block describes what format can be converted csv-file with code samples +graded by [dataset_marking.py](dataset_marking.py) script. + +We have two different formats: +- fragment to inspections list; +- fragment to inspections list with positions. + + +#### Fragment to inspections list + +This data representation match code fragments to a list with ids of inspections. + +Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script +and has `inspections` column. + +Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. +If the list of inspections for the fragment is empty, then write 0. + +#### Usage + +Run the [fragment_to_inspections_list.py](fragment_to_inspections_list.py) with the arguments from command line. + +Required arguments: + +- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script, +- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script. + +Optional arguments: +Argument | Description +--- | --- +|**‑‑remove‑duplicates**| Remove duplicates around inspections in each row. Default value is `False`. | + +The resulting file will be stored in the same folder as the input file. + +An example of the input file: + +```json +id | code | lang | inspections +-----|-------------------|---------------|----------------- +2 | "// some code" | java11 | "{""issues"": []}" +3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}" +0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}" +1 | "// some code" | java11 | "{""issues"": []}" +``` + +with the inspections file: + +```json +id | inspection_id +-----|------------------- +1 | SystemOutErr +2 | ConstantExpression +``` + +An example of the output file: + +```json +id | code | lang | inspections +-----|-------------------|---------------|----------------- +2 | "// some code" | java11 | 0 +3 | "// some code" | java11 | 1 +0 | "// some code" | java11 | 2,2 +1 | "// some code" | java11 | 0 + +``` + +--- + +#### Fragment to inspections list with positions + +This data representation match each line in code fragments to a list with ids of inspections in this line. + +Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script +and has `inspections` column. + +Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. +If the list of inspections for the fragment is empty, then write 0. +Note, that each line in code fragments in the new file is stored in a separate row. +All indents as well as blank lines are keeped. + +#### Usage + +Run the [fragment_to_inspections_list_line_by_line.py](fragment_to_inspections_list_line_by_line.py) with the arguments from command line. + +Required arguments: + +- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script, +- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script. + +Optional arguments: +Argument | Description +--- | --- +|**‑‑remove‑duplicates**| Remove duplicates around inspections in each row. Default value is `False`. | + +The resulting file will be stored in the same folder as the input file. + +An example of the input file: + +```json +id | code | lang | inspections +-----|-------------------|---------------|----------------- +2 | "// some code" | java11 | "{""issues"": []}" +3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}" +0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}" +1 | "// some code" | java11 | "{""issues"": []}" +``` + +with the inspections file: + +```json +id | inspection_id +-----|------------------- +1 | SystemOutErr +2 | ConstantExpression +``` + +An example of the output file: + +```json +id | code | lang | inspections +-----|----------------------------------------|---------------|----------------- +2 | "// first line from code with id 2" | java11 | 0 +2 | "// second line from code with id 2" | java11 | 0 +3 | "// first line from code with id 3" | java11 | 1 +3 | "// second line from code with id 3" | java11 | 0 +0 | "// first line from code with id 0" | java11 | 0 +0 | "// second line from code with id 0" | java11 | 2,2 +1 | "// first line from code with id 1" | java11 | 0 +1 | "// second line from code with id 1" | java11 | 0 + +``` diff --git a/src/python/evaluation/qodana/filter_inspections.py b/src/python/evaluation/qodana/filter_inspections.py index 6f758965..9321a7eb 100644 --- a/src/python/evaluation/qodana/filter_inspections.py +++ b/src/python/evaluation/qodana/filter_inspections.py @@ -1,5 +1,4 @@ import argparse -import json from pathlib import Path from typing import List @@ -7,7 +6,7 @@ from src.python.evaluation.common.csv_util import write_dataframe_to_csv from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path from src.python.evaluation.common.util import parse_set_arg -from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue from src.python.evaluation.qodana.util.util import to_json from src.python.review.common.file_system import Extension, extension_file_condition, get_all_file_system_items @@ -35,9 +34,8 @@ def __get_qodana_dataset(root: Path) -> pd.DataFrame: def __filter_inspections(json_issues: str, inspections_to_keep: List[str]) -> str: - issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value] - filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep, - map(lambda i: QodanaIssue.from_json(i), issues_list))) + issues_list = QodanaIssue.parse_list_issues_from_json(json_issues) + filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep, issues_list)) return to_json(filtered_issues) diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list.py b/src/python/evaluation/qodana/fragment_to_inspections_list.py new file mode 100644 index 00000000..42fe3ec6 --- /dev/null +++ b/src/python/evaluation/qodana/fragment_to_inspections_list.py @@ -0,0 +1,33 @@ +import argparse +from pathlib import Path + +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue +from src.python.evaluation.qodana.util.util import ( + configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids, +) +from src.python.review.common.file_system import Extension, get_parent_folder + +INSPECTIONS = QodanaColumnName.INSPECTIONS.value + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_model_converter_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + inspections_dict = get_inspections_dict(args.inspections_path) + + solutions_df[INSPECTIONS] = solutions_df.apply( + lambda row: replace_inspections_on_its_ids(QodanaIssue.parse_list_issues_from_json(row[INSPECTIONS]), + inspections_dict, args.remove_duplicates), axis=1) + + output_path = get_parent_folder(Path(solutions_file_path)) + write_dataframe_to_csv(output_path / f'numbered_ids{Extension.CSV.value}', solutions_df) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py b/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py new file mode 100644 index 00000000..c70d9ba1 --- /dev/null +++ b/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py @@ -0,0 +1,62 @@ +import argparse +import os +from itertools import groupby +from pathlib import Path +from typing import Dict, List + +import pandas as pd +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.common.util import ColumnName +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue +from src.python.evaluation.qodana.util.util import ( + configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids, +) +from src.python.review.common.file_system import Extension, get_parent_folder + + +INSPECTIONS = QodanaColumnName.INSPECTIONS.value +CODE = ColumnName.CODE.value + + +# Make a new dataframe where code fragment is separated line by line and inspections are grouped line by line +def __replace_inspections_to_its_ids_in_row(row: pd.Series, inspections_dict: Dict[str, int], + to_remove_duplicates: bool) -> pd.DataFrame: + row_df = pd.DataFrame(row).transpose() + fragment_lines = row_df.iloc[0][CODE].split(os.linesep) + fragment_df = row_df.loc[row_df.index.repeat(len(fragment_lines))].reset_index(drop=True) + + issues_list = QodanaIssue.parse_list_issues_from_json(row_df.iloc[0][INSPECTIONS]) + line_number_to_issues = {k: list(v) for k, v in groupby(issues_list, key=lambda i: i.line)} + for index, fragment_line in enumerate(fragment_lines): + issues = line_number_to_issues.get(index + 1, []) + fragment_df.iloc[index][CODE] = fragment_line + fragment_df.iloc[index][INSPECTIONS] = replace_inspections_on_its_ids(issues, inspections_dict, + to_remove_duplicates) + return fragment_df + + +def __append_df(df: pd.DataFrame, df_list: List[pd.DataFrame]) -> None: + df_list.append(df) + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_model_converter_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + inspections_dict = get_inspections_dict(args.inspections_path) + + fragment_df_list = [] + solutions_df.apply( + lambda row: __append_df(__replace_inspections_to_its_ids_in_row(row, inspections_dict, args.remove_duplicates), + fragment_df_list), axis=1) + + output_path = get_parent_folder(Path(solutions_file_path)) + write_dataframe_to_csv(output_path / f'numbered_ids_line_by_line{Extension.CSV.value}', pd.concat(fragment_df_list)) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/qodana/get_unique_inspectors.py b/src/python/evaluation/qodana/get_unique_inspectors.py new file mode 100644 index 00000000..35c32bdb --- /dev/null +++ b/src/python/evaluation/qodana/get_unique_inspectors.py @@ -0,0 +1,94 @@ +import argparse +import itertools +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Optional + +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue +from src.python.review.common.file_system import Extension, get_parent_folder + + +INSPECTION_ID = QodanaColumnName.INSPECTION_ID.value +INSPECTIONS = QodanaColumnName.INSPECTIONS.value +COUNT_ALL = QodanaColumnName.COUNT_ALL.value +COUNT_UNIQUE = QodanaColumnName.COUNT_UNIQUE.value +ID = QodanaColumnName.ID.value + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description) + + parser.add_argument('--uniq', + help='If True, count fragments for eash inspection in which this inspection was.', + action='store_true') + + +def __get_inspections_ids(json_issues: str) -> List[str]: + issues_list = QodanaIssue.parse_list_issues_from_json(json_issues) + return list(map(lambda i: i.problem_id, issues_list)) + + +def __get_inspections_from_df(solutions_df: pd.DataFrame) -> List[str]: + inspections = solutions_df.apply(lambda row: __get_inspections_ids(row[INSPECTIONS]), axis=1) + return list(itertools.chain.from_iterable(inspections.values)) + + +def __count_uniq_inspections_in_fragment(json_issues: str, inspection_id_to_fragments: Dict[str, int]) -> None: + issues_list = set(__get_inspections_ids(json_issues)) + for issue in issues_list: + inspection_id_to_fragments[issue] += 1 + + +def __get_uniq_inspections_in_all_fragments(solutions_df: pd.DataFrame) -> Dict[str, int]: + inspection_id_to_fragments: Dict[str, int] = defaultdict(int) + solutions_df.apply(lambda row: __count_uniq_inspections_in_fragment(row[INSPECTIONS], inspection_id_to_fragments), + axis=1) + + return inspection_id_to_fragments + + +def __get_all_inspections_by_inspection_id(inspection_id: str, all_inspections: List[str]) -> List[str]: + return list(filter(lambda i: i == inspection_id, all_inspections)) + + +def __create_unique_inspections_df(inspections: List[str], + inspection_id_to_fragments: Optional[Dict[str, int]]) -> pd.DataFrame: + id_to_inspection = {} + for index, inspection in enumerate(set(inspections)): + id_to_inspection[index + 1] = inspection + inspections_df = pd.DataFrame(id_to_inspection.items(), columns=[ID, INSPECTION_ID]) + inspections_df[COUNT_ALL] = inspections_df.apply(lambda row: len(__get_all_inspections_by_inspection_id( + row[INSPECTION_ID], inspections)), axis=1) + if inspection_id_to_fragments is None: + inspections_df[COUNT_UNIQUE] = 0 + else: + inspections_df[COUNT_UNIQUE] = inspections_df.apply(lambda row: inspection_id_to_fragments.get( + row[INSPECTION_ID], 0), axis=1) + return inspections_df + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + if args.uniq: + inspection_id_to_fragments = __get_uniq_inspections_in_all_fragments(solutions_df) + else: + inspection_id_to_fragments = None + inspections_df = __create_unique_inspections_df(__get_inspections_from_df(solutions_df), inspection_id_to_fragments) + + output_path = get_parent_folder(Path(solutions_file_path)) + write_dataframe_to_csv(output_path / f'inspections{Extension.CSV.value}', inspections_df) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/qodana/util/models.py b/src/python/evaluation/qodana/util/models.py index f5b3a589..08ce4c9f 100644 --- a/src/python/evaluation/qodana/util/models.py +++ b/src/python/evaluation/qodana/util/models.py @@ -1,6 +1,7 @@ import json from dataclasses import dataclass from enum import Enum, unique +from typing import List @dataclass(frozen=True) @@ -38,10 +39,18 @@ def from_json(cls, str_json: str) -> 'QodanaIssue': problem_id=issue[QodanaJsonField.PROBLEM_ID.value], ) + @classmethod + def parse_list_issues_from_json(cls, str_json: str) -> List['QodanaIssue']: + return list(map(lambda i: QodanaIssue.from_json(i), json.loads(str_json)[QodanaJsonField.ISSUES.value])) + @unique class QodanaColumnName(Enum): INSPECTIONS = 'inspections' + ID = 'id' + INSPECTION_ID = 'inspection_id' + COUNT_ALL = 'count_all' + COUNT_UNIQUE = 'count_unique' @unique diff --git a/src/python/evaluation/qodana/util/util.py b/src/python/evaluation/qodana/util/util.py index 0c4b8712..3766b09d 100644 --- a/src/python/evaluation/qodana/util/util.py +++ b/src/python/evaluation/qodana/util/util.py @@ -1,7 +1,11 @@ +import argparse import json -from typing import List +from pathlib import Path +from typing import Dict, List -from src.python.evaluation.qodana.util.models import QodanaIssue, QodanaJsonField +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField def to_json(issues: List[QodanaIssue]) -> str: @@ -9,3 +13,39 @@ def to_json(issues: List[QodanaIssue]) -> str: QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)), } return json.dumps(issues_json) + + +# Get a dictionary: Qodana inspection_id -> inspection_id from csv file with two columns: id, inspection_id +def get_inspections_dict(inspections_path: str) -> Dict[str, int]: + inspections_df = pd.read_csv(inspections_path) + inspections_dict = inspections_df.set_index(QodanaColumnName.INSPECTION_ID.value).T.to_dict('list') + for qodana_id, id_list in inspections_dict.items(): + inspections_dict[qodana_id] = id_list[0] + return inspections_dict + + +def replace_inspections_on_its_ids(issues_list: List[QodanaIssue], inspections_dict: Dict[str, int], + to_remove_duplicates: bool) -> str: + if len(issues_list) == 0: + inspections = '0' + else: + problem_id_list = list(map(lambda i: inspections_dict[i.problem_id], issues_list)) + if to_remove_duplicates: + problem_id_list = list(set(problem_id_list)) + problem_id_list.sort() + inspections = ','.join(str(p) for p in problem_id_list) + return inspections + + +def configure_model_converter_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description) + + parser.add_argument(RunToolArgument.QODANA_INSPECTIONS_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.QODANA_INSPECTIONS_PATH.value.description) + + parser.add_argument(RunToolArgument.QODANA_DUPLICATES.value.long_name, + help=RunToolArgument.QODANA_DUPLICATES.value.description, + action='store_true') diff --git a/whitelist.txt b/whitelist.txt index e7c8e657..6269ca26 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -115,3 +115,5 @@ groupby getuid Popen datasets +usecols +linesep From e41367803e4c7f7a59b85a5202f1f4a952df7f13 Mon Sep 17 00:00:00 2001 From: Daria Diatlova Date: Mon, 24 May 2021 21:24:45 +0300 Subject: [PATCH 3/4] Update README.md --- src/python/evaluation/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md index af2dbbd8..f8fafbe0 100644 --- a/src/python/evaluation/README.md +++ b/src/python/evaluation/README.md @@ -29,7 +29,7 @@ Optional arguments: Argument | Description --- | --- |**‑f**, **‑‑format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.| -|**‑tp**, **‑‑tool‑path**| Path to run-tool. Default is `src/python/review/run_tool.py` .| +|**‑tp**, **‑‑tool‑path**| Path to run-tool. Default is `src/python/review/run_tool.py` .| |**‑‑traceback**| To include a column with errors traceback into an output file. Default is `False`.| -|**‑ofp**, **‑‑output‑folder‑path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. | -|**‑ofn**, **‑‑output‑file‑name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.| +|**‑ofp**, **‑‑output‑folder‑path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. | +|**‑ofn**, **‑‑output‑file‑name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.| From f1524f80a6f57aeedc841fc83086fb25b4fdab66 Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Mon, 31 May 2021 17:41:13 +0300 Subject: [PATCH 4/4] Resolve conflicts --- src/python/evaluation/inspectors/filter_issues.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/python/evaluation/inspectors/filter_issues.py b/src/python/evaluation/inspectors/filter_issues.py index f5e4537d..e0d7d86b 100644 --- a/src/python/evaluation/inspectors/filter_issues.py +++ b/src/python/evaluation/inspectors/filter_issues.py @@ -5,9 +5,8 @@ import pandas as pd from src.python.common.tool_arguments import RunToolArgument from src.python.evaluation.common.pandas_util import get_issues_from_json, get_solutions_df_by_file_path -from src.python.evaluation.common.util import ColumnName, EvaluationArgument -from src.python.evaluation.inspectors.common.statistics import PenaltyIssue from src.python.evaluation.common.util import ColumnName, EvaluationArgument, parse_set_arg +from src.python.evaluation.inspectors.common.statistics import PenaltyIssue from src.python.review.common.file_system import Extension, get_parent_folder, serialize_data_and_write_to_file from src.python.review.inspectors.issue import BaseIssue @@ -28,10 +27,6 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: default='') -def __parse_issues_arg(str_issues: str) -> Set[str]: - return set(str_issues.split(',')) - - def __get_new_issues(traceback: str, new_issues_classes: Set[str]) -> List[PenaltyIssue]: all_issues = get_issues_from_json(traceback) return list(filter(lambda i: i.origin_class in new_issues_classes, all_issues)) @@ -61,7 +56,7 @@ def main() -> None: solutions_file_path = args.solutions_file_path solutions_df = get_solutions_df_by_file_path(solutions_file_path) - issues = __parse_issues_arg(args.issues) + issues = parse_set_arg(args.issues) diffs = get_statistics_dict(solutions_df, issues) output_path = get_parent_folder(Path(solutions_file_path)) / f'diffs{Extension.PICKLE.value}'