From cfc79c4e690d7c542c4d33a672f935ffe2417ff4 Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Fri, 21 May 2021 16:50:33 +0300 Subject: [PATCH 1/6] Get unique inspections --- .../qodana/get_unique_inspectors.py | 60 +++++++++++++++++++ src/python/evaluation/qodana/util/models.py | 2 + 2 files changed, 62 insertions(+) create mode 100644 src/python/evaluation/qodana/get_unique_inspectors.py diff --git a/src/python/evaluation/qodana/get_unique_inspectors.py b/src/python/evaluation/qodana/get_unique_inspectors.py new file mode 100644 index 00000000..fb2f5906 --- /dev/null +++ b/src/python/evaluation/qodana/get_unique_inspectors.py @@ -0,0 +1,60 @@ +import argparse +import json +from pathlib import Path +from typing import Set + +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.qodana.util.models import QodanaJsonField, QodanaColumnName +from src.python.review.common.file_system import get_parent_folder, Extension + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=f'Csv file with solutions. This file must be graded by Qodana.') + + +def __get_inspections_ids(json_issues: str) -> Set[str]: + issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value] + return set(map(lambda i: i.problem_id, issues_list)) + + +def __push_inspections_ids(unique_inspections: Set[str], new_inspections: Set[str]) -> None: + unique_inspections.union(new_inspections) + + +def __get_unique_inspections(solutions_df: pd.DataFrame) -> Set[str]: + unique_inspections: Set[str] = set() + solutions_df.apply(lambda row: __push_inspections_ids(unique_inspections, + __get_inspections_ids( + row[QodanaColumnName.INSPECTIONS.value] + )), axis=1) + return unique_inspections + + +def __create_unique_inspections_df(unique_inspections: Set[str]) -> pd.DataFrame: + id_to_inspection = {} + for index, inspection in enumerate(unique_inspections): + id_to_inspection[index + 1] = inspection + return pd.DataFrame(id_to_inspection.items(), + columns=[QodanaColumnName.ID.value, QodanaColumnName.INSPECTION_ID.value]) + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + + inspections_df = __create_unique_inspections_df(__get_unique_inspections(solutions_df)) + output_path = get_parent_folder(Path(solutions_file_path)) + write_dataframe_to_csv(output_path / f'inspections{Extension.CSV.value}', inspections_df) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/qodana/util/models.py b/src/python/evaluation/qodana/util/models.py index f5b3a589..de86da7c 100644 --- a/src/python/evaluation/qodana/util/models.py +++ b/src/python/evaluation/qodana/util/models.py @@ -42,6 +42,8 @@ def from_json(cls, str_json: str) -> 'QodanaIssue': @unique class QodanaColumnName(Enum): INSPECTIONS = 'inspections' + ID = 'id' + INSPECTION_ID = 'inspection_id' @unique From a89df95f41f3f8583c03971f98bd38eae70af92c Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Fri, 21 May 2021 17:10:08 +0300 Subject: [PATCH 2/6] Fix small bug and add readme --- src/python/evaluation/qodana/README.md | 23 +++++++++++++++++++ .../qodana/get_unique_inspectors.py | 23 +++++++------------ 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md index a18748a4..217b0fdd 100644 --- a/src/python/evaluation/qodana/README.md +++ b/src/python/evaluation/qodana/README.md @@ -33,6 +33,7 @@ This module allows preparing datasets that were graded by [dataset_marking.py](d Data processing consists of several stages: - union several `csv` files that were graded by [dataset_marking.py](dataset_marking.py) script and filter inspections list if it is necessary; +- get all unique inspections from the dataset; - convert `csv` file into a special format. ## Filter inspections @@ -59,3 +60,25 @@ Argument | Description |**‑i**, **‑‑inspections**| Set of inspections ids to exclude from the dataset separated by comma. By default all inspections remain. | The resulting file will be stored in the `dataset_folder`. + +___ + +## Get all unique inspections + +This stage allow you to get all unique inspections from a `csv` file graded by Qodana. +Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script +and has `inspections` column. + +Output file is a new `csv` file with two columns: `id` and `inspection_id`. +`id` is unique number for each inspection, minimal value is 1. +`inspection_id` is unique Qoadana id for each inspection. + +#### Usage + +Run the [get_unique_inspectors.py](get_unique_inspectors.py) with the arguments from command line. + +Required arguments: + +`solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script. + +The resulting file will be stored in the same folder as the input file. diff --git a/src/python/evaluation/qodana/get_unique_inspectors.py b/src/python/evaluation/qodana/get_unique_inspectors.py index fb2f5906..887df3c5 100644 --- a/src/python/evaluation/qodana/get_unique_inspectors.py +++ b/src/python/evaluation/qodana/get_unique_inspectors.py @@ -7,32 +7,25 @@ from src.python.common.tool_arguments import RunToolArgument from src.python.evaluation.common.csv_util import write_dataframe_to_csv from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path -from src.python.evaluation.qodana.util.models import QodanaJsonField, QodanaColumnName -from src.python.review.common.file_system import get_parent_folder, Extension +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField +from src.python.review.common.file_system import Extension, get_parent_folder def configure_arguments(parser: argparse.ArgumentParser) -> None: parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, type=lambda value: Path(value).absolute(), - help=f'Csv file with solutions. This file must be graded by Qodana.') + help='Csv file with solutions. This file must be graded by Qodana.') def __get_inspections_ids(json_issues: str) -> Set[str]: - issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value] + issues_list = list(map(lambda i: QodanaIssue.from_json(i), json.loads(json_issues)[QodanaJsonField.ISSUES.value])) return set(map(lambda i: i.problem_id, issues_list)) -def __push_inspections_ids(unique_inspections: Set[str], new_inspections: Set[str]) -> None: - unique_inspections.union(new_inspections) - - def __get_unique_inspections(solutions_df: pd.DataFrame) -> Set[str]: - unique_inspections: Set[str] = set() - solutions_df.apply(lambda row: __push_inspections_ids(unique_inspections, - __get_inspections_ids( - row[QodanaColumnName.INSPECTIONS.value] - )), axis=1) - return unique_inspections + inspections = solutions_df.apply(lambda row: __get_inspections_ids(row[QodanaColumnName.INSPECTIONS.value]), + axis=1) + return set.union(*inspections.values) def __create_unique_inspections_df(unique_inspections: Set[str]) -> pd.DataFrame: @@ -53,7 +46,7 @@ def main() -> None: inspections_df = __create_unique_inspections_df(__get_unique_inspections(solutions_df)) output_path = get_parent_folder(Path(solutions_file_path)) - write_dataframe_to_csv(output_path / f'inspections{Extension.CSV.value}', inspections_df) + write_dataframe_to_csv(output_path / f'inspections{Extension.CSV.value}', inspections_df) if __name__ == '__main__': From 63cae5aa0c1c159c177a090bcd7496f99e0a8f2d Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Fri, 21 May 2021 18:32:30 +0300 Subject: [PATCH 3/6] Add a script for preprocessing data for a qodana model --- src/python/common/tool_arguments.py | 3 + src/python/evaluation/qodana/README.md | 75 +++++++++++++++++++ .../evaluation/qodana/filter_inspections.py | 8 +- .../qodana/fragment_to_inspections_list.py | 60 +++++++++++++++ .../qodana/get_unique_inspectors.py | 9 +-- src/python/evaluation/qodana/util/models.py | 5 ++ 6 files changed, 150 insertions(+), 10 deletions(-) create mode 100644 src/python/evaluation/qodana/fragment_to_inspections_list.py diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py index d3048051..a85c0e0a 100644 --- a/src/python/common/tool_arguments.py +++ b/src/python/common/tool_arguments.py @@ -89,3 +89,6 @@ class RunToolArgument(Enum): DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path', 'Path to a file with serialized diffs that were founded by diffs_between_df.py') + + QODANA_SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path', + 'Csv file with solutions. This file must be graded by Qodana.') diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md index 217b0fdd..7c3713ce 100644 --- a/src/python/evaluation/qodana/README.md +++ b/src/python/evaluation/qodana/README.md @@ -82,3 +82,78 @@ Required arguments: `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script. The resulting file will be stored in the same folder as the input file. + +An example of the output file: + +```json +id | inspection_id +-----|------------------- +1 | SystemOutErr +2 | ConstantExpression +``` + +___ + +#### Convert `csv` file into a special format + +This block describes what format can be converted csv-file with code samples +graded by [dataset_marking.py](dataset_marking.py) script. + +We have two different formats: +- fragment to inspections list; +- fragment to inspections list with positions. + + +#### Fragment to inspections list + +This data representation match code fragments to a list with ids of inspections. + +Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script +and has `inspections` column. + +Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. +If the list of inspections for the fragment is empty, then write 0. + +#### Usage + +Run the [fragment_to_inspections_list.py](fragment_to_inspections_list.py) with the arguments from command line. + +Required arguments: + +- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script, +- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script. + +The resulting file will be stored in the same folder as the input file. + +An example of the input file: + +```json +id | code | lang | inspections +-----|-------------------|---------------|----------------- +2 | "// some code" | java11 | "{""issues"": []}" +3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}" +0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}" +1 | "// some code" | java11 | "{""issues"": []}" + +``` + +with the inspections file: + +```json +id | inspection_id +-----|------------------- +1 | SystemOutErr +2 | ConstantExpression +``` + +An example of the output file: + +```json +id | code | lang | inspections +-----|-------------------|---------------|----------------- +2 | "// some code" | java11 | 0 +3 | "// some code" | java11 | 1 +0 | "// some code" | java11 | 2,2 +1 | "// some code" | java11 | 0 + +``` \ No newline at end of file diff --git a/src/python/evaluation/qodana/filter_inspections.py b/src/python/evaluation/qodana/filter_inspections.py index 6f758965..9321a7eb 100644 --- a/src/python/evaluation/qodana/filter_inspections.py +++ b/src/python/evaluation/qodana/filter_inspections.py @@ -1,5 +1,4 @@ import argparse -import json from pathlib import Path from typing import List @@ -7,7 +6,7 @@ from src.python.evaluation.common.csv_util import write_dataframe_to_csv from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path from src.python.evaluation.common.util import parse_set_arg -from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue from src.python.evaluation.qodana.util.util import to_json from src.python.review.common.file_system import Extension, extension_file_condition, get_all_file_system_items @@ -35,9 +34,8 @@ def __get_qodana_dataset(root: Path) -> pd.DataFrame: def __filter_inspections(json_issues: str, inspections_to_keep: List[str]) -> str: - issues_list = json.loads(json_issues)[QodanaJsonField.ISSUES.value] - filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep, - map(lambda i: QodanaIssue.from_json(i), issues_list))) + issues_list = QodanaIssue.parse_list_issues_from_json(json_issues) + filtered_issues = list(filter(lambda i: i.problem_id not in inspections_to_keep, issues_list)) return to_json(filtered_issues) diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list.py b/src/python/evaluation/qodana/fragment_to_inspections_list.py new file mode 100644 index 00000000..26513522 --- /dev/null +++ b/src/python/evaluation/qodana/fragment_to_inspections_list.py @@ -0,0 +1,60 @@ +import argparse +from pathlib import Path +from typing import Dict + +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue +from src.python.review.common.file_system import Extension, get_parent_folder + +INSPECTIONS = QodanaColumnName.INSPECTIONS.value + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description) + + parser.add_argument('inspections_path', + type=lambda value: Path(value).absolute(), + help='Path to a CSV file with inspections list') + + +def __get_inspections_dict(inspections_path: str) -> Dict[str, int]: + inspections_df = pd.read_csv(inspections_path) + inspections_dict = inspections_df.set_index(QodanaColumnName.INSPECTION_ID.value).T.to_dict('list') + for qodana_id, id_list in inspections_dict.items(): + inspections_dict[qodana_id] = id_list[0] + return inspections_dict + + +def __replace_inspections_on_its_ids(json_issues: str, inspections_dict: Dict[str, int]) -> str: + issues_list = QodanaIssue.parse_list_issues_from_json(json_issues) + if len(issues_list) == 0: + inspections = '0' + else: + issues_list.sort(key=lambda x: x.problem_id) + inspections = ','.join(str(inspections_dict[i.problem_id]) for i in issues_list) + return inspections + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + inspections_dict = __get_inspections_dict(args.inspections_path) + + solutions_df[INSPECTIONS] = solutions_df.apply( + lambda row: __replace_inspections_on_its_ids(row[INSPECTIONS], inspections_dict), axis=1) + + output_path = get_parent_folder(Path(solutions_file_path)) + write_dataframe_to_csv(output_path / f'numbered_ids{Extension.CSV.value}', solutions_df) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/qodana/get_unique_inspectors.py b/src/python/evaluation/qodana/get_unique_inspectors.py index 887df3c5..1be04110 100644 --- a/src/python/evaluation/qodana/get_unique_inspectors.py +++ b/src/python/evaluation/qodana/get_unique_inspectors.py @@ -1,5 +1,4 @@ import argparse -import json from pathlib import Path from typing import Set @@ -7,18 +6,18 @@ from src.python.common.tool_arguments import RunToolArgument from src.python.evaluation.common.csv_util import write_dataframe_to_csv from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path -from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue from src.python.review.common.file_system import Extension, get_parent_folder def configure_arguments(parser: argparse.ArgumentParser) -> None: - parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, + parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name, type=lambda value: Path(value).absolute(), - help='Csv file with solutions. This file must be graded by Qodana.') + help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description) def __get_inspections_ids(json_issues: str) -> Set[str]: - issues_list = list(map(lambda i: QodanaIssue.from_json(i), json.loads(json_issues)[QodanaJsonField.ISSUES.value])) + issues_list = QodanaIssue.parse_list_issues_from_json(json_issues) return set(map(lambda i: i.problem_id, issues_list)) diff --git a/src/python/evaluation/qodana/util/models.py b/src/python/evaluation/qodana/util/models.py index de86da7c..769b216f 100644 --- a/src/python/evaluation/qodana/util/models.py +++ b/src/python/evaluation/qodana/util/models.py @@ -1,6 +1,7 @@ import json from dataclasses import dataclass from enum import Enum, unique +from typing import List @dataclass(frozen=True) @@ -38,6 +39,10 @@ def from_json(cls, str_json: str) -> 'QodanaIssue': problem_id=issue[QodanaJsonField.PROBLEM_ID.value], ) + @classmethod + def parse_list_issues_from_json(cls, str_json: str) -> List['QodanaIssue']: + return list(map(lambda i: QodanaIssue.from_json(i), json.loads(str_json)[QodanaJsonField.ISSUES.value])) + @unique class QodanaColumnName(Enum): From 0b384b369c91952e78fdafb839c0fc9dbd5a80b0 Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Mon, 24 May 2021 17:46:49 +0300 Subject: [PATCH 4/6] Count inspections --- src/python/evaluation/qodana/README.md | 9 ++- .../qodana/fragment_to_inspections_list.py | 3 +- .../qodana/get_unique_inspectors.py | 66 +++++++++++++++---- src/python/evaluation/qodana/util/models.py | 2 + whitelist.txt | 1 + 5 files changed, 67 insertions(+), 14 deletions(-) diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md index 7c3713ce..09d58af0 100644 --- a/src/python/evaluation/qodana/README.md +++ b/src/python/evaluation/qodana/README.md @@ -69,9 +69,11 @@ This stage allow you to get all unique inspections from a `csv` file graded by Q Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script and has `inspections` column. -Output file is a new `csv` file with two columns: `id` and `inspection_id`. +Output file is a new `csv` file with four columns: `id`, `inspection_id`, `count_all`, `count_uniq`. `id` is unique number for each inspection, minimal value is 1. `inspection_id` is unique Qoadana id for each inspection. +`count_all` count all fragments where was this inspection (with duplicates). +`count_uniq` count all fragments where was this inspection (without duplicates). #### Usage @@ -81,6 +83,11 @@ Required arguments: `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script. +Optional arguments: +Argument | Description +--- | --- +|**‑‑uniq**| To count all fragments for each inspection where was this inspection (without duplicates). By default it disabled. | + The resulting file will be stored in the same folder as the input file. An example of the output file: diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list.py b/src/python/evaluation/qodana/fragment_to_inspections_list.py index 26513522..2a5218f7 100644 --- a/src/python/evaluation/qodana/fragment_to_inspections_list.py +++ b/src/python/evaluation/qodana/fragment_to_inspections_list.py @@ -23,7 +23,8 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: def __get_inspections_dict(inspections_path: str) -> Dict[str, int]: - inspections_df = pd.read_csv(inspections_path) + inspections_df = pd.read_csv(inspections_path, + usecols=[QodanaColumnName.ID.value, QodanaColumnName.INSPECTION_ID.value]) inspections_dict = inspections_df.set_index(QodanaColumnName.INSPECTION_ID.value).T.to_dict('list') for qodana_id, id_list in inspections_dict.items(): inspections_dict[qodana_id] = id_list[0] diff --git a/src/python/evaluation/qodana/get_unique_inspectors.py b/src/python/evaluation/qodana/get_unique_inspectors.py index 1be04110..35c32bdb 100644 --- a/src/python/evaluation/qodana/get_unique_inspectors.py +++ b/src/python/evaluation/qodana/get_unique_inspectors.py @@ -1,6 +1,8 @@ import argparse +import itertools +from collections import defaultdict from pathlib import Path -from typing import Set +from typing import Dict, List, Optional import pandas as pd from src.python.common.tool_arguments import RunToolArgument @@ -10,29 +12,65 @@ from src.python.review.common.file_system import Extension, get_parent_folder +INSPECTION_ID = QodanaColumnName.INSPECTION_ID.value +INSPECTIONS = QodanaColumnName.INSPECTIONS.value +COUNT_ALL = QodanaColumnName.COUNT_ALL.value +COUNT_UNIQUE = QodanaColumnName.COUNT_UNIQUE.value +ID = QodanaColumnName.ID.value + + def configure_arguments(parser: argparse.ArgumentParser) -> None: parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name, type=lambda value: Path(value).absolute(), help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description) + parser.add_argument('--uniq', + help='If True, count fragments for eash inspection in which this inspection was.', + action='store_true') + -def __get_inspections_ids(json_issues: str) -> Set[str]: +def __get_inspections_ids(json_issues: str) -> List[str]: issues_list = QodanaIssue.parse_list_issues_from_json(json_issues) - return set(map(lambda i: i.problem_id, issues_list)) + return list(map(lambda i: i.problem_id, issues_list)) + + +def __get_inspections_from_df(solutions_df: pd.DataFrame) -> List[str]: + inspections = solutions_df.apply(lambda row: __get_inspections_ids(row[INSPECTIONS]), axis=1) + return list(itertools.chain.from_iterable(inspections.values)) + + +def __count_uniq_inspections_in_fragment(json_issues: str, inspection_id_to_fragments: Dict[str, int]) -> None: + issues_list = set(__get_inspections_ids(json_issues)) + for issue in issues_list: + inspection_id_to_fragments[issue] += 1 + + +def __get_uniq_inspections_in_all_fragments(solutions_df: pd.DataFrame) -> Dict[str, int]: + inspection_id_to_fragments: Dict[str, int] = defaultdict(int) + solutions_df.apply(lambda row: __count_uniq_inspections_in_fragment(row[INSPECTIONS], inspection_id_to_fragments), + axis=1) + + return inspection_id_to_fragments -def __get_unique_inspections(solutions_df: pd.DataFrame) -> Set[str]: - inspections = solutions_df.apply(lambda row: __get_inspections_ids(row[QodanaColumnName.INSPECTIONS.value]), - axis=1) - return set.union(*inspections.values) +def __get_all_inspections_by_inspection_id(inspection_id: str, all_inspections: List[str]) -> List[str]: + return list(filter(lambda i: i == inspection_id, all_inspections)) -def __create_unique_inspections_df(unique_inspections: Set[str]) -> pd.DataFrame: +def __create_unique_inspections_df(inspections: List[str], + inspection_id_to_fragments: Optional[Dict[str, int]]) -> pd.DataFrame: id_to_inspection = {} - for index, inspection in enumerate(unique_inspections): + for index, inspection in enumerate(set(inspections)): id_to_inspection[index + 1] = inspection - return pd.DataFrame(id_to_inspection.items(), - columns=[QodanaColumnName.ID.value, QodanaColumnName.INSPECTION_ID.value]) + inspections_df = pd.DataFrame(id_to_inspection.items(), columns=[ID, INSPECTION_ID]) + inspections_df[COUNT_ALL] = inspections_df.apply(lambda row: len(__get_all_inspections_by_inspection_id( + row[INSPECTION_ID], inspections)), axis=1) + if inspection_id_to_fragments is None: + inspections_df[COUNT_UNIQUE] = 0 + else: + inspections_df[COUNT_UNIQUE] = inspections_df.apply(lambda row: inspection_id_to_fragments.get( + row[INSPECTION_ID], 0), axis=1) + return inspections_df def main() -> None: @@ -42,8 +80,12 @@ def main() -> None: solutions_file_path = args.solutions_file_path solutions_df = get_solutions_df_by_file_path(solutions_file_path) + if args.uniq: + inspection_id_to_fragments = __get_uniq_inspections_in_all_fragments(solutions_df) + else: + inspection_id_to_fragments = None + inspections_df = __create_unique_inspections_df(__get_inspections_from_df(solutions_df), inspection_id_to_fragments) - inspections_df = __create_unique_inspections_df(__get_unique_inspections(solutions_df)) output_path = get_parent_folder(Path(solutions_file_path)) write_dataframe_to_csv(output_path / f'inspections{Extension.CSV.value}', inspections_df) diff --git a/src/python/evaluation/qodana/util/models.py b/src/python/evaluation/qodana/util/models.py index 769b216f..08ce4c9f 100644 --- a/src/python/evaluation/qodana/util/models.py +++ b/src/python/evaluation/qodana/util/models.py @@ -49,6 +49,8 @@ class QodanaColumnName(Enum): INSPECTIONS = 'inspections' ID = 'id' INSPECTION_ID = 'inspection_id' + COUNT_ALL = 'count_all' + COUNT_UNIQUE = 'count_unique' @unique diff --git a/whitelist.txt b/whitelist.txt index e7c8e657..e4dad09b 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -115,3 +115,4 @@ groupby getuid Popen datasets +usecols From 93f2a827917e0fc1c8936ebfa8554704942dfd2f Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Mon, 24 May 2021 17:50:36 +0300 Subject: [PATCH 5/6] Update readme --- src/python/evaluation/qodana/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md index 09d58af0..8d1db192 100644 --- a/src/python/evaluation/qodana/README.md +++ b/src/python/evaluation/qodana/README.md @@ -93,10 +93,10 @@ The resulting file will be stored in the same folder as the input file. An example of the output file: ```json -id | inspection_id ------|------------------- -1 | SystemOutErr -2 | ConstantExpression +id | inspection_id | count_all | count_unique +-----|---------------------|--------------|-------------- +1 | SystemOutErr | 5 | 2 +2 | ConstantExpression | 1 | 1 ``` ___ From 03bba29f9074a9829a040d935274b35ccda81c41 Mon Sep 17 00:00:00 2001 From: Nastya Birillo Date: Mon, 24 May 2021 19:59:52 +0300 Subject: [PATCH 6/6] Add a script for preprocessing data for the second qodana model (#36) Add a script for preprocessing data for the second qodana model (inspections line by line) --- src/python/common/tool_arguments.py | 4 + src/python/evaluation/README.md | 6 +- src/python/evaluation/inspectors/README.md | 4 +- .../inspectors/print_inspectors_statistics.py | 4 +- src/python/evaluation/qodana/README.md | 74 ++++++++++++++++++- .../qodana/fragment_to_inspections_list.py | 42 ++--------- ...agment_to_inspections_list_line_by_line.py | 62 ++++++++++++++++ src/python/evaluation/qodana/util/util.py | 44 ++++++++++- whitelist.txt | 1 + 9 files changed, 195 insertions(+), 46 deletions(-) create mode 100644 src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py index a85c0e0a..65af5a53 100644 --- a/src/python/common/tool_arguments.py +++ b/src/python/common/tool_arguments.py @@ -92,3 +92,7 @@ class RunToolArgument(Enum): QODANA_SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path', 'Csv file with solutions. This file must be graded by Qodana.') + + QODANA_INSPECTIONS_PATH = ArgumentsInfo(None, 'inspections_path', 'Path to a CSV file with inspections list.') + + QODANA_DUPLICATES = ArgumentsInfo(None, '--remove-duplicates', 'Remove duplicates around inspections') diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md index 5aa4bdf7..af2dbbd8 100644 --- a/src/python/evaluation/README.md +++ b/src/python/evaluation/README.md @@ -29,7 +29,7 @@ Optional arguments: Argument | Description --- | --- |**‑f**, **‑‑format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.| -|**‑tp**, **‑‑tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .| +|**‑tp**, **‑‑tool‑path**| Path to run-tool. Default is `src/python/review/run_tool.py` .| |**‑‑traceback**| To include a column with errors traceback into an output file. Default is `False`.| -|**‑ofp**, **‑‑output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. | -|**‑ofn**, **‑‑output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.| +|**‑ofp**, **‑‑output‑folder‑path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. | +|**‑ofn**, **‑‑output‑file‑name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.| diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md index a0de1314..5c54fe93 100644 --- a/src/python/evaluation/inspectors/README.md +++ b/src/python/evaluation/inspectors/README.md @@ -161,8 +161,8 @@ Optional arguments: Argument | Description --- | --- |**‑‑categorize**| If True, statistics will be categorized by several categories. By default is disabled.| -|**‑n**, **‑‑top_n**| The top N items will be printed. Default value is 10.| -|**‑‑full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.| +|**‑n**, **‑‑top‑n**| The top N items will be printed. Default value is 10.| +|**‑‑full‑stat**| If True, full statistics (with all issues) will be printed. By default is disabled.| The statistics will be printed into console. diff --git a/src/python/evaluation/inspectors/print_inspectors_statistics.py b/src/python/evaluation/inspectors/print_inspectors_statistics.py index 8b132a31..0a5605dd 100644 --- a/src/python/evaluation/inspectors/print_inspectors_statistics.py +++ b/src/python/evaluation/inspectors/print_inspectors_statistics.py @@ -19,12 +19,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: help='If True, statistics will be categorized by several categories.', action='store_true') - parser.add_argument('-n', '--top_n', + parser.add_argument('-n', '--top-n', help='The top N items will be printed', type=int, default=10) - parser.add_argument('--full_stat', + parser.add_argument('--full-stat', help='If True, full statistics will be printed.', action='store_true') diff --git a/src/python/evaluation/qodana/README.md b/src/python/evaluation/qodana/README.md index 8d1db192..4e78972c 100644 --- a/src/python/evaluation/qodana/README.md +++ b/src/python/evaluation/qodana/README.md @@ -130,6 +130,11 @@ Required arguments: - `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script, - `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script. +Optional arguments: +Argument | Description +--- | --- +|**‑‑remove‑duplicates**| Remove duplicates around inspections in each row. Default value is `False`. | + The resulting file will be stored in the same folder as the input file. An example of the input file: @@ -141,7 +146,6 @@ id | code | lang | inspections 3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}" 0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}" 1 | "// some code" | java11 | "{""issues"": []}" - ``` with the inspections file: @@ -163,4 +167,70 @@ id | code | lang | inspections 0 | "// some code" | java11 | 2,2 1 | "// some code" | java11 | 0 -``` \ No newline at end of file +``` + +--- + +#### Fragment to inspections list with positions + +This data representation match each line in code fragments to a list with ids of inspections in this line. + +Please, note that your input file must be graded by [dataset_marking.py](dataset_marking.py) script +and has `inspections` column. + +Output file is a new `csv` file with a new `inspections` column with list with ids of inspections. +If the list of inspections for the fragment is empty, then write 0. +Note, that each line in code fragments in the new file is stored in a separate row. +All indents as well as blank lines are keeped. + +#### Usage + +Run the [fragment_to_inspections_list_line_by_line.py](fragment_to_inspections_list_line_by_line.py) with the arguments from command line. + +Required arguments: + +- `solutions_file_path` — path to csv-file with code samples graded by [dataset_marking.py](dataset_marking.py) script, +- `inspections_path` — path to csv-file with inspections list from the input file. You can get this file by [get_unique_inspectors.py](get_unique_inspectors.py) script. + +Optional arguments: +Argument | Description +--- | --- +|**‑‑remove‑duplicates**| Remove duplicates around inspections in each row. Default value is `False`. | + +The resulting file will be stored in the same folder as the input file. + +An example of the input file: + +```json +id | code | lang | inspections +-----|-------------------|---------------|----------------- +2 | "// some code" | java11 | "{""issues"": []}" +3 | "// some code" | java11 | "{""issues"": [""{\"... \""problem_id\"": \""SystemOutErr\""}""]}" +0 | "// some code" | java11 | "{""issues"": [""{\"...\""problem_id\"": \""ConstantExpression\""}"",""{\"...\""problem_id\"": \""ConstantExpression\""}""]}" +1 | "// some code" | java11 | "{""issues"": []}" +``` + +with the inspections file: + +```json +id | inspection_id +-----|------------------- +1 | SystemOutErr +2 | ConstantExpression +``` + +An example of the output file: + +```json +id | code | lang | inspections +-----|----------------------------------------|---------------|----------------- +2 | "// first line from code with id 2" | java11 | 0 +2 | "// second line from code with id 2" | java11 | 0 +3 | "// first line from code with id 3" | java11 | 1 +3 | "// second line from code with id 3" | java11 | 0 +0 | "// first line from code with id 0" | java11 | 0 +0 | "// second line from code with id 0" | java11 | 2,2 +1 | "// first line from code with id 1" | java11 | 0 +1 | "// second line from code with id 1" | java11 | 0 + +``` diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list.py b/src/python/evaluation/qodana/fragment_to_inspections_list.py index 2a5218f7..42fe3ec6 100644 --- a/src/python/evaluation/qodana/fragment_to_inspections_list.py +++ b/src/python/evaluation/qodana/fragment_to_inspections_list.py @@ -1,57 +1,29 @@ import argparse from pathlib import Path -from typing import Dict -import pandas as pd -from src.python.common.tool_arguments import RunToolArgument from src.python.evaluation.common.csv_util import write_dataframe_to_csv from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue +from src.python.evaluation.qodana.util.util import ( + configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids, +) from src.python.review.common.file_system import Extension, get_parent_folder INSPECTIONS = QodanaColumnName.INSPECTIONS.value -def configure_arguments(parser: argparse.ArgumentParser) -> None: - parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name, - type=lambda value: Path(value).absolute(), - help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description) - - parser.add_argument('inspections_path', - type=lambda value: Path(value).absolute(), - help='Path to a CSV file with inspections list') - - -def __get_inspections_dict(inspections_path: str) -> Dict[str, int]: - inspections_df = pd.read_csv(inspections_path, - usecols=[QodanaColumnName.ID.value, QodanaColumnName.INSPECTION_ID.value]) - inspections_dict = inspections_df.set_index(QodanaColumnName.INSPECTION_ID.value).T.to_dict('list') - for qodana_id, id_list in inspections_dict.items(): - inspections_dict[qodana_id] = id_list[0] - return inspections_dict - - -def __replace_inspections_on_its_ids(json_issues: str, inspections_dict: Dict[str, int]) -> str: - issues_list = QodanaIssue.parse_list_issues_from_json(json_issues) - if len(issues_list) == 0: - inspections = '0' - else: - issues_list.sort(key=lambda x: x.problem_id) - inspections = ','.join(str(inspections_dict[i.problem_id]) for i in issues_list) - return inspections - - def main() -> None: parser = argparse.ArgumentParser() - configure_arguments(parser) + configure_model_converter_arguments(parser) args = parser.parse_args() solutions_file_path = args.solutions_file_path solutions_df = get_solutions_df_by_file_path(solutions_file_path) - inspections_dict = __get_inspections_dict(args.inspections_path) + inspections_dict = get_inspections_dict(args.inspections_path) solutions_df[INSPECTIONS] = solutions_df.apply( - lambda row: __replace_inspections_on_its_ids(row[INSPECTIONS], inspections_dict), axis=1) + lambda row: replace_inspections_on_its_ids(QodanaIssue.parse_list_issues_from_json(row[INSPECTIONS]), + inspections_dict, args.remove_duplicates), axis=1) output_path = get_parent_folder(Path(solutions_file_path)) write_dataframe_to_csv(output_path / f'numbered_ids{Extension.CSV.value}', solutions_df) diff --git a/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py b/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py new file mode 100644 index 00000000..c70d9ba1 --- /dev/null +++ b/src/python/evaluation/qodana/fragment_to_inspections_list_line_by_line.py @@ -0,0 +1,62 @@ +import argparse +import os +from itertools import groupby +from pathlib import Path +from typing import Dict, List + +import pandas as pd +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.common.util import ColumnName +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue +from src.python.evaluation.qodana.util.util import ( + configure_model_converter_arguments, get_inspections_dict, replace_inspections_on_its_ids, +) +from src.python.review.common.file_system import Extension, get_parent_folder + + +INSPECTIONS = QodanaColumnName.INSPECTIONS.value +CODE = ColumnName.CODE.value + + +# Make a new dataframe where code fragment is separated line by line and inspections are grouped line by line +def __replace_inspections_to_its_ids_in_row(row: pd.Series, inspections_dict: Dict[str, int], + to_remove_duplicates: bool) -> pd.DataFrame: + row_df = pd.DataFrame(row).transpose() + fragment_lines = row_df.iloc[0][CODE].split(os.linesep) + fragment_df = row_df.loc[row_df.index.repeat(len(fragment_lines))].reset_index(drop=True) + + issues_list = QodanaIssue.parse_list_issues_from_json(row_df.iloc[0][INSPECTIONS]) + line_number_to_issues = {k: list(v) for k, v in groupby(issues_list, key=lambda i: i.line)} + for index, fragment_line in enumerate(fragment_lines): + issues = line_number_to_issues.get(index + 1, []) + fragment_df.iloc[index][CODE] = fragment_line + fragment_df.iloc[index][INSPECTIONS] = replace_inspections_on_its_ids(issues, inspections_dict, + to_remove_duplicates) + return fragment_df + + +def __append_df(df: pd.DataFrame, df_list: List[pd.DataFrame]) -> None: + df_list.append(df) + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_model_converter_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + inspections_dict = get_inspections_dict(args.inspections_path) + + fragment_df_list = [] + solutions_df.apply( + lambda row: __append_df(__replace_inspections_to_its_ids_in_row(row, inspections_dict, args.remove_duplicates), + fragment_df_list), axis=1) + + output_path = get_parent_folder(Path(solutions_file_path)) + write_dataframe_to_csv(output_path / f'numbered_ids_line_by_line{Extension.CSV.value}', pd.concat(fragment_df_list)) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/qodana/util/util.py b/src/python/evaluation/qodana/util/util.py index 0c4b8712..3766b09d 100644 --- a/src/python/evaluation/qodana/util/util.py +++ b/src/python/evaluation/qodana/util/util.py @@ -1,7 +1,11 @@ +import argparse import json -from typing import List +from pathlib import Path +from typing import Dict, List -from src.python.evaluation.qodana.util.models import QodanaIssue, QodanaJsonField +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.qodana.util.models import QodanaColumnName, QodanaIssue, QodanaJsonField def to_json(issues: List[QodanaIssue]) -> str: @@ -9,3 +13,39 @@ def to_json(issues: List[QodanaIssue]) -> str: QodanaJsonField.ISSUES.value: list(map(lambda i: i.to_json(), issues)), } return json.dumps(issues_json) + + +# Get a dictionary: Qodana inspection_id -> inspection_id from csv file with two columns: id, inspection_id +def get_inspections_dict(inspections_path: str) -> Dict[str, int]: + inspections_df = pd.read_csv(inspections_path) + inspections_dict = inspections_df.set_index(QodanaColumnName.INSPECTION_ID.value).T.to_dict('list') + for qodana_id, id_list in inspections_dict.items(): + inspections_dict[qodana_id] = id_list[0] + return inspections_dict + + +def replace_inspections_on_its_ids(issues_list: List[QodanaIssue], inspections_dict: Dict[str, int], + to_remove_duplicates: bool) -> str: + if len(issues_list) == 0: + inspections = '0' + else: + problem_id_list = list(map(lambda i: inspections_dict[i.problem_id], issues_list)) + if to_remove_duplicates: + problem_id_list = list(set(problem_id_list)) + problem_id_list.sort() + inspections = ','.join(str(p) for p in problem_id_list) + return inspections + + +def configure_model_converter_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.QODANA_SOLUTIONS_FILE_PATH.value.description) + + parser.add_argument(RunToolArgument.QODANA_INSPECTIONS_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.QODANA_INSPECTIONS_PATH.value.description) + + parser.add_argument(RunToolArgument.QODANA_DUPLICATES.value.long_name, + help=RunToolArgument.QODANA_DUPLICATES.value.description, + action='store_true') diff --git a/whitelist.txt b/whitelist.txt index e4dad09b..6269ca26 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -116,3 +116,4 @@ getuid Popen datasets usecols +linesep