From 224a337cf6d836a07a72e093e74c438c66d32bf9 Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Wed, 19 May 2021 23:32:45 +0300 Subject: [PATCH 1/2] Add getter for worse public fragments --- src/python/common/tool_arguments.py | 3 + src/python/evaluation/common/pandas_util.py | 6 +- src/python/evaluation/common/util.py | 1 + .../get_incorrect_public_examples.py | 68 +++++++++++++++++++ .../inspectors/print_inspectors_statistics.py | 5 +- 5 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 src/python/evaluation/inspectors/get_incorrect_public_examples.py diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py index b285ac23..d3048051 100644 --- a/src/python/common/tool_arguments.py +++ b/src/python/common/tool_arguments.py @@ -86,3 +86,6 @@ class RunToolArgument(Enum): f'"{ColumnName.LANG.value}" column are: ' f'{LanguageVersion.PYTHON_3.value}, {LanguageVersion.JAVA_8.value}, ' f'{LanguageVersion.JAVA_11.value}, {LanguageVersion.KOTLIN.value}.') + + DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path', + 'Path to a file with serialized diffs that were founded by diffs_between_df.py') diff --git a/src/python/evaluation/common/pandas_util.py b/src/python/evaluation/common/pandas_util.py index d87cae23..987ef030 100644 --- a/src/python/evaluation/common/pandas_util.py +++ b/src/python/evaluation/common/pandas_util.py @@ -1,7 +1,7 @@ import json import logging from pathlib import Path -from typing import List, Set, Union +from typing import Any, List, Set, Union import numpy as np import pandas as pd @@ -21,6 +21,10 @@ def filter_df_by_language(df: pd.DataFrame, languages: Set[LanguageVersion], return df.loc[df[column].isin(set(map(lambda l: l.value, languages)))] +def filter_df_by_condition(df: pd.DataFrame, column: str, value: Any) -> pd.DataFrame: + return df.loc[df[column] == value] + + def drop_duplicates(df: pd.DataFrame, column: str = ColumnName.CODE.value) -> pd.DataFrame: return df.drop_duplicates(column, keep='last') diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py index b1c501b8..271956f1 100644 --- a/src/python/evaluation/common/util.py +++ b/src/python/evaluation/common/util.py @@ -15,6 +15,7 @@ class ColumnName(Enum): ROW = 'row' OLD = 'old' NEW = 'new' + IS_PUBLIC = 'is_public' @unique diff --git a/src/python/evaluation/inspectors/get_incorrect_public_examples.py b/src/python/evaluation/inspectors/get_incorrect_public_examples.py new file mode 100644 index 00000000..1bb036c5 --- /dev/null +++ b/src/python/evaluation/inspectors/get_incorrect_public_examples.py @@ -0,0 +1,68 @@ +import argparse +from pathlib import Path +from typing import Dict, List + +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import filter_df_by_condition, get_solutions_df_by_file_path +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.review.common.file_system import deserialize_data_from_file, Extension, get_parent_folder +from src.python.review.inspectors.issue import BaseIssue + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.SOLUTIONS_FILE_PATH.value.description) + + parser.add_argument(RunToolArgument.DIFFS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.DIFFS_FILE_PATH.value.description) + + parser.add_argument('-n', '--n', + help='The N worse fragments will be saved', + type=int, + default=10) + + +def __get_new_inspections(fragment_id_to_issues: Dict[int, List[BaseIssue]], fragment_id: int) -> str: + return ','.join(set(map(lambda i: i.origin_class, fragment_id_to_issues.get(fragment_id, [])))) + + +def __get_public_fragments(solutions_df: pd.DataFrame, diffs_dict: dict) -> pd.DataFrame: + # Keep only public solutions + public_fragments = filter_df_by_condition(solutions_df, ColumnName.IS_PUBLIC.value, 'YES') + count_inspections_column = 'count_inspections' + new_inspections_column = 'new_inspections' + + # Get only new inspections and count them + fragment_id_to_issues = diffs_dict[EvaluationArgument.TRACEBACK.value] + public_fragments[new_inspections_column] = public_fragments.apply( + lambda row: __get_new_inspections(fragment_id_to_issues, row[ColumnName.ID.value]), axis=1) + public_fragments[count_inspections_column] = public_fragments.apply( + lambda row: len(row[new_inspections_column].split(',')), axis=1) + + public_fragments = public_fragments.sort_values(count_inspections_column, ascending=False) + # Keep only public columns + return public_fragments[[ColumnName.CODE.value, EvaluationArgument.TRACEBACK.value, new_inspections_column]] + + +# TODO: add readme +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + diffs = deserialize_data_from_file(args.diffs_file_path) + + public_fragments = __get_public_fragments(solutions_df, diffs) + + output_path = get_parent_folder(Path(solutions_file_path)) / f'worse_fragments{Extension.CSV.value}' + write_dataframe_to_csv(output_path, public_fragments.head(args.n)) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/inspectors/print_inspectors_statistics.py b/src/python/evaluation/inspectors/print_inspectors_statistics.py index b2846894..8b132a31 100644 --- a/src/python/evaluation/inspectors/print_inspectors_statistics.py +++ b/src/python/evaluation/inspectors/print_inspectors_statistics.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Dict +from src.python.common.tool_arguments import RunToolArgument from src.python.evaluation.common.util import ColumnName, EvaluationArgument from src.python.evaluation.inspectors.common.statistics import IssuesStatistics from src.python.review.common.file_system import deserialize_data_from_file @@ -10,9 +11,9 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: - parser.add_argument('diffs_file_path', + parser.add_argument(RunToolArgument.DIFFS_FILE_PATH.value.long_name, type=lambda value: Path(value).absolute(), - help='Path to a file with serialized diffs that were founded by diffs_between_df.py') + help=RunToolArgument.DIFFS_FILE_PATH.value.description) parser.add_argument('--categorize', help='If True, statistics will be categorized by several categories.', From 6e0a3c4e0b8cde3e2d0bc5f6bcab8afedfe4856b Mon Sep 17 00:00:00 2001 From: "Anastasiia.Birillo" Date: Thu, 20 May 2021 10:01:46 +0300 Subject: [PATCH 2/2] Update readme --- src/python/evaluation/inspectors/README.md | 36 +++++++++++++++++-- ...amples.py => get_worse_public_examples.py} | 0 2 files changed, 34 insertions(+), 2 deletions(-) rename src/python/evaluation/inspectors/{get_incorrect_public_examples.py => get_worse_public_examples.py} (100%) diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md index 938b9727..a0de1314 100644 --- a/src/python/evaluation/inspectors/README.md +++ b/src/python/evaluation/inspectors/README.md @@ -11,8 +11,10 @@ This module contains _preprocessing_ stage and _analysing_ stage. `Analysing` stage includes: - [diffs_between_df.py](diffs_between_df.py) allows finding a difference between old and new grades and collect issues that were found in new data -- [print_inspectors_statistics.py](print_inspectors_statistics.py) allows print statistics +- [print_inspectors_statistics.py](print_inspectors_statistics.py) allows printing statistics that were found by [diffs_between_df.py](diffs_between_df.py) +- [get_worse_public_examples.py](get_worse_public_examples.py) allows getting + top N worse public examples from a dataset. The measure is to count unique new inspections. ___ @@ -200,4 +202,34 @@ ERROR_PRONE: 17 issues, 2363 fragments COMPLEXITY: 17 issues, 13928 fragments COHESION: 1 issues, 3826 fragments ______ -``` \ No newline at end of file +``` + +--- + +### Get worse public examples + +[get_worse_public_examples.py](get_worse_public_examples.py) allows getting + top N worse public examples from a dataset. The measure is to count unique new inspections. + +#### Usage + +Run the [get_worse_public_examples.py](get_worse_public_examples.py) with the arguments from command line. + +Required arguments: + +- `solutions_file_path` — path to xlsx-file or csv-file with graded code samples; +- `diffs_file_path` — path to a `pickle` file, that was calculated by [diffs_between_df.py](diffs_between_df.py). + +Please, note that your `solutions_file_path` file with code fragments should consist of at least 2 obligatory columns: + +- `code`, +- `traceback`, +- `is_public`, +- `id`. + +Optional arguments: +Argument | Description +--- | --- +|**‑n**, **‑‑n**| The N worse fragments will be saved.| + +The resulting file will be stored in the same folder as the `solutions_file_path` input file. diff --git a/src/python/evaluation/inspectors/get_incorrect_public_examples.py b/src/python/evaluation/inspectors/get_worse_public_examples.py similarity index 100% rename from src/python/evaluation/inspectors/get_incorrect_public_examples.py rename to src/python/evaluation/inspectors/get_worse_public_examples.py