Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/python/common/tool_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,6 @@ class RunToolArgument(Enum):
f'"{ColumnName.LANG.value}" column are: '
f'{LanguageVersion.PYTHON_3.value}, {LanguageVersion.JAVA_8.value}, '
f'{LanguageVersion.JAVA_11.value}, {LanguageVersion.KOTLIN.value}.')

DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path',
'Path to a file with serialized diffs that were founded by diffs_between_df.py')
6 changes: 5 additions & 1 deletion src/python/evaluation/common/pandas_util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import logging
from pathlib import Path
from typing import List, Set, Union
from typing import Any, List, Set, Union

import numpy as np
import pandas as pd
Expand All @@ -21,6 +21,10 @@ def filter_df_by_language(df: pd.DataFrame, languages: Set[LanguageVersion],
return df.loc[df[column].isin(set(map(lambda l: l.value, languages)))]


def filter_df_by_condition(df: pd.DataFrame, column: str, value: Any) -> pd.DataFrame:
return df.loc[df[column] == value]


def drop_duplicates(df: pd.DataFrame, column: str = ColumnName.CODE.value) -> pd.DataFrame:
return df.drop_duplicates(column, keep='last')

Expand Down
1 change: 1 addition & 0 deletions src/python/evaluation/common/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class ColumnName(Enum):
ROW = 'row'
OLD = 'old'
NEW = 'new'
IS_PUBLIC = 'is_public'


@unique
Expand Down
36 changes: 34 additions & 2 deletions src/python/evaluation/inspectors/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ This module contains _preprocessing_ stage and _analysing_ stage.
`Analysing` stage includes:
- [diffs_between_df.py](diffs_between_df.py) allows finding a difference between
old and new grades and collect issues that were found in new data
- [print_inspectors_statistics.py](print_inspectors_statistics.py) allows print statistics
- [print_inspectors_statistics.py](print_inspectors_statistics.py) allows printing statistics
that were found by [diffs_between_df.py](diffs_between_df.py)
- [get_worse_public_examples.py](get_worse_public_examples.py) allows getting
top N worse public examples from a dataset. The measure is to count unique new inspections.

___

Expand Down Expand Up @@ -200,4 +202,34 @@ ERROR_PRONE: 17 issues, 2363 fragments
COMPLEXITY: 17 issues, 13928 fragments
COHESION: 1 issues, 3826 fragments
______
```
```

---

### Get worse public examples

[get_worse_public_examples.py](get_worse_public_examples.py) allows getting
top N worse public examples from a dataset. The measure is to count unique new inspections.

#### Usage

Run the [get_worse_public_examples.py](get_worse_public_examples.py) with the arguments from command line.

Required arguments:

- `solutions_file_path` — path to xlsx-file or csv-file with graded code samples;
- `diffs_file_path` — path to a `pickle` file, that was calculated by [diffs_between_df.py](diffs_between_df.py).

Please, note that your `solutions_file_path` file with code fragments should consist of at least 2 obligatory columns:

- `code`,
- `traceback`,
- `is_public`,
- `id`.

Optional arguments:
Argument | Description
--- | ---
|**‑n**, **‑‑n**| The N worse fragments will be saved.|

The resulting file will be stored in the same folder as the `solutions_file_path` input file.
68 changes: 68 additions & 0 deletions src/python/evaluation/inspectors/get_worse_public_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import argparse
from pathlib import Path
from typing import Dict, List

import pandas as pd
from src.python.common.tool_arguments import RunToolArgument
from src.python.evaluation.common.csv_util import write_dataframe_to_csv
from src.python.evaluation.common.pandas_util import filter_df_by_condition, get_solutions_df_by_file_path
from src.python.evaluation.common.util import ColumnName, EvaluationArgument
from src.python.review.common.file_system import deserialize_data_from_file, Extension, get_parent_folder
from src.python.review.inspectors.issue import BaseIssue


def configure_arguments(parser: argparse.ArgumentParser) -> None:
parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name,
type=lambda value: Path(value).absolute(),
help=RunToolArgument.SOLUTIONS_FILE_PATH.value.description)

parser.add_argument(RunToolArgument.DIFFS_FILE_PATH.value.long_name,
type=lambda value: Path(value).absolute(),
help=RunToolArgument.DIFFS_FILE_PATH.value.description)

parser.add_argument('-n', '--n',
help='The N worse fragments will be saved',
type=int,
default=10)


def __get_new_inspections(fragment_id_to_issues: Dict[int, List[BaseIssue]], fragment_id: int) -> str:
return ','.join(set(map(lambda i: i.origin_class, fragment_id_to_issues.get(fragment_id, []))))


def __get_public_fragments(solutions_df: pd.DataFrame, diffs_dict: dict) -> pd.DataFrame:
# Keep only public solutions
public_fragments = filter_df_by_condition(solutions_df, ColumnName.IS_PUBLIC.value, 'YES')
count_inspections_column = 'count_inspections'
new_inspections_column = 'new_inspections'

# Get only new inspections and count them
fragment_id_to_issues = diffs_dict[EvaluationArgument.TRACEBACK.value]
public_fragments[new_inspections_column] = public_fragments.apply(
lambda row: __get_new_inspections(fragment_id_to_issues, row[ColumnName.ID.value]), axis=1)
public_fragments[count_inspections_column] = public_fragments.apply(
lambda row: len(row[new_inspections_column].split(',')), axis=1)

public_fragments = public_fragments.sort_values(count_inspections_column, ascending=False)
# Keep only public columns
return public_fragments[[ColumnName.CODE.value, EvaluationArgument.TRACEBACK.value, new_inspections_column]]


# TODO: add readme
def main() -> None:
parser = argparse.ArgumentParser()
configure_arguments(parser)
args = parser.parse_args()

solutions_file_path = args.solutions_file_path
solutions_df = get_solutions_df_by_file_path(solutions_file_path)
diffs = deserialize_data_from_file(args.diffs_file_path)

public_fragments = __get_public_fragments(solutions_df, diffs)

output_path = get_parent_folder(Path(solutions_file_path)) / f'worse_fragments{Extension.CSV.value}'
write_dataframe_to_csv(output_path, public_fragments.head(args.n))


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
from pathlib import Path
from typing import Dict

from src.python.common.tool_arguments import RunToolArgument
from src.python.evaluation.common.util import ColumnName, EvaluationArgument
from src.python.evaluation.inspectors.common.statistics import IssuesStatistics
from src.python.review.common.file_system import deserialize_data_from_file
from src.python.review.inspectors.issue import ShortIssue


def configure_arguments(parser: argparse.ArgumentParser) -> None:
parser.add_argument('diffs_file_path',
parser.add_argument(RunToolArgument.DIFFS_FILE_PATH.value.long_name,
type=lambda value: Path(value).absolute(),
help='Path to a file with serialized diffs that were founded by diffs_between_df.py')
help=RunToolArgument.DIFFS_FILE_PATH.value.description)

parser.add_argument('--categorize',
help='If True, statistics will be categorized by several categories.',
Expand Down