diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py index 0e1a96eb..cc7cb309 100644 --- a/src/python/evaluation/common/util.py +++ b/src/python/evaluation/common/util.py @@ -21,6 +21,7 @@ class ColumnName(Enum): PENALTY = 'penalty' USER = 'user' HISTORY = 'history' + TIME = 'time' TRACEBACK = 'traceback' diff --git a/src/python/evaluation/evaluation_config.py b/src/python/evaluation/evaluation_config.py index 52cf0f61..b894d529 100644 --- a/src/python/evaluation/evaluation_config.py +++ b/src/python/evaluation/evaluation_config.py @@ -22,6 +22,7 @@ def __init__(self, args: Namespace): self.format: str = args.format self.solutions_file_path: Union[str, Path] = args.solutions_file_path self.traceback: bool = args.traceback + self.with_history: bool = args.with_history self.output_folder_path: Union[str, Path] = args.output_folder_path self.extension: Extension = get_restricted_extension(self.solutions_file_path, [Extension.XLSX, Extension.CSV]) self.__init_output_file_name(args.output_file_name) @@ -32,12 +33,15 @@ def __init_output_file_name(self, output_file_name: Optional[str]): else: self.output_file_name = output_file_name - def build_command(self, inspected_file_path: Union[str, Path], lang: str) -> List[str]: + def build_command(self, inspected_file_path: Union[str, Path], lang: str, history: Optional[str]) -> List[str]: command = [LanguageVersion.PYTHON_3.value, self.tool_path, inspected_file_path, RunToolArgument.FORMAT.value.short_name, self.format] + if self.with_history and history is not None: + command.extend([RunToolArgument.HISTORY.value.long_name, history]) + if lang == LanguageVersion.JAVA_8.value or lang == LanguageVersion.JAVA_11.value: command.extend([RunToolArgument.LANG_VERSION.value.long_name, lang]) return command diff --git a/src/python/evaluation/evaluation_run_tool.py b/src/python/evaluation/evaluation_run_tool.py index 78d59ce6..98c4a9fe 100644 --- a/src/python/evaluation/evaluation_run_tool.py +++ b/src/python/evaluation/evaluation_run_tool.py @@ -6,6 +6,7 @@ import time import traceback from pathlib import Path +from typing import Optional sys.path.append('') sys.path.append('../../..') @@ -63,6 +64,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: f'Use this argument when {EvaluationArgument.TRACEBACK.value} argument' 'is enabled argument will not be used otherwise.') + parser.add_argument('--with-history', + help=f'If True, then history will be taken into account when calculating the grade. ' + f'In that case, for each fragment, the "{ColumnName.HISTORY.value}" column ' + 'must contain the history of previous errors.', + action='store_true') + def get_language(lang_key: str) -> LanguageVersion: try: @@ -73,14 +80,14 @@ def get_language(lang_key: str) -> LanguageVersion: raise KeyError(e) -def __inspect_row(lang: str, code: str, fragment_id: int, config: EvaluationConfig) -> str: +def __inspect_row(lang: str, code: str, fragment_id: int, history: Optional[str], config: EvaluationConfig) -> str: print(f'current id: {fragment_id}') # Tool does not work correctly with tmp files from module on macOS # thus we create a real file in the file system extension = get_language(lang).extension_by_language().value tmp_file_path = config.solutions_file_path.parent.absolute() / f'inspected_code_{fragment_id}{extension}' temp_file = next(create_file(tmp_file_path, code)) - command = config.build_command(temp_file, lang) + command = config.build_command(temp_file, lang, history) results = run_in_subprocess(command) os.remove(temp_file) return results @@ -103,7 +110,9 @@ def inspect_solutions_df(config: EvaluationConfig, lang_code_dataframe: pd.DataF lang_code_dataframe[ColumnName.TRACEBACK.value] = lang_code_dataframe.parallel_apply( lambda row: __inspect_row(row[ColumnName.LANG.value], row[ColumnName.CODE.value], - row[ColumnName.ID.value], config), axis=1) + row[ColumnName.ID.value], + row.get(ColumnName.HISTORY.value), + config), axis=1) lang_code_dataframe[ColumnName.GRADE.value] = lang_code_dataframe.parallel_apply( lambda row: __get_grade_from_traceback(row[ColumnName.TRACEBACK.value]), axis=1) diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md index e32e390f..762602e9 100644 --- a/src/python/evaluation/inspectors/README.md +++ b/src/python/evaluation/inspectors/README.md @@ -7,6 +7,7 @@ This module contains _preprocessing_ stage and _analysing_ stage. the `csv` or `xslx` file with student solutions and drop duplicates of code fragments (optional); - [distribute_grades.py](distribute_grades.py) allows distributing calculated grades and traceback for unique solutions into all solutions. +- [generate_history.py](generate_history.py) allows you to generate history based on issues from previous solutions. `Analysing` stage includes: - [diffs_between_df.py](diffs_between_df.py) allows finding a difference between @@ -80,6 +81,39 @@ Required arguments: The resulting file will be stored in the same folder as the input file with all samples. +---- + +### Generate history + +[generate_history.py](generate_history.py) allows you to generate history based on issues from previous solutions. + +Please, note that your solutions file should consist of at least 4 obligatory columns: + +- `user`, +- `lang`, +- `time`, +- `traceback`. + +You can get such a file with [evaluation_run_tool.py](../evaluation_run_tool.py). + +The output file is a new `xlsx` or `csv` (the same format with the input files) file with all columns from the input +except for `traceback` and `grade` (this behavior can be changed when you run the script). + +#### Usage + +Run the [generate_history.py](generate_history.py) with the arguments from command line. + +Required argument: + +- `solutions_file_path` — path to xlsx-file or csv-file with necessary columns, + +Optional arguments: +Argument | Description +--- | --- +|**‑o**, **‑‑output‑path**| The path where the dataset with history will be saved. If not specified, the dataset will be saved next to the original one. | +|**‑‑to‑drop‑traceback**| The `traceback` column will be removed from the final dataset. Default is false. | +|**‑‑to‑drop‑grades**| The `grade` column will be removed from the final dataset. Default is false.| + ___ ## Analysing diff --git a/src/python/evaluation/inspectors/generate_history.py b/src/python/evaluation/inspectors/generate_history.py new file mode 100644 index 00000000..d330dfe2 --- /dev/null +++ b/src/python/evaluation/inspectors/generate_history.py @@ -0,0 +1,131 @@ +import argparse +import json +from collections import Counter +from pathlib import Path + +import pandas as pd +from pandarallel import pandarallel +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.pandas_util import ( + get_issues_from_json, + get_solutions_df_by_file_path, + write_df_to_file, +) +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.evaluation.evaluation_run_tool import get_language +from src.python.review.common.file_system import ( + Extension, + get_name_from_path, + get_parent_folder, + get_restricted_extension, +) +from src.python.review.common.language import Language + +TRACEBACK = EvaluationArgument.TRACEBACK.value +GRADE = ColumnName.GRADE.value +HISTORY = ColumnName.HISTORY.value +USER = ColumnName.USER.value +LANG = ColumnName.LANG.value +TIME = ColumnName.TIME.value +EXTRACTED_ISSUES = 'extracted_issues' + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=f'Path to csv or xlsx file. Your dataset must include column-names: ' + f'"{USER}", "{LANG}", "{TIME}, "{TRACEBACK}".', + ) + + parser.add_argument( + '-o', '--output-path', + type=lambda value: Path(value).absolute(), + help='The path where the dataset with history will be saved. ' + 'If not specified, the dataset will be saved next to the original one.', + ) + + parser.add_argument( + '--to-drop-traceback', + help=f'The "{TRACEBACK}" column will be removed from the final dataset.', + action='store_true', + ) + + parser.add_argument( + '--to-drop-grade', + help=f'The "{GRADE}" column will be removed from the final dataset.', + action='store_true', + ) + + +def _update_counter(extracted_issues: str, counter: Counter) -> None: + issue_classes = [] + if extracted_issues: + issue_classes = extracted_issues.split(',') + + counter.update(issue_classes) + + +def _add_history(row, solutions_df: pd.DataFrame) -> str: + counter = Counter() + + filtered_df = solutions_df[ + (solutions_df[USER] == row[USER]) & (solutions_df[LANG] == row[LANG]) & (solutions_df[TIME] < row[TIME]) + ] + filtered_df.apply(lambda row: _update_counter(row[EXTRACTED_ISSUES], counter), axis=1) + + history = {} + + # If we were unable to identify the language version, we return an empty history + try: + lang_version = get_language(row[LANG]) + except KeyError: + return json.dumps(history) + + lang = Language.from_language_version(lang_version) + if len(counter) != 0: + history = {lang.value.lower(): [{'origin_class': key, 'number': value} for key, value in counter.items()]} + + return json.dumps(history) + + +def _extract_issues(traceback: str) -> str: + issues = get_issues_from_json(traceback) + issue_classes = [issue.origin_class for issue in issues] + return ','.join(issue_classes) + + +def main(): + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + pandarallel.initialize() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + solutions_df[EXTRACTED_ISSUES] = solutions_df.parallel_apply(lambda row: _extract_issues(row[TRACEBACK]), axis=1) + solutions_df[HISTORY] = solutions_df.parallel_apply(_add_history, axis=1, args=(solutions_df,)) + + columns_to_drop = [EXTRACTED_ISSUES] + + if args.to_drop_grade: + columns_to_drop.append(GRADE) + + if args.to_drop_traceback: + columns_to_drop.append(TRACEBACK) + + solutions_df.drop(columns=columns_to_drop, inplace=True, errors='ignore') + + output_path = args.output_path + if output_path is None: + output_dir = get_parent_folder(solutions_file_path) + dataset_name = get_name_from_path(solutions_file_path, with_extension=False) + output_path = output_dir / f'{dataset_name}_with_history{Extension.CSV.value}' + + output_ext = get_restricted_extension(solutions_file_path, [Extension.XLSX, Extension.CSV]) + write_df_to_file(solutions_df, output_path, output_ext) + + +if __name__ == '__main__': + main() diff --git a/src/python/review/common/language.py b/src/python/review/common/language.py index 6dc5728d..130d2581 100644 --- a/src/python/review/common/language.py +++ b/src/python/review/common/language.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import List +from src.python.review.application_config import LanguageVersion from src.python.review.common.file_system import Extension, get_extension_from_file @@ -13,6 +14,19 @@ class Language(Enum): JS = 'JAVASCRIPT' UNKNOWN = 'UNKNOWN' + @staticmethod + def from_language_version(language_version: LanguageVersion) -> 'Language': + version_to_lang = { + LanguageVersion.PYTHON_3: Language.PYTHON, + LanguageVersion.JAVA_7: Language.JAVA, + LanguageVersion.JAVA_8: Language.JAVA, + LanguageVersion.JAVA_9: Language.JAVA, + LanguageVersion.JAVA_11: Language.JAVA, + LanguageVersion.KOTLIN: Language.KOTLIN, + } + + return version_to_lang.get(language_version, Language.UNKNOWN) + @classmethod def values(cls) -> List[str]: return [member.value for member in Language] diff --git a/src/python/review/quality/penalty.py b/src/python/review/quality/penalty.py index 229cdd73..b1334682 100644 --- a/src/python/review/quality/penalty.py +++ b/src/python/review/quality/penalty.py @@ -58,7 +58,7 @@ def get_previous_issues_by_language(lang_to_history: Optional[str], language: La return [] language_to_history = json.loads(lang_to_history) - history = language_to_history[language.value.lower()] + history = language_to_history.get(language.value.lower(), []) previous_issues = [] for issue_data in history: diff --git a/test/python/common/file_system/test_subprocess.py b/test/python/common/file_system/test_subprocess.py index 1f2104dd..c3a60ffd 100644 --- a/test/python/common/file_system/test_subprocess.py +++ b/test/python/common/file_system/test_subprocess.py @@ -2,6 +2,7 @@ from pathlib import Path from test.python.common import FILE_SYSTEM_DATA_FOLDER from test.python.evaluation.testing_config import get_testing_arguments +from typing import Optional import pytest from src.python.evaluation.evaluation_config import EvaluationConfig @@ -15,8 +16,8 @@ ] -def inspect_code(config: EvaluationConfig, file: str, language: LanguageVersion) -> str: - command = config.build_command(file, language.value) +def inspect_code(config: EvaluationConfig, file: str, language: LanguageVersion, history: Optional[str] = None) -> str: + command = config.build_command(file, language.value, history) return run_in_subprocess(command) diff --git a/test/python/evaluation/testing_config.py b/test/python/evaluation/testing_config.py index 8d144534..f4bb9203 100644 --- a/test/python/evaluation/testing_config.py +++ b/test/python/evaluation/testing_config.py @@ -5,16 +5,20 @@ from src.python.review.reviewers.perform_review import OutputFormat -def get_testing_arguments(to_add_traceback=None, to_add_tool_path=None) -> Namespace: +def get_testing_arguments(to_add_traceback=None, to_add_tool_path=None, to_add_history=None) -> Namespace: testing_arguments = Namespace(format=OutputFormat.JSON.value, output_file_name=EvaluationArgument.RESULT_FILE_NAME_XLSX.value, - output_folder_path=None) + output_folder_path=None, + with_history=False) if to_add_traceback: testing_arguments.traceback = True if to_add_tool_path: testing_arguments.tool_path = MAIN_FOLDER.parent / 'review/run_tool.py' + if to_add_history: + testing_arguments.with_history = True + testing_arguments.solutions_file_path = None return testing_arguments diff --git a/whitelist.txt b/whitelist.txt index c2d37a8c..464ba1cd 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -159,4 +159,4 @@ Measurer ndarray Runtime matcher -pathlib \ No newline at end of file +pathlib