diff --git a/VERSION.md b/VERSION.md index 3eefcb9d..26aaba0e 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1 +1 @@ -1.0.0 +1.2.0 diff --git a/requirements-evaluation.txt b/requirements-evaluation.txt index 11910373..8df291f4 100644 --- a/requirements-evaluation.txt +++ b/requirements-evaluation.txt @@ -1,2 +1,3 @@ openpyxl==3.0.7 -pandas==1.2.3 \ No newline at end of file +pandas==1.2.3 +pandarallel \ No newline at end of file diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py index 5038653b..d3048051 100644 --- a/src/python/common/tool_arguments.py +++ b/src/python/common/tool_arguments.py @@ -2,6 +2,7 @@ from enum import Enum, unique from typing import List, Optional +from src.python.evaluation.common.util import ColumnName from src.python.review.application_config import LanguageVersion from src.python.review.inspectors.inspector_type import InspectorType @@ -76,3 +77,15 @@ class RunToolArgument(Enum): HISTORY = ArgumentsInfo(None, '--history', 'Json string, which contains lists of issues in the previous submissions ' 'for other tasks for one user.') + + SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path', + 'Local XLSX-file or CSV-file path. ' + 'Your file must include column-names: ' + f'"{ColumnName.CODE.value}" and ' + f'"{ColumnName.LANG.value}". Acceptable values for ' + f'"{ColumnName.LANG.value}" column are: ' + f'{LanguageVersion.PYTHON_3.value}, {LanguageVersion.JAVA_8.value}, ' + f'{LanguageVersion.JAVA_11.value}, {LanguageVersion.KOTLIN.value}.') + + DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path', + 'Path to a file with serialized diffs that were founded by diffs_between_df.py') diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md index 67e1d45e..5aa4bdf7 100644 --- a/src/python/evaluation/README.md +++ b/src/python/evaluation/README.md @@ -1,31 +1,35 @@ # Hyperstyle evaluation -This tool allows running the `Hyperstyle` tool on an xlsx table to get code quality for all code fragments. Please, note that your input file should consist of at least 2 obligatory columns to run xlsx-tool on its code fragments: +This tool allows running the `Hyperstyle` tool on a `xlsx` or `csv` table to get code quality for all code fragments. +Please, note that your input file should consist of at least 2 obligatory columns to run the tool on its code fragments: - `code` - `lang` Possible values for column `lang` are: `python3`, `kotlin`, `java8`, `java11`. -Output file is a new `xlsx` file with 3 columns: -- `code` -- `lang` +Output file is a new `xlsx` or `csv` file with the all columns from the input file and two additional ones: - `grade` -Grade assessment is conducted by [`run_tool.py`](https://github.com/hyperskill/hyperstyle/blob/main/README.md) with default arguments. Avaliable values for column `grade` are: BAD, MODERATE, GOOD, EXCELLENT. It is also possible add fourth column: `traceback` to get full inspectors feedback on each code fragment. More details on enabling traceback column in **Optional Arguments** table. +- `traceback` (optional) + +Grade assessment is conducted by [`run_tool.py`](https://github.com/hyperskill/hyperstyle/blob/main/README.md) with default arguments. + Avaliable values for column `grade` are: BAD, MODERATE, GOOD, EXCELLENT. + `traceback` column stores full inspectors feedback on each code fragment. + More details on enabling traceback column in **Optional Arguments** table. ## Usage -Run the [xlsx_run_tool.py](xlsx_run_tool.py) with the arguments from command line. +Run the [evaluation_run_tool.py](evaluation_run_tool.py) with the arguments from command line. Required arguments: -`xlsx_file_path` — path to xlsx-file with code samples to inspect. +`solutions_file_path` — path to xlsx-file or csv-file with code samples to inspect. Optional arguments: Argument | Description --- | --- |**‑f**, **‑‑format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.| |**‑tp**, **‑‑tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .| -|**‑tr**, **‑‑traceback**| To include a column with errors traceback into an output file. Default is `False`.| -|**‑ofp**, **‑‑output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file sent for inspection. | -|**‑ofn**, **‑‑output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx`.| +|**‑‑traceback**| To include a column with errors traceback into an output file. Default is `False`.| +|**‑ofp**, **‑‑output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. | +|**‑ofn**, **‑‑output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.| diff --git a/src/python/evaluation/common/csv_util.py b/src/python/evaluation/common/csv_util.py new file mode 100644 index 00000000..c2956e57 --- /dev/null +++ b/src/python/evaluation/common/csv_util.py @@ -0,0 +1,13 @@ +from pathlib import Path +from typing import Union + +import pandas as pd +from src.python.review.common.file_system import Encoding + + +def write_dataframe_to_csv(csv_file_path: Union[str, Path], df: pd.DataFrame) -> None: + # Get error with this encoding=ENCODING on several fragments. So change it then to 'utf8' + try: + df.to_csv(csv_file_path, encoding=Encoding.ISO_ENCODING.value, index=False) + except UnicodeEncodeError: + df.to_csv(csv_file_path, encoding=Encoding.UTF_ENCODING.value, index=False) diff --git a/src/python/evaluation/common/pandas_util.py b/src/python/evaluation/common/pandas_util.py new file mode 100644 index 00000000..987ef030 --- /dev/null +++ b/src/python/evaluation/common/pandas_util.py @@ -0,0 +1,103 @@ +import json +import logging +from pathlib import Path +from typing import Any, List, Set, Union + +import numpy as np +import pandas as pd +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.evaluation.common.xlsx_util import create_workbook, remove_sheet, write_dataframe_to_xlsx_sheet +from src.python.review.application_config import LanguageVersion +from src.python.review.common.file_system import Extension, get_restricted_extension +from src.python.review.inspectors.issue import BaseIssue +from src.python.review.reviewers.utils.print_review import convert_json_to_issues + +logger = logging.getLogger(__name__) + + +def filter_df_by_language(df: pd.DataFrame, languages: Set[LanguageVersion], + column: str = ColumnName.LANG.value) -> pd.DataFrame: + return df.loc[df[column].isin(set(map(lambda l: l.value, languages)))] + + +def filter_df_by_condition(df: pd.DataFrame, column: str, value: Any) -> pd.DataFrame: + return df.loc[df[column] == value] + + +def drop_duplicates(df: pd.DataFrame, column: str = ColumnName.CODE.value) -> pd.DataFrame: + return df.drop_duplicates(column, keep='last') + + +# Find all rows and columns where two dataframes are inconsistent. +# For example: +# row | column | +# ------------------------- +# 3 | column_1 | True +# | column_2 | True +# ------------------------- +# 4 | column_1 | True +# | column_2 | True +# means first and second dataframes have different values +# in column_1 and in column_2 in 3-th and 4-th rows +def get_inconsistent_positions(first: pd.DataFrame, second: pd.DataFrame) -> pd.DataFrame: + ne_stacked = (first != second).stack() + inconsistent_positions = ne_stacked[ne_stacked] + inconsistent_positions.index.names = [ColumnName.ROW.value, ColumnName.COLUMN.value] + return inconsistent_positions + + +# Create a new dataframe with all items that are different. +# For example: +# | old | new +# --------------------------------- +# row column | | +# 3 grade | EXCELLENT | MODERATE +# 4 grade | EXCELLENT | BAD +def get_diffs(first: pd.DataFrame, second: pd.DataFrame) -> pd.DataFrame: + changed = get_inconsistent_positions(first, second) + + difference_locations = np.where(first != second) + changed_from = first.values[difference_locations] + changed_to = second.values[difference_locations] + return pd.DataFrame({ + ColumnName.OLD.value: changed_from, + ColumnName.NEW.value: changed_to}, + index=changed.index) + + +def get_solutions_df(ext: Extension, file_path: Union[str, Path]) -> pd.DataFrame: + try: + if ext == Extension.XLSX: + lang_code_dataframe = pd.read_excel(file_path) + else: + lang_code_dataframe = pd.read_csv(file_path) + except FileNotFoundError as e: + logger.error('XLSX-file or CSV-file with the specified name does not exists.') + raise e + + return lang_code_dataframe + + +def get_solutions_df_by_file_path(path: Path) -> pd.DataFrame: + ext = get_restricted_extension(path, [Extension.XLSX, Extension.CSV]) + return get_solutions_df(ext, path) + + +def write_df_to_file(df: pd.DataFrame, output_file_path: Path, extension: Extension) -> None: + if extension == Extension.CSV: + write_dataframe_to_csv(output_file_path, df) + elif extension == Extension.XLSX: + create_workbook(output_file_path) + write_dataframe_to_xlsx_sheet(output_file_path, df, 'inspection_results') + # remove empty sheet that was initially created with the workbook + remove_sheet(output_file_path, 'Sheet') + + +def get_issues_from_json(str_json: str) -> List[BaseIssue]: + parsed_json = json.loads(str_json)['issues'] + return convert_json_to_issues(parsed_json) + + +def get_issues_by_row(df: pd.DataFrame, row: int) -> List[BaseIssue]: + return get_issues_from_json(df.iloc[row][EvaluationArgument.TRACEBACK.value]) diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py index c306d3b7..271956f1 100644 --- a/src/python/evaluation/common/util.py +++ b/src/python/evaluation/common/util.py @@ -6,29 +6,36 @@ @unique class ColumnName(Enum): - CODE = "code" - LANG = "lang" - LANGUAGE = "language" - GRADE = "grade" + CODE = 'code' + LANG = 'lang' + LANGUAGE = 'language' + GRADE = 'grade' + ID = 'id' + COLUMN = 'column' + ROW = 'row' + OLD = 'old' + NEW = 'new' + IS_PUBLIC = 'is_public' @unique class EvaluationArgument(Enum): - TRACEBACK = "traceback" - RESULT_FILE_NAME = "results" - RESULT_FILE_NAME_EXT = f"{RESULT_FILE_NAME}{Extension.XLSX.value}" + TRACEBACK = 'traceback' + RESULT_FILE_NAME = 'evaluation_results' + RESULT_FILE_NAME_XLSX = f'{RESULT_FILE_NAME}{Extension.XLSX.value}' + RESULT_FILE_NAME_CSV = f'{RESULT_FILE_NAME}{Extension.CSV.value}' -script_structure_rule = ("Please, make sure your XLSX-file matches following script standards: \n" - "1. Your XLSX-file should have 2 obligatory columns named:" - f"'{ColumnName.CODE.value}' & '{ColumnName.LANG.value}'. \n" - f"'{ColumnName.CODE.value}' column -- relates to the code-sample. \n" - f"'{ColumnName.LANG.value}' column -- relates to the language of a " - "particular code-sample. \n" - "2. Your code samples should belong to the one of the supported languages. \n" - "Supported languages are: Java, Kotlin, Python. \n" - f"3. Check that '{ColumnName.LANG.value}' column cells are filled with " - "acceptable language-names: \n" - f"Acceptable language-names are: {LanguageVersion.PYTHON_3.value}, " - f"{LanguageVersion.JAVA_8.value} ," - f"{LanguageVersion.JAVA_11.value} and {LanguageVersion.KOTLIN.value}.") +script_structure_rule = ('Please, make sure your XLSX-file matches following script standards: \n' + '1. Your XLSX-file or CSV-file should have 2 obligatory columns named:' + f'"{ColumnName.CODE.value}" & "{ColumnName.LANG.value}". \n' + f'"{ColumnName.CODE.value}" column -- relates to the code-sample. \n' + f'"{ColumnName.LANG.value}" column -- relates to the language of a ' + 'particular code-sample. \n' + '2. Your code samples should belong to the one of the supported languages. \n' + 'Supported languages are: Java, Kotlin, Python. \n' + f'3. Check that "{ColumnName.LANG.value}" column cells are filled with ' + 'acceptable language-names: \n' + f'Acceptable language-names are: {LanguageVersion.PYTHON_3.value}, ' + f'{LanguageVersion.JAVA_8.value} ,' + f'{LanguageVersion.JAVA_11.value} and {LanguageVersion.KOTLIN.value}.') diff --git a/src/python/evaluation/common/xlsx_util.py b/src/python/evaluation/common/xlsx_util.py index 032a5ce6..e4a3dcf4 100644 --- a/src/python/evaluation/common/xlsx_util.py +++ b/src/python/evaluation/common/xlsx_util.py @@ -4,7 +4,6 @@ import pandas as pd from openpyxl import load_workbook, Workbook -from src.python.evaluation.evaluation_config import EvaluationConfig logger = logging.getLogger(__name__) @@ -24,11 +23,10 @@ def remove_sheet(workbook_path: Union[str, Path], sheet_name: str, to_raise_erro logger.info(message) -def create_and_get_workbook_path(config: EvaluationConfig) -> Path: +def create_workbook(output_file_path: Path) -> Workbook: workbook = Workbook() - workbook_path = config.get_output_file_path() - workbook.save(workbook_path) - return workbook_path + workbook.save(output_file_path) + return workbook def write_dataframe_to_xlsx_sheet(xlsx_file_path: Union[str, Path], df: pd.DataFrame, sheet_name: str, diff --git a/src/python/evaluation/evaluation_config.py b/src/python/evaluation/evaluation_config.py index 5cee71dc..a987fa8b 100644 --- a/src/python/evaluation/evaluation_config.py +++ b/src/python/evaluation/evaluation_config.py @@ -1,12 +1,17 @@ import logging.config from argparse import Namespace from pathlib import Path -from typing import List, Union +from typing import List, Optional, Union from src.python.common.tool_arguments import RunToolArgument from src.python.evaluation.common.util import EvaluationArgument from src.python.review.application_config import LanguageVersion -from src.python.review.common.file_system import create_directory +from src.python.review.common.file_system import ( + create_directory, + Extension, + get_parent_folder, + get_restricted_extension, +) logger = logging.getLogger(__name__) @@ -15,10 +20,17 @@ class EvaluationConfig: def __init__(self, args: Namespace): self.tool_path: Union[str, Path] = args.tool_path self.output_format: str = args.format - self.xlsx_file_path: Union[str, Path] = args.xlsx_file_path + self.solutions_file_path: Union[str, Path] = args.solutions_file_path self.traceback: bool = args.traceback self.output_folder_path: Union[str, Path] = args.output_folder_path - self.output_file_name: str = args.output_file_name + self.extension: Extension = get_restricted_extension(self.solutions_file_path, [Extension.XLSX, Extension.CSV]) + self.__init_output_file_name(args.output_file_name) + + def __init_output_file_name(self, output_file_name: Optional[str]): + if output_file_name is None: + self.output_file_name = f'{EvaluationArgument.RESULT_FILE_NAME.value}{self.extension.value}' + else: + self.output_file_name = output_file_name def build_command(self, inspected_file_path: Union[str, Path], lang: str) -> List[str]: command = [LanguageVersion.PYTHON_3.value, @@ -33,11 +45,9 @@ def build_command(self, inspected_file_path: Union[str, Path], lang: str) -> Lis def get_output_file_path(self) -> Path: if self.output_folder_path is None: try: - self.output_folder_path = ( - Path(self.xlsx_file_path).parent.parent / EvaluationArgument.RESULT_FILE_NAME.value - ) + self.output_folder_path = get_parent_folder(Path(self.solutions_file_path)) create_directory(self.output_folder_path) except FileNotFoundError as e: - logger.error('XLSX-file with the specified name does not exists.') + logger.error('XLSX-file or CSV-file with the specified name does not exists.') raise e return Path(self.output_folder_path) / self.output_file_name diff --git a/src/python/evaluation/evaluation_run_tool.py b/src/python/evaluation/evaluation_run_tool.py new file mode 100644 index 00000000..4a8d5029 --- /dev/null +++ b/src/python/evaluation/evaluation_run_tool.py @@ -0,0 +1,156 @@ +import argparse +import logging.config +import os +import re +import sys +import time +import traceback +from pathlib import Path + +sys.path.append('') +sys.path.append('../../..') + +import pandas as pd +from pandarallel import pandarallel +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.pandas_util import get_solutions_df, write_df_to_file +from src.python.evaluation.common.util import ColumnName, EvaluationArgument, script_structure_rule +from src.python.evaluation.evaluation_config import EvaluationConfig +from src.python.review.application_config import LanguageVersion +from src.python.review.common.file_system import create_file +from src.python.review.common.subprocess_runner import run_in_subprocess +from src.python.review.reviewers.perform_review import OutputFormat + +logger = logging.getLogger(__name__) + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.SOLUTIONS_FILE_PATH.value.description) + + parser.add_argument('-tp', '--tool-path', + default=Path(f'{os.path.dirname(os.path.abspath(__file__))}/../review/run_tool.py'), + type=lambda value: Path(value).absolute(), + help='Path to script to run on files.') + + parser.add_argument('--traceback', + help='If True, column with the full inspector feedback will be added ' + 'to the output file with results.', + action='store_true') + + parser.add_argument('-ofp', '--output-folder-path', + help='An absolute path to the folder where file with evaluation results' + 'will be stored.' + 'Default is the path to a directory, where is the folder with xlsx or csv file.', + # if None default path will be specified based on solutions_file_path. + default=None, + type=str) + + parser.add_argument('-ofn', '--output-file-name', + help='Filename for that will be created to store inspection results.' + f'Default is "{EvaluationArgument.RESULT_FILE_NAME.value}" ' + f'with the same extension as the input file has', + default=None, + type=str) + + parser.add_argument(RunToolArgument.FORMAT.value.short_name, + RunToolArgument.FORMAT.value.long_name, + default=OutputFormat.JSON.value, + choices=OutputFormat.values(), + type=str, + help=f'{RunToolArgument.FORMAT.value.description}' + f'Use this argument when {EvaluationArgument.TRACEBACK.value} argument' + 'is enabled argument will not be used otherwise.') + + +def get_language(lang_key: str) -> LanguageVersion: + try: + return LanguageVersion(lang_key) + except ValueError as e: + logger.error(script_structure_rule) + # We should raise KeyError since it is incorrect value for key in a column + raise KeyError(e) + + +def __inspect_row(lang: str, code: str, fragment_id: int, config: EvaluationConfig) -> str: + print(f'current id: {fragment_id}') + # Tool does not work correctly with tmp files from module on macOS + # thus we create a real file in the file system + extension = get_language(lang).extension_by_language().value + tmp_file_path = config.solutions_file_path.parent.absolute() / f'inspected_code_{fragment_id}{extension}' + temp_file = next(create_file(tmp_file_path, code)) + command = config.build_command(temp_file, lang) + results = run_in_subprocess(command) + os.remove(temp_file) + return results + + +def __get_grade_from_traceback(traceback: str) -> str: + # this regular expression matches final tool grade: EXCELLENT, GOOD, MODERATE or BAD + return re.match(r'^.*{"code":\s"([A-Z]+)"', traceback).group(1) + + +# TODO: calculate grade after it +def inspect_solutions_df(config: EvaluationConfig, lang_code_dataframe: pd.DataFrame) -> pd.DataFrame: + report = pd.DataFrame(columns=lang_code_dataframe.columns) + report[EvaluationArgument.TRACEBACK.value] = [] + + pandarallel.initialize() + if config.traceback: + report[EvaluationArgument.TRACEBACK.value] = [] + try: + lang_code_dataframe[EvaluationArgument.TRACEBACK.value] = lang_code_dataframe.parallel_apply( + lambda row: __inspect_row(row[ColumnName.LANG.value], + row[ColumnName.CODE.value], + row[ColumnName.ID.value], config), axis=1) + + lang_code_dataframe[ColumnName.GRADE.value] = lang_code_dataframe.parallel_apply( + lambda row: __get_grade_from_traceback(row[EvaluationArgument.TRACEBACK.value]), axis=1) + + if not config.traceback: + del lang_code_dataframe[EvaluationArgument.TRACEBACK.value] + return lang_code_dataframe + + except ValueError as e: + logger.error(script_structure_rule) + # parallel_apply can raise ValueError but it connected to KeyError: not all columns exist in df + raise KeyError(e) + + except Exception as e: + traceback.print_exc() + logger.exception('An unexpected error.') + raise e + + +def main() -> int: + parser = argparse.ArgumentParser() + configure_arguments(parser) + + try: + start = time.time() + args = parser.parse_args() + config = EvaluationConfig(args) + lang_code_dataframe = get_solutions_df(config.extension, config.solutions_file_path) + results = inspect_solutions_df(config, lang_code_dataframe) + write_df_to_file(results, config.get_output_file_path(), config.extension) + end = time.time() + print(f'All time: {end - start}') + return 0 + + except FileNotFoundError: + logger.error('XLSX-file or CSV-file with the specified name does not exists.') + return 2 + + except KeyError: + logger.error(script_structure_rule) + return 2 + + except Exception: + traceback.print_exc() + logger.exception('An unexpected error.') + return 2 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md new file mode 100644 index 00000000..a0de1314 --- /dev/null +++ b/src/python/evaluation/inspectors/README.md @@ -0,0 +1,235 @@ +# Hyperstyle evaluation: inspectors + +This module allows comparing two different versions of `Hyperstyle` tool. +This module contains _preprocessing_ stage and _analysing_ stage. +`Preprocessing` stage includes: +- [filter_solutions.py](filter_solutions.py) script, that allows keeping only necessary languages in + the `csv` or `xslx` file with student solutions and drop duplicates of code fragments (optional); +- [distribute_grades.py](distribute_grades.py) allows distributing calculated grades and traceback + for unique solutions into all solutions. + +`Analysing` stage includes: +- [diffs_between_df.py](diffs_between_df.py) allows finding a difference between + old and new grades and collect issues that were found in new data +- [print_inspectors_statistics.py](print_inspectors_statistics.py) allows printing statistics + that were found by [diffs_between_df.py](diffs_between_df.py) +- [get_worse_public_examples.py](get_worse_public_examples.py) allows getting + top N worse public examples from a dataset. The measure is to count unique new inspections. + +___ + +## Preprocessing + +### Filter solutions + +[filter_solutions.py](filter_solutions.py) script allows keeping only necessary languages in + the `csv` or `xslx` file with student solutions and drop duplicates of code fragments (optional). + +Please, note that your input file must meet the requirements to [evaluation](./../evaluation_run_tool.py) tool. +You can find all requirements in the evaluation [README](./../README.md) file. + +Output file is a new `xlsx` or `csv` (the same format with the input file) file with the all columns +from the input file. + +#### Usage + +Run the [filter_solutions.py](filter_solutions.py) with the arguments from command line. + +Required arguments: + +`solutions_file_path` — path to xlsx-file or csv-file with code samples. + +Optional arguments: +Argument | Description +--- | --- +|**‑l**, **‑‑languages**| Set of languages to keep in the dataset. Available values: `java7`, `java8`, `java9` `java11`, `python3`, `kotlin`. The default value is set of all languages.| +|**‑‑duplicates**| If True, drop duplicates in the "code" column. By default is disabled.| + +The resulting file will be stored in the same folder as the input file. + +___ + +### Distribute grades + +[distribute_grades.py](distribute_grades.py) allows distributing calculated grades and traceback + for unique solutions into all solutions. + +Please, note that your input file with all code fragments should consist of at least 1 obligatory columns: + +- `code`. + +Please, note that your input file with unique code fragments should consist of at least 2 obligatory columns: + +- `code`, +- `grade`, +- `traceback` (optional), + +and must have all fragments from the input file with all code fragments. + +Output file is a new `xlsx` or `csv` (the same format with the input files) file with the all columns +from the input file with unique solutions. + +#### Usage + +Run the [distribute_grades.py](distribute_grades.py) with the arguments from command line. + +Required arguments: + +- `solutions_file_path_all` — path to xlsx-file or csv-file with all code samples, +- `solutions_file_path_uniq` — path to xlsx-file or csv-file with unique code samples, + +The resulting file will be stored in the same folder as the input file with all samples. + +___ + +## Analysing + +### Find diffs + +[diffs_between_df.py](diffs_between_df.py) allows finding a difference between + old and new grades and collect issues that were found in new data. + +Please, note that your input files should consist of at least 3 obligatory columns: + +- `id`, +- `grade`, +- `traceback`. + +Output file is a `pickle` file with serialized dictionary with the result. + + +#### Usage + +Run the [diffs_between_df.py](diffs_between_df.py) with the arguments from command line. + +Required arguments: + +- `solutions_file_path_old` — path to xlsx-file or csv-file with code samples that was graded by the old version of the tool, +- `solutions_file_path_new` — path to xlsx-file or csv-file with code samples that was graded by the new version of the tool. + +The resulting file will be stored in the same folder as the `solutions_file_path_old` input file. + +An example of the pickle` file is: + +```json +{ + grade: [2, 3], + traceback: { + 1: { + BaseIssue( + origin_class='C0305', + description='Trailing newlines', + line_no=15, + column_no=1, + type=IssueType('CODE_STYLE'), + + file_path=Path(), + inspector_type=InspectorType.UNDEFINED, + ), BaseIssue( + origin_class='E211', + description='whitespace before \'(\'', + line_no=1, + column_no=6, + type=IssueType('CODE_STYLE'), + + file_path=Path(), + inspector_type=InspectorType.UNDEFINED, + ), + } + }, +} +``` +In the `grade` field are stored fragments ids for which grade was increased in the new data. +In the `traceback` field for fragments ids are stored set of issues. These issues were found in the new data and were not found in the old data. + +___ + +### Print statistics + +[print_inspectors_statistics.py](print_inspectors_statistics.py) allows print statistics + that were calculated by [diffs_between_df.py](diffs_between_df.py) + +#### Usage + +Run the [print_inspectors_statistics.py](print_inspectors_statistics.py) with the arguments from command line. + +Required arguments: + +- `diffs_file_path` — path to a `pickle` file, that was calculated by [diffs_between_df.py](diffs_between_df.py). + +Optional arguments: +Argument | Description +--- | --- +|**‑‑categorize**| If True, statistics will be categorized by several categories. By default is disabled.| +|**‑n**, **‑‑top_n**| The top N items will be printed. Default value is 10.| +|**‑‑full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.| + +The statistics will be printed into console. + +The output contains: +- was found incorrect grades or not; +- how many fragments has additional issues; +- how many unique issues was found; +- top N issues in the format: (issue_key, frequency); +- short categorized statistics: for each category how many issues were found and how many + fragments have these issues; +- \[Optional\] full categorized statistics: for each category for each issue how many + fragments have this issue + +An example of the printed statistics (without full categorized statistics): + +```json +SUCCESS! Was not found incorrect grades. +______ +39830 fragments has additional issues +139 unique issues was found +______ +Top 10 issues: +SC200: 64435 times +WPS432: 17477 times +WPS221: 10618 times +WPS336: 4965 times +H601: 3826 times +SC100: 2719 times +WPS319: 2655 times +WPS317: 2575 times +WPS515: 1783 times +WPS503: 1611 times +______ +CODE_STYLE: 28 issues, 26171 fragments +BEST_PRACTICES: 76 issues, 88040 fragments +ERROR_PRONE: 17 issues, 2363 fragments +COMPLEXITY: 17 issues, 13928 fragments +COHESION: 1 issues, 3826 fragments +______ +``` + +--- + +### Get worse public examples + +[get_worse_public_examples.py](get_worse_public_examples.py) allows getting + top N worse public examples from a dataset. The measure is to count unique new inspections. + +#### Usage + +Run the [get_worse_public_examples.py](get_worse_public_examples.py) with the arguments from command line. + +Required arguments: + +- `solutions_file_path` — path to xlsx-file or csv-file with graded code samples; +- `diffs_file_path` — path to a `pickle` file, that was calculated by [diffs_between_df.py](diffs_between_df.py). + +Please, note that your `solutions_file_path` file with code fragments should consist of at least 2 obligatory columns: + +- `code`, +- `traceback`, +- `is_public`, +- `id`. + +Optional arguments: +Argument | Description +--- | --- +|**‑n**, **‑‑n**| The N worse fragments will be saved.| + +The resulting file will be stored in the same folder as the `solutions_file_path` input file. diff --git a/test/resources/evaluation/xlsx_files/__init__.py b/src/python/evaluation/inspectors/__init__.py similarity index 100% rename from test/resources/evaluation/xlsx_files/__init__.py rename to src/python/evaluation/inspectors/__init__.py diff --git a/test/resources/evaluation/xlsx_target_files/__init__.py b/src/python/evaluation/inspectors/common/__init__.py similarity index 100% rename from test/resources/evaluation/xlsx_target_files/__init__.py rename to src/python/evaluation/inspectors/common/__init__.py diff --git a/src/python/evaluation/inspectors/common/statistics.py b/src/python/evaluation/inspectors/common/statistics.py new file mode 100644 index 00000000..a36cefb7 --- /dev/null +++ b/src/python/evaluation/inspectors/common/statistics.py @@ -0,0 +1,56 @@ +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, List, Tuple + +from src.python.review.inspectors.issue import IssueType, ShortIssue + + +@dataclass(frozen=True) +class IssuesStatistics: + stat: Dict[ShortIssue, int] + changed_grades_count: int + + def print_full_statistics(self, to_categorize: bool = True): + if to_categorize: + categorized_statistics: Dict[IssueType, Dict[ShortIssue, int]] = self.get_categorized_statistics() + for category, issues in categorized_statistics.items(): + print(f'{category.value} issues:') + self.__print_stat(issues) + else: + self.__print_stat(self.stat) + + @classmethod + def __print_stat(cls, stat: Dict[ShortIssue, int]): + for issue, freq in stat.items(): + cls.print_issue_with_freq(issue, freq, prefix='- ') + + @classmethod + def print_issue_with_freq(cls, issue: ShortIssue, freq: int, prefix: str = '', suffix: str = '') -> None: + print(f'{prefix}{issue.origin_class}: {freq} times{suffix}') + + def get_categorized_statistics(self) -> Dict[IssueType, Dict[ShortIssue, int]]: + categorized_stat: Dict[IssueType, Dict[ShortIssue, int]] = defaultdict(dict) + for issue, freq in self.stat.items(): + categorized_stat[issue.type][issue] = freq + return categorized_stat + + # Get statistics for each IssueType: count unique issues, count fragments with these issues + def get_short_categorized_statistics(self) -> Dict[IssueType, Tuple[int, int]]: + categorized_statistics: Dict[IssueType, Dict[ShortIssue, int]] = self.get_categorized_statistics() + short_categorized_statistics = defaultdict(tuple) + for issue_type, stat in categorized_statistics.items(): + unique_issues = len(stat) + fragments = sum(stat.values()) + short_categorized_statistics[issue_type] = (unique_issues, fragments) + return short_categorized_statistics + + def print_short_categorized_statistics(self) -> None: + short_categorized_statistics = self.get_short_categorized_statistics() + for category, stat in short_categorized_statistics.items(): + print(f'{category.value}: {stat[0]} issues, {stat[1]} fragments') + + def get_top_n_issues(self, n: int) -> List[ShortIssue]: + return sorted(self.stat.items(), key=lambda t: t[1], reverse=True)[:n] + + def count_unique_issues(self) -> int: + return len(self.stat) diff --git a/src/python/evaluation/inspectors/diffs_between_df.py b/src/python/evaluation/inspectors/diffs_between_df.py new file mode 100644 index 00000000..c747175f --- /dev/null +++ b/src/python/evaluation/inspectors/diffs_between_df.py @@ -0,0 +1,84 @@ +import argparse +from pathlib import Path + +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.pandas_util import ( + get_inconsistent_positions, get_issues_by_row, get_solutions_df, get_solutions_df_by_file_path, +) +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.review.common.file_system import ( + Extension, get_parent_folder, get_restricted_extension, serialize_data_and_write_to_file, +) +from src.python.review.quality.model import QualityType + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name}_old', + type=lambda value: Path(value).absolute(), + help=f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.description}' + f'\nAll code fragments from this file must be graded ' + f'(file contains grade and traceback (optional) columns)') + + parser.add_argument(f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name}_new', + type=lambda value: Path(value).absolute(), + help=f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.description}' + f'\nAll code fragments from this file must be graded ' + f'(file contains grade and traceback (optional) columns)') + + +# Find difference between two dataframes. Return dict: +# { +# grade: [list_of_fragment_ids], +# traceback: { +# fragment_id: [list of issues] +# }, +# } +# The key contains only fragments that increase quality in new df +# The key contains list of new issues for each fragment +def find_diffs(old_df: pd.DataFrame, new_df: pd.DataFrame) -> dict: + inconsistent_positions = get_inconsistent_positions(old_df, new_df) + diffs = { + ColumnName.GRADE.value: [], + EvaluationArgument.TRACEBACK.value: {}, + } + # Keep only diffs in the TRACEBACK column + for row, _ in filter(lambda t: t[1] == EvaluationArgument.TRACEBACK.value, inconsistent_positions.index): + old_value = old_df.iloc[row][ColumnName.GRADE.value] + new_value = new_df.iloc[row][ColumnName.GRADE.value] + old_quality = QualityType(old_value).to_number() + new_quality = QualityType(new_value).to_number() + fragment_id = old_df.iloc[row][ColumnName.ID.value] + if new_quality > old_quality: + # It is an unexpected keys, we should check the algorithm + diffs[ColumnName.GRADE.value].append(fragment_id) + else: + # Find difference between issues + old_issues = get_issues_by_row(old_df, row) + new_issues = get_issues_by_row(new_df, row) + if len(old_issues) > len(new_issues): + raise ValueError(f'New dataframe contains less issues than old for fragment {id}') + difference = set(set(new_issues) - set(old_issues)) + diffs[EvaluationArgument.TRACEBACK.value][fragment_id] = difference + return diffs + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + old_solutions_file_path = args.solutions_file_path_old + output_ext = get_restricted_extension(old_solutions_file_path, [Extension.XLSX, Extension.CSV]) + old_solutions_df = get_solutions_df(output_ext, old_solutions_file_path) + + new_solutions_file_path = args.solutions_file_path_new + new_solutions_df = get_solutions_df_by_file_path(new_solutions_file_path) + + diffs = find_diffs(old_solutions_df, new_solutions_df) + output_path = get_parent_folder(Path(old_solutions_file_path)) / f'diffs{Extension.PICKLE.value}' + serialize_data_and_write_to_file(output_path, diffs) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/inspectors/distribute_grades.py b/src/python/evaluation/inspectors/distribute_grades.py new file mode 100644 index 00000000..e9d3e3ad --- /dev/null +++ b/src/python/evaluation/inspectors/distribute_grades.py @@ -0,0 +1,66 @@ +import argparse +from pathlib import Path +from typing import Dict, Optional, Tuple + +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.pandas_util import get_solutions_df, get_solutions_df_by_file_path, write_df_to_file +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.review.common.file_system import Extension, get_parent_folder, get_restricted_extension + +CodeToGradesDict = Dict[str, Tuple[str, Optional[str]]] + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name}_all', + type=lambda value: Path(value).absolute(), + help=f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.description}' + f'\nAll code fragments from this file must be in the uniq file') + + parser.add_argument(f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name}_uniq', + type=lambda value: Path(value).absolute(), + help=f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.description}' + f'\nAll code fragments from this file must be graded ' + f'(file contains grade and traceback (optional) columns)') + + +def __add_grade(code_to_grades_dict: CodeToGradesDict, code: str, grade: str, traceback: Optional[str]) -> None: + code_to_grades_dict[code] = (grade, traceback) + + +# Return a dictionary that contains code fragments +# with their grades and traceback (optional, can be None) +def get_code_to_grades_dict(df: pd.DataFrame) -> CodeToGradesDict: + code_to_grades_dict: CodeToGradesDict = {} + df.apply(lambda row: __add_grade(code_to_grades_dict, + row[ColumnName.CODE.value], + row[ColumnName.GRADE.value], + row[EvaluationArgument.TRACEBACK.value]), axis=1) + return code_to_grades_dict + + +def fill_all_solutions_df(all_solutions_df: pd.DataFrame, code_to_grades_dict: CodeToGradesDict) -> pd.DataFrame: + all_solutions_df[ColumnName.GRADE.value], all_solutions_df[EvaluationArgument.TRACEBACK.value] = zip( + *all_solutions_df[ColumnName.CODE.value].map(lambda code: code_to_grades_dict[code])) + return all_solutions_df + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + all_solutions_file_path = args.solutions_file_path_all + output_ext = get_restricted_extension(all_solutions_file_path, [Extension.XLSX, Extension.CSV]) + all_solutions_df = get_solutions_df(output_ext, all_solutions_file_path) + uniq_solutions_df = get_solutions_df_by_file_path(args.solutions_file_path_uniq) + + code_to_grades_dict = get_code_to_grades_dict(uniq_solutions_df) + all_solutions_df = fill_all_solutions_df(all_solutions_df, code_to_grades_dict) + + output_path = get_parent_folder(Path(all_solutions_file_path)) + write_df_to_file(all_solutions_df, output_path / f'evaluation_result_all{output_ext.value}', output_ext) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/inspectors/filter_issues.py b/src/python/evaluation/inspectors/filter_issues.py new file mode 100644 index 00000000..ca4b38b6 --- /dev/null +++ b/src/python/evaluation/inspectors/filter_issues.py @@ -0,0 +1,70 @@ +import argparse +from pathlib import Path +from typing import List, Set + +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.pandas_util import get_issues_from_json, get_solutions_df_by_file_path +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.review.common.file_system import Extension, get_parent_folder, serialize_data_and_write_to_file +from src.python.review.inspectors.issue import BaseIssue + + +TRACEBACK = EvaluationArgument.TRACEBACK.value +ID = ColumnName.ID.value +GRADE = ColumnName.GRADE.value + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.description}' + f'\nAll code fragments from this file must be graded ') + + parser.add_argument('-i', '--issues', + help='Set of issues', + default='') + + +def __parse_issues_arg(str_issues: str) -> Set[str]: + return set(str_issues.split(',')) + + +def __get_new_issues(traceback: str, new_issues_classes: Set[str]) -> List[BaseIssue]: + all_issues = get_issues_from_json(traceback) + return list(filter(lambda i: i.origin_class in new_issues_classes, all_issues)) + + +def __add_issues_for_fragment(fragment_id: int, new_issues: List[BaseIssue], diffs: dict) -> None: + if len(new_issues) > 0: + diffs[TRACEBACK][fragment_id] = new_issues + + +# Make a dict with the same structure as in the find_diffs function from diffs_between_df.py +def get_statistics_dict(solutions_df: pd.DataFrame, new_issues_classes: Set[str]) -> dict: + diffs = { + GRADE: [], + TRACEBACK: {}, + } + solutions_df.apply(lambda row: __add_issues_for_fragment(row[ID], + __get_new_issues(row[TRACEBACK], new_issues_classes), + diffs), axis=1) + return diffs + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + issues = __parse_issues_arg(args.issues) + + diffs = get_statistics_dict(solutions_df, issues) + output_path = get_parent_folder(Path(solutions_file_path)) / f'diffs{Extension.PICKLE.value}' + serialize_data_and_write_to_file(output_path, diffs) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/inspectors/filter_solutions.py b/src/python/evaluation/inspectors/filter_solutions.py new file mode 100644 index 00000000..99d3ac89 --- /dev/null +++ b/src/python/evaluation/inspectors/filter_solutions.py @@ -0,0 +1,60 @@ +import argparse +import logging +from pathlib import Path +from typing import Set + +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.pandas_util import ( + drop_duplicates, + filter_df_by_language, + get_solutions_df, + write_df_to_file, +) +from src.python.review.application_config import LanguageVersion +from src.python.review.common.file_system import Extension, get_parent_folder, get_restricted_extension + +logger = logging.getLogger(__name__) + + +def parse_languages(value: str) -> Set[LanguageVersion]: + passed_names = value.lower().split(',') + allowed_names = {lang.value for lang in LanguageVersion} + if not all(name in allowed_names for name in passed_names): + raise argparse.ArgumentError('--languages', 'Incorrect --languages\' names') + + return {LanguageVersion(name) for name in passed_names} + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.SOLUTIONS_FILE_PATH.value.description) + + parser.add_argument('-l', '--languages', + help='Set of languages to keep in the dataset', + type=parse_languages, + default=set(LanguageVersion)) + + parser.add_argument('--duplicates', + help='If True, drop duplicates in the "code" column.', + action='store_true') + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + ext = get_restricted_extension(solutions_file_path, [Extension.XLSX, Extension.CSV]) + solutions_df = get_solutions_df(ext, solutions_file_path) + + filtered_df = filter_df_by_language(solutions_df, args.languages) + if args.duplicates: + filtered_df = drop_duplicates(filtered_df) + output_path = get_parent_folder(Path(solutions_file_path)) + write_df_to_file(filtered_df, output_path / f'filtered_solutions{ext.value}', ext) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/inspectors/get_worse_public_examples.py b/src/python/evaluation/inspectors/get_worse_public_examples.py new file mode 100644 index 00000000..1bb036c5 --- /dev/null +++ b/src/python/evaluation/inspectors/get_worse_public_examples.py @@ -0,0 +1,68 @@ +import argparse +from pathlib import Path +from typing import Dict, List + +import pandas as pd +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.csv_util import write_dataframe_to_csv +from src.python.evaluation.common.pandas_util import filter_df_by_condition, get_solutions_df_by_file_path +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.review.common.file_system import deserialize_data_from_file, Extension, get_parent_folder +from src.python.review.inspectors.issue import BaseIssue + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.SOLUTIONS_FILE_PATH.value.description) + + parser.add_argument(RunToolArgument.DIFFS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.DIFFS_FILE_PATH.value.description) + + parser.add_argument('-n', '--n', + help='The N worse fragments will be saved', + type=int, + default=10) + + +def __get_new_inspections(fragment_id_to_issues: Dict[int, List[BaseIssue]], fragment_id: int) -> str: + return ','.join(set(map(lambda i: i.origin_class, fragment_id_to_issues.get(fragment_id, [])))) + + +def __get_public_fragments(solutions_df: pd.DataFrame, diffs_dict: dict) -> pd.DataFrame: + # Keep only public solutions + public_fragments = filter_df_by_condition(solutions_df, ColumnName.IS_PUBLIC.value, 'YES') + count_inspections_column = 'count_inspections' + new_inspections_column = 'new_inspections' + + # Get only new inspections and count them + fragment_id_to_issues = diffs_dict[EvaluationArgument.TRACEBACK.value] + public_fragments[new_inspections_column] = public_fragments.apply( + lambda row: __get_new_inspections(fragment_id_to_issues, row[ColumnName.ID.value]), axis=1) + public_fragments[count_inspections_column] = public_fragments.apply( + lambda row: len(row[new_inspections_column].split(',')), axis=1) + + public_fragments = public_fragments.sort_values(count_inspections_column, ascending=False) + # Keep only public columns + return public_fragments[[ColumnName.CODE.value, EvaluationArgument.TRACEBACK.value, new_inspections_column]] + + +# TODO: add readme +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + solutions_file_path = args.solutions_file_path + solutions_df = get_solutions_df_by_file_path(solutions_file_path) + diffs = deserialize_data_from_file(args.diffs_file_path) + + public_fragments = __get_public_fragments(solutions_df, diffs) + + output_path = get_parent_folder(Path(solutions_file_path)) / f'worse_fragments{Extension.CSV.value}' + write_dataframe_to_csv(output_path, public_fragments.head(args.n)) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/inspectors/print_inspectors_statistics.py b/src/python/evaluation/inspectors/print_inspectors_statistics.py new file mode 100644 index 00000000..8b132a31 --- /dev/null +++ b/src/python/evaluation/inspectors/print_inspectors_statistics.py @@ -0,0 +1,84 @@ +import argparse +from collections import defaultdict +from pathlib import Path +from typing import Dict + +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.evaluation.inspectors.common.statistics import IssuesStatistics +from src.python.review.common.file_system import deserialize_data_from_file +from src.python.review.inspectors.issue import ShortIssue + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.DIFFS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.DIFFS_FILE_PATH.value.description) + + parser.add_argument('--categorize', + help='If True, statistics will be categorized by several categories.', + action='store_true') + + parser.add_argument('-n', '--top_n', + help='The top N items will be printed', + type=int, + default=10) + + parser.add_argument('--full_stat', + help='If True, full statistics will be printed.', + action='store_true') + + +def has_incorrect_grades(diffs_dict: dict) -> bool: + return len(diffs_dict[ColumnName.GRADE.value]) > 0 + + +def gather_statistics(diffs_dict: dict) -> IssuesStatistics: + changed_grades_count = len(diffs_dict[EvaluationArgument.TRACEBACK.value]) + issues_dict: Dict[ShortIssue, int] = defaultdict(int) + for _, issues in diffs_dict[EvaluationArgument.TRACEBACK.value].items(): + for issue in issues: + short_issue = ShortIssue(origin_class=issue.origin_class, type=issue.type) + issues_dict[short_issue] += 1 + return IssuesStatistics(issues_dict, changed_grades_count) + + +def __print_top_n(statistics: IssuesStatistics, n: int, separator: str) -> None: + top_n = statistics.get_top_n_issues(n) + print(separator) + print(f'Top {n} issues:') + for issue, freq in top_n: + IssuesStatistics.print_issue_with_freq(issue, freq) + print(separator) + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + separator = '______' + + diffs = deserialize_data_from_file(args.diffs_file_path) + if has_incorrect_grades(diffs): + print(f'WARNING! Was found incorrect grades in the following fragments: {diffs[ColumnName.GRADE.value]}.') + else: + print('SUCCESS! Was not found incorrect grades.') + print(separator) + + statistics = gather_statistics(diffs) + print(f'{statistics.changed_grades_count} fragments has additional issues') + print(f'{statistics.count_unique_issues()} unique issues was found') + + n = args.top_n + __print_top_n(statistics, n, separator) + + statistics.print_short_categorized_statistics() + print(separator) + + if args.full_stat: + statistics.print_full_statistics() + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/xlsx_run_tool.py b/src/python/evaluation/xlsx_run_tool.py deleted file mode 100644 index b2200f19..00000000 --- a/src/python/evaluation/xlsx_run_tool.py +++ /dev/null @@ -1,172 +0,0 @@ -import argparse -import logging.config -import os -import re -import sys -import traceback -from pathlib import Path -from typing import Type - -sys.path.append('') -sys.path.append('../../..') - -import pandas as pd -from src.python.common.tool_arguments import RunToolArgument -from src.python.evaluation.common.util import ColumnName, EvaluationArgument, script_structure_rule -from src.python.evaluation.common.xlsx_util import ( - create_and_get_workbook_path, - remove_sheet, - write_dataframe_to_xlsx_sheet, -) -from src.python.evaluation.evaluation_config import EvaluationConfig -from src.python.review.application_config import LanguageVersion -from src.python.review.common.file_system import create_file -from src.python.review.common.subprocess_runner import run_in_subprocess -from src.python.review.reviewers.perform_review import OutputFormat - -logger = logging.getLogger(__name__) - - -def configure_arguments(parser: argparse.ArgumentParser, run_tool_arguments: Type[RunToolArgument]) -> None: - parser.add_argument('xlsx_file_path', - type=lambda value: Path(value).absolute(), - help='Local XLSX-file path. ' - 'Your XLSX-file must include column-names: ' - f'"{ColumnName.CODE.value}" and ' - f'"{ColumnName.LANG.value}". Acceptable values for ' - f'"{ColumnName.LANG.value}" column are: ' - f'{LanguageVersion.PYTHON_3.value}, {LanguageVersion.JAVA_8.value}, ' - f'{LanguageVersion.JAVA_11.value}, {LanguageVersion.KOTLIN.value}.') - - parser.add_argument('-tp', '--tool-path', - default=Path('src/python/review/run_tool.py').absolute(), - type=lambda value: Path(value).absolute(), - help='Path to script to run on files.') - - parser.add_argument('-tr', '--traceback', - help='If True, column with the full inspector feedback will be added ' - 'to the output file with results.', - action='store_true') - - parser.add_argument('-ofp', '--output-folder-path', - help='An absolute path to the folder where file with evaluation results' - 'will be stored.' - 'Default is the path to a directory, where is the folder with xlsx_file.', - # if None default path will be specified based on xlsx_file_path. - default=None, - type=str) - - parser.add_argument('-ofn', '--output-file-name', - help='Filename for that will be created to store inspection results.' - f'Default is "{EvaluationArgument.RESULT_FILE_NAME_EXT.value}"', - default=f'{EvaluationArgument.RESULT_FILE_NAME_EXT.value}', - type=str) - - parser.add_argument(run_tool_arguments.FORMAT.value.short_name, - run_tool_arguments.FORMAT.value.long_name, - default=OutputFormat.JSON.value, - choices=OutputFormat.values(), - type=str, - help=f'{run_tool_arguments.FORMAT.value.description}' - f'Use this argument when {EvaluationArgument.TRACEBACK.value} argument' - 'is enabled argument will not be used otherwise.') - - -def get_language(lang_key: str) -> LanguageVersion: - try: - return LanguageVersion(lang_key) - except ValueError as e: - logger.error(script_structure_rule) - # We should raise KeyError since it is incorrect value for key in a column - raise KeyError(e) - - -def create_dataframe(config: EvaluationConfig) -> pd.DataFrame: - report = pd.DataFrame( - { - ColumnName.LANGUAGE.value: [], - ColumnName.CODE.value: [], - ColumnName.GRADE.value: [], - }, - ) - - if config.traceback: - report[EvaluationArgument.TRACEBACK.value] = [] - - try: - lang_code_dataframe = pd.read_excel(config.xlsx_file_path) - - except FileNotFoundError as e: - logger.error('XLSX-file with the specified name does not exists.') - raise e - - try: - for lang, code in zip(lang_code_dataframe[ColumnName.LANG.value], - lang_code_dataframe[ColumnName.CODE.value]): - - # Tool does not work correctly with tmp files from module on macOS - # thus we create a real file in the file system - extension = get_language(lang).extension_by_language().value - tmp_file_path = config.xlsx_file_path.parent.absolute() / f'inspected_code{extension}' - temp_file = next(create_file(tmp_file_path, code)) - - command = config.build_command(temp_file, lang) - results = run_in_subprocess(command) - os.remove(temp_file) - - # this regular expression matches final tool grade: EXCELLENT, GOOD, MODERATE or BAD - grades = re.match(r'^.*{"code":\s"([A-Z]+)"', results).group(1) - output_row_values = [lang, code, grades] - column_indices = [ColumnName.LANGUAGE.value, - ColumnName.CODE.value, - ColumnName.GRADE.value] - - if config.traceback: - output_row_values.append(results) - column_indices.append(EvaluationArgument.TRACEBACK.value) - - new_file_report_row = pd.Series(data=output_row_values, index=column_indices) - report = report.append(new_file_report_row, ignore_index=True) - - return report - - except KeyError as e: - logger.error(script_structure_rule) - raise e - - except Exception as e: - traceback.print_exc() - logger.exception('An unexpected error.') - raise e - - -def main() -> int: - parser = argparse.ArgumentParser() - configure_arguments(parser, RunToolArgument) - - try: - args = parser.parse_args() - config = EvaluationConfig(args) - workbook_path = create_and_get_workbook_path(config) - results = create_dataframe(config) - write_dataframe_to_xlsx_sheet(workbook_path, results, 'inspection_results') - # remove empty sheet that was initially created with the workbook - remove_sheet(workbook_path, 'Sheet') - return 0 - - except FileNotFoundError: - logger.error('XLSX-file with the specified name does not exists.') - return 2 - - except KeyError: - logger.error(script_structure_rule) - return 2 - - except Exception: - traceback.print_exc() - logger.exception('An unexpected error.') - return 2 - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/python/review/common/file_system.py b/src/python/review/common/file_system.py index d0b3dca0..eb5bc768 100644 --- a/src/python/review/common/file_system.py +++ b/src/python/review/common/file_system.py @@ -1,10 +1,12 @@ import linecache import os +import pickle +import re import tempfile from contextlib import contextmanager from enum import Enum, unique from pathlib import Path -from typing import Callable, List, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple, Union @unique @@ -30,6 +32,14 @@ class Extension(Enum): JS = '.js' KTS = '.kts' XLSX = '.xlsx' + CSV = '.csv' + PICKLE = '.pickle' + + # Not empty extensions are returned with a dot, for example, '.txt' + # If file has no extensions, an empty one ('') is returned + @classmethod + def get_extension_from_file(cls, file: str) -> 'Extension': + return Extension(os.path.splitext(file)[1]) ItemCondition = Callable[[str], bool] @@ -54,6 +64,48 @@ def get_all_file_system_items(root: Path, item_condition: ItemCondition = all_it return items +def match_condition(regex: str) -> ItemCondition: + def does_name_match(name: str) -> bool: + return re.fullmatch(regex, name) is not None + return does_name_match + + +def serialize_data_and_write_to_file(path: Path, data: Any) -> None: + create_directory(get_parent_folder(path)) + with open(path, 'wb') as f: + p = pickle.Pickler(f) + p.dump(data) + + +def deserialize_data_from_file(path: Path) -> Any: + with open(path, 'rb') as f: + u = pickle.Unpickler(f) + return u.load() + + +# For getting name of the last folder or file +# For example, returns 'folder' for both 'path/data/folder' and 'path/data/folder/' +def get_name_from_path(path: str, with_extension: bool = True) -> str: + head, tail = os.path.split(path) + # Tail can be empty if '/' is at the end of the path + file_name = tail or os.path.basename(head) + if not with_extension: + file_name = os.path.splitext(file_name)[0] + elif get_extension_from_file(Path(file_name)) == Extension.EMPTY: + raise ValueError('Cannot get file name with extension, because the passed path does not contain it') + return file_name + + +def pair_in_and_out_files(in_files: List[Path], out_files: List[Path]) -> List[Tuple[Path, Path]]: + pairs = [] + for in_file in in_files: + out_file = Path(re.sub(r'in(?=[^in]*$)', 'out', str(in_file))) + if out_file not in out_files: + raise ValueError(f'List of out files does not contain a file for {in_file}') + pairs.append((in_file, out_file)) + return pairs + + # TODO: Need testing @contextmanager def new_temp_dir() -> Path: @@ -76,7 +128,7 @@ def create_file(file_path: Union[str, Path], content: str): yield Path(file_path) -def create_directory(directory: str) -> None: +def create_directory(directory: Union[str, Path]) -> None: os.makedirs(directory, exist_ok=True) @@ -98,3 +150,32 @@ def get_content_from_file(file_path: Path, encoding: str = Encoding.ISO_ENCODING # If file has no extensions, an empty one ('') is returned def get_extension_from_file(file: Path) -> Extension: return Extension(os.path.splitext(file)[1]) + + +def get_restricted_extension(file_path: Optional[Union[str, Path]] = None, + available_values: List[Extension] = None) -> Extension: + if file_path is None: + return Extension.EMPTY + ext = Extension.get_extension_from_file(file_path) + if available_values is not None and ext not in available_values: + raise ValueError(f'Invalid extension. ' + f'Available values are: {list(map(lambda e: e.value, available_values))}.') + return ext + + +def remove_slash(path: str) -> str: + return path.rstrip('/') + + +def add_slash(path: str) -> str: + if not path.endswith('/'): + path += '/' + return path + + +def get_parent_folder(path: Path, to_add_slash: bool = False) -> Path: + path = remove_slash(str(path)) + parent_folder = '/'.join(path.split('/')[:-1]) + if to_add_slash: + parent_folder = add_slash(parent_folder) + return Path(parent_folder) diff --git a/src/python/review/inspectors/inspector_type.py b/src/python/review/inspectors/inspector_type.py index 15482a8b..2d00c0d5 100644 --- a/src/python/review/inspectors/inspector_type.py +++ b/src/python/review/inspectors/inspector_type.py @@ -23,6 +23,8 @@ class InspectorType(Enum): # JavaScript language ESLINT = 'ESLINT' + UNDEFINED = 'UNDEFINED' + @classmethod def available_values(cls) -> List[str]: return [ diff --git a/src/python/review/inspectors/issue.py b/src/python/review/inspectors/issue.py index c910bf80..965f2262 100644 --- a/src/python/review/inspectors/issue.py +++ b/src/python/review/inspectors/issue.py @@ -66,16 +66,21 @@ def get_base_issue_data_dict(cls, @dataclass(frozen=True, eq=True) -class BaseIssue: +class ShortIssue: + origin_class: str + + type: IssueType + + +@dataclass(frozen=True, eq=True) +class BaseIssue(ShortIssue): + description: str + file_path: Path line_no: int column_no: int - description: str - origin_class: str - inspector_type: InspectorType - type: IssueType class Measurable(abc.ABC): diff --git a/src/python/review/reviewers/utils/print_review.py b/src/python/review/reviewers/utils/print_review.py index a5a2f59b..f67db761 100644 --- a/src/python/review/reviewers/utils/print_review.py +++ b/src/python/review/reviewers/utils/print_review.py @@ -1,10 +1,12 @@ import json import linecache +from enum import Enum, unique from pathlib import Path -from typing import Any, Dict +from typing import Any, Dict, List from src.python.review.common.file_system import get_file_line -from src.python.review.inspectors.issue import BaseIssue +from src.python.review.inspectors.inspector_type import InspectorType +from src.python.review.inspectors.issue import BaseIssue, IssueType from src.python.review.reviewers.review_result import ReviewResult @@ -107,15 +109,45 @@ def print_review_result_as_multi_file_json(review_result: ReviewResult) -> None: print(json.dumps(output_json)) +@unique +class IssueJsonFields(Enum): + CODE = 'code' + TEXT = 'text' + LINE = 'line' + LINE_NUMBER = 'line_number' + COLUMN_NUMBER = 'column_number' + CATEGORY = 'category' + INFLUENCE_ON_PENALTY = 'influence_on_penalty' + + def convert_issue_to_json(issue: BaseIssue, influence_on_penalty: int) -> Dict[str, Any]: line_text = get_file_line(issue.file_path, issue.line_no) return { - 'code': issue.origin_class, - 'text': issue.description, - 'line': line_text, - 'line_number': issue.line_no, - 'column_number': issue.column_no, - 'category': issue.type.value, - 'influence_on_penalty': influence_on_penalty, + IssueJsonFields.CODE.value: issue.origin_class, + IssueJsonFields.TEXT.value: issue.description, + IssueJsonFields.LINE.value: line_text, + IssueJsonFields.LINE_NUMBER.value: issue.line_no, + IssueJsonFields.COLUMN_NUMBER.value: issue.column_no, + IssueJsonFields.CATEGORY.value: issue.type.value, + IssueJsonFields.INFLUENCE_ON_PENALTY.value: influence_on_penalty, } + + +# It works only for old json format +def convert_json_to_issues(issues_json: List[dict]) -> List[BaseIssue]: + issues = [] + for issue in issues_json: + issues.append( + BaseIssue( + origin_class=issue[IssueJsonFields.CODE.value], + description=issue[IssueJsonFields.TEXT.value], + line_no=int(issue[IssueJsonFields.LINE_NUMBER.value]), + column_no=int(issue[IssueJsonFields.COLUMN_NUMBER.value]), + type=IssueType(issue[IssueJsonFields.CATEGORY.value]), + + file_path=Path(), + inspector_type=InspectorType.UNDEFINED, + ), + ) + return issues diff --git a/src/python/review/run_tool.py b/src/python/review/run_tool.py index fc74774d..9731e4f0 100644 --- a/src/python/review/run_tool.py +++ b/src/python/review/run_tool.py @@ -1,5 +1,4 @@ import argparse -import enum import logging.config import os import sys @@ -8,7 +7,6 @@ from pathlib import Path from typing import Set - sys.path.append('') sys.path.append('../../..') @@ -43,69 +41,69 @@ def positive_int(value: str) -> int: return value_int -def configure_arguments(parser: argparse.ArgumentParser, tool_arguments: enum.EnumMeta) -> None: - parser.add_argument(tool_arguments.VERBOSITY.value.short_name, - tool_arguments.VERBOSITY.value.long_name, - help=tool_arguments.VERBOSITY.value.description, +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument(RunToolArgument.VERBOSITY.value.short_name, + RunToolArgument.VERBOSITY.value.long_name, + help=RunToolArgument.VERBOSITY.value.description, default=VerbosityLevel.DISABLE.value, choices=VerbosityLevel.values(), type=str) # Usage example: -d Flake8,Intelli - parser.add_argument(tool_arguments.DISABLE.value.short_name, - tool_arguments.DISABLE.value.long_name, - help=tool_arguments.DISABLE.value.description, + parser.add_argument(RunToolArgument.DISABLE.value.short_name, + RunToolArgument.DISABLE.value.long_name, + help=RunToolArgument.DISABLE.value.description, type=parse_disabled_inspectors, default=set()) - parser.add_argument(tool_arguments.DUPLICATES.value.long_name, + parser.add_argument(RunToolArgument.DUPLICATES.value.long_name, action='store_true', - help=tool_arguments.DUPLICATES.value.description) + help=RunToolArgument.DUPLICATES.value.description) # TODO: deprecated argument: language_version. Delete after several releases. parser.add_argument('--language_version', - tool_arguments.LANG_VERSION.value.long_name, - help=tool_arguments.LANG_VERSION.value.description, + RunToolArgument.LANG_VERSION.value.long_name, + help=RunToolArgument.LANG_VERSION.value.description, default=None, choices=LanguageVersion.values(), type=str) # TODO: deprecated argument: --n_cpu. Delete after several releases. parser.add_argument('--n_cpu', - tool_arguments.CPU.value.long_name, - help=tool_arguments.CPU.value.description, + RunToolArgument.CPU.value.long_name, + help=RunToolArgument.CPU.value.description, default=1, type=positive_int) - parser.add_argument(tool_arguments.PATH.value.long_name, + parser.add_argument(RunToolArgument.PATH.value.long_name, type=lambda value: Path(value).absolute(), - help=tool_arguments.PATH.value.description) + help=RunToolArgument.PATH.value.description) - parser.add_argument(tool_arguments.FORMAT.value.short_name, - tool_arguments.FORMAT.value.long_name, + parser.add_argument(RunToolArgument.FORMAT.value.short_name, + RunToolArgument.FORMAT.value.long_name, default=OutputFormat.JSON.value, choices=OutputFormat.values(), type=str, - help=tool_arguments.FORMAT.value.description) + help=RunToolArgument.FORMAT.value.description) - parser.add_argument(tool_arguments.START_LINE.value.short_name, - tool_arguments.START_LINE.value.long_name, + parser.add_argument(RunToolArgument.START_LINE.value.short_name, + RunToolArgument.START_LINE.value.long_name, default=1, type=positive_int, - help=tool_arguments.START_LINE.value.description) + help=RunToolArgument.START_LINE.value.description) - parser.add_argument(tool_arguments.END_LINE.value.short_name, - tool_arguments.END_LINE.value.long_name, + parser.add_argument(RunToolArgument.END_LINE.value.short_name, + RunToolArgument.END_LINE.value.long_name, default=None, type=positive_int, - help=tool_arguments.END_LINE.value.description) + help=RunToolArgument.END_LINE.value.description) - parser.add_argument(tool_arguments.NEW_FORMAT.value.long_name, + parser.add_argument(RunToolArgument.NEW_FORMAT.value.long_name, action='store_true', - help=tool_arguments.NEW_FORMAT.value.description) + help=RunToolArgument.NEW_FORMAT.value.description) - parser.add_argument(tool_arguments.HISTORY.value.long_name, - help=tool_arguments.HISTORY.value.description, + parser.add_argument(RunToolArgument.HISTORY.value.long_name, + help=RunToolArgument.HISTORY.value.description, type=str) @@ -124,7 +122,7 @@ def configure_logging(verbosity: VerbosityLevel) -> None: def main() -> int: parser = argparse.ArgumentParser() - configure_arguments(parser, RunToolArgument) + configure_arguments(parser) try: args = parser.parse_args() diff --git a/test/python/common_util.py b/test/python/common_util.py new file mode 100644 index 00000000..4823bec8 --- /dev/null +++ b/test/python/common_util.py @@ -0,0 +1,20 @@ +from pathlib import Path +from typing import List, Tuple + +import pandas as pd +from src.python.review.common.file_system import ( + Extension, get_all_file_system_items, match_condition, pair_in_and_out_files, +) + + +def get_in_and_out_list(root: Path, + in_ext: Extension = Extension.CSV, + out_ext: Extension = Extension.CSV) -> List[Tuple[Path, Path]]: + in_files = get_all_file_system_items(root, match_condition(rf'in_\d+{in_ext.value}')) + out_files = get_all_file_system_items(root, match_condition(rf'out_\d+{out_ext.value}')) + return pair_in_and_out_files(in_files, out_files) + + +def equal_df(expected_df: pd.DataFrame, actual_df: pd.DataFrame) -> bool: + return expected_df.reset_index(drop=True).equals( + actual_df.reset_index(drop=True)) or (expected_df.empty and actual_df.empty) diff --git a/test/python/evaluation/__init__.py b/test/python/evaluation/__init__.py index 31b1b86f..293fdcae 100644 --- a/test/python/evaluation/__init__.py +++ b/test/python/evaluation/__init__.py @@ -9,3 +9,9 @@ TARGET_XLSX_DATA_FOLDER = CURRENT_TEST_DATA_FOLDER / 'xlsx_target_files' RESULTS_DIR_PATH = MAIN_FOLDER.parent / 'evaluation/results' + +EVALUATION_COMMON_DIR_PATH = CURRENT_TEST_DATA_FOLDER / 'common' + +PANDAS_UTIL_DIR_PATH = EVALUATION_COMMON_DIR_PATH / 'pandas_util' + +INSPECTORS_DIR_PATH = EVALUATION_COMMON_DIR_PATH / 'inspectors' diff --git a/test/python/evaluation/common/__init__.py b/test/python/evaluation/common/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/python/evaluation/common/pandas_util/__init__.py b/test/python/evaluation/common/pandas_util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/python/evaluation/common/pandas_util/test_drop_duplicates.py b/test/python/evaluation/common/pandas_util/test_drop_duplicates.py new file mode 100644 index 00000000..acd47445 --- /dev/null +++ b/test/python/evaluation/common/pandas_util/test_drop_duplicates.py @@ -0,0 +1,18 @@ +from pathlib import Path +from test.python.common_util import equal_df, get_in_and_out_list +from test.python.evaluation import PANDAS_UTIL_DIR_PATH + +import pytest +from src.python.evaluation.common.pandas_util import drop_duplicates, get_solutions_df_by_file_path + +RESOURCES_PATH = PANDAS_UTIL_DIR_PATH / 'drop_duplicates' + +IN_AND_OUT_FILES = get_in_and_out_list(RESOURCES_PATH) + + +@pytest.mark.parametrize(('in_file', 'out_file'), IN_AND_OUT_FILES) +def test(in_file: Path, out_file: Path): + in_df = get_solutions_df_by_file_path(in_file) + out_df = get_solutions_df_by_file_path(out_file) + filtered_df = drop_duplicates(in_df) + assert equal_df(out_df, filtered_df) diff --git a/test/python/evaluation/common/pandas_util/test_filter_by_language.py b/test/python/evaluation/common/pandas_util/test_filter_by_language.py new file mode 100644 index 00000000..25af150d --- /dev/null +++ b/test/python/evaluation/common/pandas_util/test_filter_by_language.py @@ -0,0 +1,29 @@ +from pathlib import Path +from test.python.common_util import equal_df, get_in_and_out_list +from test.python.evaluation import PANDAS_UTIL_DIR_PATH + +import pytest +from src.python.evaluation.common.pandas_util import filter_df_by_language, get_solutions_df_by_file_path +from src.python.review.application_config import LanguageVersion +from src.python.review.common.file_system import get_name_from_path + +RESOURCES_PATH = PANDAS_UTIL_DIR_PATH / 'filter_by_language' + + +IN_FILE_TO_LANGUAGES = { + 'in_1.csv': set(LanguageVersion), + 'in_2.csv': set(), + 'in_3.csv': [LanguageVersion.PYTHON_3], + 'in_4.csv': [LanguageVersion.PYTHON_3, LanguageVersion.PYTHON_3], + 'in_5.csv': [LanguageVersion.PYTHON_3, LanguageVersion.JAVA_11], +} + +IN_AND_OUT_FILES = get_in_and_out_list(RESOURCES_PATH) + + +@pytest.mark.parametrize(('in_file', 'out_file'), IN_AND_OUT_FILES) +def test(in_file: Path, out_file: Path): + in_df = get_solutions_df_by_file_path(in_file) + out_df = get_solutions_df_by_file_path(out_file) + filtered_df = filter_df_by_language(in_df, IN_FILE_TO_LANGUAGES[get_name_from_path(str(in_file))]) + assert equal_df(out_df, filtered_df) diff --git a/test/python/evaluation/inspectors/__init__.py b/test/python/evaluation/inspectors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/python/evaluation/inspectors/diffs_between_df/__init__.py b/test/python/evaluation/inspectors/diffs_between_df/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/python/evaluation/inspectors/diffs_between_df/test_diifs_between_df.py b/test/python/evaluation/inspectors/diffs_between_df/test_diifs_between_df.py new file mode 100644 index 00000000..86af9105 --- /dev/null +++ b/test/python/evaluation/inspectors/diffs_between_df/test_diifs_between_df.py @@ -0,0 +1,72 @@ +from pathlib import Path +from test.python.evaluation import INSPECTORS_DIR_PATH + +import pytest +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.common.util import ColumnName, EvaluationArgument +from src.python.evaluation.inspectors.diffs_between_df import find_diffs +from src.python.review.inspectors.inspector_type import InspectorType +from src.python.review.inspectors.issue import BaseIssue, IssueType + +RESOURCES_PATH = INSPECTORS_DIR_PATH / 'diffs_between_df' + +EMPTY_DIFFS = { + ColumnName.GRADE.value: [], + EvaluationArgument.TRACEBACK.value: {}, +} + +INCORRECT_GRADE_DIFFS = { + ColumnName.GRADE.value: [1, 2], + EvaluationArgument.TRACEBACK.value: {}, +} + +ISSUES = { + BaseIssue( + origin_class='C0305', + description='Trailing newlines', + line_no=15, + column_no=1, + type=IssueType('CODE_STYLE'), + + file_path=Path(), + inspector_type=InspectorType.UNDEFINED, + ), BaseIssue( + origin_class='E211', + description='whitespace before \'(\'', + line_no=1, + column_no=6, + type=IssueType('CODE_STYLE'), + + file_path=Path(), + inspector_type=InspectorType.UNDEFINED, + ), +} + +ISSUES_DIFFS = { + ColumnName.GRADE.value: [], + EvaluationArgument.TRACEBACK.value: { + 1: ISSUES, + }, +} + +MIXED_DIFFS = { + ColumnName.GRADE.value: [2, 3], + EvaluationArgument.TRACEBACK.value: { + 1: ISSUES, + }, +} + +TEST_DATA = [ + ('old_1.csv', 'new_1.csv', EMPTY_DIFFS), + ('old_2.csv', 'new_2.csv', INCORRECT_GRADE_DIFFS), + ('old_3.csv', 'new_3.csv', ISSUES_DIFFS), + ('old_4.csv', 'new_4.csv', MIXED_DIFFS), +] + + +@pytest.mark.parametrize(('old_file', 'new_file', 'diffs'), TEST_DATA) +def test(old_file: Path, new_file: Path, diffs: dict): + old_df = get_solutions_df_by_file_path(RESOURCES_PATH / old_file) + new_df = get_solutions_df_by_file_path(RESOURCES_PATH / new_file) + actual_diffs = find_diffs(old_df, new_df) + assert actual_diffs == diffs diff --git a/test/python/evaluation/test_data_path.py b/test/python/evaluation/test_data_path.py index 0d8e3502..bae12c11 100644 --- a/test/python/evaluation/test_data_path.py +++ b/test/python/evaluation/test_data_path.py @@ -3,12 +3,13 @@ import pytest from src.python.evaluation.evaluation_config import EvaluationConfig -from src.python.evaluation.xlsx_run_tool import create_dataframe +from src.python.evaluation.evaluation_run_tool import get_solutions_df, inspect_solutions_df def test_incorrect_data_path(): with pytest.raises(FileNotFoundError): testing_arguments_dict = get_testing_arguments(to_add_traceback=True, to_add_tool_path=True) - testing_arguments_dict.xlsx_file_path = XLSX_DATA_FOLDER / 'do_not_exist.xlsx' + testing_arguments_dict.solutions_file_path = XLSX_DATA_FOLDER / 'do_not_exist.xlsx' config = EvaluationConfig(testing_arguments_dict) - assert create_dataframe(config) + lang_code_dataframe = get_solutions_df(config.extension, config.solutions_file_path) + assert inspect_solutions_df(config, lang_code_dataframe) diff --git a/test/python/evaluation/test_output_results.py b/test/python/evaluation/test_output_results.py index 519652e5..44508688 100644 --- a/test/python/evaluation/test_output_results.py +++ b/test/python/evaluation/test_output_results.py @@ -1,10 +1,11 @@ +from test.python.common_util import equal_df from test.python.evaluation import TARGET_XLSX_DATA_FOLDER, XLSX_DATA_FOLDER from test.python.evaluation.testing_config import get_testing_arguments import pandas as pd import pytest from src.python.evaluation.evaluation_config import EvaluationConfig -from src.python.evaluation.xlsx_run_tool import create_dataframe +from src.python.evaluation.evaluation_run_tool import get_solutions_df, inspect_solutions_df FILE_NAMES = [ ('test_sorted_order.xlsx', 'target_sorted_order.xlsx', False), @@ -18,15 +19,16 @@ def test_correct_output(test_file: str, target_file: str, output_type: bool): testing_arguments_dict = get_testing_arguments(to_add_tool_path=True) - testing_arguments_dict.xlsx_file_path = XLSX_DATA_FOLDER / test_file + testing_arguments_dict.solutions_file_path = XLSX_DATA_FOLDER / test_file testing_arguments_dict.traceback = output_type config = EvaluationConfig(testing_arguments_dict) - test_dataframe = create_dataframe(config) + lang_code_dataframe = get_solutions_df(config.extension, config.solutions_file_path) + test_dataframe = inspect_solutions_df(config, lang_code_dataframe) sheet_name = 'grades' if output_type: sheet_name = 'traceback' target_dataframe = pd.read_excel(TARGET_XLSX_DATA_FOLDER / target_file, sheet_name=sheet_name) - assert test_dataframe.reset_index(drop=True).equals(target_dataframe.reset_index(drop=True)) + assert equal_df(target_dataframe, test_dataframe) diff --git a/test/python/evaluation/test_tool_path.py b/test/python/evaluation/test_tool_path.py index 0581caad..8ee4cd07 100644 --- a/test/python/evaluation/test_tool_path.py +++ b/test/python/evaluation/test_tool_path.py @@ -4,15 +4,16 @@ import pytest from src.python import MAIN_FOLDER from src.python.evaluation.evaluation_config import EvaluationConfig -from src.python.evaluation.xlsx_run_tool import create_dataframe +from src.python.evaluation.evaluation_run_tool import get_solutions_df, inspect_solutions_df def test_correct_tool_path(): try: testing_arguments_dict = get_testing_arguments(to_add_traceback=True, to_add_tool_path=True) - testing_arguments_dict.xlsx_file_path = XLSX_DATA_FOLDER / 'test_unsorted_order.xlsx' + testing_arguments_dict.solutions_file_path = XLSX_DATA_FOLDER / 'test_unsorted_order.xlsx' config = EvaluationConfig(testing_arguments_dict) - create_dataframe(config) + lang_code_dataframe = get_solutions_df(config.extension, config.solutions_file_path) + inspect_solutions_df(config, lang_code_dataframe) except Exception: pytest.fail("Unexpected error") @@ -20,7 +21,8 @@ def test_correct_tool_path(): def test_incorrect_tool_path(): with pytest.raises(Exception): testing_arguments_dict = get_testing_arguments(to_add_traceback=True) - testing_arguments_dict.xlsx_file_path = XLSX_DATA_FOLDER / 'test_unsorted_order.xlsx' + testing_arguments_dict.solutions_file_path = XLSX_DATA_FOLDER / 'test_unsorted_order.xlsx' testing_arguments_dict.tool_path = MAIN_FOLDER.parent / 'review/incorrect_path.py' config = EvaluationConfig(testing_arguments_dict) - assert create_dataframe(config) + lang_code_dataframe = get_solutions_df(config.extension, config.solutions_file_path) + assert inspect_solutions_df(config, lang_code_dataframe) diff --git a/test/python/evaluation/test_xlsx_file_structure.py b/test/python/evaluation/test_xlsx_file_structure.py index 9965992e..f043772d 100644 --- a/test/python/evaluation/test_xlsx_file_structure.py +++ b/test/python/evaluation/test_xlsx_file_structure.py @@ -3,8 +3,7 @@ import pytest from src.python.evaluation.evaluation_config import EvaluationConfig -from src.python.evaluation.xlsx_run_tool import create_dataframe - +from src.python.evaluation.evaluation_run_tool import get_solutions_df, inspect_solutions_df FILE_NAMES = [ 'test_wrong_column_name.xlsx', @@ -18,6 +17,7 @@ def test_wrong_column(file_name: str): with pytest.raises(KeyError): testing_arguments_dict = get_testing_arguments(to_add_traceback=True, to_add_tool_path=True) - testing_arguments_dict.xlsx_file_path = XLSX_DATA_FOLDER / file_name + testing_arguments_dict.solutions_file_path = XLSX_DATA_FOLDER / file_name config = EvaluationConfig(testing_arguments_dict) - assert create_dataframe(config) + lang_code_dataframe = get_solutions_df(config.extension, config.solutions_file_path) + assert inspect_solutions_df(config, lang_code_dataframe) diff --git a/test/python/evaluation/testing_config.py b/test/python/evaluation/testing_config.py index 1e8fc5a9..8d144534 100644 --- a/test/python/evaluation/testing_config.py +++ b/test/python/evaluation/testing_config.py @@ -7,7 +7,7 @@ def get_testing_arguments(to_add_traceback=None, to_add_tool_path=None) -> Namespace: testing_arguments = Namespace(format=OutputFormat.JSON.value, - output_file_name=EvaluationArgument.RESULT_FILE_NAME_EXT.value, + output_file_name=EvaluationArgument.RESULT_FILE_NAME_XLSX.value, output_folder_path=None) if to_add_traceback: testing_arguments.traceback = True @@ -15,6 +15,6 @@ def get_testing_arguments(to_add_traceback=None, to_add_tool_path=None) -> Names if to_add_tool_path: testing_arguments.tool_path = MAIN_FOLDER.parent / 'review/run_tool.py' - testing_arguments.xlsx_file_path = None + testing_arguments.solutions_file_path = None return testing_arguments diff --git a/test/resources/evaluation/common/inspectors/diffs_between_df/new_1.csv b/test/resources/evaluation/common/inspectors/diffs_between_df/new_1.csv new file mode 100644 index 00000000..c415b91a --- /dev/null +++ b/test/resources/evaluation/common/inspectors/diffs_between_df/new_1.csv @@ -0,0 +1,10 @@ +id,time,code,lang,grade,traceback +1,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +2,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +3,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" \ No newline at end of file diff --git a/test/resources/evaluation/common/inspectors/diffs_between_df/new_2.csv b/test/resources/evaluation/common/inspectors/diffs_between_df/new_2.csv new file mode 100644 index 00000000..2ccfe8aa --- /dev/null +++ b/test/resources/evaluation/common/inspectors/diffs_between_df/new_2.csv @@ -0,0 +1,10 @@ +id,time,code,lang,grade,traceback +1,1617455906,"print (""Learn Python to be great!"") +",python3,GOOD,"{""quality"": {""code"": ""GOOD"", ""text"": ""Code quality (beta): GOOD""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +2,1617455906,"print (""Learn Python to be great!"") +",python3,GOOD,"{""quality"": {""code"": ""GOOD"", ""text"": ""Code quality (beta): GOOD""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +3,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" \ No newline at end of file diff --git a/test/resources/evaluation/common/inspectors/diffs_between_df/new_3.csv b/test/resources/evaluation/common/inspectors/diffs_between_df/new_3.csv new file mode 100644 index 00000000..d8b2addc --- /dev/null +++ b/test/resources/evaluation/common/inspectors/diffs_between_df/new_3.csv @@ -0,0 +1,10 @@ +id,time,code,lang,grade,traceback +1,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0},{""code"": ""C0305"", ""text"": ""Trailing newlines"", ""line"": """", ""line_number"": 15, ""column_number"": 1, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0},{""code"": ""E211"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +2,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +3,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" \ No newline at end of file diff --git a/test/resources/evaluation/common/inspectors/diffs_between_df/new_4.csv b/test/resources/evaluation/common/inspectors/diffs_between_df/new_4.csv new file mode 100644 index 00000000..b77ae579 --- /dev/null +++ b/test/resources/evaluation/common/inspectors/diffs_between_df/new_4.csv @@ -0,0 +1,10 @@ +id,time,code,lang,grade,traceback +1,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0},{""code"": ""C0305"", ""text"": ""Trailing newlines"", ""line"": """", ""line_number"": 15, ""column_number"": 1, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0},{""code"": ""E211"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +2,1617455906,"print (""Learn Python to be great!"") +",python3,GOOD,"{""quality"": {""code"": ""GOOD"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +3,1617455906,"print (""Learn Python to be great!"") +",python3,GOOD,"{""quality"": {""code"": ""GOOD"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" \ No newline at end of file diff --git a/test/resources/evaluation/common/inspectors/diffs_between_df/old_1.csv b/test/resources/evaluation/common/inspectors/diffs_between_df/old_1.csv new file mode 100644 index 00000000..c415b91a --- /dev/null +++ b/test/resources/evaluation/common/inspectors/diffs_between_df/old_1.csv @@ -0,0 +1,10 @@ +id,time,code,lang,grade,traceback +1,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +2,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +3,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" \ No newline at end of file diff --git a/test/resources/evaluation/common/inspectors/diffs_between_df/old_2.csv b/test/resources/evaluation/common/inspectors/diffs_between_df/old_2.csv new file mode 100644 index 00000000..c415b91a --- /dev/null +++ b/test/resources/evaluation/common/inspectors/diffs_between_df/old_2.csv @@ -0,0 +1,10 @@ +id,time,code,lang,grade,traceback +1,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +2,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +3,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" \ No newline at end of file diff --git a/test/resources/evaluation/common/inspectors/diffs_between_df/old_3.csv b/test/resources/evaluation/common/inspectors/diffs_between_df/old_3.csv new file mode 100644 index 00000000..c415b91a --- /dev/null +++ b/test/resources/evaluation/common/inspectors/diffs_between_df/old_3.csv @@ -0,0 +1,10 @@ +id,time,code,lang,grade,traceback +1,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +2,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +3,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" \ No newline at end of file diff --git a/test/resources/evaluation/common/inspectors/diffs_between_df/old_4.csv b/test/resources/evaluation/common/inspectors/diffs_between_df/old_4.csv new file mode 100644 index 00000000..c415b91a --- /dev/null +++ b/test/resources/evaluation/common/inspectors/diffs_between_df/old_4.csv @@ -0,0 +1,10 @@ +id,time,code,lang,grade,traceback +1,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +2,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" +3,1617455906,"print (""Learn Python to be great!"") +",python3,MODERATE,"{""quality"": {""code"": ""MODERATE"", ""text"": ""Code quality (beta): MODERATE""}, ""issues"": [{""code"": ""E215"", ""text"": ""whitespace before '('"", ""line"": ""print (\""Learn Python to be great!\"")"", ""line_number"": 1, ""column_number"": 6, ""category"": ""CODE_STYLE"", ""influence_on_penalty"": 0}]} +" \ No newline at end of file diff --git a/test/resources/evaluation/common/pandas_util/drop_duplicates/in_1.csv b/test/resources/evaluation/common/pandas_util/drop_duplicates/in_1.csv new file mode 100644 index 00000000..813f1a3c --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/drop_duplicates/in_1.csv @@ -0,0 +1,10 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"print("Hi")",python3 \ No newline at end of file diff --git a/test/resources/evaluation/common/pandas_util/drop_duplicates/in_2.csv b/test/resources/evaluation/common/pandas_util/drop_duplicates/in_2.csv new file mode 100644 index 00000000..ddc699ea --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/drop_duplicates/in_2.csv @@ -0,0 +1 @@ +id,time,code,lang diff --git a/test/resources/evaluation/common/pandas_util/drop_duplicates/in_3.csv b/test/resources/evaluation/common/pandas_util/drop_duplicates/in_3.csv new file mode 100644 index 00000000..01a46348 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/drop_duplicates/in_3.csv @@ -0,0 +1,3 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 diff --git a/test/resources/evaluation/common/pandas_util/drop_duplicates/out_1.csv b/test/resources/evaluation/common/pandas_util/drop_duplicates/out_1.csv new file mode 100644 index 00000000..d35cff98 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/drop_duplicates/out_1.csv @@ -0,0 +1,4 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"print("Hi")",python3 \ No newline at end of file diff --git a/test/resources/evaluation/common/pandas_util/drop_duplicates/out_2.csv b/test/resources/evaluation/common/pandas_util/drop_duplicates/out_2.csv new file mode 100644 index 00000000..ddc699ea --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/drop_duplicates/out_2.csv @@ -0,0 +1 @@ +id,time,code,lang diff --git a/test/resources/evaluation/common/pandas_util/drop_duplicates/out_3.csv b/test/resources/evaluation/common/pandas_util/drop_duplicates/out_3.csv new file mode 100644 index 00000000..01a46348 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/drop_duplicates/out_3.csv @@ -0,0 +1,3 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/in_1.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/in_1.csv new file mode 100644 index 00000000..a1ca4864 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/in_1.csv @@ -0,0 +1,13 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"println("")",kotlin +46846118,1617443943,"println("Hello")",kotlin +46846118,1617443943,"System.out.println("");",java7 +46846118,1617443943,"System.out.println("Hello");",java7 +46846118,1617443943,"System.out.println("");",java8 +46846118,1617443943,"System.out.println("Hello");",java8 +46846118,1617443943,"System.out.println("");",java9 +46846118,1617443943,"System.out.println("Hello");",java9 +46846118,1617443943,"System.out.println("");",java11 +46846118,1617443943,"System.out.println("Hello");",java11 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/in_2.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/in_2.csv new file mode 100644 index 00000000..a1ca4864 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/in_2.csv @@ -0,0 +1,13 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"println("")",kotlin +46846118,1617443943,"println("Hello")",kotlin +46846118,1617443943,"System.out.println("");",java7 +46846118,1617443943,"System.out.println("Hello");",java7 +46846118,1617443943,"System.out.println("");",java8 +46846118,1617443943,"System.out.println("Hello");",java8 +46846118,1617443943,"System.out.println("");",java9 +46846118,1617443943,"System.out.println("Hello");",java9 +46846118,1617443943,"System.out.println("");",java11 +46846118,1617443943,"System.out.println("Hello");",java11 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/in_3.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/in_3.csv new file mode 100644 index 00000000..a1ca4864 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/in_3.csv @@ -0,0 +1,13 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"println("")",kotlin +46846118,1617443943,"println("Hello")",kotlin +46846118,1617443943,"System.out.println("");",java7 +46846118,1617443943,"System.out.println("Hello");",java7 +46846118,1617443943,"System.out.println("");",java8 +46846118,1617443943,"System.out.println("Hello");",java8 +46846118,1617443943,"System.out.println("");",java9 +46846118,1617443943,"System.out.println("Hello");",java9 +46846118,1617443943,"System.out.println("");",java11 +46846118,1617443943,"System.out.println("Hello");",java11 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/in_4.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/in_4.csv new file mode 100644 index 00000000..a1ca4864 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/in_4.csv @@ -0,0 +1,13 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"println("")",kotlin +46846118,1617443943,"println("Hello")",kotlin +46846118,1617443943,"System.out.println("");",java7 +46846118,1617443943,"System.out.println("Hello");",java7 +46846118,1617443943,"System.out.println("");",java8 +46846118,1617443943,"System.out.println("Hello");",java8 +46846118,1617443943,"System.out.println("");",java9 +46846118,1617443943,"System.out.println("Hello");",java9 +46846118,1617443943,"System.out.println("");",java11 +46846118,1617443943,"System.out.println("Hello");",java11 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/in_5.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/in_5.csv new file mode 100644 index 00000000..a1ca4864 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/in_5.csv @@ -0,0 +1,13 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"println("")",kotlin +46846118,1617443943,"println("Hello")",kotlin +46846118,1617443943,"System.out.println("");",java7 +46846118,1617443943,"System.out.println("Hello");",java7 +46846118,1617443943,"System.out.println("");",java8 +46846118,1617443943,"System.out.println("Hello");",java8 +46846118,1617443943,"System.out.println("");",java9 +46846118,1617443943,"System.out.println("Hello");",java9 +46846118,1617443943,"System.out.println("");",java11 +46846118,1617443943,"System.out.println("Hello");",java11 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/out_1.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/out_1.csv new file mode 100644 index 00000000..a1ca4864 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/out_1.csv @@ -0,0 +1,13 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"println("")",kotlin +46846118,1617443943,"println("Hello")",kotlin +46846118,1617443943,"System.out.println("");",java7 +46846118,1617443943,"System.out.println("Hello");",java7 +46846118,1617443943,"System.out.println("");",java8 +46846118,1617443943,"System.out.println("Hello");",java8 +46846118,1617443943,"System.out.println("");",java9 +46846118,1617443943,"System.out.println("Hello");",java9 +46846118,1617443943,"System.out.println("");",java11 +46846118,1617443943,"System.out.println("Hello");",java11 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/out_2.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/out_2.csv new file mode 100644 index 00000000..ddc699ea --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/out_2.csv @@ -0,0 +1 @@ +id,time,code,lang diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/out_3.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/out_3.csv new file mode 100644 index 00000000..01a46348 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/out_3.csv @@ -0,0 +1,3 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/out_4.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/out_4.csv new file mode 100644 index 00000000..01a46348 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/out_4.csv @@ -0,0 +1,3 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 diff --git a/test/resources/evaluation/common/pandas_util/filter_by_language/out_5.csv b/test/resources/evaluation/common/pandas_util/filter_by_language/out_5.csv new file mode 100644 index 00000000..d54853f4 --- /dev/null +++ b/test/resources/evaluation/common/pandas_util/filter_by_language/out_5.csv @@ -0,0 +1,5 @@ +id,time,code,lang +46846118,1617443943,"print("")",python3 +46846118,1617443943,"print("Hello")",python3 +46846118,1617443943,"System.out.println("");",java11 +46846118,1617443943,"System.out.println("Hello");",java11 diff --git a/test/resources/evaluation/xlsx_target_files/target_sorted_order.xlsx b/test/resources/evaluation/xlsx_target_files/target_sorted_order.xlsx index 8c24f18b..a6ad5df7 100644 Binary files a/test/resources/evaluation/xlsx_target_files/target_sorted_order.xlsx and b/test/resources/evaluation/xlsx_target_files/target_sorted_order.xlsx differ diff --git a/test/resources/evaluation/xlsx_target_files/target_unsorted_order.xlsx b/test/resources/evaluation/xlsx_target_files/target_unsorted_order.xlsx index a091643a..1e96d109 100644 Binary files a/test/resources/evaluation/xlsx_target_files/target_unsorted_order.xlsx and b/test/resources/evaluation/xlsx_target_files/target_unsorted_order.xlsx differ diff --git a/whitelist.txt b/whitelist.txt index 7095e567..0573d872 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -93,4 +93,14 @@ nom wmc util tmp -Namespace \ No newline at end of file +Namespace +pandarallel +isin +loc +uniq +fullmatch +iloc +dataframes +numpy +Pickler +Unpickler \ No newline at end of file