hyperskill · nbirillo · May 20, 2021 · May 17, 2021 · May 17, 2021 · May 17, 2021
diff --git a/VERSION.md b/VERSION.md
@@ -1 +1 @@
-1.0.0
+1.2.0
diff --git a/requirements-evaluation.txt b/requirements-evaluation.txt
@@ -1,2 +1,3 @@
 openpyxl==3.0.7
-pandas==1.2.3
+pandas==1.2.3
+pandarallel
diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py
@@ -2,6 +2,7 @@
 from enum import Enum, unique
 from typing import List, Optional
 
+from src.python.evaluation.common.util import ColumnName
 from src.python.review.application_config import LanguageVersion
 from src.python.review.inspectors.inspector_type import InspectorType
 
@@ -76,3 +77,15 @@ class RunToolArgument(Enum):
     HISTORY = ArgumentsInfo(None, '--history',
                             'Json string, which contains lists of issues in the previous submissions '
                             'for other tasks for one user.')
+
+    SOLUTIONS_FILE_PATH = ArgumentsInfo(None, 'solutions_file_path',
+                                        'Local XLSX-file or CSV-file path. '
+                                        'Your file must include column-names: '
+                                        f'"{ColumnName.CODE.value}" and '
+                                        f'"{ColumnName.LANG.value}". Acceptable values for '
+                                        f'"{ColumnName.LANG.value}" column are: '
+                                        f'{LanguageVersion.PYTHON_3.value}, {LanguageVersion.JAVA_8.value}, '
+                                        f'{LanguageVersion.JAVA_11.value}, {LanguageVersion.KOTLIN.value}.')
+
+    DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path',
+                                    'Path to a file with serialized diffs that were founded by diffs_between_df.py')
diff --git a/src/python/evaluation/README.md b/src/python/evaluation/README.md
@@ -1,31 +1,35 @@
 # Hyperstyle evaluation
 
-This tool allows running the `Hyperstyle` tool on an xlsx table to get code quality for all code fragments. Please, note that your input file should consist of at least 2 obligatory columns to run xlsx-tool on its code fragments:
+This tool allows running the `Hyperstyle` tool on a `xlsx` or `csv` table to get code quality for all code fragments. 
+Please, note that your input file should consist of at least 2 obligatory columns to run the tool on its code fragments:
 
 - `code`
 - `lang`
 
 Possible values for column `lang` are: `python3`, `kotlin`, `java8`, `java11`.
 
-Output file is a new `xlsx` file with 3 columns:
-- `code`
-- `lang`
+Output file is a new `xlsx` or `csv` file with the all columns from the input file and two additional ones:
 - `grade`
-Grade assessment is conducted by [`run_tool.py`](https://github.com/hyperskill/hyperstyle/blob/main/README.md) with default arguments. Avaliable values for column  `grade` are: BAD, MODERATE, GOOD, EXCELLENT. It is also possible add fourth column: `traceback` to get full inspectors feedback on each code fragment. More details on enabling traceback column in **Optional Arguments** table.
+- `traceback` (optional)
+
+Grade assessment is conducted by [`run_tool.py`](https://github.com/hyperskill/hyperstyle/blob/main/README.md) with default arguments. 
+  Avaliable values for column  `grade` are: BAD, MODERATE, GOOD, EXCELLENT. 
+  `traceback` column stores full inspectors feedback on each code fragment. 
+  More details on enabling traceback column in **Optional Arguments** table.
 
 ## Usage
 
-Run the [xlsx_run_tool.py](xlsx_run_tool.py) with the arguments from command line.
+Run the [evaluation_run_tool.py](evaluation_run_tool.py) with the arguments from command line.
 
 Required arguments:
 
-`xlsx_file_path` — path to xlsx-file with code samples to inspect.
+`solutions_file_path` — path to xlsx-file or csv-file with code samples to inspect.
 
 Optional arguments:
 Argument | Description
 --- | ---
 |**&#8209;f**, **&#8209;&#8209;format**| The output format. Available values: `json`, `text`. The default value is `json` . Use this argument when `traceback` is enabled, otherwise it will not be used.|
 |**&#8209;tp**, **&#8209;&#8209;tool_path**| Path to run-tool. Default is `src/python/review/run_tool.py` .|
-|**&#8209;tr**, **&#8209;&#8209;traceback**| To include a column with errors traceback into an output file. Default is `False`.|
-|**&#8209;ofp**, **&#8209;&#8209;output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file sent for inspection. |
-|**&#8209;ofn**, **&#8209;&#8209;output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx`.|
+|**&#8209;&#8209;traceback**| To include a column with errors traceback into an output file. Default is `False`.|
+|**&#8209;ofp**, **&#8209;&#8209;output_folder_path**| An explicit folder path to store file with results. Default is a parent directory of a folder with xlsx-file or csv-file sent for inspection. |
+|**&#8209;ofn**, **&#8209;&#8209;output_file_name**| A name of an output file where evaluation results will be stored. Default is `results.xlsx` or `results.csv`.|
diff --git a/src/python/evaluation/common/csv_util.py b/src/python/evaluation/common/csv_util.py
@@ -0,0 +1,13 @@
+from pathlib import Path
+from typing import Union
+
+import pandas as pd
+from src.python.review.common.file_system import Encoding
+
+
+def write_dataframe_to_csv(csv_file_path: Union[str, Path], df: pd.DataFrame) -> None:
+    # Get error with this encoding=ENCODING on several fragments. So change it then to 'utf8'
+    try:
+        df.to_csv(csv_file_path, encoding=Encoding.ISO_ENCODING.value, index=False)
+    except UnicodeEncodeError:
+        df.to_csv(csv_file_path, encoding=Encoding.UTF_ENCODING.value, index=False)
diff --git a/src/python/evaluation/common/pandas_util.py b/src/python/evaluation/common/pandas_util.py
@@ -0,0 +1,103 @@
+import json
+import logging
+from pathlib import Path
+from typing import Any, List, Set, Union
+
+import numpy as np
+import pandas as pd
+from src.python.evaluation.common.csv_util import write_dataframe_to_csv
+from src.python.evaluation.common.util import ColumnName, EvaluationArgument
+from src.python.evaluation.common.xlsx_util import create_workbook, remove_sheet, write_dataframe_to_xlsx_sheet
+from src.python.review.application_config import LanguageVersion
+from src.python.review.common.file_system import Extension, get_restricted_extension
+from src.python.review.inspectors.issue import BaseIssue
+from src.python.review.reviewers.utils.print_review import convert_json_to_issues
+
+logger = logging.getLogger(__name__)
+
+
+def filter_df_by_language(df: pd.DataFrame, languages: Set[LanguageVersion],
+                          column: str = ColumnName.LANG.value) -> pd.DataFrame:
+    return df.loc[df[column].isin(set(map(lambda l: l.value, languages)))]
+
+
+def filter_df_by_condition(df: pd.DataFrame, column: str, value: Any) -> pd.DataFrame:
+    return df.loc[df[column] == value]
+
+
+def drop_duplicates(df: pd.DataFrame, column: str = ColumnName.CODE.value) -> pd.DataFrame:
+    return df.drop_duplicates(column, keep='last')
+
+
+# Find all rows and columns where two dataframes are inconsistent.
+# For example:
+#  row  |  column    |
+#  -------------------------
+#  3    | column_1   | True
+#       | column_2   | True
+#  -------------------------
+#  4    | column_1   | True
+#       | column_2   | True
+# means first and second dataframes have different values
+# in column_1 and in column_2 in 3-th and 4-th rows
+def get_inconsistent_positions(first: pd.DataFrame, second: pd.DataFrame) -> pd.DataFrame:
+    ne_stacked = (first != second).stack()
+    inconsistent_positions = ne_stacked[ne_stacked]
+    inconsistent_positions.index.names = [ColumnName.ROW.value, ColumnName.COLUMN.value]
+    return inconsistent_positions
+
+
+# Create a new dataframe with all items that are different.
+# For example:
+#            |       old   |   new
+#  ---------------------------------
+# row column |             |
+# 3   grade  |  EXCELLENT  | MODERATE
+# 4   grade  |  EXCELLENT  |  BAD
+def get_diffs(first: pd.DataFrame, second: pd.DataFrame) -> pd.DataFrame:
+    changed = get_inconsistent_positions(first, second)
+
+    difference_locations = np.where(first != second)
+    changed_from = first.values[difference_locations]
+    changed_to = second.values[difference_locations]
+    return pd.DataFrame({
+        ColumnName.OLD.value: changed_from,
+        ColumnName.NEW.value: changed_to},
+        index=changed.index)
+
+
+def get_solutions_df(ext: Extension, file_path: Union[str, Path]) -> pd.DataFrame:
+    try:
+        if ext == Extension.XLSX:
+            lang_code_dataframe = pd.read_excel(file_path)
+        else:
+            lang_code_dataframe = pd.read_csv(file_path)
+    except FileNotFoundError as e:
+        logger.error('XLSX-file or CSV-file with the specified name does not exists.')
+        raise e
+
+    return lang_code_dataframe
+
+
+def get_solutions_df_by_file_path(path: Path) -> pd.DataFrame:
+    ext = get_restricted_extension(path, [Extension.XLSX, Extension.CSV])
+    return get_solutions_df(ext, path)
+
+
+def write_df_to_file(df: pd.DataFrame, output_file_path: Path, extension: Extension) -> None:
+    if extension == Extension.CSV:
+        write_dataframe_to_csv(output_file_path, df)
+    elif extension == Extension.XLSX:
+        create_workbook(output_file_path)
+        write_dataframe_to_xlsx_sheet(output_file_path, df, 'inspection_results')
+        # remove empty sheet that was initially created with the workbook
+        remove_sheet(output_file_path, 'Sheet')
+
+
+def get_issues_from_json(str_json: str) -> List[BaseIssue]:
+    parsed_json = json.loads(str_json)['issues']
+    return convert_json_to_issues(parsed_json)
+
+
+def get_issues_by_row(df: pd.DataFrame, row: int) -> List[BaseIssue]:
+    return get_issues_from_json(df.iloc[row][EvaluationArgument.TRACEBACK.value])
diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py
@@ -6,29 +6,36 @@
 
 @unique
 class ColumnName(Enum):
-    CODE = "code"
-    LANG = "lang"
-    LANGUAGE = "language"
-    GRADE = "grade"
+    CODE = 'code'
+    LANG = 'lang'
+    LANGUAGE = 'language'
+    GRADE = 'grade'
+    ID = 'id'
+    COLUMN = 'column'
+    ROW = 'row'
+    OLD = 'old'
+    NEW = 'new'
+    IS_PUBLIC = 'is_public'
 
 
 @unique
 class EvaluationArgument(Enum):
-    TRACEBACK = "traceback"
-    RESULT_FILE_NAME = "results"
-    RESULT_FILE_NAME_EXT = f"{RESULT_FILE_NAME}{Extension.XLSX.value}"
+    TRACEBACK = 'traceback'
+    RESULT_FILE_NAME = 'evaluation_results'
+    RESULT_FILE_NAME_XLSX = f'{RESULT_FILE_NAME}{Extension.XLSX.value}'
+    RESULT_FILE_NAME_CSV = f'{RESULT_FILE_NAME}{Extension.CSV.value}'
 
 
-script_structure_rule = ("Please, make sure your XLSX-file matches following script standards: \n"
-                         "1. Your XLSX-file should have 2 obligatory columns named:"
-                         f"'{ColumnName.CODE.value}' & '{ColumnName.LANG.value}'. \n"
-                         f"'{ColumnName.CODE.value}' column -- relates to the code-sample. \n"
-                         f"'{ColumnName.LANG.value}' column -- relates to the language of a "
-                         "particular code-sample. \n"
-                         "2. Your code samples should belong to the one of the supported languages. \n"
-                         "Supported languages are: Java, Kotlin, Python. \n"
-                         f"3. Check that '{ColumnName.LANG.value}' column cells are filled with "
-                         "acceptable language-names: \n"
-                         f"Acceptable language-names are: {LanguageVersion.PYTHON_3.value}, "
-                         f"{LanguageVersion.JAVA_8.value} ,"
-                         f"{LanguageVersion.JAVA_11.value} and {LanguageVersion.KOTLIN.value}.")
+script_structure_rule = ('Please, make sure your XLSX-file matches following script standards: \n'
+                         '1. Your XLSX-file or CSV-file should have 2 obligatory columns named:'
+                         f'"{ColumnName.CODE.value}" & "{ColumnName.LANG.value}". \n'
+                         f'"{ColumnName.CODE.value}" column -- relates to the code-sample. \n'
+                         f'"{ColumnName.LANG.value}" column -- relates to the language of a '
+                         'particular code-sample. \n'
+                         '2. Your code samples should belong to the one of the supported languages. \n'
+                         'Supported languages are: Java, Kotlin, Python. \n'
+                         f'3. Check that "{ColumnName.LANG.value}" column cells are filled with '
+                         'acceptable language-names: \n'
+                         f'Acceptable language-names are: {LanguageVersion.PYTHON_3.value}, '
+                         f'{LanguageVersion.JAVA_8.value} ,'
+                         f'{LanguageVersion.JAVA_11.value} and {LanguageVersion.KOTLIN.value}.')
diff --git a/src/python/evaluation/common/xlsx_util.py b/src/python/evaluation/common/xlsx_util.py
@@ -4,7 +4,6 @@
 
 import pandas as pd
 from openpyxl import load_workbook, Workbook
-from src.python.evaluation.evaluation_config import EvaluationConfig
 
 logger = logging.getLogger(__name__)
 
@@ -24,11 +23,10 @@ def remove_sheet(workbook_path: Union[str, Path], sheet_name: str, to_raise_erro
             logger.info(message)
 
 
-def create_and_get_workbook_path(config: EvaluationConfig) -> Path:
+def create_workbook(output_file_path: Path) -> Workbook:
     workbook = Workbook()
-    workbook_path = config.get_output_file_path()
-    workbook.save(workbook_path)
-    return workbook_path
+    workbook.save(output_file_path)
+    return workbook
 
 
 def write_dataframe_to_xlsx_sheet(xlsx_file_path: Union[str, Path], df: pd.DataFrame, sheet_name: str,

diff --git a/src/python/evaluation/evaluation_config.py b/src/python/evaluation/evaluation_config.py
@@ -1,12 +1,17 @@
 import logging.config
 from argparse import Namespace
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union
 
 from src.python.common.tool_arguments import RunToolArgument
 from src.python.evaluation.common.util import EvaluationArgument
 from src.python.review.application_config import LanguageVersion
-from src.python.review.common.file_system import create_directory
+from src.python.review.common.file_system import (
+    create_directory,
+    Extension,
+    get_parent_folder,
+    get_restricted_extension,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -15,10 +20,17 @@ class EvaluationConfig:
     def __init__(self, args: Namespace):
         self.tool_path: Union[str, Path] = args.tool_path
         self.output_format: str = args.format
-        self.xlsx_file_path: Union[str, Path] = args.xlsx_file_path
+        self.solutions_file_path: Union[str, Path] = args.solutions_file_path
         self.traceback: bool = args.traceback
         self.output_folder_path: Union[str, Path] = args.output_folder_path
-        self.output_file_name: str = args.output_file_name
+        self.extension: Extension = get_restricted_extension(self.solutions_file_path, [Extension.XLSX, Extension.CSV])
+        self.__init_output_file_name(args.output_file_name)
+
+    def __init_output_file_name(self, output_file_name: Optional[str]):
+        if output_file_name is None:
+            self.output_file_name = f'{EvaluationArgument.RESULT_FILE_NAME.value}{self.extension.value}'
+        else:
+            self.output_file_name = output_file_name
 
     def build_command(self, inspected_file_path: Union[str, Path], lang: str) -> List[str]:
         command = [LanguageVersion.PYTHON_3.value,
@@ -33,11 +45,9 @@ def build_command(self, inspected_file_path: Union[str, Path], lang: str) -> Lis
     def get_output_file_path(self) -> Path:
         if self.output_folder_path is None:
             try:
-                self.output_folder_path = (
-                    Path(self.xlsx_file_path).parent.parent / EvaluationArgument.RESULT_FILE_NAME.value
-                )
+                self.output_folder_path = get_parent_folder(Path(self.solutions_file_path))
                 create_directory(self.output_folder_path)
             except FileNotFoundError as e:
-                logger.error('XLSX-file with the specified name does not exists.')
+                logger.error('XLSX-file or CSV-file with the specified name does not exists.')
                 raise e
         return Path(self.output_folder_path) / self.output_file_name