hyperskill · nbirillo · Jun 21, 2021 · May 17, 2021 · May 18, 2021 · May 18, 2021
diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py
@@ -21,6 +21,7 @@ class ColumnName(Enum):
     PENALTY = 'penalty'
     USER = 'user'
     HISTORY = 'history'
+    TIME = 'time'
     TRACEBACK = 'traceback'
 
 

diff --git a/src/python/evaluation/evaluation_config.py b/src/python/evaluation/evaluation_config.py
@@ -22,6 +22,7 @@ def __init__(self, args: Namespace):
         self.format: str = args.format
         self.solutions_file_path: Union[str, Path] = args.solutions_file_path
         self.traceback: bool = args.traceback
+        self.with_history: bool = args.with_history
         self.output_folder_path: Union[str, Path] = args.output_folder_path
         self.extension: Extension = get_restricted_extension(self.solutions_file_path, [Extension.XLSX, Extension.CSV])
         self.__init_output_file_name(args.output_file_name)
@@ -32,12 +33,15 @@ def __init_output_file_name(self, output_file_name: Optional[str]):
         else:
             self.output_file_name = output_file_name
 
-    def build_command(self, inspected_file_path: Union[str, Path], lang: str) -> List[str]:
+    def build_command(self, inspected_file_path: Union[str, Path], lang: str, history: Optional[str]) -> List[str]:
         command = [LanguageVersion.PYTHON_3.value,
                    self.tool_path,
                    inspected_file_path,
                    RunToolArgument.FORMAT.value.short_name, self.format]
 
+        if self.with_history and history is not None:
+            command.extend([RunToolArgument.HISTORY.value.long_name, history])
+
         if lang == LanguageVersion.JAVA_8.value or lang == LanguageVersion.JAVA_11.value:
             command.extend([RunToolArgument.LANG_VERSION.value.long_name, lang])
         return command

diff --git a/src/python/evaluation/evaluation_run_tool.py b/src/python/evaluation/evaluation_run_tool.py
@@ -6,6 +6,7 @@
 import time
 import traceback
 from pathlib import Path
+from typing import Optional
 
 sys.path.append('')
 sys.path.append('../../..')
@@ -63,6 +64,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
                              f'Use this argument when {EvaluationArgument.TRACEBACK.value} argument'
                              'is enabled argument will not be used otherwise.')
 
+    parser.add_argument('--with-history',
+                        help=f'If True, then history will be taken into account when calculating the grade. '
+                             f'In that case, for each fragment, the "{ColumnName.HISTORY.value}" column '
+                             'must contain the history of previous errors.',
+                        action='store_true')
+
 
 def get_language(lang_key: str) -> LanguageVersion:
     try:
@@ -73,14 +80,14 @@ def get_language(lang_key: str) -> LanguageVersion:
         raise KeyError(e)
 
 
-def __inspect_row(lang: str, code: str, fragment_id: int, config: EvaluationConfig) -> str:
+def __inspect_row(lang: str, code: str, fragment_id: int, history: Optional[str], config: EvaluationConfig) -> str:
     print(f'current id: {fragment_id}')
     # Tool does not work correctly with tmp files from <tempfile> module on macOS
     # thus we create a real file in the file system
     extension = get_language(lang).extension_by_language().value
     tmp_file_path = config.solutions_file_path.parent.absolute() / f'inspected_code_{fragment_id}{extension}'
     temp_file = next(create_file(tmp_file_path, code))
-    command = config.build_command(temp_file, lang)
+    command = config.build_command(temp_file, lang, history)
     results = run_in_subprocess(command)
     os.remove(temp_file)
     return results
@@ -103,7 +110,9 @@ def inspect_solutions_df(config: EvaluationConfig, lang_code_dataframe: pd.DataF
         lang_code_dataframe[ColumnName.TRACEBACK.value] = lang_code_dataframe.parallel_apply(
             lambda row: __inspect_row(row[ColumnName.LANG.value],
                                       row[ColumnName.CODE.value],
-                                      row[ColumnName.ID.value], config), axis=1)
+                                      row[ColumnName.ID.value],
+                                      row.get(ColumnName.HISTORY.value),
+                                      config), axis=1)
 
         lang_code_dataframe[ColumnName.GRADE.value] = lang_code_dataframe.parallel_apply(
             lambda row: __get_grade_from_traceback(row[ColumnName.TRACEBACK.value]), axis=1)

diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md
@@ -7,6 +7,7 @@ This module contains _preprocessing_ stage and _analysing_ stage.
   the `csv` or `xslx` file with student solutions and drop duplicates of code fragments (optional);
 - [distribute_grades.py](distribute_grades.py) allows distributing calculated grades and traceback 
   for unique solutions into all solutions.
+- [generate_history.py](generate_history.py) allows you to generate history based on issues from previous solutions.
 
 `Analysing` stage includes:
 - [diffs_between_df.py](diffs_between_df.py) allows finding a difference between 
@@ -80,6 +81,39 @@ Required arguments:
 
 The resulting file will be stored in the same folder as the input file with all samples.
 
+----
+
+### Generate history
+
+[generate_history.py](generate_history.py) allows you to generate history based on issues from previous solutions.
+
+Please, note that your solutions file should consist of at least 4 obligatory columns:
+
+- `user`,
+- `lang`,
+- `time`,
+- `traceback`.
+
+You can get such a file with [evaluation_run_tool.py](../evaluation_run_tool.py).
+
+The output file is a new `xlsx` or `csv` (the same format with the input files) file with all columns from the input 
+except for `traceback` and `grade` (this behavior can be changed when you run the script).
+
+#### Usage
+
+Run the [generate_history.py](generate_history.py) with the arguments from command line.
+
+Required argument:
+
+- `solutions_file_path` — path to xlsx-file or csv-file with necessary columns,
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;o**, **&#8209;&#8209;output&#8209;path**| The path where the dataset with history will be saved. If not specified, the dataset will be saved next to the original one. |
+|**&#8209;&#8209;to&#8209;drop&#8209;traceback**| The `traceback` column will be removed from the final dataset. Default is false. |
+|**&#8209;&#8209;to&#8209;drop&#8209;grades**| The `grade` column will be removed from the final dataset. Default is false.|
+
 ___
 
 ## Analysing

diff --git a/src/python/evaluation/inspectors/generate_history.py b/src/python/evaluation/inspectors/generate_history.py
@@ -0,0 +1,131 @@
+import argparse
+import json
+from collections import Counter
+from pathlib import Path
+
+import pandas as pd
+from pandarallel import pandarallel
+from src.python.common.tool_arguments import RunToolArgument
+from src.python.evaluation.common.pandas_util import (
+    get_issues_from_json,
+    get_solutions_df_by_file_path,
+    write_df_to_file,
+)
+from src.python.evaluation.common.util import ColumnName, EvaluationArgument
+from src.python.evaluation.evaluation_run_tool import get_language
+from src.python.review.common.file_system import (
+    Extension,
+    get_name_from_path,
+    get_parent_folder,
+    get_restricted_extension,
+)
+from src.python.review.common.language import Language
+
+TRACEBACK = EvaluationArgument.TRACEBACK.value
+GRADE = ColumnName.GRADE.value
+HISTORY = ColumnName.HISTORY.value
+USER = ColumnName.USER.value
+LANG = ColumnName.LANG.value
+TIME = ColumnName.TIME.value
+EXTRACTED_ISSUES = 'extracted_issues'
+
+
+def configure_arguments(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name,
+        type=lambda value: Path(value).absolute(),
+        help=f'Path to csv or xlsx file. Your dataset must include column-names: '
+             f'"{USER}", "{LANG}", "{TIME}, "{TRACEBACK}".',
+    )
+
+    parser.add_argument(
+        '-o', '--output-path',
+        type=lambda value: Path(value).absolute(),
+        help='The path where the dataset with history will be saved. '
+             'If not specified, the dataset will be saved next to the original one.',
+    )
+
+    parser.add_argument(
+        '--to-drop-traceback',
+        help=f'The "{TRACEBACK}" column will be removed from the final dataset.',
+        action='store_true',
+    )
+
+    parser.add_argument(
+        '--to-drop-grade',
+        help=f'The "{GRADE}" column will be removed from the final dataset.',
+        action='store_true',
+    )
+
+
+def _update_counter(extracted_issues: str, counter: Counter) -> None:
+    issue_classes = []
+    if extracted_issues:
+        issue_classes = extracted_issues.split(',')
+
+    counter.update(issue_classes)
+
+
+def _add_history(row, solutions_df: pd.DataFrame) -> str:
+    counter = Counter()
+
+    filtered_df = solutions_df[
+        (solutions_df[USER] == row[USER]) & (solutions_df[LANG] == row[LANG]) & (solutions_df[TIME] < row[TIME])
+    ]
+    filtered_df.apply(lambda row: _update_counter(row[EXTRACTED_ISSUES], counter), axis=1)
+
+    history = {}
+
+    # If we were unable to identify the language version, we return an empty history
+    try:
+        lang_version = get_language(row[LANG])
+    except KeyError:
+        return json.dumps(history)
+
+    lang = Language.from_language_version(lang_version)
+    if len(counter) != 0:
+        history = {lang.value.lower(): [{'origin_class': key, 'number': value} for key, value in counter.items()]}
+
+    return json.dumps(history)
+
+
+def _extract_issues(traceback: str) -> str:
+    issues = get_issues_from_json(traceback)
+    issue_classes = [issue.origin_class for issue in issues]
+    return ','.join(issue_classes)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    configure_arguments(parser)
+    args = parser.parse_args()
+
+    pandarallel.initialize()
+
+    solutions_file_path = args.solutions_file_path
+    solutions_df = get_solutions_df_by_file_path(solutions_file_path)
+    solutions_df[EXTRACTED_ISSUES] = solutions_df.parallel_apply(lambda row: _extract_issues(row[TRACEBACK]), axis=1)
+    solutions_df[HISTORY] = solutions_df.parallel_apply(_add_history, axis=1, args=(solutions_df,))
+
+    columns_to_drop = [EXTRACTED_ISSUES]
+
+    if args.to_drop_grade:
+        columns_to_drop.append(GRADE)
+
+    if args.to_drop_traceback:
+        columns_to_drop.append(TRACEBACK)
+
+    solutions_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
+
+    output_path = args.output_path
+    if output_path is None:
+        output_dir = get_parent_folder(solutions_file_path)
+        dataset_name = get_name_from_path(solutions_file_path, with_extension=False)
+        output_path = output_dir / f'{dataset_name}_with_history{Extension.CSV.value}'
+
+    output_ext = get_restricted_extension(solutions_file_path, [Extension.XLSX, Extension.CSV])
+    write_df_to_file(solutions_df, output_path, output_ext)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/review/common/language.py b/src/python/review/common/language.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import List
 
+from src.python.review.application_config import LanguageVersion
 from src.python.review.common.file_system import Extension, get_extension_from_file
 
 
@@ -13,6 +14,19 @@ class Language(Enum):
     JS = 'JAVASCRIPT'
     UNKNOWN = 'UNKNOWN'
 
+    @staticmethod
+    def from_language_version(language_version: LanguageVersion) -> 'Language':
+        version_to_lang = {
+            LanguageVersion.PYTHON_3: Language.PYTHON,
+            LanguageVersion.JAVA_7: Language.JAVA,
+            LanguageVersion.JAVA_8: Language.JAVA,
+            LanguageVersion.JAVA_9: Language.JAVA,
+            LanguageVersion.JAVA_11: Language.JAVA,
+            LanguageVersion.KOTLIN: Language.KOTLIN,
+        }
+
+        return version_to_lang.get(language_version, Language.UNKNOWN)
+
     @classmethod
     def values(cls) -> List[str]:
         return [member.value for member in Language]

diff --git a/src/python/review/quality/penalty.py b/src/python/review/quality/penalty.py
@@ -58,7 +58,7 @@ def get_previous_issues_by_language(lang_to_history: Optional[str], language: La
         return []
 
     language_to_history = json.loads(lang_to_history)
-    history = language_to_history[language.value.lower()]
+    history = language_to_history.get(language.value.lower(), [])
 
     previous_issues = []
     for issue_data in history:

diff --git a/test/python/common/file_system/test_subprocess.py b/test/python/common/file_system/test_subprocess.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from test.python.common import FILE_SYSTEM_DATA_FOLDER
 from test.python.evaluation.testing_config import get_testing_arguments
+from typing import Optional
 
 import pytest
 from src.python.evaluation.evaluation_config import EvaluationConfig
@@ -15,8 +16,8 @@
 ]
 
 
-def inspect_code(config: EvaluationConfig, file: str, language: LanguageVersion) -> str:
-    command = config.build_command(file, language.value)
+def inspect_code(config: EvaluationConfig, file: str, language: LanguageVersion, history: Optional[str] = None) -> str:
+    command = config.build_command(file, language.value, history)
     return run_in_subprocess(command)
 
 

diff --git a/test/python/evaluation/testing_config.py b/test/python/evaluation/testing_config.py
@@ -5,16 +5,20 @@
 from src.python.review.reviewers.perform_review import OutputFormat
 
 
-def get_testing_arguments(to_add_traceback=None, to_add_tool_path=None) -> Namespace:
+def get_testing_arguments(to_add_traceback=None, to_add_tool_path=None, to_add_history=None) -> Namespace:
     testing_arguments = Namespace(format=OutputFormat.JSON.value,
                                   output_file_name=EvaluationArgument.RESULT_FILE_NAME_XLSX.value,
-                                  output_folder_path=None)
+                                  output_folder_path=None,
+                                  with_history=False)
     if to_add_traceback:
         testing_arguments.traceback = True
 
     if to_add_tool_path:
         testing_arguments.tool_path = MAIN_FOLDER.parent / 'review/run_tool.py'
 
+    if to_add_history:
+        testing_arguments.with_history = True
+
     testing_arguments.solutions_file_path = None
 
     return testing_arguments
diff --git a/whitelist.txt b/whitelist.txt
@@ -159,4 +159,4 @@ Measurer
 ndarray
 Runtime
 matcher
-pathlib
+pathlib
-Original file line number
+Diff line change
@@ Expand Up / @@ -159,4 +159,4 @@ Measurer @@
     ndarray
     Runtime
     matcher
-    pathlib
+    pathlib