hyperskill · nbirillo · Jul 28, 2021 · Jul 25, 2021 · Jul 25, 2021 · Jul 26, 2021
diff --git a/src/python/evaluation/statistics/README.md → ...on/evaluation/issues_statistics/README.md b/src/python/evaluation/statistics/README.md → ...on/evaluation/issues_statistics/README.md
diff --git a/src/python/evaluation/statistics/__init__.py → .../evaluation/issues_statistics/__init__.py b/src/python/evaluation/statistics/__init__.py → .../evaluation/issues_statistics/__init__.py
diff --git a/.../evaluation/statistics/common/__init__.py → ...tion/issues_statistics/common/__init__.py b/.../evaluation/statistics/common/__init__.py → ...tion/issues_statistics/common/__init__.py
diff --git a/...stics/common/raw_issue_encoder_decoder.py → ...stics/common/raw_issue_encoder_decoder.py b/...stics/common/raw_issue_encoder_decoder.py → ...stics/common/raw_issue_encoder_decoder.py
diff --git a/...n/evaluation/statistics/get_raw_issues.py → ...ation/issues_statistics/get_raw_issues.py b/...n/evaluation/statistics/get_raw_issues.py → ...ation/issues_statistics/get_raw_issues.py
@@ -15,7 +15,7 @@
 from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file
 from src.python.evaluation.common.util import ColumnName
 from src.python.evaluation.evaluation_run_tool import get_language_version
-from src.python.evaluation.statistics.common.raw_issue_encoder_decoder import RawIssueEncoder
+from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueEncoder
 from src.python.review.common.file_system import (
     create_file,
     Extension,

diff --git a/src/python/evaluation/paper_evaluation/README.md b/src/python/evaluation/paper_evaluation/README.md
@@ -0,0 +1,7 @@
+# Paper evaluation
+
+This module contains scripts for SIGCSE-2022 paper evaluation:
+
+- [Comparison with other tools](./comparison_with_other_tools/README.md)
+- Formatting issues importance
+- [Dynamics of student usage](./user_dynamics/README.md)
diff --git a/src/python/evaluation/paper_evaluation/__init__.py b/src/python/evaluation/paper_evaluation/__init__.py
diff --git a/src/python/evaluation/paper_evaluation/comparison_with_other_tools/README.md b/src/python/evaluation/paper_evaluation/comparison_with_other_tools/README.md
@@ -0,0 +1,58 @@
+# Comparison with other tools evaluation
+
+This module allows getting statistic about using of several code quality tools.
+In our work we compare the Hyperstyle tool with the [Tutor](https://www.hkeuning.nl/rpt/) tool.
+Other tools (FrenchPress, WebTA, and AutoStyle) does not have open sources.
+
+To get statistics we use students solutions for six programming tasks, 
+but the main script can gather this statistics for any tasks.
+
+The tasks from the out dataset:
+- **countEven**. The `countEven` method returns the number of even integers in the values-array.
+- **sumValues**. The `sumValues` method adds up all numbers from the values-array, 
+  or only the positive numbers if the `positivesOnly` boolean parameter is set 
+  to `true`.
+- **oddSum**. The method `oddSum` returns the sum of all numbers at an odd index 
+  in the array parameter, until the number -1 is seen at an odd index.
+- **calculateScore**. The `calculateScore` method calculates the score for a train trip. 
+  The highest score is 10. The score is based on the number of changes and the day of 
+  the week (Monday is 1, Sunday is 7).
+- **hasDoubled**. Write a program that calculates in how many years your savings 
+  have doubled with the given interest.
+- **haveThree**. Given an array of ints, return true if the value 3 appears in the 
+  array exactly  3 times, and no 3's are next to each other.
+
+The dataset has several columns:
+- Student id (student_id);
+- Task key (task_key);
+- Code fragment (solution);
+- Tutor error, if it is existed (tutor_error);
+- Tutor issues keys (tutor_issues);
+- Hyperstyle issues keys (hyperstyle_issues);
+- Hyperstyle INFO issues keys (hyperstyle_info_issues);
+- Code style issues count (code_style_issues_count).
+
+The dataset stores in the `csv` format.
+
+## Usage
+
+Run the [statistics_gathering.py](statistics_gathering.py) with the arguments from command line.
+
+Required arguments:
+
+`solutions_file_path` — path to csv-file with code samples.
+
+The statistics will be printed in the terminal. The statistics includes:
+- Unique users count;
+- Code snippets count;
+- Tasks statistics: for each task count code snippets and count snippets with the Tutor errors;
+- Count code fragments has Tutor errors;
+- Count of unique errors was found in Tutor;
+- Error statistics: for each error get the error text and frequency;
+- Issues statistics:
+    - Count of unique issues in total;
+    - Common issues statistics: for all common issues for Hyperstyle and Tutor count frequency of this issue;
+    - Tutor unique issues statistics: for all Tutor issues (that were not found by Hyperstyle) count frequency of this issue;
+    - Hyperstyle unique issues statistics: for all Hyperstyle issues (that were not found by Tutor) count frequency of this issue;
+    - Count code style issues and count fragments with these issues.
+
diff --git a/src/python/evaluation/paper_evaluation/comparison_with_other_tools/__init__.py b/src/python/evaluation/paper_evaluation/comparison_with_other_tools/__init__.py
diff --git a/src/python/evaluation/paper_evaluation/comparison_with_other_tools/statistics_gathering.py b/src/python/evaluation/paper_evaluation/comparison_with_other_tools/statistics_gathering.py
@@ -0,0 +1,58 @@
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from src.python.common.tool_arguments import RunToolArgument
+from src.python.evaluation.common.pandas_util import get_solutions_df
+from src.python.evaluation.paper_evaluation.comparison_with_other_tools.tutor_statistics import (
+    IssuesStatistics, TutorStatistics,
+)
+from src.python.evaluation.paper_evaluation.comparison_with_other_tools.util import ComparisonColumnName
+from src.python.review.common.file_system import Extension, get_restricted_extension
+
+sys.path.append('')
+sys.path.append('../../..')
+
+logger = logging.getLogger(__name__)
+
+
+def configure_arguments(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name,
+                        type=lambda value: Path(value).absolute(),
+                        help='Local CSV-file path with feedback from different tools. '
+                             'Your file must include column-names:'
+                             f'"{ComparisonColumnName.STUDENT_ID.name}" and '
+                             f'"{ComparisonColumnName.TASK_KEY.name}" and '
+                             f'"{ComparisonColumnName.SOLUTION.name}" and '
+                             f'"{ComparisonColumnName.TUTOR_ERROR.name}" and ')
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    configure_arguments(parser)
+
+    try:
+        args = parser.parse_args()
+        solutions_file_path = args.solutions_file_path
+        extension = get_restricted_extension(solutions_file_path, [Extension.CSV])
+        solutions_df = get_solutions_df(extension, solutions_file_path)
+        tutor_stat = TutorStatistics(solutions_df, to_drop_duplicates=True)
+        tutor_stat.print_tasks_stat()
+        tutor_stat.print_error_stat()
+        print('ISSUES STAT:')
+        issue_stat = IssuesStatistics(solutions_df)
+        issue_stat.print_issues_stat()
+        return 0
+
+    except FileNotFoundError:
+        logger.error('CSV-file with the specified name does not exists.')
+        return 2
+
+    except Exception:
+        logger.exception('An unexpected error.')
+        return 2
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/python/evaluation/paper_evaluation/comparison_with_other_tools/tutor_statistics.py b/src/python/evaluation/paper_evaluation/comparison_with_other_tools/tutor_statistics.py
@@ -0,0 +1,127 @@
+from collections import Counter
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+import pandas as pd
+from src.python.evaluation.common.pandas_util import filter_df_by_single_value
+from src.python.evaluation.paper_evaluation.comparison_with_other_tools.util import (
+    ComparisonColumnName, ERROR_CONST, TutorTask,
+)
+
+
+def sort_freq_dict(freq_dict: Dict[Any, int]) -> Dict[Any, int]:
+    return dict(sorted(freq_dict.items(), key=lambda item: item[1], reverse=True))
+
+
+@dataclass
+class TutorStatistics:
+    unique_users: int
+    task_to_freq: Dict[TutorTask, int]
+    task_to_error_freq: Dict[TutorTask, int]
+    error_to_freq: Dict[str, int]
+    fragments_with_error: int = 0
+
+    __separator: str = '----------'
+
+    def __init__(self, solutions_df: pd.DataFrame, to_drop_duplicates: bool = False):
+        if to_drop_duplicates:
+            solutions_df = solutions_df.drop_duplicates(ComparisonColumnName.SOLUTION.value)
+        self.unique_users = len(solutions_df[ComparisonColumnName.STUDENT_ID.value].unique())
+        self.task_to_freq = defaultdict(int)
+        self.task_to_error_freq = defaultdict(int)
+        self.error_to_freq = defaultdict(int)
+        for task in TutorTask:
+            task_df = filter_df_by_single_value(solutions_df, ComparisonColumnName.TASK_KEY.value, task.value)
+            self.task_to_freq[task] = task_df.shape[0]
+            errors_list = list(map(lambda e_l: e_l.split(';'),
+                                   task_df[ComparisonColumnName.TUTOR_ERROR.value].dropna().values))
+            for cell_errors in errors_list:
+                for error in cell_errors:
+                    self.error_to_freq[error] += 1
+                self.task_to_error_freq[task] += 1
+                self.fragments_with_error += 1
+        self.task_to_freq = sort_freq_dict(self.task_to_freq)
+        self.error_to_freq = sort_freq_dict(self.error_to_freq)
+
+    def print_tasks_stat(self) -> None:
+        print(f'Unique users count: {self.unique_users}')
+        print(f'Code snippets count: {sum(self.task_to_freq.values())}')
+        print('Tasks statistics:')
+        for task, freq in self.task_to_freq.items():
+            print(f'Task {task.value}: {freq} items; {self.task_to_error_freq[task]} with tutor errors')
+        print(self.__separator)
+
+    def print_error_stat(self) -> None:
+        print(f'{self.fragments_with_error} code fragments has errors during running by Tutor')
+        print(f'{len(self.error_to_freq.keys())} unique errors was found in Tutor')
+        print('Error statistics:')
+        for error, freq in self.error_to_freq.items():
+            print(f'{error}: {freq} items')
+        print(self.__separator)
+
+
+@dataclass
+class IssuesStatistics:
+    common_issue_to_freq: Dict[str, int]
+    tutor_uniq_issue_to_freq: Dict[str, int]
+    hyperstyle_uniq_issue_to_freq: Dict[str, int]
+
+    code_style_issues_count: int
+    fragments_count_with_code_style_issues: int
+
+    __separator: str = '----------'
+
+    # TODO: info and code style issues
+    def __init__(self, solutions_df: pd.DataFrame, to_drop_duplicates: bool = False):
+        if to_drop_duplicates:
+            solutions_df = solutions_df.drop_duplicates(ComparisonColumnName.SOLUTION.value)
+        self.common_issue_to_freq = defaultdict(int)
+        self.tutor_uniq_issue_to_freq = defaultdict(int)
+        self.hyperstyle_uniq_issue_to_freq = defaultdict(int)
+        solutions_df.apply(lambda row: self.__init_solution_df_row(row), axis=1)
+        self.common_issue_to_freq = sort_freq_dict(self.common_issue_to_freq)
+        self.tutor_uniq_issue_to_freq = sort_freq_dict(self.tutor_uniq_issue_to_freq)
+        self.hyperstyle_uniq_issue_to_freq = sort_freq_dict(self.hyperstyle_uniq_issue_to_freq)
+        self.code_style_issues_count = sum(solutions_df[ComparisonColumnName.CODE_STYLE_ISSUES_COUNT.value])
+        self.fragments_count_with_code_style_issues = len(list(
+            filter(lambda x: x != 0, solutions_df[ComparisonColumnName.CODE_STYLE_ISSUES_COUNT.value])))
+
+    @staticmethod
+    def __parse_issues(issues_str: str) -> List[str]:
+        if pd.isna(issues_str) or issues_str == ERROR_CONST:
+            return []
+        return issues_str.split(';')
+
+    @staticmethod
+    def __add_issues(issues_dict: Dict[str, int], issues: List[str]) -> None:
+        for issue in issues:
+            issues_dict[issue] += 1
+
+    def __init_solution_df_row(self, row: pd.DataFrame) -> None:
+        tutor_issues = self.__parse_issues(row[ComparisonColumnName.TUTOR_ISSUES.value])
+        hyperstyle_issues = self.__parse_issues(row[ComparisonColumnName.HYPERSTYLE_ISSUES.value])
+        common_issues = list((Counter(tutor_issues) & Counter(hyperstyle_issues)).elements())
+        self.__add_issues(self.common_issue_to_freq, common_issues)
+        self.__add_issues(self.tutor_uniq_issue_to_freq, list(set(tutor_issues) - set(common_issues)))
+        self.__add_issues(self.hyperstyle_uniq_issue_to_freq, list(set(hyperstyle_issues) - set(common_issues)))
+
+    def __print_freq_issues_stat(self, freq_stat: Dict[str, int], prefix: str) -> None:
+        print(f'{prefix} issues statistics:')
+        for issue, freq in freq_stat.items():
+            print(f'{issue} was found {freq} times')
+        print(self.__separator)
+
+    def print_issues_stat(self) -> None:
+        uniq_issues = (len(self.common_issue_to_freq)
+                       + len(self.tutor_uniq_issue_to_freq)
+                       + len(self.hyperstyle_uniq_issue_to_freq)
+                       )
+        print(f'{uniq_issues} unique issues in total was found')
+        print(self.__separator)
+        self.__print_freq_issues_stat(self.common_issue_to_freq, 'Common')
+        self.__print_freq_issues_stat(self.tutor_uniq_issue_to_freq, 'Tutor unique')
+        self.__print_freq_issues_stat(self.hyperstyle_uniq_issue_to_freq, 'Hyperstyle unique')
+        print(f'{self.code_style_issues_count} code style issues (spaces, different brackets, indentations)'
+              f' was found in total by hyperstyle in {self.fragments_count_with_code_style_issues}  fragments')
+        print(self.__separator)
diff --git a/src/python/evaluation/paper_evaluation/comparison_with_other_tools/util.py b/src/python/evaluation/paper_evaluation/comparison_with_other_tools/util.py
@@ -0,0 +1,27 @@
+from enum import Enum, unique
+
+
+@unique
+class ComparisonColumnName(Enum):
+    STUDENT_ID = 'student_id'
+    TASK_KEY = 'task_key'
+    SOLUTION = 'solution'
+    TUTOR_ERROR = 'tutor_error'
+
+    TUTOR_ISSUES = 'tutor_issues'
+    HYPERSTYLE_ISSUES = 'hyperstyle_issues'
+    HYPERSTYLE_INFO_ISSUES = 'hyperstyle_info_issues'
+    CODE_STYLE_ISSUES_COUNT = 'code_style_issues_count'
+
+
+ERROR_CONST = 'ERROR'
+
+
+@unique
+class TutorTask(Enum):
+    EVEN = 'countEven'
+    SUM_VALUES = 'sumValues'
+    ODD_SUM = 'oddSum'
+    SCORE = 'calculateScore'
+    HAS_DOUBLED = 'hasDoubled'
+    HAVE_THREE = 'haveThree'
diff --git a/src/python/evaluation/paper_evaluation/user_dynamics/README.md b/src/python/evaluation/paper_evaluation/user_dynamics/README.md
@@ -0,0 +1,29 @@
+# Dynamics of student usage
+
+This module allows getting statistics about students dynamics in code quality issues improvements.
+
+## Usage
+
+Run the [dynamics_gathering.py](dynamics_gathering.py) with the arguments from command line.
+
+Required arguments:
+
+`solutions_file_path` — path to csv-file with code samples.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;fb**, **&#8209;&#8209;freq-boundary**| The boundary of solutions count for one student to analyze. The default value is 100.|
+|**&#8209;n**, **&#8209;&#8209;n**| Top n popular issues in solutions. The default value is 100. |
+
+In the result a file with students issues dynamics will be created.
+Also, the top of issues for all students will be printed into the terminal. This statistics has key of issue and frequency for all students.
+
+An example of issues dynamics:
+```text
+user,traceback
+0,"0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,0,0,0,0,2,0,4,0,6,3,0,3,0,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,0,0,0,4,0,0,0,1,6,0,1,0,1,3,0,0,1,1,0,0,0,0,0,3,6,1,0,0,0,0,0,0,0,4,1,0,0,1,0,8,0,2,8,0,0,0,0,1,1,1,1,3,7,23,0,9"
+1,"0,0,0,3,0,0,2,1,0,0,0,0,4,1,0,0,1,1,0,0,0,0,0,6,0,1,1,0,8,1,2,1,1,0,0,1,0,4,10,1,1,1,3,0,1,0,0,0,1,0,0,0,0,0,0,2,0,3,0,0,2,2,3,2,0,0,0,1,0,1,1,0,0,1,0,4,6,2,0,0,1,0,0,0,0,2,0,0,0,2,1,2,1,0,1,7,1,0,1,1,0,1,0"
+```
+Each number in the traceback column is the count of issues in one solution. 
+The numbers of issues sorted by timestamps.
diff --git a/src/python/evaluation/paper_evaluation/user_dynamics/__init__.py b/src/python/evaluation/paper_evaluation/user_dynamics/__init__.py