hyperskill · nbirillo · May 20, 2021 · May 19, 2021 · May 19, 2021 · May 20, 2021
diff --git a/src/python/common/tool_arguments.py b/src/python/common/tool_arguments.py
@@ -86,3 +86,6 @@ class RunToolArgument(Enum):
                                         f'"{ColumnName.LANG.value}" column are: '
                                         f'{LanguageVersion.PYTHON_3.value}, {LanguageVersion.JAVA_8.value}, '
                                         f'{LanguageVersion.JAVA_11.value}, {LanguageVersion.KOTLIN.value}.')
+
+    DIFFS_FILE_PATH = ArgumentsInfo(None, 'diffs_file_path',
+                                    'Path to a file with serialized diffs that were founded by diffs_between_df.py')
diff --git a/src/python/evaluation/common/pandas_util.py b/src/python/evaluation/common/pandas_util.py
@@ -1,14 +1,17 @@
+import json
 import logging
 from pathlib import Path
-from typing import Set, Union
+from typing import Any, List, Set, Union
 
 import numpy as np
 import pandas as pd
 from src.python.evaluation.common.csv_util import write_dataframe_to_csv
-from src.python.evaluation.common.util import ColumnName
+from src.python.evaluation.common.util import ColumnName, EvaluationArgument
 from src.python.evaluation.common.xlsx_util import create_workbook, remove_sheet, write_dataframe_to_xlsx_sheet
 from src.python.review.application_config import LanguageVersion
 from src.python.review.common.file_system import Extension, get_restricted_extension
+from src.python.review.inspectors.issue import BaseIssue
+from src.python.review.reviewers.utils.print_review import convert_json_to_issues
 
 logger = logging.getLogger(__name__)
 
@@ -18,6 +21,10 @@ def filter_df_by_language(df: pd.DataFrame, languages: Set[LanguageVersion],
     return df.loc[df[column].isin(set(map(lambda l: l.value, languages)))]
 
 
+def filter_df_by_condition(df: pd.DataFrame, column: str, value: Any) -> pd.DataFrame:
+    return df.loc[df[column] == value]
+
+
 def drop_duplicates(df: pd.DataFrame, column: str = ColumnName.CODE.value) -> pd.DataFrame:
     return df.drop_duplicates(column, keep='last')
 
@@ -85,3 +92,12 @@ def write_df_to_file(df: pd.DataFrame, output_file_path: Path, extension: Extens
         write_dataframe_to_xlsx_sheet(output_file_path, df, 'inspection_results')
         # remove empty sheet that was initially created with the workbook
         remove_sheet(output_file_path, 'Sheet')
+
+
+def get_issues_from_json(str_json: str) -> List[BaseIssue]:
+    parsed_json = json.loads(str_json)['issues']
+    return convert_json_to_issues(parsed_json)
+
+
+def get_issues_by_row(df: pd.DataFrame, row: int) -> List[BaseIssue]:
+    return get_issues_from_json(df.iloc[row][EvaluationArgument.TRACEBACK.value])
diff --git a/src/python/evaluation/common/util.py b/src/python/evaluation/common/util.py
@@ -15,6 +15,7 @@ class ColumnName(Enum):
     ROW = 'row'
     OLD = 'old'
     NEW = 'new'
+    IS_PUBLIC = 'is_public'
 
 
 @unique

diff --git a/src/python/evaluation/inspectors/README.md b/src/python/evaluation/inspectors/README.md
@@ -11,6 +11,10 @@ This module contains _preprocessing_ stage and _analysing_ stage.
 `Analysing` stage includes:
 - [diffs_between_df.py](diffs_between_df.py) allows finding a difference between 
   old and new grades and collect issues that were found in new data
+- [print_inspectors_statistics.py](print_inspectors_statistics.py) allows printing statistics 
+  that were found by [diffs_between_df.py](diffs_between_df.py)
+- [get_worse_public_examples.py](get_worse_public_examples.py) allows getting 
+  top N worse public examples from a dataset. The measure is to count unique new inspections.
 
 ___
 
@@ -136,4 +140,96 @@ An example of the pickle` file is:
 }
 ```
 In the `grade` field are stored fragments ids for which grade was increased in the new data.
-In the `traceback` field for fragments ids are stored set of issues. These issues were found in the new data and were not found in the old data.
+In the `traceback` field for fragments ids are stored set of issues. These issues were found in the new data and were not found in the old data.
+
+___
+
+### Print statistics
+
+[print_inspectors_statistics.py](print_inspectors_statistics.py) allows print statistics 
+  that were calculated by [diffs_between_df.py](diffs_between_df.py)
+
+#### Usage
+
+Run the [print_inspectors_statistics.py](print_inspectors_statistics.py) with the arguments from command line.
+
+Required arguments:
+
+- `diffs_file_path` — path to a `pickle` file, that was calculated by [diffs_between_df.py](diffs_between_df.py).
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;&#8209;categorize**| If True, statistics will be categorized by several categories. By default is disabled.|
+|**&#8209;n**, **&#8209;&#8209;top_n**| The top N items will be printed. Default value is 10.|
+|**&#8209;&#8209;full_stat**| If True, full statistics (with all issues) will be printed. By default is disabled.|
+
+The statistics will be printed into console.
+
+The output contains:
+- was found incorrect grades or not;
+- how many fragments has additional issues;
+- how many unique issues was found;
+- top N issues in the format: (issue_key, frequency);
+- short categorized statistics: for each category how many issues were found and how many 
+  fragments have these issues;
+- \[Optional\] full categorized statistics: for each category for each issue how many 
+  fragments have this issue
+
+An example of the printed statistics (without full categorized statistics):
+
+```json
+SUCCESS! Was not found incorrect grades.
+______
+39830 fragments has additional issues
+139 unique issues was found
+______
+Top 10 issues:
+SC200: 64435 times
+WPS432: 17477 times
+WPS221: 10618 times
+WPS336: 4965 times
+H601: 3826 times
+SC100: 2719 times
+WPS319: 2655 times
+WPS317: 2575 times
+WPS515: 1783 times
+WPS503: 1611 times
+______
+CODE_STYLE: 28 issues, 26171 fragments
+BEST_PRACTICES: 76 issues, 88040 fragments
+ERROR_PRONE: 17 issues, 2363 fragments
+COMPLEXITY: 17 issues, 13928 fragments
+COHESION: 1 issues, 3826 fragments
+______
+```
+
+---
+
+### Get worse public examples
+
+[get_worse_public_examples.py](get_worse_public_examples.py) allows getting 
+  top N worse public examples from a dataset. The measure is to count unique new inspections.
+
+#### Usage
+
+Run the [get_worse_public_examples.py](get_worse_public_examples.py) with the arguments from command line.
+
+Required arguments:
+
+- `solutions_file_path` — path to xlsx-file or csv-file with graded code samples;
+- `diffs_file_path` — path to a `pickle` file, that was calculated by [diffs_between_df.py](diffs_between_df.py).
+
+Please, note that your `solutions_file_path` file with code fragments should consist of at least 2 obligatory columns:
+
+- `code`,
+- `traceback`,
+- `is_public`,
+- `id`.
+
+Optional arguments:
+Argument | Description
+--- | ---
+|**&#8209;n**, **&#8209;&#8209;n**| The N worse fragments will be saved.|
+
+The resulting file will be stored in the same folder as the `solutions_file_path` input file.
diff --git a/src/python/evaluation/inspectors/common/__init__.py b/src/python/evaluation/inspectors/common/__init__.py
diff --git a/src/python/evaluation/inspectors/common/statistics.py b/src/python/evaluation/inspectors/common/statistics.py
@@ -0,0 +1,56 @@
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+from src.python.review.inspectors.issue import IssueType, ShortIssue
+
+
+@dataclass(frozen=True)
+class IssuesStatistics:
+    stat: Dict[ShortIssue, int]
+    changed_grades_count: int
+
+    def print_full_statistics(self, to_categorize: bool = True):
+        if to_categorize:
+            categorized_statistics: Dict[IssueType, Dict[ShortIssue, int]] = self.get_categorized_statistics()
+            for category, issues in categorized_statistics.items():
+                print(f'{category.value} issues:')
+                self.__print_stat(issues)
+        else:
+            self.__print_stat(self.stat)
+
+    @classmethod
+    def __print_stat(cls, stat: Dict[ShortIssue, int]):
+        for issue, freq in stat.items():
+            cls.print_issue_with_freq(issue, freq, prefix='- ')
+
+    @classmethod
+    def print_issue_with_freq(cls, issue: ShortIssue, freq: int, prefix: str = '', suffix: str = '') -> None:
+        print(f'{prefix}{issue.origin_class}: {freq} times{suffix}')
+
+    def get_categorized_statistics(self) -> Dict[IssueType, Dict[ShortIssue, int]]:
+        categorized_stat: Dict[IssueType, Dict[ShortIssue, int]] = defaultdict(dict)
+        for issue, freq in self.stat.items():
+            categorized_stat[issue.type][issue] = freq
+        return categorized_stat
+
+    # Get statistics for each IssueType: count unique issues, count fragments with these issues
+    def get_short_categorized_statistics(self) -> Dict[IssueType, Tuple[int, int]]:
+        categorized_statistics: Dict[IssueType, Dict[ShortIssue, int]] = self.get_categorized_statistics()
+        short_categorized_statistics = defaultdict(tuple)
+        for issue_type, stat in categorized_statistics.items():
+            unique_issues = len(stat)
+            fragments = sum(stat.values())
+            short_categorized_statistics[issue_type] = (unique_issues, fragments)
+        return short_categorized_statistics
+
+    def print_short_categorized_statistics(self) -> None:
+        short_categorized_statistics = self.get_short_categorized_statistics()
+        for category, stat in short_categorized_statistics.items():
+            print(f'{category.value}: {stat[0]} issues, {stat[1]} fragments')
+
+    def get_top_n_issues(self, n: int) -> List[ShortIssue]:
+        return sorted(self.stat.items(), key=lambda t: t[1], reverse=True)[:n]
+
+    def count_unique_issues(self) -> int:
+        return len(self.stat)
diff --git a/src/python/evaluation/inspectors/diffs_between_df.py b/src/python/evaluation/inspectors/diffs_between_df.py
@@ -1,20 +1,16 @@
 import argparse
-import json
 from pathlib import Path
-from typing import List
 
 import pandas as pd
 from src.python.common.tool_arguments import RunToolArgument
 from src.python.evaluation.common.pandas_util import (
-    get_inconsistent_positions, get_solutions_df, get_solutions_df_by_file_path,
+    get_inconsistent_positions, get_issues_by_row, get_solutions_df, get_solutions_df_by_file_path,
 )
 from src.python.evaluation.common.util import ColumnName, EvaluationArgument
 from src.python.review.common.file_system import (
     Extension, get_parent_folder, get_restricted_extension, serialize_data_and_write_to_file,
 )
-from src.python.review.inspectors.issue import BaseIssue
 from src.python.review.quality.model import QualityType
-from src.python.review.reviewers.utils.print_review import convert_json_to_issues
 
 
 def configure_arguments(parser: argparse.ArgumentParser) -> None:
@@ -31,11 +27,6 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None:
                              f'(file contains grade and traceback (optional) columns)')
 
 
-def __get_issues(df: pd.DataFrame, row: int) -> List[BaseIssue]:
-    parsed_json = json.loads(df.iloc[row][EvaluationArgument.TRACEBACK.value])['issues']
-    return convert_json_to_issues(parsed_json)
-
-
 # Find difference between two dataframes. Return dict:
 # {
 #  grade: [list_of_fragment_ids],
@@ -63,8 +54,8 @@ def find_diffs(old_df: pd.DataFrame, new_df: pd.DataFrame) -> dict:
             diffs[ColumnName.GRADE.value].append(fragment_id)
         else:
             # Find difference between issues
-            old_issues = __get_issues(old_df, row)
-            new_issues = __get_issues(new_df, row)
+            old_issues = get_issues_by_row(old_df, row)
+            new_issues = get_issues_by_row(new_df, row)
             if len(old_issues) > len(new_issues):
                 raise ValueError(f'New dataframe contains less issues than old for fragment {id}')
             difference = set(set(new_issues) - set(old_issues))

diff --git a/src/python/evaluation/inspectors/filter_issues.py b/src/python/evaluation/inspectors/filter_issues.py
@@ -0,0 +1,70 @@
+import argparse
+from pathlib import Path
+from typing import List, Set
+
+import pandas as pd
+from src.python.common.tool_arguments import RunToolArgument
+from src.python.evaluation.common.pandas_util import get_issues_from_json, get_solutions_df_by_file_path
+from src.python.evaluation.common.util import ColumnName, EvaluationArgument
+from src.python.review.common.file_system import Extension, get_parent_folder, serialize_data_and_write_to_file
+from src.python.review.inspectors.issue import BaseIssue
+
+
+TRACEBACK = EvaluationArgument.TRACEBACK.value
+ID = ColumnName.ID.value
+GRADE = ColumnName.GRADE.value
+
+
+def configure_arguments(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(RunToolArgument.SOLUTIONS_FILE_PATH.value.long_name,
+                        type=lambda value: Path(value).absolute(),
+                        help=f'{RunToolArgument.SOLUTIONS_FILE_PATH.value.description}'
+                             f'\nAll code fragments from this file must be graded ')
+
+    parser.add_argument('-i', '--issues',
+                        help='Set of issues',
+                        default='')
+
+
+def __parse_issues_arg(str_issues: str) -> Set[str]:
+    return set(str_issues.split(','))
+
+
+def __get_new_issues(traceback: str, new_issues_classes: Set[str]) -> List[BaseIssue]:
+    all_issues = get_issues_from_json(traceback)
+    return list(filter(lambda i: i.origin_class in new_issues_classes, all_issues))
+
+
+def __add_issues_for_fragment(fragment_id: int, new_issues: List[BaseIssue], diffs: dict) -> None:
+    if len(new_issues) > 0:
+        diffs[TRACEBACK][fragment_id] = new_issues
+
+
+# Make a dict with the same structure as in the find_diffs function from diffs_between_df.py
+def get_statistics_dict(solutions_df: pd.DataFrame, new_issues_classes: Set[str]) -> dict:
+    diffs = {
+        GRADE: [],
+        TRACEBACK: {},
+    }
+    solutions_df.apply(lambda row: __add_issues_for_fragment(row[ID],
+                                                             __get_new_issues(row[TRACEBACK], new_issues_classes),
+                                                             diffs), axis=1)
+    return diffs
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    configure_arguments(parser)
+    args = parser.parse_args()
+
+    solutions_file_path = args.solutions_file_path
+    solutions_df = get_solutions_df_by_file_path(solutions_file_path)
+    issues = __parse_issues_arg(args.issues)
+
+    diffs = get_statistics_dict(solutions_df, issues)
+    output_path = get_parent_folder(Path(solutions_file_path)) / f'diffs{Extension.PICKLE.value}'
+    serialize_data_and_write_to_file(output_path, diffs)
+
+
+if __name__ == '__main__':
+    main()