hyperskill · nbirillo · Aug 2, 2021 · Jul 20, 2021 · Jul 26, 2021 · Jul 26, 2021
diff --git a/src/python/evaluation/issues_statistics/README.md b/src/python/evaluation/issues_statistics/README.md
@@ -27,3 +27,33 @@ Run the [get_raw_issues.py](get_raw_issues.py) with the arguments from command l
 | **&#8209;&#8209;to&#8209;save&#8209;path** | Allows to save the path to the file where the issue was found. By default, the path is not saved. |
 | **&#8209;o**, **&#8209;&#8209;output** | Path where the dataset with raw issues will be saved. If not specified, the dataset will be saved next to the original one. |
 | **&#8209;l**, **&#8209;&#8209;log-output** | Path where logs will be stored. If not specified, then logs will be output to stderr. |
+
+## Get raw issues statistics
+The script takes the dataframe obtained after executing [get_raw_issues.py](get_raw_issues.py) and outputs dataframes with statistics grouped by language.
+
+The input dataset must have 3 obligatory columns: 
+- `id`
+- `code`
+- `lang`
+- `raw_issues`
+
+Possible values for column `lang` are: `python3`, `kotlin`, `javascript`, `java7`, `java8`, `java9`, `java11`, `java15`.
+
+The output files is a new `xlsx` or `csv` files which contains the `value` column and the columns responsible for its category statistics.
+
+The `value` column shows the metric value (for measurable issue categories), quantity (for quantitative issue categories) or `ratio * 100` (for `CODE_STYLE` and `LINE_LEN`), where `ratio` is calculated as in the corresponding rules (`CodeStyleRule` and `LineLengthRule`). 
+
+The table cells indicate how often value occurs in one fragment (for quantitative categories) or in all fragments (for measurable categories).
+
+All output datasets are arranged in folders according to language.
+
+### Usage
+Run the [get_raw_issues_statistics.py](get_raw_issues_statistics.py) with the arguments from command line.
+
+**Required arguments:**
+- `solutions_with_raw_issues` — path to an xlsx- or csv-file with code samples and raw issues, which were received with [get_raw_issues.py](get_raw_issues.py).
+
+**Optional arguments:**
+| Argument | Description |
+|----------|-------------|
+| **&#8209;o**, **&#8209;&#8209;output** | Path to the folder where datasets with statistics will be saved. If not specified, the datasets will be saved in the folder next to the original dataset. |
diff --git a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py
@@ -0,0 +1,227 @@
+import argparse
+import json
+import logging
+import sys
+from collections import Counter
+from json import JSONDecodeError
+from pathlib import Path
+from typing import Dict, List, Optional
+
+sys.path.append('')
+sys.path.append('../../..')
+
+import pandas as pd
+from pandarallel import pandarallel
+from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file
+from src.python.evaluation.common.util import ColumnName
+from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueDecoder
+from src.python.evaluation.issues_statistics.get_raw_issues import RAW_ISSUES
+from src.python.review.application_config import LanguageVersion
+from src.python.review.common.file_system import Extension, get_parent_folder, get_total_code_lines_from_code
+from src.python.review.common.language import Language
+from src.python.review.inspectors.issue import BaseIssue, ISSUE_TYPE_TO_CLASS, IssueType, Measurable
+from src.python.review.quality.rules.code_style_scoring import CodeStyleRule
+from src.python.review.quality.rules.line_len_scoring import LineLengthRule
+from src.python.review.reviewers.utils.code_statistics import get_code_style_lines
+
+ID = ColumnName.ID.value
+LANG = ColumnName.LANG.value
+CODE = ColumnName.CODE.value
+
+CODE_STYLE_LINES = f'{IssueType.CODE_STYLE.value}_lines'
+CODE_STYLE_RATIO = f'{IssueType.CODE_STYLE.value}_ratio'
+LINE_LEN_NUMBER = f'{IssueType.LINE_LEN.value}_number'
+LINE_LEN_RATIO = f'{IssueType.LINE_LEN.value}_ratio'
+TOTAL_LINES = 'total_lines'
+VALUE = 'value'
+
+OUTPUT_DF_NAME = 'stats'
+DEFAULT_OUTPUT_FOLDER_NAME = 'raw_issues_statistics'
+
+logger = logging.getLogger(__name__)
+
+
+def configure_arguments(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        'solutions_with_raw_issues',
+        type=lambda value: Path(value).absolute(),
+        help=f'Local XLSX-file or CSV-file path. Your file must include column-names: '
+             f'"{ID}", "{CODE}", "{LANG}", and "{RAW_ISSUES}".',
+    )
+
+    parser.add_argument(
+        '-o', '--output',
+        type=lambda value: Path(value).absolute(),
+        help='Path to the folder where datasets with statistics will be saved. '
+             'If not specified, the datasets will be saved in the folder next to the original one.',
+    )
+
+    parser.add_argument(
+        '-l', '--log-output',
+        type=lambda value: Path(value).absolute(),
+        help='Path where logs will be stored. If not specified, then logs will be output to stderr.',
+    )
+
+
+def _convert_language_code_to_language(fragment_id: str, language_code: str) -> str:
+    language_version = LanguageVersion.from_value(language_code)
+
+    if language_version is None:
+        logger.warning(f'{fragment_id}: it was not possible to determine the language version from "{language_code}".')
+        return language_code
+
+    language = Language.from_language_version(language_version)
+
+    if language == Language.UNKNOWN:
+        logger.warning(f'{fragment_id}: it was not possible to determine the language from "{language_version}".')
+        return language_code
+
+    return language.value
+
+
+def _extract_stats_from_issues(row: pd.Series) -> pd.Series:
+    print(f'{row[ID]}: extracting stats.')
+
+    if pd.isnull(row[CODE]):
+        logger.warning(f'{row[ID]}: no code.')
+        row[CODE] = ""
+
+    if pd.isnull(row[LANG]):
+        logger.warning(f'{row[ID]}: no lang.')
+        row[LANG] = ""
+
+    try:
+        issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder)
+    except (JSONDecodeError, TypeError):
+        logger.warning(f'{row[ID]}: failed to decode issues.')
+        issues: List[BaseIssue] = []
+
+    counter = Counter([issue.type for issue in issues])
+
+    for issue_type, issue_class in ISSUE_TYPE_TO_CLASS.items():
+        if issubclass(issue_class, Measurable):
+            row[issue_type.value] = [issue.measure() for issue in issues if isinstance(issue, issue_class)]
+        else:
+            row[issue_type.value] = counter[issue_type]
+
+    row[CODE_STYLE_LINES] = get_code_style_lines(issues)
+    row[LINE_LEN_NUMBER] = counter[IssueType.LINE_LEN]
+    row[TOTAL_LINES] = get_total_code_lines_from_code(row[CODE])
+
+    row[LANG] = _convert_language_code_to_language(row[ID], row[LANG])
+
+    print(f'{row[ID]}: extraction of statistics is complete.')
+
+    return row
+
+
+def _convert_ratio_to_int(ratio: float):
+    """
+    Round the ratio to 2 decimal places, multiply by 100, and take the integer part.
+    """
+    return int((round(ratio, 2) * 100))
+
+
+def _group_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame]:
+    logger.info('The grouping of statistics by language has started.')
+
+    result = {}
+
+    df_grouped_by_lang = df_with_stats.groupby(LANG)
+    for lang in df_grouped_by_lang.groups:
+        logger.info(f'"{lang}" statistics grouping started.')
+
+        lang_group = df_grouped_by_lang.get_group(lang)
+
+        columns_with_stats = []
+
+        for issue_type, issue_class in ISSUE_TYPE_TO_CLASS.items():
+            column = lang_group[issue_type.value]
+            if issubclass(issue_class, Measurable):
+                column = column.explode()
+            columns_with_stats.append(column.value_counts())
+
+        columns_with_stats.append(lang_group[TOTAL_LINES].value_counts())
+
+        line_len_ratio_column = lang_group.apply(
+            lambda row: LineLengthRule.get_ratio(row[LINE_LEN_NUMBER], row[TOTAL_LINES]),
+            axis=1,
+        )
+        line_len_ratio_column = line_len_ratio_column.apply(_convert_ratio_to_int)
+        line_len_ratio_column.name = LINE_LEN_RATIO
+        columns_with_stats.append(line_len_ratio_column.value_counts())
+
+        code_style_ratio_column = lang_group.apply(
+            lambda row: CodeStyleRule.get_ratio(
+                row[CODE_STYLE_LINES], row[TOTAL_LINES], Language.from_value(str(lang), default=Language.UNKNOWN),
+            ),
+            axis=1,
+        )
+        code_style_ratio_column = code_style_ratio_column.apply(_convert_ratio_to_int)
+        code_style_ratio_column.name = CODE_STYLE_RATIO
+        columns_with_stats.append(code_style_ratio_column.value_counts())
+
+        stats = pd.concat(columns_with_stats, axis=1).fillna(0).astype(int)
+
+        # Put values in a separate column
+        stats.index.name = VALUE
+        stats.reset_index(inplace=True)
+
+        result[str(lang)] = stats
+        logger.info(f'"{lang}" statistics grouping finished.')
+
+    logger.info('The grouping of statistics by language has finished.')
+
+    return result
+
+
+def inspect_raw_issues(solutions_with_raw_issues: pd.DataFrame) -> Dict[str, pd.DataFrame]:
+    pandarallel.initialize()
+
+    solutions_with_raw_issues = solutions_with_raw_issues.parallel_apply(_extract_stats_from_issues, axis=1)
+
+    return _group_stats_by_lang(solutions_with_raw_issues)
+
+
+def _get_output_folder(solutions_file_path: Path, output_folder: Optional[Path]):
+    if output_folder is not None:
+        return output_folder
+
+    return get_parent_folder(solutions_file_path) / DEFAULT_OUTPUT_FOLDER_NAME
+
+
+def _save_stats(stats_by_lang: Dict[str, pd.DataFrame], solutions_file_path: Path, output_path: Optional[Path]) -> None:
+    output_folder = _get_output_folder(solutions_file_path, output_path)
+    output_extension = Extension.get_extension_from_file(str(solutions_file_path))
+
+    logger.info(f'Saving statistics to a folder: {output_folder}.')
+
+    for lang, stats in stats_by_lang.items():
+        lang_folder = output_folder / lang
+        lang_folder.mkdir(parents=True, exist_ok=True)
+        write_df_to_file(stats, lang_folder / f'{OUTPUT_DF_NAME}{output_extension.value}', output_extension)
+
+    logger.info('Saving statistics is complete.')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    configure_arguments(parser)
+    args = parser.parse_args()
+
+    if args.log_output is not None:
+        args.log_output.parent.mkdir(parents=True, exist_ok=True)
+
+    logging.basicConfig(
+        filename=args.log_output, filemode="w", level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s',
+    )
+
+    solutions_with_raw_issues = get_solutions_df_by_file_path(args.solutions_with_raw_issues)
+
+    logger.info("Dataset inspection started.")
+
+    stats_by_lang = inspect_raw_issues(solutions_with_raw_issues)
+
+    logger.info("Dataset inspection finished.")
+
+    _save_stats(stats_by_lang, args.solutions_with_raw_issues, args.output)
diff --git a/src/python/review/application_config.py b/src/python/review/application_config.py
@@ -58,3 +58,10 @@ def is_java(self) -> bool:
             or self == LanguageVersion.JAVA_11
             or self == LanguageVersion.JAVA_15
         )
+
+    @classmethod
+    def from_value(cls, value: str, default=None):
+        try:
+            return LanguageVersion(value)
+        except ValueError:
+            return default
diff --git a/src/python/review/common/file_system.py b/src/python/review/common/file_system.py
@@ -233,3 +233,22 @@ def copy_directory(source: Union[str, Path], destination: Union[str, Path], dirs
 
 def copy_file(source: Union[str, Path], destination: Union[str, Path]):
     shutil.copy(source, destination)
+
+
+# Before using it, check that there are no line breaks in the string
+def __is_line_empty(line: str) -> bool:
+    return len(line.strip()) == 0
+
+
+def __is_comment(line: str) -> bool:
+    return line.strip().startswith(('#', '//'))
+
+
+def get_total_code_lines_from_file(path: Path) -> int:
+    code = get_content_from_file(path, to_strip_nl=False)
+    return get_total_code_lines_from_code(code)
+
+
+def get_total_code_lines_from_code(code: str) -> int:
+    lines = code.splitlines()
+    return len(list(filter(lambda line: not __is_line_empty(line) and not __is_comment(line), lines)))
diff --git a/src/python/review/common/language.py b/src/python/review/common/language.py
@@ -33,6 +33,13 @@ def from_language_version(language_version: LanguageVersion) -> 'Language':
     def values(cls) -> List[str]:
         return [member.value for member in Language]
 
+    @classmethod
+    def from_value(cls, value: str, default=None):
+        try:
+            return Language(value)
+        except ValueError:
+            return default
+
 
 EXTENSION_TO_LANGUAGE = {
     Extension.JAVA: Language.JAVA,

diff --git a/src/python/review/quality/rules/code_style_scoring.py b/src/python/review/quality/rules/code_style_scoring.py
@@ -70,7 +70,8 @@ def apply(self, n_code_style_lines, n_code_style, total_lines):
         self.n_code_style_lines = n_code_style_lines
         self.n_code_style = n_code_style
 
-        self.get_ratio(n_code_style_lines, n_code_style, total_lines)
+        self.update_quality(n_code_style_lines, n_code_style)
+        self.ratio = self.get_ratio(n_code_style_lines, total_lines, self.config.language)
 
         if self.ratio > self.config.n_code_style_bad:
             self.save_quality(QualityType.BAD)
@@ -84,17 +85,22 @@ def apply(self, n_code_style_lines, n_code_style, total_lines):
         if n_code_style_lines > self.config.n_code_style_lines_bad:
             self.quality_type = QualityType.BAD
 
-    def get_ratio(self, n_code_style_lines, n_code_style, total_lines):
+    @staticmethod
+    def get_ratio(n_code_style_lines: int, total_lines: int, language: Language) -> float:
+        if language == Language.PYTHON:
+            return n_code_style_lines / max(1, total_lines)
+        else:
+            return n_code_style_lines / max(1, total_lines - 4)
+
+    def update_quality(self, n_code_style_lines: int, n_code_style: int):
         if self.config.language == Language.PYTHON:
             if n_code_style == 1:
                 self.save_quality(QualityType.MODERATE)
-            self.ratio = n_code_style_lines / max(1, total_lines)
         else:
             if n_code_style_lines == 1:
                 self.save_quality(QualityType.GOOD)
             elif n_code_style_lines == 2:
                 self.save_quality(QualityType.MODERATE)
-            self.ratio = n_code_style_lines / max(1, total_lines - 4)
 
     def __get_next_quality_type(self) -> QualityType:
         if self.quality_type == QualityType.BAD:

diff --git a/src/python/review/quality/rules/line_len_scoring.py b/src/python/review/quality/rules/line_len_scoring.py
@@ -31,7 +31,7 @@ def __init__(self, config: LineLengthRuleConfig):
 
     # TODO: refactor
     def apply(self, n_line_len, n_lines):
-        self.ratio = n_line_len / max(n_lines, 1)
+        self.ratio = self.get_ratio(n_line_len, n_lines)
         self.n_line_len = n_line_len
         self.n_lines = n_lines
 
@@ -60,3 +60,7 @@ def merge(self, other: 'LineLengthRule') -> 'LineLengthRule':
         result_rule.apply(self.n_line_len + other.n_line_len, self.n_lines + other.n_lines)
 
         return result_rule
+
+    @staticmethod
+    def get_ratio(n_line_len: int, n_lines: int) -> float:
+        return n_line_len / max(n_lines, 1)
diff --git a/src/python/review/reviewers/utils/code_statistics.py b/src/python/review/reviewers/utils/code_statistics.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import Dict, List
 
-from src.python.review.common.file_system import get_content_from_file
+from src.python.review.common.file_system import get_total_code_lines_from_file
 from src.python.review.inspectors.issue import BaseIssue, IssueType
 
 
@@ -53,19 +53,6 @@ def issue_type_to_statistics_dict(self) -> Dict[IssueType, int]:
         }
 
 
-def __get_total_lines(path: Path) -> int:
-    lines = get_content_from_file(path, to_strip_nl=False).splitlines()
-    return len(list(filter(lambda line: not __is_empty(line) and not __is_comment(line), lines)))
-
-
-def __is_empty(line: str) -> bool:
-    return len(line.strip()) == 0
-
-
-def __is_comment(line: str) -> bool:
-    return line.strip().startswith(('#', '//'))
-
-
 def get_code_style_lines(issues: List[BaseIssue]) -> int:
     code_style_issues = filter(lambda issue: issue.type == IssueType.CODE_STYLE, issues)
     line_counter = Counter([issue.line_no for issue in code_style_issues])
@@ -111,6 +98,6 @@ def gather_code_statistics(issues: List[BaseIssue], path: Path) -> CodeStatistic
         coupling=couplings,
         weighted_method_complexities=weighted_method_complexities,
         method_number=method_numbers,
-        total_lines=__get_total_lines(path),
+        total_lines=get_total_code_lines_from_file(path),
         code_style_lines=get_code_style_lines(issues),
     )