From 2ef00d9a9f389caaced474b38b1b8d0ef86c02f6 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Tue, 20 Jul 2021 11:42:36 +0300 Subject: [PATCH 01/35] Fixed get_output_path test data --- test/python/evaluation/statistics/test_get_raw_issues.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/python/evaluation/statistics/test_get_raw_issues.py b/test/python/evaluation/statistics/test_get_raw_issues.py index c11882c2..67126322 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues.py +++ b/test/python/evaluation/statistics/test_get_raw_issues.py @@ -19,6 +19,15 @@ NEW_DF_NAME = 'new_df' +ORIGINAL_DF_NAME = 'original_df' +ORIGINAL_DF_CSV = f'{ORIGINAL_DF_NAME}.csv' +ORIGINAL_DF_XLSX = f'{ORIGINAL_DF_NAME}.xlsx' + +ORIGINAL_DF_WITH_RAW_ISSUES_CSV = f'{ORIGINAL_DF_NAME}_with_raw_issues.csv' +ORIGINAL_DF_WITH_RAW_ISSUES_XLSX = f'{ORIGINAL_DF_NAME}_with_raw_issues.xlsx' + +NEW_DF_NAME = 'new_df' + GET_OUTPUT_PATH_TEST_DATA = [ (Path(ORIGINAL_DF_CSV), None, Path(ORIGINAL_DF_WITH_RAW_ISSUES_CSV)), (Path(ORIGINAL_DF_XLSX), None, Path(ORIGINAL_DF_WITH_RAW_ISSUES_XLSX)), From 461b511ec8fb8ab2d77bc6a2a3ba1ab8ac31db72 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 09:29:39 +0300 Subject: [PATCH 02/35] Moved __get_total_lines to file_system.py --- .../review/reviewers/utils/code_statistics.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/python/review/reviewers/utils/code_statistics.py b/src/python/review/reviewers/utils/code_statistics.py index 19218a7b..2d9381d9 100644 --- a/src/python/review/reviewers/utils/code_statistics.py +++ b/src/python/review/reviewers/utils/code_statistics.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Dict, List -from src.python.review.common.file_system import get_content_from_file +from src.python.review.common.file_system import get_total_code_lines_from_file from src.python.review.inspectors.issue import BaseIssue, IssueType @@ -53,19 +53,6 @@ def issue_type_to_statistics_dict(self) -> Dict[IssueType, int]: } -def __get_total_lines(path: Path) -> int: - lines = get_content_from_file(path, to_strip_nl=False).splitlines() - return len(list(filter(lambda line: not __is_empty(line) and not __is_comment(line), lines))) - - -def __is_empty(line: str) -> bool: - return len(line.strip()) == 0 - - -def __is_comment(line: str) -> bool: - return line.strip().startswith(('#', '//')) - - def get_code_style_lines(issues: List[BaseIssue]) -> int: code_style_issues = filter(lambda issue: issue.type == IssueType.CODE_STYLE, issues) line_counter = Counter([issue.line_no for issue in code_style_issues]) @@ -111,6 +98,6 @@ def gather_code_statistics(issues: List[BaseIssue], path: Path) -> CodeStatistic coupling=couplings, weighted_method_complexities=weighted_method_complexities, method_number=method_numbers, - total_lines=__get_total_lines(path), + total_lines=get_total_code_lines_from_file(path), code_style_lines=get_code_style_lines(issues), ) From bec0701a817f3a9ae25fe49a3ececff66008884e Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 09:29:59 +0300 Subject: [PATCH 03/35] Added get_total_code_lines_from_file and get_total_code_lines_from_code --- src/python/review/common/file_system.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/python/review/common/file_system.py b/src/python/review/common/file_system.py index bbdeeda6..9bbf5123 100644 --- a/src/python/review/common/file_system.py +++ b/src/python/review/common/file_system.py @@ -233,3 +233,21 @@ def copy_directory(source: Union[str, Path], destination: Union[str, Path], dirs def copy_file(source: Union[str, Path], destination: Union[str, Path]): shutil.copy(source, destination) + + +def __is_line_empty(line: str) -> bool: + return len(line.strip()) == 0 + + +def __is_comment(line: str) -> bool: + return line.strip().startswith(('#', '//')) + + +def get_total_code_lines_from_file(path: Path) -> int: + code = get_content_from_file(path, to_strip_nl=False) + return get_total_code_lines_from_code(code) + + +def get_total_code_lines_from_code(code: str) -> int: + lines = code.splitlines() + return len(list(filter(lambda line: not __is_line_empty(line) and not __is_comment(line), lines))) From e5085cbccd19f9f42757b777440a8a1735b9ae59 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 09:34:36 +0300 Subject: [PATCH 04/35] Added get_raw_issues_statistics --- .../statistics/get_raw_issues_statistics.py | 186 ++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 src/python/evaluation/statistics/get_raw_issues_statistics.py diff --git a/src/python/evaluation/statistics/get_raw_issues_statistics.py b/src/python/evaluation/statistics/get_raw_issues_statistics.py new file mode 100644 index 00000000..4bb1ddf7 --- /dev/null +++ b/src/python/evaluation/statistics/get_raw_issues_statistics.py @@ -0,0 +1,186 @@ +import argparse +import json +import sys +from collections import Counter +from json import JSONDecodeError +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +sys.path.append('') +sys.path.append('../../..') + +import pandas as pd +from pandarallel import pandarallel +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file +from src.python.evaluation.common.util import ColumnName +from src.python.evaluation.evaluation_run_tool import get_language_version +from src.python.evaluation.statistics.common.raw_issue_encoder_decoder import RawIssueDecoder +from src.python.evaluation.statistics.get_raw_issues import RAW_ISSUES +from src.python.review.common.file_system import Extension, get_parent_folder, get_total_code_lines_from_code +from src.python.review.common.language import Language +from src.python.review.inspectors.issue import BaseIssue, ISSUE_TYPE_TO_CLASS, IssueType, Measurable +from src.python.review.reviewers.utils.code_statistics import get_code_style_lines + +ID = ColumnName.ID.value +LANG = ColumnName.LANG.value +CODE = ColumnName.CODE.value + +CODE_STYLE_LINES = f'{IssueType.CODE_STYLE.value}_lines' +LINE_LEN_NUMBER = f'{IssueType.LINE_LEN.value}_number' +TOTAL_LINES = 'total_lines' + +DEFAULT_OUTPUT_FOLDER_NAME = 'raw_issues_statistics' +MAIN_STATS_DF_NAME = 'main_stats' +OTHER_STATS_DF_NAME = 'other_stats' + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + 'solutions_with_raw_issues', + type=lambda value: Path(value).absolute(), + help=f'Local XLSX-file or CSV-file path. Your file must include column-names: ' + f'"{CODE}", "{LANG}", and "{RAW_ISSUES}"', + ) + + parser.add_argument( + '-o', + '--output', + type=lambda value: Path(value).absolute(), + help='Path where datasets with statistics will be saved. ' + 'If not specified, datasets will be saved next to the original one.', + ) + + +def _convert_language_code_to_language(language_code: str) -> str: + try: + language_version = get_language_version(language_code) + except KeyError: + return language_code + + language = Language.from_language_version(language_version) + + if language == Language.UNKNOWN: + return language_code + + return language.value + + +def _extract_stats_from_issues(row: pd.Series) -> pd.Series: + try: + issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder) + except JSONDecodeError: + issues: List[BaseIssue] = [] + + counter = Counter([issue.type for issue in issues]) + + for issue_type, issue_class in ISSUE_TYPE_TO_CLASS.items(): + if issubclass(issue_class, Measurable): + row[issue_type.value] = [issue.measure() for issue in issues if isinstance(issue, issue_class)] + else: + row[issue_type.value] = counter[issue_type] + + row[CODE_STYLE_LINES] = get_code_style_lines(issues) + row[LINE_LEN_NUMBER] = counter[IssueType.LINE_LEN] + row[TOTAL_LINES] = get_total_code_lines_from_code(row[CODE]) + + row[LANG] = _convert_language_code_to_language(row[LANG]) + + return row + + +def _is_python(language_code: str) -> bool: + try: + return Language(language_code) == Language.PYTHON + except ValueError: + return False + + +def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]: + result = {} + + df_grouped_by_lang = df_with_stats.groupby(LANG) + for lang in df_grouped_by_lang.groups: + lang_group = df_grouped_by_lang.get_group(lang) + + columns_with_stats = [] + + # ---- main stats ---- + for issue_type, issue_class in ISSUE_TYPE_TO_CLASS.items(): + column = lang_group[issue_type.value] + if issubclass(issue_class, Measurable): + column = column.explode() + columns_with_stats.append(column.value_counts()) + + columns_with_stats.append(lang_group[TOTAL_LINES].value_counts()) + + main_stats = pd.concat(columns_with_stats, axis=1).fillna(0) + + min_value, max_value = main_stats.index.min(), main_stats.index.max() + main_stats = main_stats.reindex(range(min_value, max_value + 1), fill_value=0).astype(int) + + columns_with_stats.clear() + + # ---- other stats ---- + line_len_stats_column = lang_group[LINE_LEN_NUMBER] / lang_group[TOTAL_LINES].apply(lambda elem: max(1, elem)) + line_len_stats_column.name = IssueType.LINE_LEN.value + columns_with_stats.append(line_len_stats_column) + + if _is_python(str(lang)): + code_style_stats_column = lang_group[CODE_STYLE_LINES] / lang_group[TOTAL_LINES].apply( + lambda total_lines: max(1, total_lines), + ) + else: + code_style_stats_column = lang_group[CODE_STYLE_LINES] / lang_group[TOTAL_LINES].apply( + lambda total_lines: max(1, total_lines - 4), + ) + + code_style_stats_column.name = IssueType.CODE_STYLE.value + columns_with_stats.append(code_style_stats_column) + + other_stats = pd.concat(columns_with_stats, axis=1) + + result[str(lang)] = (main_stats, other_stats) + + return result + + +def inspect_solutions(solutions_with_raw_issues: pd.DataFrame) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]: + pandarallel.initialize() + + solutions_with_raw_issues = solutions_with_raw_issues.parallel_apply(_extract_stats_from_issues, axis=1) + + return _get_stats_by_lang(solutions_with_raw_issues) + + +def _get_output_folder(solutions_file_path: Path, output_folder: Optional[Path]): + if output_folder is not None: + return output_folder + + return get_parent_folder(solutions_file_path) / DEFAULT_OUTPUT_FOLDER_NAME + + +def _save_stats( + stats_by_lang: Dict[str, Tuple[pd.DataFrame, pd.DataFrame]], + solutions_file_path: Path, + output_path: Optional[Path], +) -> None: + output_folder = _get_output_folder(solutions_file_path, output_path) + output_extension = Extension.get_extension_from_file(str(solutions_file_path)) + + for lang, (main_stats, other_stats) in stats_by_lang.items(): + lang_folder = output_folder / lang + lang_folder.mkdir(parents=True, exist_ok=True) + write_df_to_file(main_stats, lang_folder / f'{MAIN_STATS_DF_NAME}{output_extension.value}', output_extension) + write_df_to_file(other_stats, lang_folder / f'{OTHER_STATS_DF_NAME}{output_extension.value}', output_extension) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + solutions_with_raw_issues = get_solutions_df_by_file_path(args.solutions_with_raw_issues) + + stats_by_lang = inspect_solutions(solutions_with_raw_issues) + + _save_stats(stats_by_lang, args.solutions_with_raw_issues, args.output) From 71caba1929c2eced38e59e91f29d827ddc6c4ff6 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 12:35:51 +0300 Subject: [PATCH 05/35] Renamed main_stats -> freq_stats and other_stats -> ratio_stats --- .../statistics/get_raw_issues_statistics.py | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/python/evaluation/statistics/get_raw_issues_statistics.py b/src/python/evaluation/statistics/get_raw_issues_statistics.py index 4bb1ddf7..fbf689e4 100644 --- a/src/python/evaluation/statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/statistics/get_raw_issues_statistics.py @@ -28,6 +28,7 @@ CODE_STYLE_LINES = f'{IssueType.CODE_STYLE.value}_lines' LINE_LEN_NUMBER = f'{IssueType.LINE_LEN.value}_number' TOTAL_LINES = 'total_lines' +VALUE = 'value' DEFAULT_OUTPUT_FOLDER_NAME = 'raw_issues_statistics' MAIN_STATS_DF_NAME = 'main_stats' @@ -39,12 +40,11 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: 'solutions_with_raw_issues', type=lambda value: Path(value).absolute(), help=f'Local XLSX-file or CSV-file path. Your file must include column-names: ' - f'"{CODE}", "{LANG}", and "{RAW_ISSUES}"', + f'"{CODE}", "{LANG}", and "{RAW_ISSUES}"', ) parser.add_argument( - '-o', - '--output', + '-o', '--output', type=lambda value: Path(value).absolute(), help='Path where datasets with statistics will be saved. ' 'If not specified, datasets will be saved next to the original one.', @@ -68,7 +68,7 @@ def _convert_language_code_to_language(language_code: str) -> str: def _extract_stats_from_issues(row: pd.Series) -> pd.Series: try: issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder) - except JSONDecodeError: + except (JSONDecodeError, TypeError): issues: List[BaseIssue] = [] counter = Counter([issue.type for issue in issues]) @@ -104,7 +104,7 @@ def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFr columns_with_stats = [] - # ---- main stats ---- + # ---- Frequency statistics ---- for issue_type, issue_class in ISSUE_TYPE_TO_CLASS.items(): column = lang_group[issue_type.value] if issubclass(issue_class, Measurable): @@ -113,33 +113,41 @@ def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFr columns_with_stats.append(lang_group[TOTAL_LINES].value_counts()) - main_stats = pd.concat(columns_with_stats, axis=1).fillna(0) + freq_stats = pd.concat(columns_with_stats, axis=1).fillna(0) - min_value, max_value = main_stats.index.min(), main_stats.index.max() - main_stats = main_stats.reindex(range(min_value, max_value + 1), fill_value=0).astype(int) + # Fill in the intermediate values that are not occurred with zeros + min_value, max_value = freq_stats.index.min(), freq_stats.index.max() + freq_stats = freq_stats.reindex(range(min_value, max_value + 1), fill_value=0).astype(int) + + # Put the values in a separate column + freq_stats.index.name = VALUE + freq_stats.reset_index(inplace=True) columns_with_stats.clear() - # ---- other stats ---- - line_len_stats_column = lang_group[LINE_LEN_NUMBER] / lang_group[TOTAL_LINES].apply(lambda elem: max(1, elem)) - line_len_stats_column.name = IssueType.LINE_LEN.value - columns_with_stats.append(line_len_stats_column) + # ---- Ratio statistics ---- + + # Calculate line len ratio according to LineLengthRule + line_len_ratio_column = lang_group[LINE_LEN_NUMBER] / lang_group[TOTAL_LINES].apply(lambda elem: max(1, elem)) + line_len_ratio_column.name = IssueType.LINE_LEN.value + columns_with_stats.append(line_len_ratio_column) + # Calculate code style ratio according to CodeStyleRule if _is_python(str(lang)): - code_style_stats_column = lang_group[CODE_STYLE_LINES] / lang_group[TOTAL_LINES].apply( + code_style_ratio_column = lang_group[CODE_STYLE_LINES] / lang_group[TOTAL_LINES].apply( lambda total_lines: max(1, total_lines), ) else: - code_style_stats_column = lang_group[CODE_STYLE_LINES] / lang_group[TOTAL_LINES].apply( + code_style_ratio_column = lang_group[CODE_STYLE_LINES] / lang_group[TOTAL_LINES].apply( lambda total_lines: max(1, total_lines - 4), ) - code_style_stats_column.name = IssueType.CODE_STYLE.value - columns_with_stats.append(code_style_stats_column) + code_style_ratio_column.name = IssueType.CODE_STYLE.value + columns_with_stats.append(code_style_ratio_column) - other_stats = pd.concat(columns_with_stats, axis=1) + ratio_stats = pd.concat(columns_with_stats, axis=1) - result[str(lang)] = (main_stats, other_stats) + result[str(lang)] = (freq_stats, ratio_stats) return result From 223c1d12e6c25e3d25fd6def3eabd138cc454bb2 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 13:48:57 +0300 Subject: [PATCH 06/35] Small fix --- src/python/evaluation/statistics/get_raw_issues_statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/evaluation/statistics/get_raw_issues_statistics.py b/src/python/evaluation/statistics/get_raw_issues_statistics.py index fbf689e4..615eb5dd 100644 --- a/src/python/evaluation/statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/statistics/get_raw_issues_statistics.py @@ -130,7 +130,7 @@ def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFr # Calculate line len ratio according to LineLengthRule line_len_ratio_column = lang_group[LINE_LEN_NUMBER] / lang_group[TOTAL_LINES].apply(lambda elem: max(1, elem)) line_len_ratio_column.name = IssueType.LINE_LEN.value - columns_with_stats.append(line_len_ratio_column) + columns_with_stats.append(round(line_len_ratio_column, 2)) # Calculate code style ratio according to CodeStyleRule if _is_python(str(lang)): @@ -143,7 +143,7 @@ def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFr ) code_style_ratio_column.name = IssueType.CODE_STYLE.value - columns_with_stats.append(code_style_ratio_column) + columns_with_stats.append(round(code_style_ratio_column, 2)) ratio_stats = pd.concat(columns_with_stats, axis=1) From 207e49d282524bd325a06f7ce309d386ee799338 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 13:49:08 +0300 Subject: [PATCH 07/35] Added new data folders --- test/python/evaluation/statistics/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/python/evaluation/statistics/__init__.py b/test/python/evaluation/statistics/__init__.py index 08bac33a..d0a4efd1 100644 --- a/test/python/evaluation/statistics/__init__.py +++ b/test/python/evaluation/statistics/__init__.py @@ -7,3 +7,9 @@ GET_RAW_ISSUES_TEST_FILES_FOLDER = GET_RAW_ISSUES_DATA_FOLDER / 'test_files' GET_RAW_ISSUES_TARGET_FILES_FOLDER = GET_RAW_ISSUES_DATA_FOLDER / 'target_files' + +GET_RAW_ISSUES_STATISTICS_DATA_FOLDER = STATISTICS_TEST_DATA_FOLDER / 'get_raw_issues_statistics' + +GET_RAW_ISSUES_STATISTICS_TEST_FILES_FOLDER = GET_RAW_ISSUES_STATISTICS_DATA_FOLDER / 'test_files' + +GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER = GET_RAW_ISSUES_STATISTICS_DATA_FOLDER / 'target_files' From df4bef25f81d064297458eecc5fedcbed8c6942d Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 13:50:25 +0300 Subject: [PATCH 08/35] Added tests --- .../test_get_raw_issues_statistics.py | 103 ++++++++++++++++++ .../target_df_single_lang_freq.csv | 36 ++++++ .../target_df_single_lang_ratio.csv | 4 + .../target_df_with_empty_raw_issues_freq.csv | 5 + .../target_df_with_empty_raw_issues_ratio.csv | 2 + ...target_df_with_incorrect_language_freq.csv | 21 ++++ ...arget_df_with_incorrect_language_ratio.csv | 2 + .../target_df_with_null_raw_issues_freq.csv | 3 + .../target_df_with_null_raw_issues_ratio.csv | 2 + .../test_df_with_empty_raw_issues.csv | 4 + .../test_df_with_incorrect_language.csv | 23 ++++ .../test_df_with_null_raw_issues.csv | 2 + .../test_files/test_single_lang_df.csv | 68 ++++++++++++ 13 files changed, 275 insertions(+) create mode 100644 test/python/evaluation/statistics/test_get_raw_issues_statistics.py create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_freq.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_ratio.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_freq.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_ratio.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_freq.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_ratio.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_freq.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_ratio.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_single_lang_df.csv diff --git a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py new file mode 100644 index 00000000..cbb6854e --- /dev/null +++ b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py @@ -0,0 +1,103 @@ +from pathlib import Path +from test.python.common_util import equal_df +from test.python.evaluation.statistics import ( + GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER, + GET_RAW_ISSUES_STATISTICS_TEST_FILES_FOLDER, +) +from typing import Optional + +import pandas as pd +import pytest +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.statistics.get_raw_issues_statistics import ( + _convert_language_code_to_language, + _get_output_folder, + DEFAULT_OUTPUT_FOLDER_NAME, + inspect_solutions, +) +from src.python.review.common.language import Language + +DF_PARENT_FOLDER_NAME = 'parent_folder' +DF_NAME = 'input_df' +DF_PATH = Path(DF_PARENT_FOLDER_NAME) / DF_NAME +DEFAULT_OUTPUT_PATH = Path(DF_PARENT_FOLDER_NAME) / DEFAULT_OUTPUT_FOLDER_NAME + +NEW_FOLDER = 'new_folder' + +GET_OUTPUT_FOLDER_PATH_TEST_DATA = [ + (DF_PATH, None, DEFAULT_OUTPUT_PATH), + (DF_PATH, Path(NEW_FOLDER), Path(NEW_FOLDER)), +] + + +@pytest.mark.parametrize( + ('solutions_file_path', 'output_folder', 'expected_output_folder'), + GET_OUTPUT_FOLDER_PATH_TEST_DATA, +) +def test_get_output_folder(solutions_file_path: Path, output_folder: Optional[Path], expected_output_folder: Path): + actual_output_folder = _get_output_folder(solutions_file_path, output_folder) + assert actual_output_folder == expected_output_folder + + +CONVERT_LANGUAGE_CODE_TO_LANGUAGE_TEST_DATA = [ + ('java7', 'JAVA'), + ('java8', 'JAVA'), + ('java9', 'JAVA'), + ('java11', 'JAVA'), + ('java15', 'JAVA'), + ('python3', 'PYTHON'), + ('kotlin', 'KOTLIN'), + ('javascript', 'JAVASCRIPT'), + ('some_weird_lang', 'some_weird_lang'), +] + + +@pytest.mark.parametrize(('language_code', 'expected_language'), CONVERT_LANGUAGE_CODE_TO_LANGUAGE_TEST_DATA) +def test_convert_language_code_to_language(language_code: str, expected_language: str): + actual_language = _convert_language_code_to_language(language_code) + assert actual_language == expected_language + + +INSPECT_SOLUTIONS_TEST_DATA = [ + ( + 'test_df_with_null_raw_issues.csv', + 'target_df_with_null_raw_issues_freq.csv', + 'target_df_with_null_raw_issues_ratio.csv', + Language.PYTHON.value, + ), + ( + 'test_df_with_empty_raw_issues.csv', + 'target_df_with_empty_raw_issues_freq.csv', + 'target_df_with_empty_raw_issues_ratio.csv', + Language.KOTLIN.value, + ), + ( + 'test_df_with_incorrect_language.csv', + 'target_df_with_incorrect_language_freq.csv', + 'target_df_with_incorrect_language_ratio.csv', + 'some_weird_lang', + ), + ( + 'test_single_lang_df.csv', + 'target_df_single_lang_freq.csv', + 'target_df_single_lang_ratio.csv', + Language.JAVA.value, + ), +] + + +@pytest.mark.parametrize( + ('test_file', 'target_freq_file', 'target_ratio_file', 'lang'), + INSPECT_SOLUTIONS_TEST_DATA, +) +def test_inspect_solutions(test_file: str, target_freq_file: str, target_ratio_file: str, lang: str): + test_df = get_solutions_df_by_file_path(GET_RAW_ISSUES_STATISTICS_TEST_FILES_FOLDER / test_file) + stats_by_lang = inspect_solutions(test_df) + + freq_stats = pd.read_csv(GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER / target_freq_file) + ratio_df = pd.read_csv(GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER / target_ratio_file) + + ratio_df.to_csv("/home/ilya/Downloads/ratio_df.csv") + + assert equal_df(stats_by_lang[lang][0], freq_stats) + assert equal_df(stats_by_lang[lang][1], ratio_df) diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_freq.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_freq.csv new file mode 100644 index 00000000..d1ea83f9 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_freq.csv @@ -0,0 +1,36 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines +0,2,2,3,3,3,0,0,0,0,0,0,0 +1,0,1,0,0,0,0,0,0,1,0,0,0 +2,0,0,0,0,0,0,0,0,0,0,0,0 +3,0,0,0,0,0,0,1,1,0,0,0,0 +4,0,0,0,0,0,0,0,0,1,0,0,0 +5,0,0,0,0,0,0,0,0,1,0,0,0 +6,0,0,0,0,0,0,0,0,0,0,0,1 +7,0,0,0,0,0,0,1,0,1,0,0,0 +8,0,0,0,0,0,0,0,0,0,0,0,0 +9,0,0,0,0,0,0,0,0,0,0,0,0 +10,0,0,0,0,0,0,0,0,0,0,0,0 +11,0,0,0,0,0,0,1,0,0,0,0,0 +12,0,0,0,0,0,0,0,0,0,0,0,0 +13,0,0,0,0,0,0,0,0,0,0,0,0 +14,0,0,0,0,0,0,1,0,0,0,0,0 +15,0,0,0,0,0,0,0,0,0,0,0,0 +16,0,0,0,0,0,0,0,0,0,0,0,0 +17,0,0,0,0,0,0,0,0,0,0,0,0 +18,0,0,0,0,0,0,0,0,0,0,0,0 +19,0,0,0,0,0,0,0,0,0,0,0,1 +20,0,0,0,0,0,0,0,0,0,0,0,0 +21,0,0,0,0,0,0,0,0,0,0,0,0 +22,0,0,0,0,0,0,0,0,0,0,0,0 +23,0,0,0,0,0,0,0,0,0,0,0,0 +24,0,0,0,0,0,0,0,0,0,0,0,0 +25,0,0,0,0,0,0,0,0,0,0,0,0 +26,0,0,0,0,0,0,0,0,0,0,0,0 +27,0,0,0,0,0,0,0,0,0,0,0,0 +28,0,0,0,0,0,0,0,0,0,0,0,0 +29,0,0,0,0,0,0,0,0,0,0,0,0 +30,0,0,0,0,0,0,0,0,0,0,0,0 +31,0,0,0,0,0,0,0,0,0,0,0,0 +32,0,0,0,0,0,0,0,0,0,0,0,0 +33,0,0,0,0,0,0,0,0,0,0,0,1 +34,1,0,0,0,0,0,0,0,0,0,0,0 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_ratio.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_ratio.csv new file mode 100644 index 00000000..99b308b6 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_ratio.csv @@ -0,0 +1,4 @@ +LINE_LEN,CODE_STYLE +0.00,0.00 +0.00,1.03 +0.00,0.00 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_freq.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_freq.csv new file mode 100644 index 00000000..5f564e68 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_freq.csv @@ -0,0 +1,5 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines +0,1,1,1,1,1,0,0,0,0,0,0,0 +1,0,0,0,0,0,0,0,0,0,0,0,0 +2,0,0,0,0,0,0,0,0,0,0,0,0 +3,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_ratio.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_ratio.csv new file mode 100644 index 00000000..555bb958 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_ratio.csv @@ -0,0 +1,2 @@ +LINE_LEN,CODE_STYLE +0.00000,0.00000 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_freq.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_freq.csv new file mode 100644 index 00000000..9c1c1018 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_freq.csv @@ -0,0 +1,21 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines +0,1,0,1,1,1,0,0,0,0,0,0,0 +1,0,1,0,0,0,0,0,0,0,0,0,0 +2,0,0,0,0,0,0,0,0,0,0,0,0 +3,0,0,0,0,0,0,0,1,0,0,0,0 +4,0,0,0,0,0,0,0,0,0,0,0,0 +5,0,0,0,0,0,0,0,0,0,0,0,0 +6,0,0,0,0,0,0,0,0,0,0,0,0 +7,0,0,0,0,0,0,0,0,1,0,0,0 +8,0,0,0,0,0,0,0,0,0,0,0,0 +9,0,0,0,0,0,0,0,0,0,0,0,0 +10,0,0,0,0,0,0,0,0,0,0,0,0 +11,0,0,0,0,0,0,1,0,0,0,0,0 +12,0,0,0,0,0,0,0,0,0,0,0,0 +13,0,0,0,0,0,0,0,0,0,0,0,0 +14,0,0,0,0,0,0,0,0,0,0,0,0 +15,0,0,0,0,0,0,0,0,0,0,0,0 +16,0,0,0,0,0,0,0,0,0,0,0,0 +17,0,0,0,0,0,0,0,0,0,0,0,0 +18,0,0,0,0,0,0,0,0,0,0,0,0 +19,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_ratio.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_ratio.csv new file mode 100644 index 00000000..555bb958 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_ratio.csv @@ -0,0 +1,2 @@ +LINE_LEN,CODE_STYLE +0.00000,0.00000 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_freq.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_freq.csv new file mode 100644 index 00000000..20466ef9 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_freq.csv @@ -0,0 +1,3 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines +0,1,1,1,1,1,0,0,0,0,0,0,0 +1,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_ratio.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_ratio.csv new file mode 100644 index 00000000..555bb958 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_ratio.csv @@ -0,0 +1,2 @@ +LINE_LEN,CODE_STYLE +0.00000,0.00000 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv new file mode 100644 index 00000000..37e9eca2 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv @@ -0,0 +1,4 @@ +id,lang,code,raw_issues +1,kotlin,"fun main() { + println(""Hello, World!"") +}",[] \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv new file mode 100644 index 00000000..e98834c0 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv @@ -0,0 +1,23 @@ +id,lang,code,raw_issues +1,some_weird_lang,"import java.util.Scanner; +import java.util.Arrays; + +class Main { + public static void main(String[] args) { + Scanner scanner = new Scanner(System.in); + + int n = scanner.nextInt(); + String[][] star = new String[n][n]; + + for (int i = 0; i < star.length; i++) { + for (int j = 0; j < star[i].length; j++) { + star[i][j] = "". ""; + if (i == (n / 2) || j == (n / 2) || i == j || j == n - i - 1) { + star[i][j] = ""* ""; + } + System.out.print(star[i][j]); + } + System.out.println(""""); + } + } +}","[{""origin_class"": ""UnusedImportsCheck"", ""type"": ""BEST_PRACTICES"", ""description"": ""Unused import - java.util.Arrays."", ""file_path"": """", ""line_no"": 2, ""column_no"": 8, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 5, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 7}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 5, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 11}, {""origin_class"": ""BooleanExpressionComplexityCheck"", ""type"": ""BOOL_EXPR_LEN"", ""description"": ""Too long boolean expression. Try to split it into smaller expressions."", ""file_path"": """", ""line_no"": 14, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 3}]" \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv new file mode 100644 index 00000000..1dd62893 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv @@ -0,0 +1,2 @@ +id,lang,code,raw_issues +1,python3,"println(""Hello, World!"")", \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_single_lang_df.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_single_lang_df.csv new file mode 100644 index 00000000..b1662148 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_single_lang_df.csv @@ -0,0 +1,68 @@ +id,lang,code,raw_issues +1,java11,"import java.util.Scanner; +import java.util.Arrays; + +class Main { + public static void main(String[] args) { + Scanner scanner = new Scanner(System.in); + + int n = scanner.nextInt(); + String[][] star = new String[n][n]; + + for (int i = 0; i < star.length; i++) { + for (int j = 0; j < star[i].length; j++) { + star[i][j] = "". ""; + if (i == (n / 2) || j == (n / 2) || i == j || j == n - i - 1) { + star[i][j] = ""* ""; + } + System.out.print(star[i][j]); + } + System.out.println(""""); + } + } +}","[{""origin_class"": ""UnusedImportsCheck"", ""type"": ""BEST_PRACTICES"", ""description"": ""Unused import - java.util.Arrays."", ""file_path"": """", ""line_no"": 2, ""column_no"": 8, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 5, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 7}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 5, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 11}, {""origin_class"": ""BooleanExpressionComplexityCheck"", ""type"": ""BOOL_EXPR_LEN"", ""description"": ""Too long boolean expression. Try to split it into smaller expressions."", ""file_path"": """", ""line_no"": 14, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 3}]" +2,java11,"import java.util.Scanner; +public class Main { + + public static void main(String[] args) { + Scanner scanner = new Scanner(System.in); + + int n = scanner.nextInt(); + int mas[][] = new int[n][n]; + int i; + int j; + for(i = 0; i < n; i++) { + mas[i][0] = i; + } + for (j = 0; j < n; j++) { + mas[0][j] = j; + } + for (i = 1; i < n; i++) { + for (j = 1; j < n; j++) { + mas[i][j] = mas[i][j -1] - 1; + } + } + printArray(mas, n); + } + + public static void printArray(int[][] mas, int n) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if (mas[i][j] < 0) { + mas[i][j] *= -1; + } + System.out.print(mas[i][j] + "" ""); + } + System.out.println(); + } + } +}","[{""origin_class"": ""EmptyLineSeparatorCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'CLASS_DEF' should be separated from previous line."", ""file_path"": """", ""line_no"": 2, ""column_no"": 1, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 4, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 5}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 4, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 14}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 5, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 7, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 8, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""ArrayTypeStyleCheck"", ""type"": ""CODE_STYLE"", ""description"": ""Array brackets at illegal position."", ""file_path"": """", ""line_no"": 8, ""column_no"": 24, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""ArrayTypeStyleCheck"", ""type"": ""CODE_STYLE"", ""description"": ""Array brackets at illegal position."", ""file_path"": """", ""line_no"": 8, ""column_no"": 26, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 9, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 10, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 11, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""WhitespaceAfterCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' is not followed by whitespace."", ""file_path"": """", ""line_no"": 11, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 12, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 13, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 14, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 15, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 16, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 17, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 18, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 32, expected level should be 16."", ""file_path"": """", ""line_no"": 19, ""column_no"": 33, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""WhitespaceAroundCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'-' is not followed by whitespace."", ""file_path"": """", ""line_no"": 19, ""column_no"": 54, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 20, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 21, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 22, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def rcurly' has incorrect indentation level 8, expected level should be 4."", ""file_path"": """", ""line_no"": 23, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def modifier' has incorrect indentation level 8, expected level should be 4."", ""file_path"": """", ""line_no"": 25, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 25, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 4}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 25, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 7}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 12, expected level should be 8."", ""file_path"": """", ""line_no"": 26, ""column_no"": 13, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 20, expected level should be 12."", ""file_path"": """", ""line_no"": 27, ""column_no"": 21, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'if' has incorrect indentation level 28, expected level should be 16."", ""file_path"": """", ""line_no"": 28, ""column_no"": 29, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'if' child has incorrect indentation level 36, expected level should be 20."", ""file_path"": """", ""line_no"": 29, ""column_no"": 37, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'if rcurly' has incorrect indentation level 32, expected level should be 16."", ""file_path"": """", ""line_no"": 30, ""column_no"": 33, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 32, expected level should be 16."", ""file_path"": """", ""line_no"": 31, ""column_no"": 33, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 32, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 20, expected level should be 12."", ""file_path"": """", ""line_no"": 33, ""column_no"": 21, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 34, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def rcurly' has incorrect indentation level 8, expected level should be 4."", ""file_path"": """", ""line_no"": 35, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE""}]" +3,java11,"public class Main { + public static void main(String[] args) { + + int variable = 123456; // Change this line + + System.out.println(variable); + } +} +","[{""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 2, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 1}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 2, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 3}]" \ No newline at end of file From 63104d86b0b637964ce20cacebc31c08fa906e6f Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 14:09:39 +0300 Subject: [PATCH 09/35] Removed duplicates --- test/python/evaluation/statistics/test_get_raw_issues.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/test/python/evaluation/statistics/test_get_raw_issues.py b/test/python/evaluation/statistics/test_get_raw_issues.py index 67126322..c11882c2 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues.py +++ b/test/python/evaluation/statistics/test_get_raw_issues.py @@ -19,15 +19,6 @@ NEW_DF_NAME = 'new_df' -ORIGINAL_DF_NAME = 'original_df' -ORIGINAL_DF_CSV = f'{ORIGINAL_DF_NAME}.csv' -ORIGINAL_DF_XLSX = f'{ORIGINAL_DF_NAME}.xlsx' - -ORIGINAL_DF_WITH_RAW_ISSUES_CSV = f'{ORIGINAL_DF_NAME}_with_raw_issues.csv' -ORIGINAL_DF_WITH_RAW_ISSUES_XLSX = f'{ORIGINAL_DF_NAME}_with_raw_issues.xlsx' - -NEW_DF_NAME = 'new_df' - GET_OUTPUT_PATH_TEST_DATA = [ (Path(ORIGINAL_DF_CSV), None, Path(ORIGINAL_DF_WITH_RAW_ISSUES_CSV)), (Path(ORIGINAL_DF_XLSX), None, Path(ORIGINAL_DF_WITH_RAW_ISSUES_XLSX)), From 3fb8b5e8a55b6c95c10a0b02243780467b4ae80e Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 14:20:25 +0300 Subject: [PATCH 10/35] Removed unnecessary line --- .../evaluation/statistics/test_get_raw_issues_statistics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py index cbb6854e..715b8f2a 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py @@ -97,7 +97,5 @@ def test_inspect_solutions(test_file: str, target_freq_file: str, target_ratio_f freq_stats = pd.read_csv(GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER / target_freq_file) ratio_df = pd.read_csv(GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER / target_ratio_file) - ratio_df.to_csv("/home/ilya/Downloads/ratio_df.csv") - assert equal_df(stats_by_lang[lang][0], freq_stats) assert equal_df(stats_by_lang[lang][1], ratio_df) From 2fcfedebec6da0256f4d36d2521475e4e3e15544 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 15:56:19 +0300 Subject: [PATCH 11/35] Now the script returns only one dataframe --- .../statistics/get_raw_issues_statistics.py | 55 +++++++------------ 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/src/python/evaluation/statistics/get_raw_issues_statistics.py b/src/python/evaluation/statistics/get_raw_issues_statistics.py index 615eb5dd..c6574a1a 100644 --- a/src/python/evaluation/statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/statistics/get_raw_issues_statistics.py @@ -4,7 +4,7 @@ from collections import Counter from json import JSONDecodeError from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional sys.path.append('') sys.path.append('../../..') @@ -26,13 +26,14 @@ CODE = ColumnName.CODE.value CODE_STYLE_LINES = f'{IssueType.CODE_STYLE.value}_lines' +CODE_STYLE_RATIO = f'{IssueType.CODE_STYLE.value}_ratio' LINE_LEN_NUMBER = f'{IssueType.LINE_LEN.value}_number' +LINE_LEN_RATIO = f'{IssueType.LINE_LEN.value}_ratio' TOTAL_LINES = 'total_lines' VALUE = 'value' +STATS_DF_NAME = 'stats' DEFAULT_OUTPUT_FOLDER_NAME = 'raw_issues_statistics' -MAIN_STATS_DF_NAME = 'main_stats' -OTHER_STATS_DF_NAME = 'other_stats' def configure_arguments(parser: argparse.ArgumentParser) -> None: @@ -95,7 +96,7 @@ def _is_python(language_code: str) -> bool: return False -def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]: +def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame]: result = {} df_grouped_by_lang = df_with_stats.groupby(LANG) @@ -104,7 +105,6 @@ def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFr columns_with_stats = [] - # ---- Frequency statistics ---- for issue_type, issue_class in ISSUE_TYPE_TO_CLASS.items(): column = lang_group[issue_type.value] if issubclass(issue_class, Measurable): @@ -113,24 +113,11 @@ def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFr columns_with_stats.append(lang_group[TOTAL_LINES].value_counts()) - freq_stats = pd.concat(columns_with_stats, axis=1).fillna(0) - - # Fill in the intermediate values that are not occurred with zeros - min_value, max_value = freq_stats.index.min(), freq_stats.index.max() - freq_stats = freq_stats.reindex(range(min_value, max_value + 1), fill_value=0).astype(int) - - # Put the values in a separate column - freq_stats.index.name = VALUE - freq_stats.reset_index(inplace=True) - - columns_with_stats.clear() - - # ---- Ratio statistics ---- - # Calculate line len ratio according to LineLengthRule line_len_ratio_column = lang_group[LINE_LEN_NUMBER] / lang_group[TOTAL_LINES].apply(lambda elem: max(1, elem)) - line_len_ratio_column.name = IssueType.LINE_LEN.value - columns_with_stats.append(round(line_len_ratio_column, 2)) + line_len_ratio_column = (round(line_len_ratio_column, 2) * 100).apply(int) + line_len_ratio_column.name = LINE_LEN_RATIO + columns_with_stats.append(line_len_ratio_column.value_counts()) # Calculate code style ratio according to CodeStyleRule if _is_python(str(lang)): @@ -142,17 +129,22 @@ def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, Tuple[pd.DataFr lambda total_lines: max(1, total_lines - 4), ) - code_style_ratio_column.name = IssueType.CODE_STYLE.value - columns_with_stats.append(round(code_style_ratio_column, 2)) + code_style_ratio_column = (round(code_style_ratio_column, 2) * 100).apply(int) + code_style_ratio_column.name = CODE_STYLE_RATIO + columns_with_stats.append(code_style_ratio_column.value_counts()) + + stats = pd.concat(columns_with_stats, axis=1).fillna(0).astype(int) - ratio_stats = pd.concat(columns_with_stats, axis=1) + # Put values in a separate column + stats.index.name = VALUE + stats.reset_index(inplace=True) - result[str(lang)] = (freq_stats, ratio_stats) + result[str(lang)] = stats return result -def inspect_solutions(solutions_with_raw_issues: pd.DataFrame) -> Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]: +def inspect_solutions(solutions_with_raw_issues: pd.DataFrame) -> Dict[str, pd.DataFrame]: pandarallel.initialize() solutions_with_raw_issues = solutions_with_raw_issues.parallel_apply(_extract_stats_from_issues, axis=1) @@ -167,19 +159,14 @@ def _get_output_folder(solutions_file_path: Path, output_folder: Optional[Path]) return get_parent_folder(solutions_file_path) / DEFAULT_OUTPUT_FOLDER_NAME -def _save_stats( - stats_by_lang: Dict[str, Tuple[pd.DataFrame, pd.DataFrame]], - solutions_file_path: Path, - output_path: Optional[Path], -) -> None: +def _save_stats(stats_by_lang: Dict[str, pd.DataFrame], solutions_file_path: Path, output_path: Optional[Path]) -> None: output_folder = _get_output_folder(solutions_file_path, output_path) output_extension = Extension.get_extension_from_file(str(solutions_file_path)) - for lang, (main_stats, other_stats) in stats_by_lang.items(): + for lang, stats in stats_by_lang.items(): lang_folder = output_folder / lang lang_folder.mkdir(parents=True, exist_ok=True) - write_df_to_file(main_stats, lang_folder / f'{MAIN_STATS_DF_NAME}{output_extension.value}', output_extension) - write_df_to_file(other_stats, lang_folder / f'{OTHER_STATS_DF_NAME}{output_extension.value}', output_extension) + write_df_to_file(stats, lang_folder / f'{STATS_DF_NAME}{output_extension.value}', output_extension) if __name__ == "__main__": From 715211259b3aef27026d3a9f29ce288da839daa2 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 15:56:32 +0300 Subject: [PATCH 12/35] Fixed tests --- .../test_get_raw_issues_statistics.py | 29 ++++++--------- .../target_files/target_df_single_lang.csv | 14 ++++++++ .../target_df_single_lang_freq.csv | 36 ------------------- .../target_df_single_lang_ratio.csv | 4 --- ...sv => target_df_with_empty_raw_issues.csv} | 6 ++-- .../target_df_with_empty_raw_issues_ratio.csv | 2 -- .../target_df_with_incorrect_language.csv | 7 ++++ ...target_df_with_incorrect_language_freq.csv | 21 ----------- ...arget_df_with_incorrect_language_ratio.csv | 2 -- ...csv => target_df_with_null_raw_issues.csv} | 8 ++--- .../target_df_with_null_raw_issues_ratio.csv | 2 -- ...le_lang_df.csv => test_df_single_lang.csv} | 0 12 files changed, 37 insertions(+), 94 deletions(-) create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang.csv delete mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_freq.csv delete mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_ratio.csv rename test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/{target_df_with_null_raw_issues_freq.csv => target_df_with_empty_raw_issues.csv} (52%) delete mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_ratio.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv delete mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_freq.csv delete mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_ratio.csv rename test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/{target_df_with_empty_raw_issues_freq.csv => target_df_with_null_raw_issues.csv} (50%) delete mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_ratio.csv rename test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/{test_single_lang_df.csv => test_df_single_lang.csv} (100%) diff --git a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py index 715b8f2a..395f848c 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py @@ -61,41 +61,32 @@ def test_convert_language_code_to_language(language_code: str, expected_language INSPECT_SOLUTIONS_TEST_DATA = [ ( 'test_df_with_null_raw_issues.csv', - 'target_df_with_null_raw_issues_freq.csv', - 'target_df_with_null_raw_issues_ratio.csv', + 'target_df_with_null_raw_issues.csv', Language.PYTHON.value, ), ( 'test_df_with_empty_raw_issues.csv', - 'target_df_with_empty_raw_issues_freq.csv', - 'target_df_with_empty_raw_issues_ratio.csv', + 'target_df_with_empty_raw_issues.csv', Language.KOTLIN.value, ), ( 'test_df_with_incorrect_language.csv', - 'target_df_with_incorrect_language_freq.csv', - 'target_df_with_incorrect_language_ratio.csv', + 'target_df_with_incorrect_language.csv', 'some_weird_lang', ), ( - 'test_single_lang_df.csv', - 'target_df_single_lang_freq.csv', - 'target_df_single_lang_ratio.csv', + 'test_df_single_lang.csv', + 'target_df_single_lang.csv', Language.JAVA.value, ), ] -@pytest.mark.parametrize( - ('test_file', 'target_freq_file', 'target_ratio_file', 'lang'), - INSPECT_SOLUTIONS_TEST_DATA, -) -def test_inspect_solutions(test_file: str, target_freq_file: str, target_ratio_file: str, lang: str): +@pytest.mark.parametrize(('test_file', 'target_file', 'lang'), INSPECT_SOLUTIONS_TEST_DATA) +def test_inspect_solutions(test_file: str, target_file: str, lang: str): test_df = get_solutions_df_by_file_path(GET_RAW_ISSUES_STATISTICS_TEST_FILES_FOLDER / test_file) - stats_by_lang = inspect_solutions(test_df) + stats = inspect_solutions(test_df) - freq_stats = pd.read_csv(GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER / target_freq_file) - ratio_df = pd.read_csv(GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER / target_ratio_file) + freq_stats = pd.read_csv(GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER / target_file) - assert equal_df(stats_by_lang[lang][0], freq_stats) - assert equal_df(stats_by_lang[lang][1], ratio_df) + assert equal_df(stats[lang], freq_stats) diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang.csv new file mode 100644 index 00000000..0797881b --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang.csv @@ -0,0 +1,14 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio +0,2,2,3,3,3,0,0,0,0,0,0,0,3,2 +1,0,1,0,0,0,0,0,0,1,0,0,0,0,0 +3,0,0,0,0,0,0,1,1,0,0,0,0,0,0 +4,0,0,0,0,0,0,0,0,1,0,0,0,0,0 +5,0,0,0,0,0,0,0,0,1,0,0,0,0,0 +6,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +7,0,0,0,0,0,0,1,0,1,0,0,0,0,0 +11,0,0,0,0,0,0,1,0,0,0,0,0,0,0 +14,0,0,0,0,0,0,1,0,0,0,0,0,0,0 +19,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +33,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +34,1,0,0,0,0,0,0,0,0,0,0,0,0,0 +103,0,0,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_freq.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_freq.csv deleted file mode 100644 index d1ea83f9..00000000 --- a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_freq.csv +++ /dev/null @@ -1,36 +0,0 @@ -value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines -0,2,2,3,3,3,0,0,0,0,0,0,0 -1,0,1,0,0,0,0,0,0,1,0,0,0 -2,0,0,0,0,0,0,0,0,0,0,0,0 -3,0,0,0,0,0,0,1,1,0,0,0,0 -4,0,0,0,0,0,0,0,0,1,0,0,0 -5,0,0,0,0,0,0,0,0,1,0,0,0 -6,0,0,0,0,0,0,0,0,0,0,0,1 -7,0,0,0,0,0,0,1,0,1,0,0,0 -8,0,0,0,0,0,0,0,0,0,0,0,0 -9,0,0,0,0,0,0,0,0,0,0,0,0 -10,0,0,0,0,0,0,0,0,0,0,0,0 -11,0,0,0,0,0,0,1,0,0,0,0,0 -12,0,0,0,0,0,0,0,0,0,0,0,0 -13,0,0,0,0,0,0,0,0,0,0,0,0 -14,0,0,0,0,0,0,1,0,0,0,0,0 -15,0,0,0,0,0,0,0,0,0,0,0,0 -16,0,0,0,0,0,0,0,0,0,0,0,0 -17,0,0,0,0,0,0,0,0,0,0,0,0 -18,0,0,0,0,0,0,0,0,0,0,0,0 -19,0,0,0,0,0,0,0,0,0,0,0,1 -20,0,0,0,0,0,0,0,0,0,0,0,0 -21,0,0,0,0,0,0,0,0,0,0,0,0 -22,0,0,0,0,0,0,0,0,0,0,0,0 -23,0,0,0,0,0,0,0,0,0,0,0,0 -24,0,0,0,0,0,0,0,0,0,0,0,0 -25,0,0,0,0,0,0,0,0,0,0,0,0 -26,0,0,0,0,0,0,0,0,0,0,0,0 -27,0,0,0,0,0,0,0,0,0,0,0,0 -28,0,0,0,0,0,0,0,0,0,0,0,0 -29,0,0,0,0,0,0,0,0,0,0,0,0 -30,0,0,0,0,0,0,0,0,0,0,0,0 -31,0,0,0,0,0,0,0,0,0,0,0,0 -32,0,0,0,0,0,0,0,0,0,0,0,0 -33,0,0,0,0,0,0,0,0,0,0,0,1 -34,1,0,0,0,0,0,0,0,0,0,0,0 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_ratio.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_ratio.csv deleted file mode 100644 index 99b308b6..00000000 --- a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang_ratio.csv +++ /dev/null @@ -1,4 +0,0 @@ -LINE_LEN,CODE_STYLE -0.00,0.00 -0.00,1.03 -0.00,0.00 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_freq.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues.csv similarity index 52% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_freq.csv rename to test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues.csv index 20466ef9..55c2e3d8 100644 --- a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_freq.csv +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues.csv @@ -1,3 +1,3 @@ -value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines -0,1,1,1,1,1,0,0,0,0,0,0,0 -1,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio +0,1,1,1,1,1,0,0,0,0,0,0,0,1,1 +3,0,0,0,0,0,0,0,0,0,0,0,1,0,0 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_ratio.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_ratio.csv deleted file mode 100644 index 555bb958..00000000 --- a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_ratio.csv +++ /dev/null @@ -1,2 +0,0 @@ -LINE_LEN,CODE_STYLE -0.00000,0.00000 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv new file mode 100644 index 00000000..ebbb9c4a --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv @@ -0,0 +1,7 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio +0,1,0,1,1,1,0,0,0,0,0,0,0,1,1 +1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 +3,0,0,0,0,0,0,0,1,0,0,0,0,0,0 +7,0,0,0,0,0,0,0,0,1,0,0,0,0,0 +11,0,0,0,0,0,0,1,0,0,0,0,0,0,0 +19,0,0,0,0,0,0,0,0,0,0,0,1,0,0 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_freq.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_freq.csv deleted file mode 100644 index 9c1c1018..00000000 --- a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_freq.csv +++ /dev/null @@ -1,21 +0,0 @@ -value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines -0,1,0,1,1,1,0,0,0,0,0,0,0 -1,0,1,0,0,0,0,0,0,0,0,0,0 -2,0,0,0,0,0,0,0,0,0,0,0,0 -3,0,0,0,0,0,0,0,1,0,0,0,0 -4,0,0,0,0,0,0,0,0,0,0,0,0 -5,0,0,0,0,0,0,0,0,0,0,0,0 -6,0,0,0,0,0,0,0,0,0,0,0,0 -7,0,0,0,0,0,0,0,0,1,0,0,0 -8,0,0,0,0,0,0,0,0,0,0,0,0 -9,0,0,0,0,0,0,0,0,0,0,0,0 -10,0,0,0,0,0,0,0,0,0,0,0,0 -11,0,0,0,0,0,0,1,0,0,0,0,0 -12,0,0,0,0,0,0,0,0,0,0,0,0 -13,0,0,0,0,0,0,0,0,0,0,0,0 -14,0,0,0,0,0,0,0,0,0,0,0,0 -15,0,0,0,0,0,0,0,0,0,0,0,0 -16,0,0,0,0,0,0,0,0,0,0,0,0 -17,0,0,0,0,0,0,0,0,0,0,0,0 -18,0,0,0,0,0,0,0,0,0,0,0,0 -19,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_ratio.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_ratio.csv deleted file mode 100644 index 555bb958..00000000 --- a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language_ratio.csv +++ /dev/null @@ -1,2 +0,0 @@ -LINE_LEN,CODE_STYLE -0.00000,0.00000 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_freq.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv similarity index 50% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_freq.csv rename to test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv index 5f564e68..980efd7f 100644 --- a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues_freq.csv +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv @@ -1,5 +1,3 @@ -value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines -0,1,1,1,1,1,0,0,0,0,0,0,0 -1,0,0,0,0,0,0,0,0,0,0,0,0 -2,0,0,0,0,0,0,0,0,0,0,0,0 -3,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio +0,1,1,1,1,1,0,0,0,0,0,0,0,1,1 +1,0,0,0,0,0,0,0,0,0,0,0,1,0,0 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_ratio.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_ratio.csv deleted file mode 100644 index 555bb958..00000000 --- a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues_ratio.csv +++ /dev/null @@ -1,2 +0,0 @@ -LINE_LEN,CODE_STYLE -0.00000,0.00000 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_single_lang_df.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_single_lang.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_single_lang_df.csv rename to test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_single_lang.csv From 49fec5ec97a7d612de05fea7dfb1eb89551ba56a Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 16:56:02 +0300 Subject: [PATCH 13/35] Added new tests --- .../test_get_raw_issues_statistics.py | 15 +++ .../target_df_multi_lang_java.csv | 14 ++ .../target_files/target_df_multi_lang_js.csv | 6 + .../target_df_multi_lang_python.csv | 12 ++ .../test_files/test_df_multi_lang.csv | 127 ++++++++++++++++++ 5 files changed, 174 insertions(+) create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv create mode 100644 test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_multi_lang.csv diff --git a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py index 395f848c..b81dd707 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py @@ -79,6 +79,21 @@ def test_convert_language_code_to_language(language_code: str, expected_language 'target_df_single_lang.csv', Language.JAVA.value, ), + ( + 'test_df_multi_lang.csv', + 'target_df_multi_lang_java.csv', + Language.JAVA.value, + ), + ( + 'test_df_multi_lang.csv', + 'target_df_multi_lang_js.csv', + Language.JS.value, + ), + ( + 'test_df_multi_lang.csv', + 'target_df_multi_lang_python.csv', + Language.PYTHON.value, + ), ] diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv new file mode 100644 index 00000000..0797881b --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv @@ -0,0 +1,14 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio +0,2,2,3,3,3,0,0,0,0,0,0,0,3,2 +1,0,1,0,0,0,0,0,0,1,0,0,0,0,0 +3,0,0,0,0,0,0,1,1,0,0,0,0,0,0 +4,0,0,0,0,0,0,0,0,1,0,0,0,0,0 +5,0,0,0,0,0,0,0,0,1,0,0,0,0,0 +6,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +7,0,0,0,0,0,0,1,0,1,0,0,0,0,0 +11,0,0,0,0,0,0,1,0,0,0,0,0,0,0 +14,0,0,0,0,0,0,1,0,0,0,0,0,0,0 +19,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +33,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +34,1,0,0,0,0,0,0,0,0,0,0,0,0,0 +103,0,0,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv new file mode 100644 index 00000000..27c13bb9 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv @@ -0,0 +1,6 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio +0,0,0,1,1,1,0,0,0,0,0,0,0,1,0 +1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 +2,1,0,0,0,0,0,0,0,0,0,0,0,0,0 +9,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +40,0,0,0,0,0,0,0,0,0,0,0,0,0,1 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv new file mode 100644 index 00000000..d5749fb4 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv @@ -0,0 +1,12 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio +0,1,1,2,2,2,0,0,0,0,0,0,0,2,1 +1,1,0,0,0,0,0,1,1,6,0,0,0,0,0 +2,0,1,0,0,0,0,1,0,3,0,0,0,0,0 +3,0,0,0,0,0,0,3,0,0,0,0,0,0,1 +4,0,0,0,0,0,0,1,0,0,0,0,0,0,0 +5,0,0,0,0,0,0,2,0,0,0,0,0,0,0 +9,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +12,0,0,0,0,0,0,0,0,0,1,0,0,0,0 +14,0,0,0,0,0,0,0,0,0,0,1,0,0,0 +30,0,0,0,0,0,0,0,0,0,0,0,1,0,0 +50,0,0,0,0,0,0,0,0,0,1,0,0,0,0 \ No newline at end of file diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_multi_lang.csv b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_multi_lang.csv new file mode 100644 index 00000000..129a8b51 --- /dev/null +++ b/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_multi_lang.csv @@ -0,0 +1,127 @@ +id,lang,code,raw_issues +1,java8,"import java.util.Scanner; +import java.util.Arrays; + +class Main { + public static void main(String[] args) { + Scanner scanner = new Scanner(System.in); + + int n = scanner.nextInt(); + String[][] star = new String[n][n]; + + for (int i = 0; i < star.length; i++) { + for (int j = 0; j < star[i].length; j++) { + star[i][j] = "". ""; + if (i == (n / 2) || j == (n / 2) || i == j || j == n - i - 1) { + star[i][j] = ""* ""; + } + System.out.print(star[i][j]); + } + System.out.println(""""); + } + } +}","[{""origin_class"": ""UnusedImportsCheck"", ""type"": ""BEST_PRACTICES"", ""description"": ""Unused import - java.util.Arrays."", ""file_path"": """", ""line_no"": 2, ""column_no"": 8, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 5, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 7}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 5, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 11}, {""origin_class"": ""BooleanExpressionComplexityCheck"", ""type"": ""BOOL_EXPR_LEN"", ""description"": ""Too long boolean expression. Try to split it into smaller expressions."", ""file_path"": """", ""line_no"": 14, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 3}]" +2,java9,"import java.util.Scanner; +public class Main { + + public static void main(String[] args) { + Scanner scanner = new Scanner(System.in); + + int n = scanner.nextInt(); + int mas[][] = new int[n][n]; + int i; + int j; + for(i = 0; i < n; i++) { + mas[i][0] = i; + } + for (j = 0; j < n; j++) { + mas[0][j] = j; + } + for (i = 1; i < n; i++) { + for (j = 1; j < n; j++) { + mas[i][j] = mas[i][j -1] - 1; + } + } + printArray(mas, n); + } + + public static void printArray(int[][] mas, int n) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + if (mas[i][j] < 0) { + mas[i][j] *= -1; + } + System.out.print(mas[i][j] + "" ""); + } + System.out.println(); + } + } +}","[{""origin_class"": ""EmptyLineSeparatorCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'CLASS_DEF' should be separated from previous line."", ""file_path"": """", ""line_no"": 2, ""column_no"": 1, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 4, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 5}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 4, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 14}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 5, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 7, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 8, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""ArrayTypeStyleCheck"", ""type"": ""CODE_STYLE"", ""description"": ""Array brackets at illegal position."", ""file_path"": """", ""line_no"": 8, ""column_no"": 24, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""ArrayTypeStyleCheck"", ""type"": ""CODE_STYLE"", ""description"": ""Array brackets at illegal position."", ""file_path"": """", ""line_no"": 8, ""column_no"": 26, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 9, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 10, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 11, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""WhitespaceAfterCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' is not followed by whitespace."", ""file_path"": """", ""line_no"": 11, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 12, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 13, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 14, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 15, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 16, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 17, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 18, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 32, expected level should be 16."", ""file_path"": """", ""line_no"": 19, ""column_no"": 33, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""WhitespaceAroundCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'-' is not followed by whitespace."", ""file_path"": """", ""line_no"": 19, ""column_no"": 54, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 20, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 21, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def' child has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 22, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def rcurly' has incorrect indentation level 8, expected level should be 4."", ""file_path"": """", ""line_no"": 23, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def modifier' has incorrect indentation level 8, expected level should be 4."", ""file_path"": """", ""line_no"": 25, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 25, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 4}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 25, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 7}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 12, expected level should be 8."", ""file_path"": """", ""line_no"": 26, ""column_no"": 13, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' has incorrect indentation level 20, expected level should be 12."", ""file_path"": """", ""line_no"": 27, ""column_no"": 21, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'if' has incorrect indentation level 28, expected level should be 16."", ""file_path"": """", ""line_no"": 28, ""column_no"": 29, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'if' child has incorrect indentation level 36, expected level should be 20."", ""file_path"": """", ""line_no"": 29, ""column_no"": 37, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'if rcurly' has incorrect indentation level 32, expected level should be 16."", ""file_path"": """", ""line_no"": 30, ""column_no"": 33, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 32, expected level should be 16."", ""file_path"": """", ""line_no"": 31, ""column_no"": 33, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 24, expected level should be 12."", ""file_path"": """", ""line_no"": 32, ""column_no"": 25, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for' child has incorrect indentation level 20, expected level should be 12."", ""file_path"": """", ""line_no"": 33, ""column_no"": 21, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'for rcurly' has incorrect indentation level 16, expected level should be 8."", ""file_path"": """", ""line_no"": 34, ""column_no"": 17, ""inspector_type"": ""CHECKSTYLE""}, {""origin_class"": ""IndentationCheck"", ""type"": ""CODE_STYLE"", ""description"": ""'method def rcurly' has incorrect indentation level 8, expected level should be 4."", ""file_path"": """", ""line_no"": 35, ""column_no"": 9, ""inspector_type"": ""CHECKSTYLE""}]" +3,java11,"public class Main { + public static void main(String[] args) { + + int variable = 123456; // Change this line + + System.out.println(variable); + } +} +","[{""origin_class"": ""CyclomaticComplexityCheck"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 2, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 1}, {""origin_class"": ""JavaNCSSCheck"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 2, ""column_no"": 5, ""inspector_type"": ""CHECKSTYLE"", ""measure"": 3}]" +4,python3,"class ComplexNumber: + def __init__(self, real_part, im_part): + self.real_part = real_part + self.im_part = im_part + + def __add__(self, other): + real = self.real_part + other.real_part + imaginary = self.im_part + other.im_part + return ComplexNumber(real, imaginary) + + def __mul__(self, other): + real = self.real_part * other.real_part - self.im_part * other.im_part + imaginary = self.real_part * other.im_part + other.real_part * self.im_part + return ComplexNumber(real, imaginary) + + def __eq__(self, other): + return ((self.real_part == other.real_part) and (self.im_part == other.im_part)) + + def __str__(self): + if self.im_part < 0: + sign = ""-"" + else: + sign = ""+"" + return ""{} {} {}i"".format(self.real_part, sign, abs(self.im_part)) + + def __truediv__(self, other): + div = (other.real_part ** 2 + other.im_part ** 2) + other.real_part = (other.real_part / div) + other.im_part = -(other.im_part / div) + result = self.__mul__(other) + return result + + def __sub__(self, other): + real = self.real_part - other.real_part + imaginary = self.im_part - other.im_part + return ComplexNumber(real, imaginary)","[{""origin_class"": ""C0325"", ""type"": ""CODE_STYLE"", ""description"": ""Unnecessary parens after 'return' keyword"", ""file_path"": """", ""line_no"": 17, ""column_no"": 1, ""inspector_type"": ""PYLINT""}, {""origin_class"": ""C001"", ""type"": ""BOOL_EXPR_LEN"", ""description"": ""Too long boolean expression. Try to split it into smaller expressions."", ""file_path"": """", ""line_no"": 17, ""column_no"": 16, ""inspector_type"": ""PYTHON_AST"", ""measure"": 1}, {""origin_class"": ""H601"", ""type"": ""COHESION"", ""description"": ""class has low (85.71%) cohesion"", ""file_path"": """", ""line_no"": 1, ""column_no"": 1, ""inspector_type"": ""FLAKE8"", ""measure"": 14}, {""origin_class"": ""RAD100"", ""type"": ""MAINTAINABILITY"", ""description"": ""The maintainability index is too low."", ""file_path"": """", ""line_no"": 1, ""column_no"": 1, ""inspector_type"": ""RADON"", ""measure"": 50}, {""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 2, ""column_no"": 5, ""inspector_type"": ""FLAKE8"", ""measure"": 1}, {""origin_class"": ""C002"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 2, ""column_no"": 4, ""inspector_type"": ""PYTHON_AST"", ""measure"": 2}, {""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 6, ""column_no"": 5, ""inspector_type"": ""FLAKE8"", ""measure"": 1}, {""origin_class"": ""C002"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 6, ""column_no"": 4, ""inspector_type"": ""PYTHON_AST"", ""measure"": 3}, {""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 11, ""column_no"": 5, ""inspector_type"": ""FLAKE8"", ""measure"": 1}, {""origin_class"": ""C002"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 11, ""column_no"": 4, ""inspector_type"": ""PYTHON_AST"", ""measure"": 3}, {""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 16, ""column_no"": 5, ""inspector_type"": ""FLAKE8"", ""measure"": 1}, {""origin_class"": ""C002"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 16, ""column_no"": 4, ""inspector_type"": ""PYTHON_AST"", ""measure"": 1}, {""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 19, ""column_no"": 5, ""inspector_type"": ""FLAKE8"", ""measure"": 2}, {""origin_class"": ""C002"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 19, ""column_no"": 4, ""inspector_type"": ""PYTHON_AST"", ""measure"": 5}, {""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 26, ""column_no"": 5, ""inspector_type"": ""FLAKE8"", ""measure"": 1}, {""origin_class"": ""C002"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 26, ""column_no"": 4, ""inspector_type"": ""PYTHON_AST"", ""measure"": 5}, {""origin_class"": ""WPS331"", ""type"": ""BEST_PRACTICES"", ""description"": ""Found variables that are only used for `return`: result"", ""file_path"": """", ""line_no"": 31, ""column_no"": 9, ""inspector_type"": ""FLAKE8""}, {""origin_class"": ""R504"", ""type"": ""BEST_PRACTICES"", ""description"": ""you shouldn`t assign value to variable if it will be use only as return value"", ""file_path"": """", ""line_no"": 31, ""column_no"": 16, ""inspector_type"": ""FLAKE8""}, {""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 33, ""column_no"": 5, ""inspector_type"": ""FLAKE8"", ""measure"": 1}, {""origin_class"": ""C002"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 33, ""column_no"": 4, ""inspector_type"": ""PYTHON_AST"", ""measure"": 3}]" +5,python3,"n = int(input()) +def even(): + i = 0 + while i <= n * 2: + yield i + i += 2 +generator = even() +for _ in range(n): + print(next(generator)) + + +# Don't forget to print out the first n numbers one by one here +","[{""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 2, ""column_no"": 1, ""inspector_type"": ""FLAKE8"", ""measure"": 2}, {""origin_class"": ""C002"", ""type"": ""FUNC_LEN"", ""description"": ""Too long function. Try to split it into smaller functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 2, ""column_no"": 0, ""inspector_type"": ""PYTHON_AST"", ""measure"": 4}, {""origin_class"": ""C901"", ""type"": ""CYCLOMATIC_COMPLEXITY"", ""description"": ""Too complex function. You can figure out how to simplify this code or split it into a set of small functions / methods. It will make your code easy to understand and less error prone."", ""file_path"": """", ""line_no"": 8, ""column_no"": 1, ""inspector_type"": ""FLAKE8"", ""measure"": 2}, {""origin_class"": ""RAD100"", ""type"": ""MAINTAINABILITY"", ""description"": ""The maintainability index is too low."", ""file_path"": """", ""line_no"": 1, ""column_no"": 1, ""inspector_type"": ""RADON"", ""measure"": 12}]" +6,javascript,"async function rockBand(str) { + return new Promise(function(resolve, reject) { + if (str == 'Linkin Park') { + resolve(""Chester, we miss you!""); + } else { + resolve(""No matter the band we miss him anyway!""); + } + }); +} +","[{""origin_class"": ""no-unused-vars"", ""type"": ""CODE_STYLE"", ""description"": ""'rockBand' is defined but never used. (no-unused-vars)"", ""file_path"": """", ""line_no"": 1, ""column_no"": 16, ""inspector_type"": ""ESLINT""}, {""origin_class"": ""no-unused-vars"", ""type"": ""CODE_STYLE"", ""description"": ""'reject' is defined but never used. (no-unused-vars)"", ""file_path"": """", ""line_no"": 2, ""column_no"": 40, ""inspector_type"": ""ESLINT""}, {""origin_class"": ""eqeqeq"", ""type"": ""BEST_PRACTICES"", ""description"": ""Expected '===' and instead saw '=='. (eqeqeq)"", ""file_path"": """", ""line_no"": 3, ""column_no"": 15, ""inspector_type"": ""ESLINT""}]" \ No newline at end of file From 531ee4049a4bec72e4694cdbff16569d98ac5c25 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 17:32:40 +0300 Subject: [PATCH 14/35] Added logger and small code refactoring --- .../statistics/get_raw_issues_statistics.py | 38 +++++++++++++------ .../test_get_raw_issues_statistics.py | 4 +- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/python/evaluation/statistics/get_raw_issues_statistics.py b/src/python/evaluation/statistics/get_raw_issues_statistics.py index c6574a1a..93ba4c0f 100644 --- a/src/python/evaluation/statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/statistics/get_raw_issues_statistics.py @@ -1,5 +1,6 @@ import argparse import json +import logging import sys from collections import Counter from json import JSONDecodeError @@ -32,44 +33,51 @@ TOTAL_LINES = 'total_lines' VALUE = 'value' -STATS_DF_NAME = 'stats' +OUTPUT_DF_NAME = 'stats' DEFAULT_OUTPUT_FOLDER_NAME = 'raw_issues_statistics' +logger = logging.getLogger(__name__) + def configure_arguments(parser: argparse.ArgumentParser) -> None: parser.add_argument( 'solutions_with_raw_issues', type=lambda value: Path(value).absolute(), help=f'Local XLSX-file or CSV-file path. Your file must include column-names: ' - f'"{CODE}", "{LANG}", and "{RAW_ISSUES}"', + f'"{ID}", "{CODE}", "{LANG}", and "{RAW_ISSUES}"', ) parser.add_argument( '-o', '--output', type=lambda value: Path(value).absolute(), - help='Path where datasets with statistics will be saved. ' - 'If not specified, datasets will be saved next to the original one.', + help='Path where the dataset with statistics will be saved. ' + 'If not specified, the dataset will be saved next to the original one.', ) -def _convert_language_code_to_language(language_code: str) -> str: +def _convert_language_code_to_language(fragment_id: int, language_code: str) -> str: try: language_version = get_language_version(language_code) except KeyError: + logger.warning(f'{fragment_id}: it was not possible to determine the language version from "{language_code}"') return language_code language = Language.from_language_version(language_version) if language == Language.UNKNOWN: + logger.warning(f'{fragment_id}: it was not possible to determine the language from "{language_version}"') return language_code return language.value def _extract_stats_from_issues(row: pd.Series) -> pd.Series: + logger.info(f'{row[ID]}: extracting stats') + try: issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder) except (JSONDecodeError, TypeError): + logger.warning(f'{row[ID]}: failed to decode issues') issues: List[BaseIssue] = [] counter = Counter([issue.type for issue in issues]) @@ -84,8 +92,9 @@ def _extract_stats_from_issues(row: pd.Series) -> pd.Series: row[LINE_LEN_NUMBER] = counter[IssueType.LINE_LEN] row[TOTAL_LINES] = get_total_code_lines_from_code(row[CODE]) - row[LANG] = _convert_language_code_to_language(row[LANG]) + row[LANG] = _convert_language_code_to_language(row[ID], row[LANG]) + logger.info(f'{row[ID]}: extraction of statistics is complete') return row @@ -96,11 +105,15 @@ def _is_python(language_code: str) -> bool: return False -def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame]: +def _group_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame]: + logger.info('The grouping of statistics by language has started') + result = {} df_grouped_by_lang = df_with_stats.groupby(LANG) for lang in df_grouped_by_lang.groups: + logger.info(f'"{lang}" statistics grouping started') + lang_group = df_grouped_by_lang.get_group(lang) columns_with_stats = [] @@ -140,16 +153,19 @@ def _get_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame]: stats.reset_index(inplace=True) result[str(lang)] = stats + logger.info(f'"{lang}" statistics grouping finished') + + logger.info('The grouping of statistics by language has finished') return result -def inspect_solutions(solutions_with_raw_issues: pd.DataFrame) -> Dict[str, pd.DataFrame]: +def inspect_raw_issues(solutions_with_raw_issues: pd.DataFrame) -> Dict[str, pd.DataFrame]: pandarallel.initialize() solutions_with_raw_issues = solutions_with_raw_issues.parallel_apply(_extract_stats_from_issues, axis=1) - return _get_stats_by_lang(solutions_with_raw_issues) + return _group_stats_by_lang(solutions_with_raw_issues) def _get_output_folder(solutions_file_path: Path, output_folder: Optional[Path]): @@ -166,7 +182,7 @@ def _save_stats(stats_by_lang: Dict[str, pd.DataFrame], solutions_file_path: Pat for lang, stats in stats_by_lang.items(): lang_folder = output_folder / lang lang_folder.mkdir(parents=True, exist_ok=True) - write_df_to_file(stats, lang_folder / f'{STATS_DF_NAME}{output_extension.value}', output_extension) + write_df_to_file(stats, lang_folder / f'{OUTPUT_DF_NAME}{output_extension.value}', output_extension) if __name__ == "__main__": @@ -176,6 +192,6 @@ def _save_stats(stats_by_lang: Dict[str, pd.DataFrame], solutions_file_path: Pat solutions_with_raw_issues = get_solutions_df_by_file_path(args.solutions_with_raw_issues) - stats_by_lang = inspect_solutions(solutions_with_raw_issues) + stats_by_lang = inspect_raw_issues(solutions_with_raw_issues) _save_stats(stats_by_lang, args.solutions_with_raw_issues, args.output) diff --git a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py index b81dd707..d62c6c82 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py @@ -13,7 +13,7 @@ _convert_language_code_to_language, _get_output_folder, DEFAULT_OUTPUT_FOLDER_NAME, - inspect_solutions, + inspect_raw_issues, ) from src.python.review.common.language import Language @@ -100,7 +100,7 @@ def test_convert_language_code_to_language(language_code: str, expected_language @pytest.mark.parametrize(('test_file', 'target_file', 'lang'), INSPECT_SOLUTIONS_TEST_DATA) def test_inspect_solutions(test_file: str, target_file: str, lang: str): test_df = get_solutions_df_by_file_path(GET_RAW_ISSUES_STATISTICS_TEST_FILES_FOLDER / test_file) - stats = inspect_solutions(test_df) + stats = inspect_raw_issues(test_df) freq_stats = pd.read_csv(GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER / target_file) From d7f98bc9974986fefcfed62e54415ab559052875 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 17:52:43 +0300 Subject: [PATCH 15/35] Added some more logging --- .../statistics/get_raw_issues_statistics.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/python/evaluation/statistics/get_raw_issues_statistics.py b/src/python/evaluation/statistics/get_raw_issues_statistics.py index 93ba4c0f..8f61e7f6 100644 --- a/src/python/evaluation/statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/statistics/get_raw_issues_statistics.py @@ -37,6 +37,7 @@ DEFAULT_OUTPUT_FOLDER_NAME = 'raw_issues_statistics' logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) def configure_arguments(parser: argparse.ArgumentParser) -> None: @@ -44,7 +45,7 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: 'solutions_with_raw_issues', type=lambda value: Path(value).absolute(), help=f'Local XLSX-file or CSV-file path. Your file must include column-names: ' - f'"{ID}", "{CODE}", "{LANG}", and "{RAW_ISSUES}"', + f'"{ID}", "{CODE}", "{LANG}", and "{RAW_ISSUES}".', ) parser.add_argument( @@ -59,25 +60,25 @@ def _convert_language_code_to_language(fragment_id: int, language_code: str) -> try: language_version = get_language_version(language_code) except KeyError: - logger.warning(f'{fragment_id}: it was not possible to determine the language version from "{language_code}"') + logger.warning(f'{fragment_id}: it was not possible to determine the language version from "{language_code}".') return language_code language = Language.from_language_version(language_version) if language == Language.UNKNOWN: - logger.warning(f'{fragment_id}: it was not possible to determine the language from "{language_version}"') + logger.warning(f'{fragment_id}: it was not possible to determine the language from "{language_version}".') return language_code return language.value def _extract_stats_from_issues(row: pd.Series) -> pd.Series: - logger.info(f'{row[ID]}: extracting stats') + logger.info(f'{row[ID]}: extracting stats.') try: issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder) except (JSONDecodeError, TypeError): - logger.warning(f'{row[ID]}: failed to decode issues') + logger.warning(f'{row[ID]}: failed to decode issues.') issues: List[BaseIssue] = [] counter = Counter([issue.type for issue in issues]) @@ -94,7 +95,7 @@ def _extract_stats_from_issues(row: pd.Series) -> pd.Series: row[LANG] = _convert_language_code_to_language(row[ID], row[LANG]) - logger.info(f'{row[ID]}: extraction of statistics is complete') + logger.info(f'{row[ID]}: extraction of statistics is complete.') return row @@ -106,13 +107,13 @@ def _is_python(language_code: str) -> bool: def _group_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame]: - logger.info('The grouping of statistics by language has started') + logger.info('The grouping of statistics by language has started.') result = {} df_grouped_by_lang = df_with_stats.groupby(LANG) for lang in df_grouped_by_lang.groups: - logger.info(f'"{lang}" statistics grouping started') + logger.info(f'"{lang}" statistics grouping started.') lang_group = df_grouped_by_lang.get_group(lang) @@ -153,9 +154,9 @@ def _group_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame] stats.reset_index(inplace=True) result[str(lang)] = stats - logger.info(f'"{lang}" statistics grouping finished') + logger.info(f'"{lang}" statistics grouping finished.') - logger.info('The grouping of statistics by language has finished') + logger.info('The grouping of statistics by language has finished.') return result @@ -179,11 +180,15 @@ def _save_stats(stats_by_lang: Dict[str, pd.DataFrame], solutions_file_path: Pat output_folder = _get_output_folder(solutions_file_path, output_path) output_extension = Extension.get_extension_from_file(str(solutions_file_path)) + logger.info(f'Saving statistics to a folder: {output_folder}.') + for lang, stats in stats_by_lang.items(): lang_folder = output_folder / lang lang_folder.mkdir(parents=True, exist_ok=True) write_df_to_file(stats, lang_folder / f'{OUTPUT_DF_NAME}{output_extension.value}', output_extension) + logger.info('Saving statistics is complete.') + if __name__ == "__main__": parser = argparse.ArgumentParser() From 2bb65a8a770d385aca8c818b7891aea28aa298ed Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 18:02:51 +0300 Subject: [PATCH 16/35] Fixed test --- .../evaluation/statistics/test_get_raw_issues_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py index d62c6c82..2c5dffb3 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py @@ -54,7 +54,7 @@ def test_get_output_folder(solutions_file_path: Path, output_folder: Optional[Pa @pytest.mark.parametrize(('language_code', 'expected_language'), CONVERT_LANGUAGE_CODE_TO_LANGUAGE_TEST_DATA) def test_convert_language_code_to_language(language_code: str, expected_language: str): - actual_language = _convert_language_code_to_language(language_code) + actual_language = _convert_language_code_to_language(0, language_code) assert actual_language == expected_language From c3bbf455309d032aab16ccfc12786302201626b4 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 18:03:14 +0300 Subject: [PATCH 17/35] Fixed test --- .../evaluation/statistics/test_get_raw_issues_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py index 2c5dffb3..3c01135b 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/statistics/test_get_raw_issues_statistics.py @@ -54,7 +54,7 @@ def test_get_output_folder(solutions_file_path: Path, output_folder: Optional[Pa @pytest.mark.parametrize(('language_code', 'expected_language'), CONVERT_LANGUAGE_CODE_TO_LANGUAGE_TEST_DATA) def test_convert_language_code_to_language(language_code: str, expected_language: str): - actual_language = _convert_language_code_to_language(0, language_code) + actual_language = _convert_language_code_to_language(fragment_id=0, language_code=language_code) assert actual_language == expected_language From 117961c41d9ca1f74083cd60f79e44f424b34955 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Mon, 26 Jul 2021 18:12:44 +0300 Subject: [PATCH 18/35] Fixed help message --- src/python/evaluation/statistics/get_raw_issues_statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/evaluation/statistics/get_raw_issues_statistics.py b/src/python/evaluation/statistics/get_raw_issues_statistics.py index 8f61e7f6..913a8faa 100644 --- a/src/python/evaluation/statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/statistics/get_raw_issues_statistics.py @@ -51,8 +51,8 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: parser.add_argument( '-o', '--output', type=lambda value: Path(value).absolute(), - help='Path where the dataset with statistics will be saved. ' - 'If not specified, the dataset will be saved next to the original one.', + help='Path to the folder where datasets with statistics will be saved. ' + 'If not specified, the datasets will be saved in the folder next to the original one.', ) From 32d8dc9deb9444d3031c4cfbe43c5de9c4b65ea9 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov <55441714+GirZ0n@users.noreply.github.com> Date: Mon, 26 Jul 2021 18:13:04 +0300 Subject: [PATCH 19/35] Update README.md --- src/python/evaluation/statistics/README.md | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/python/evaluation/statistics/README.md b/src/python/evaluation/statistics/README.md index 5933edd7..0b7f83d7 100644 --- a/src/python/evaluation/statistics/README.md +++ b/src/python/evaluation/statistics/README.md @@ -27,3 +27,33 @@ Run the [get_raw_issues.py](get_raw_issues.py) with the arguments from command l | **‑‑to‑save‑path** | Allows to save the path to the file where the issue was found. By default, the path is not saved. | | **‑o**, **‑‑output** | Path where the dataset with raw issues will be saved. If not specified, the dataset will be saved next to the original one. | | **‑l**, **‑‑log-output** | Path where logs will be stored. If not specified, then logs will be output to stderr. | + +## Get raw issues statistics +The script takes the dataframe obtained after executing [get_raw_issues.py](get_raw_issues.py) and outputs dataframes with statistics grouped by language. + +The input dataset must have 3 obligatory columns: +- `id` +- `code` +- `lang` +- `raw_issues` + +Possible values for column `lang` are: `python3`, `kotlin`, `javascript`, `java7`, `java8`, `java9`, `java11`, `java15`. + +The output files is a new `xlsx` or `csv` files which contains the `value` column and the columns responsible for its category statistics. + +The `value` column shows the metric value (for measurable issue categories), quantity (for quantitative issue categories) or `ratio * 100` (for `CODE_STYLE` and `LINE_LEN`), where `ratio` is calculated as in the corresponding rules (`CodeStyleRule` and `LineLengthRule`). + +The table cells indicate how often value occurs in one fragment (for quantitative categories) or in all fragments (for measurable categories). + +All output datasets are arranged in folders according to language. + +### Usage +Run the [get_raw_issues_statistics.py](get_raw_issues_statistics.py) with the arguments from command line. + +**Required arguments:** +- `solutions_with_raw_issues` — path to an xlsx- or csv-file with code samples and raw issues, which were received with [get_raw_issues.py](get_raw_issues.py). + +**Optional arguments:** +| Argument | Description | +|----------|-------------| +| **‑o**, **‑‑output** | Path to the folder where datasets with statistics will be saved. If not specified, the datasets will be saved in the folder next to the original dataset. | From 39106af8a37ff8295a940d19760000cff2ed5d10 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Tue, 27 Jul 2021 12:16:14 +0300 Subject: [PATCH 20/35] statistics -> issues_statistics --- .../evaluation/{statistics => issues_statistics}/README.md | 0 .../{statistics => issues_statistics}/__init__.py | 0 .../{statistics => issues_statistics}/common/__init__.py | 0 .../common/raw_issue_encoder_decoder.py | 0 .../{statistics => issues_statistics}/get_raw_issues.py | 2 +- .../get_raw_issues_statistics.py | 4 ++-- .../{statistics => issues_statistics}/__init__.py | 6 +++--- .../test_get_raw_issues.py | 4 ++-- .../test_get_raw_issues_statistics.py | 4 ++-- .../test_raw_issue_encoding_decoding.py | 2 +- .../target_files/target_fragment_per_language.csv | 0 .../get_raw_issues/target_files/target_incorrect_code.csv | 0 .../target_files/target_incorrect_language.csv | 0 .../test_files/test_fragment_per_language.csv | 0 .../get_raw_issues/test_files/test_incorrect_code.csv | 0 .../get_raw_issues/test_files/test_incorrect_language.csv | 0 .../target_files/target_df_multi_lang_java.csv | 0 .../target_files/target_df_multi_lang_js.csv | 0 .../target_files/target_df_multi_lang_python.csv | 0 .../target_files/target_df_single_lang.csv | 0 .../target_files/target_df_with_empty_raw_issues.csv | 0 .../target_files/target_df_with_incorrect_language.csv | 0 .../target_files/target_df_with_null_raw_issues.csv | 0 .../test_files/test_df_multi_lang.csv | 0 .../test_files/test_df_single_lang.csv | 0 .../test_files/test_df_with_empty_raw_issues.csv | 0 .../test_files/test_df_with_incorrect_language.csv | 0 .../test_files/test_df_with_null_raw_issues.csv | 0 28 files changed, 11 insertions(+), 11 deletions(-) rename src/python/evaluation/{statistics => issues_statistics}/README.md (100%) rename src/python/evaluation/{statistics => issues_statistics}/__init__.py (100%) rename src/python/evaluation/{statistics => issues_statistics}/common/__init__.py (100%) rename src/python/evaluation/{statistics => issues_statistics}/common/raw_issue_encoder_decoder.py (100%) rename src/python/evaluation/{statistics => issues_statistics}/get_raw_issues.py (98%) rename src/python/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics.py (97%) rename test/python/evaluation/{statistics => issues_statistics}/__init__.py (61%) rename test/python/evaluation/{statistics => issues_statistics}/test_get_raw_issues.py (97%) rename test/python/evaluation/{statistics => issues_statistics}/test_get_raw_issues_statistics.py (96%) rename test/python/evaluation/{statistics => issues_statistics}/test_raw_issue_encoding_decoding.py (98%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues/target_files/target_fragment_per_language.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues/target_files/target_incorrect_code.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues/target_files/target_incorrect_language.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues/test_files/test_fragment_per_language.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues/test_files/test_incorrect_code.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues/test_files/test_incorrect_language.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/target_files/target_df_single_lang.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/test_files/test_df_multi_lang.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/test_files/test_df_single_lang.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv (100%) rename test/resources/evaluation/{statistics => issues_statistics}/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv (100%) diff --git a/src/python/evaluation/statistics/README.md b/src/python/evaluation/issues_statistics/README.md similarity index 100% rename from src/python/evaluation/statistics/README.md rename to src/python/evaluation/issues_statistics/README.md diff --git a/src/python/evaluation/statistics/__init__.py b/src/python/evaluation/issues_statistics/__init__.py similarity index 100% rename from src/python/evaluation/statistics/__init__.py rename to src/python/evaluation/issues_statistics/__init__.py diff --git a/src/python/evaluation/statistics/common/__init__.py b/src/python/evaluation/issues_statistics/common/__init__.py similarity index 100% rename from src/python/evaluation/statistics/common/__init__.py rename to src/python/evaluation/issues_statistics/common/__init__.py diff --git a/src/python/evaluation/statistics/common/raw_issue_encoder_decoder.py b/src/python/evaluation/issues_statistics/common/raw_issue_encoder_decoder.py similarity index 100% rename from src/python/evaluation/statistics/common/raw_issue_encoder_decoder.py rename to src/python/evaluation/issues_statistics/common/raw_issue_encoder_decoder.py diff --git a/src/python/evaluation/statistics/get_raw_issues.py b/src/python/evaluation/issues_statistics/get_raw_issues.py similarity index 98% rename from src/python/evaluation/statistics/get_raw_issues.py rename to src/python/evaluation/issues_statistics/get_raw_issues.py index 19791c79..b077f34f 100644 --- a/src/python/evaluation/statistics/get_raw_issues.py +++ b/src/python/evaluation/issues_statistics/get_raw_issues.py @@ -15,7 +15,7 @@ from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file from src.python.evaluation.common.util import ColumnName from src.python.evaluation.evaluation_run_tool import get_language_version -from src.python.evaluation.statistics.common.raw_issue_encoder_decoder import RawIssueEncoder +from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueEncoder from src.python.review.common.file_system import ( create_file, Extension, diff --git a/src/python/evaluation/statistics/get_raw_issues_statistics.py b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py similarity index 97% rename from src/python/evaluation/statistics/get_raw_issues_statistics.py rename to src/python/evaluation/issues_statistics/get_raw_issues_statistics.py index 913a8faa..d72ad0f9 100644 --- a/src/python/evaluation/statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py @@ -15,8 +15,8 @@ from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file from src.python.evaluation.common.util import ColumnName from src.python.evaluation.evaluation_run_tool import get_language_version -from src.python.evaluation.statistics.common.raw_issue_encoder_decoder import RawIssueDecoder -from src.python.evaluation.statistics.get_raw_issues import RAW_ISSUES +from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueDecoder +from src.python.evaluation.issues_statistics.get_raw_issues import RAW_ISSUES from src.python.review.common.file_system import Extension, get_parent_folder, get_total_code_lines_from_code from src.python.review.common.language import Language from src.python.review.inspectors.issue import BaseIssue, ISSUE_TYPE_TO_CLASS, IssueType, Measurable diff --git a/test/python/evaluation/statistics/__init__.py b/test/python/evaluation/issues_statistics/__init__.py similarity index 61% rename from test/python/evaluation/statistics/__init__.py rename to test/python/evaluation/issues_statistics/__init__.py index d0a4efd1..604e3d61 100644 --- a/test/python/evaluation/statistics/__init__.py +++ b/test/python/evaluation/issues_statistics/__init__.py @@ -1,14 +1,14 @@ from test.python.evaluation import CURRENT_TEST_DATA_FOLDER -STATISTICS_TEST_DATA_FOLDER = CURRENT_TEST_DATA_FOLDER / 'statistics' +ISSUES_STATISTICS_TEST_DATA_FOLDER = CURRENT_TEST_DATA_FOLDER / 'issues_statistics' -GET_RAW_ISSUES_DATA_FOLDER = STATISTICS_TEST_DATA_FOLDER / 'get_raw_issues' +GET_RAW_ISSUES_DATA_FOLDER = ISSUES_STATISTICS_TEST_DATA_FOLDER / 'get_raw_issues' GET_RAW_ISSUES_TEST_FILES_FOLDER = GET_RAW_ISSUES_DATA_FOLDER / 'test_files' GET_RAW_ISSUES_TARGET_FILES_FOLDER = GET_RAW_ISSUES_DATA_FOLDER / 'target_files' -GET_RAW_ISSUES_STATISTICS_DATA_FOLDER = STATISTICS_TEST_DATA_FOLDER / 'get_raw_issues_statistics' +GET_RAW_ISSUES_STATISTICS_DATA_FOLDER = ISSUES_STATISTICS_TEST_DATA_FOLDER / 'get_raw_issues_statistics' GET_RAW_ISSUES_STATISTICS_TEST_FILES_FOLDER = GET_RAW_ISSUES_STATISTICS_DATA_FOLDER / 'test_files' diff --git a/test/python/evaluation/statistics/test_get_raw_issues.py b/test/python/evaluation/issues_statistics/test_get_raw_issues.py similarity index 97% rename from test/python/evaluation/statistics/test_get_raw_issues.py rename to test/python/evaluation/issues_statistics/test_get_raw_issues.py index c11882c2..643ada1c 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues.py +++ b/test/python/evaluation/issues_statistics/test_get_raw_issues.py @@ -1,12 +1,12 @@ from pathlib import Path from test.python.common_util import equal_df -from test.python.evaluation.statistics import GET_RAW_ISSUES_TARGET_FILES_FOLDER, GET_RAW_ISSUES_TEST_FILES_FOLDER +from test.python.evaluation.issues_statistics import GET_RAW_ISSUES_TARGET_FILES_FOLDER, GET_RAW_ISSUES_TEST_FILES_FOLDER from typing import List, Optional import pandas as pd import pytest from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path -from src.python.evaluation.statistics.get_raw_issues import _filter_issues, _get_output_path, inspect_solutions +from src.python.evaluation.issues_statistics.get_raw_issues import _filter_issues, _get_output_path, inspect_solutions from src.python.review.inspectors.inspector_type import InspectorType from src.python.review.inspectors.issue import BaseIssue, CodeIssue, IssueType, LineLenIssue, MaintainabilityLackIssue diff --git a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py similarity index 96% rename from test/python/evaluation/statistics/test_get_raw_issues_statistics.py rename to test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py index 3c01135b..324be27a 100644 --- a/test/python/evaluation/statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py @@ -1,6 +1,6 @@ from pathlib import Path from test.python.common_util import equal_df -from test.python.evaluation.statistics import ( +from test.python.evaluation.issues_statistics import ( GET_RAW_ISSUES_STATISTICS_TARGET_FILES_FOLDER, GET_RAW_ISSUES_STATISTICS_TEST_FILES_FOLDER, ) @@ -9,7 +9,7 @@ import pandas as pd import pytest from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path -from src.python.evaluation.statistics.get_raw_issues_statistics import ( +from src.python.evaluation.issues_statistics.get_raw_issues_statistics import ( _convert_language_code_to_language, _get_output_folder, DEFAULT_OUTPUT_FOLDER_NAME, diff --git a/test/python/evaluation/statistics/test_raw_issue_encoding_decoding.py b/test/python/evaluation/issues_statistics/test_raw_issue_encoding_decoding.py similarity index 98% rename from test/python/evaluation/statistics/test_raw_issue_encoding_decoding.py rename to test/python/evaluation/issues_statistics/test_raw_issue_encoding_decoding.py index 82c6c901..43c20e08 100644 --- a/test/python/evaluation/statistics/test_raw_issue_encoding_decoding.py +++ b/test/python/evaluation/issues_statistics/test_raw_issue_encoding_decoding.py @@ -3,7 +3,7 @@ from pathlib import Path import pytest -from src.python.evaluation.statistics.common.raw_issue_encoder_decoder import RawIssueDecoder, RawIssueEncoder +from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueDecoder, RawIssueEncoder from src.python.review.inspectors.inspector_type import InspectorType from src.python.review.inspectors.issue import ( BaseIssue, diff --git a/test/resources/evaluation/statistics/get_raw_issues/target_files/target_fragment_per_language.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_fragment_per_language.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues/target_files/target_fragment_per_language.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_fragment_per_language.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues/target_files/target_incorrect_code.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_incorrect_code.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues/target_files/target_incorrect_code.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_incorrect_code.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues/target_files/target_incorrect_language.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_incorrect_language.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues/target_files/target_incorrect_language.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues/target_files/target_incorrect_language.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues/test_files/test_fragment_per_language.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_fragment_per_language.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues/test_files/test_fragment_per_language.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_fragment_per_language.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues/test_files/test_incorrect_code.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_incorrect_code.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues/test_files/test_incorrect_code.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_incorrect_code.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues/test_files/test_incorrect_language.csv b/test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_incorrect_language.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues/test_files/test_incorrect_language.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues/test_files/test_incorrect_language.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_java.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_js.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_multi_lang_python.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_single_lang.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_single_lang.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_single_lang.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_empty_raw_issues.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_incorrect_language.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_multi_lang.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_multi_lang.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_multi_lang.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_multi_lang.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_single_lang.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_single_lang.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_single_lang.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_single_lang.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_empty_raw_issues.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_incorrect_language.csv diff --git a/test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv similarity index 100% rename from test/resources/evaluation/statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv From 28a9fe6f769406f3a6dec2b3212769892453d9be Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Tue, 27 Jul 2021 12:19:03 +0300 Subject: [PATCH 21/35] Fixed flake8 --- .../evaluation/issues_statistics/test_get_raw_issues.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/python/evaluation/issues_statistics/test_get_raw_issues.py b/test/python/evaluation/issues_statistics/test_get_raw_issues.py index 643ada1c..cafa3b63 100644 --- a/test/python/evaluation/issues_statistics/test_get_raw_issues.py +++ b/test/python/evaluation/issues_statistics/test_get_raw_issues.py @@ -1,6 +1,8 @@ from pathlib import Path from test.python.common_util import equal_df -from test.python.evaluation.issues_statistics import GET_RAW_ISSUES_TARGET_FILES_FOLDER, GET_RAW_ISSUES_TEST_FILES_FOLDER +from test.python.evaluation.issues_statistics import ( + GET_RAW_ISSUES_TARGET_FILES_FOLDER, GET_RAW_ISSUES_TEST_FILES_FOLDER, +) from typing import List, Optional import pandas as pd From 153ec3a3308a88f003db5e38012d2be76157bdf9 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 10:53:03 +0300 Subject: [PATCH 22/35] Added from_value function --- src/python/review/application_config.py | 7 +++++++ src/python/review/common/language.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/src/python/review/application_config.py b/src/python/review/application_config.py index e2d08db1..bc59b621 100644 --- a/src/python/review/application_config.py +++ b/src/python/review/application_config.py @@ -58,3 +58,10 @@ def is_java(self) -> bool: or self == LanguageVersion.JAVA_11 or self == LanguageVersion.JAVA_15 ) + + @classmethod + def from_value(cls, value: str, default=None): + try: + return LanguageVersion(value) + except ValueError: + return default diff --git a/src/python/review/common/language.py b/src/python/review/common/language.py index bfe7a34c..c48944ec 100644 --- a/src/python/review/common/language.py +++ b/src/python/review/common/language.py @@ -33,6 +33,13 @@ def from_language_version(language_version: LanguageVersion) -> 'Language': def values(cls) -> List[str]: return [member.value for member in Language] + @classmethod + def from_value(cls, value: str, default=None): + try: + return Language(value) + except ValueError: + return default + EXTENSION_TO_LANGUAGE = { Extension.JAVA: Language.JAVA, From bd14b026e3a7bdc0a71dd2d97a49acc43a36eccc Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 10:54:26 +0300 Subject: [PATCH 23/35] Added comment --- src/python/review/common/file_system.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/review/common/file_system.py b/src/python/review/common/file_system.py index 9bbf5123..952eed1a 100644 --- a/src/python/review/common/file_system.py +++ b/src/python/review/common/file_system.py @@ -235,6 +235,7 @@ def copy_file(source: Union[str, Path], destination: Union[str, Path]): shutil.copy(source, destination) +# Before using it, check that there are no line breaks in the string def __is_line_empty(line: str) -> bool: return len(line.strip()) == 0 From bf5b64915d2b72e87effc280269d5a48784367e0 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 10:57:51 +0300 Subject: [PATCH 24/35] Added get_ratio --- src/python/review/quality/rules/line_len_scoring.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/python/review/quality/rules/line_len_scoring.py b/src/python/review/quality/rules/line_len_scoring.py index b188576f..50a767bf 100644 --- a/src/python/review/quality/rules/line_len_scoring.py +++ b/src/python/review/quality/rules/line_len_scoring.py @@ -31,7 +31,7 @@ def __init__(self, config: LineLengthRuleConfig): # TODO: refactor def apply(self, n_line_len, n_lines): - self.ratio = n_line_len / max(n_lines, 1) + self.ratio = self.get_ratio(n_line_len, n_lines) self.n_line_len = n_line_len self.n_lines = n_lines @@ -60,3 +60,7 @@ def merge(self, other: 'LineLengthRule') -> 'LineLengthRule': result_rule.apply(self.n_line_len + other.n_line_len, self.n_lines + other.n_lines) return result_rule + + @staticmethod + def get_ratio(n_line_len: int, n_lines: int) -> float: + return n_line_len / max(n_lines, 1) From 2bc1d17e7a22c60230e070c9ebf4ac6c70ec8171 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 10:59:32 +0300 Subject: [PATCH 25/35] Small refactoring: added get_ratio --- .../review/quality/rules/code_style_scoring.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/python/review/quality/rules/code_style_scoring.py b/src/python/review/quality/rules/code_style_scoring.py index a1edda09..8fdabd23 100644 --- a/src/python/review/quality/rules/code_style_scoring.py +++ b/src/python/review/quality/rules/code_style_scoring.py @@ -70,7 +70,8 @@ def apply(self, n_code_style_lines, n_code_style, total_lines): self.n_code_style_lines = n_code_style_lines self.n_code_style = n_code_style - self.get_ratio(n_code_style_lines, n_code_style, total_lines) + self.update_quality(n_code_style_lines, n_code_style) + self.ratio = self.get_ratio(n_code_style_lines, total_lines, self.config.language) if self.ratio > self.config.n_code_style_bad: self.save_quality(QualityType.BAD) @@ -84,17 +85,22 @@ def apply(self, n_code_style_lines, n_code_style, total_lines): if n_code_style_lines > self.config.n_code_style_lines_bad: self.quality_type = QualityType.BAD - def get_ratio(self, n_code_style_lines, n_code_style, total_lines): + @staticmethod + def get_ratio(n_code_style_lines: int, total_lines: int, language: Language) -> float: + if language == Language.PYTHON: + return n_code_style_lines / max(1, total_lines) + else: + return n_code_style_lines / max(1, total_lines - 4) + + def update_quality(self, n_code_style_lines: int, n_code_style: int): if self.config.language == Language.PYTHON: if n_code_style == 1: self.save_quality(QualityType.MODERATE) - self.ratio = n_code_style_lines / max(1, total_lines) else: if n_code_style_lines == 1: self.save_quality(QualityType.GOOD) elif n_code_style_lines == 2: self.save_quality(QualityType.MODERATE) - self.ratio = n_code_style_lines / max(1, total_lines - 4) def __get_next_quality_type(self) -> QualityType: if self.quality_type == QualityType.BAD: From 000c1cbbb8d69d3d096be94e8126d17e4acc4d4e Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 11:00:11 +0300 Subject: [PATCH 26/35] Fixed PR issues --- .../get_raw_issues_statistics.py | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py index d72ad0f9..ed0bf05b 100644 --- a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py @@ -14,12 +14,14 @@ from pandarallel import pandarallel from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file from src.python.evaluation.common.util import ColumnName -from src.python.evaluation.evaluation_run_tool import get_language_version from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueDecoder from src.python.evaluation.issues_statistics.get_raw_issues import RAW_ISSUES +from src.python.review.application_config import LanguageVersion from src.python.review.common.file_system import Extension, get_parent_folder, get_total_code_lines_from_code from src.python.review.common.language import Language from src.python.review.inspectors.issue import BaseIssue, ISSUE_TYPE_TO_CLASS, IssueType, Measurable +from src.python.review.quality.rules.code_style_scoring import CodeStyleRule +from src.python.review.quality.rules.line_len_scoring import LineLengthRule from src.python.review.reviewers.utils.code_statistics import get_code_style_lines ID = ColumnName.ID.value @@ -57,9 +59,9 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: def _convert_language_code_to_language(fragment_id: int, language_code: str) -> str: - try: - language_version = get_language_version(language_code) - except KeyError: + language_version = LanguageVersion.from_value(language_code) + + if language_version is None: logger.warning(f'{fragment_id}: it was not possible to determine the language version from "{language_code}".') return language_code @@ -106,6 +108,13 @@ def _is_python(language_code: str) -> bool: return False +def _convert_ratio_to_int(ratio: float): + """ + Round the ratio to 2 decimal places, multiply by 100, and take the integer part. + """ + return int((round(ratio, 2) * 100)) + + def _group_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame]: logger.info('The grouping of statistics by language has started.') @@ -127,23 +136,21 @@ def _group_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame] columns_with_stats.append(lang_group[TOTAL_LINES].value_counts()) - # Calculate line len ratio according to LineLengthRule - line_len_ratio_column = lang_group[LINE_LEN_NUMBER] / lang_group[TOTAL_LINES].apply(lambda elem: max(1, elem)) - line_len_ratio_column = (round(line_len_ratio_column, 2) * 100).apply(int) + line_len_ratio_column = lang_group.apply( + lambda row: LineLengthRule.get_ratio(row[LINE_LEN_NUMBER], row[TOTAL_LINES]), + axis=1, + ) + line_len_ratio_column = line_len_ratio_column.apply(_convert_ratio_to_int) line_len_ratio_column.name = LINE_LEN_RATIO columns_with_stats.append(line_len_ratio_column.value_counts()) - # Calculate code style ratio according to CodeStyleRule - if _is_python(str(lang)): - code_style_ratio_column = lang_group[CODE_STYLE_LINES] / lang_group[TOTAL_LINES].apply( - lambda total_lines: max(1, total_lines), - ) - else: - code_style_ratio_column = lang_group[CODE_STYLE_LINES] / lang_group[TOTAL_LINES].apply( - lambda total_lines: max(1, total_lines - 4), - ) - - code_style_ratio_column = (round(code_style_ratio_column, 2) * 100).apply(int) + code_style_ratio_column = lang_group.apply( + lambda row: CodeStyleRule.get_ratio( + row[CODE_STYLE_LINES], row[TOTAL_LINES], Language.from_value(str(lang), default=Language.UNKNOWN), + ), + axis=1, + ) + code_style_ratio_column = code_style_ratio_column.apply(_convert_ratio_to_int) code_style_ratio_column.name = CODE_STYLE_RATIO columns_with_stats.append(code_style_ratio_column.value_counts()) From 06d687e9704330df1272521f7efc099cddd8c911 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 11:41:40 +0300 Subject: [PATCH 27/35] Small fixes --- .../issues_statistics/get_raw_issues_statistics.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py index ed0bf05b..74709b15 100644 --- a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py @@ -58,7 +58,7 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: ) -def _convert_language_code_to_language(fragment_id: int, language_code: str) -> str: +def _convert_language_code_to_language(fragment_id: str, language_code: str) -> str: language_version = LanguageVersion.from_value(language_code) if language_version is None: @@ -77,6 +77,9 @@ def _convert_language_code_to_language(fragment_id: int, language_code: str) -> def _extract_stats_from_issues(row: pd.Series) -> pd.Series: logger.info(f'{row[ID]}: extracting stats.') + if row.isnull().values.any(): + logger.warning(f'{row[ID]}: the row contains Null. ') + try: issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder) except (JSONDecodeError, TypeError): @@ -98,14 +101,8 @@ def _extract_stats_from_issues(row: pd.Series) -> pd.Series: row[LANG] = _convert_language_code_to_language(row[ID], row[LANG]) logger.info(f'{row[ID]}: extraction of statistics is complete.') - return row - -def _is_python(language_code: str) -> bool: - try: - return Language(language_code) == Language.PYTHON - except ValueError: - return False + return row def _convert_ratio_to_int(ratio: float): From de45c0906f6451a30e7e4903bada18e217d67956 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 11:44:12 +0300 Subject: [PATCH 28/35] Added isnull --- whitelist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/whitelist.txt b/whitelist.txt index 18fd4cb7..51ce03fa 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -189,3 +189,4 @@ dropna sublist dyn setdefault +isnull From 5c4e0d994493295815f992d28093a666f3400b0c Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 11:48:17 +0300 Subject: [PATCH 29/35] typo fix --- .../evaluation/issues_statistics/get_raw_issues_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py index 74709b15..d2b2c5ff 100644 --- a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py @@ -78,7 +78,7 @@ def _extract_stats_from_issues(row: pd.Series) -> pd.Series: logger.info(f'{row[ID]}: extracting stats.') if row.isnull().values.any(): - logger.warning(f'{row[ID]}: the row contains Null. ') + logger.warning(f'{row[ID]}: the row contains null.') try: issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder) From c7f9efca30bab8b46ae9cc63d559ae5788786b17 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 14:32:54 +0300 Subject: [PATCH 30/35] Fixed tests --- .../issues_statistics/test_get_raw_issues_statistics.py | 6 +++--- ..._df_with_null_raw_issues.csv => target_df_with_null.csv} | 2 +- .../test_files/test_df_with_null.csv | 4 ++++ .../test_files/test_df_with_null_raw_issues.csv | 2 -- 4 files changed, 8 insertions(+), 6 deletions(-) rename test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/{target_df_with_null_raw_issues.csv => target_df_with_null.csv} (87%) create mode 100644 test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null.csv delete mode 100644 test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv diff --git a/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py index 324be27a..f77bd0b5 100644 --- a/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py @@ -54,14 +54,14 @@ def test_get_output_folder(solutions_file_path: Path, output_folder: Optional[Pa @pytest.mark.parametrize(('language_code', 'expected_language'), CONVERT_LANGUAGE_CODE_TO_LANGUAGE_TEST_DATA) def test_convert_language_code_to_language(language_code: str, expected_language: str): - actual_language = _convert_language_code_to_language(fragment_id=0, language_code=language_code) + actual_language = _convert_language_code_to_language(fragment_id='0', language_code=language_code) assert actual_language == expected_language INSPECT_SOLUTIONS_TEST_DATA = [ ( - 'test_df_with_null_raw_issues.csv', - 'target_df_with_null_raw_issues.csv', + 'test_df_with_null.csv', + 'target_df_with_null.csv', Language.PYTHON.value, ), ( diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null.csv similarity index 87% rename from test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null.csv index 980efd7f..cc1ae6ea 100644 --- a/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_raw_issues.csv +++ b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null.csv @@ -1,3 +1,3 @@ value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio -0,1,1,1,1,1,0,0,0,0,0,0,0,1,1 +0,2,2,2,2,2,0,0,0,0,0,0,1,2,2 1,0,0,0,0,0,0,0,0,0,0,0,1,0,0 \ No newline at end of file diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null.csv new file mode 100644 index 00000000..eab06807 --- /dev/null +++ b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null.csv @@ -0,0 +1,4 @@ +id,lang,code,raw_issues +1,python3,"println(""Hello, World!"")", +2,python3,,"[]" +3,,"println(""Hello, World!"")","[]" diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv deleted file mode 100644 index 1dd62893..00000000 --- a/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/test_files/test_df_with_null_raw_issues.csv +++ /dev/null @@ -1,2 +0,0 @@ -id,lang,code,raw_issues -1,python3,"println(""Hello, World!"")", \ No newline at end of file From 38657fa1abecbde8c18152c02dcdd79231f70f47 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 14:35:01 +0300 Subject: [PATCH 31/35] Added --log-output and fixed null checks --- .../get_raw_issues_statistics.py | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py index d2b2c5ff..786c9cb2 100644 --- a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py @@ -39,7 +39,6 @@ DEFAULT_OUTPUT_FOLDER_NAME = 'raw_issues_statistics' logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) def configure_arguments(parser: argparse.ArgumentParser) -> None: @@ -57,6 +56,12 @@ def configure_arguments(parser: argparse.ArgumentParser) -> None: 'If not specified, the datasets will be saved in the folder next to the original one.', ) + parser.add_argument( + '-l', '--log-output', + type=lambda value: Path(value).absolute(), + help='Path where logs will be stored. If not specified, then logs will be output to stderr.', + ) + def _convert_language_code_to_language(fragment_id: str, language_code: str) -> str: language_version = LanguageVersion.from_value(language_code) @@ -75,10 +80,15 @@ def _convert_language_code_to_language(fragment_id: str, language_code: str) -> def _extract_stats_from_issues(row: pd.Series) -> pd.Series: - logger.info(f'{row[ID]}: extracting stats.') + print(f'{row[ID]}: extracting stats.') + + if pd.isnull(row[CODE]): + logger.warning(f'{row[ID]}: no code.') + row[CODE] = "" - if row.isnull().values.any(): - logger.warning(f'{row[ID]}: the row contains null.') + if pd.isnull(row[CODE]): + logger.warning(f'{row[ID]}: no lang.') + row[LANG] = "" try: issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder) @@ -100,7 +110,7 @@ def _extract_stats_from_issues(row: pd.Series) -> pd.Series: row[LANG] = _convert_language_code_to_language(row[ID], row[LANG]) - logger.info(f'{row[ID]}: extraction of statistics is complete.') + print(f'{row[ID]}: extraction of statistics is complete.') return row @@ -199,8 +209,19 @@ def _save_stats(stats_by_lang: Dict[str, pd.DataFrame], solutions_file_path: Pat configure_arguments(parser) args = parser.parse_args() + if args.log_output is not None: + args.log_output.parent.mkdir(parents=True, exist_ok=True) + + logging.basicConfig( + filename=args.log_output, filemode="w", level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s', + ) + solutions_with_raw_issues = get_solutions_df_by_file_path(args.solutions_with_raw_issues) + logger.info("Dataset inspection started.") + stats_by_lang = inspect_raw_issues(solutions_with_raw_issues) + logger.info("Dataset inspection finished.") + _save_stats(stats_by_lang, args.solutions_with_raw_issues, args.output) From c39599dd2e686492ed51d8882cb48837d59d34a0 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 14:36:08 +0300 Subject: [PATCH 32/35] Added filemode --- whitelist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/whitelist.txt b/whitelist.txt index 51ce03fa..1d4af387 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -190,3 +190,4 @@ sublist dyn setdefault isnull +filemode From 31c34ed07387d8818460fbe4e5ea33788e48ee22 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 14:44:10 +0300 Subject: [PATCH 33/35] typo fix --- .../evaluation/issues_statistics/get_raw_issues_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py index 786c9cb2..b32fe4eb 100644 --- a/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py +++ b/src/python/evaluation/issues_statistics/get_raw_issues_statistics.py @@ -86,7 +86,7 @@ def _extract_stats_from_issues(row: pd.Series) -> pd.Series: logger.warning(f'{row[ID]}: no code.') row[CODE] = "" - if pd.isnull(row[CODE]): + if pd.isnull(row[LANG]): logger.warning(f'{row[ID]}: no lang.') row[LANG] = "" From 8780d0e7034298945710ff045eb98761cfadd857 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 14:44:19 +0300 Subject: [PATCH 34/35] Fixed tests --- .../issues_statistics/test_get_raw_issues_statistics.py | 7 ++++++- ...get_df_with_null.csv => target_df_with_null_python.csv} | 0 2 files changed, 6 insertions(+), 1 deletion(-) rename test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/{target_df_with_null.csv => target_df_with_null_python.csv} (100%) diff --git a/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py b/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py index f77bd0b5..24c10a67 100644 --- a/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py +++ b/test/python/evaluation/issues_statistics/test_get_raw_issues_statistics.py @@ -61,9 +61,14 @@ def test_convert_language_code_to_language(language_code: str, expected_language INSPECT_SOLUTIONS_TEST_DATA = [ ( 'test_df_with_null.csv', - 'target_df_with_null.csv', + 'target_df_with_null_python.csv', Language.PYTHON.value, ), + ( + 'test_df_with_null.csv', + 'target_df_with_null_unknown.csv', + '', + ), ( 'test_df_with_empty_raw_issues.csv', 'target_df_with_empty_raw_issues.csv', diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_python.csv similarity index 100% rename from test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null.csv rename to test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_python.csv From 37a939352201f64ff8d5872dbc03831220f259b4 Mon Sep 17 00:00:00 2001 From: Ilya Vlasov Date: Fri, 30 Jul 2021 14:53:11 +0300 Subject: [PATCH 35/35] Fixed tests --- .../target_files/target_df_with_null_unknown.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_unknown.csv diff --git a/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_unknown.csv b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_unknown.csv new file mode 100644 index 00000000..980efd7f --- /dev/null +++ b/test/resources/evaluation/issues_statistics/get_raw_issues_statistics/target_files/target_df_with_null_unknown.csv @@ -0,0 +1,3 @@ +value,CODE_STYLE,BEST_PRACTICES,ERROR_PRONE,COMPLEXITY,INFO,LINE_LEN,FUNC_LEN,BOOL_EXPR_LEN,CYCLOMATIC_COMPLEXITY,MAINTAINABILITY,COHESION,total_lines,LINE_LEN_ratio,CODE_STYLE_ratio +0,1,1,1,1,1,0,0,0,0,0,0,0,1,1 +1,0,0,0,0,0,0,0,0,0,0,0,1,0,0 \ No newline at end of file