Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
2ef00d9
Fixed get_output_path test data
GirZ0n Jul 20, 2021
461b511
Moved __get_total_lines to file_system.py
GirZ0n Jul 26, 2021
bec0701
Added get_total_code_lines_from_file and get_total_code_lines_from_code
GirZ0n Jul 26, 2021
e5085cb
Added get_raw_issues_statistics
GirZ0n Jul 26, 2021
71caba1
Renamed main_stats -> freq_stats and other_stats -> ratio_stats
GirZ0n Jul 26, 2021
223c1d1
Small fix
GirZ0n Jul 26, 2021
207e49d
Added new data folders
GirZ0n Jul 26, 2021
df4bef2
Added tests
GirZ0n Jul 26, 2021
63104d8
Removed duplicates
GirZ0n Jul 26, 2021
3fb8b5e
Removed unnecessary line
GirZ0n Jul 26, 2021
2fcfede
Now the script returns only one dataframe
GirZ0n Jul 26, 2021
7152112
Fixed tests
GirZ0n Jul 26, 2021
49fec5e
Added new tests
GirZ0n Jul 26, 2021
531ee40
Added logger and small code refactoring
GirZ0n Jul 26, 2021
d7f98bc
Added some more logging
GirZ0n Jul 26, 2021
2bb65a8
Fixed test
GirZ0n Jul 26, 2021
c3bbf45
Fixed test
GirZ0n Jul 26, 2021
117961c
Fixed help message
GirZ0n Jul 26, 2021
32d8dc9
Update README.md
GirZ0n Jul 26, 2021
39106af
statistics -> issues_statistics
GirZ0n Jul 27, 2021
28a9fe6
Fixed flake8
GirZ0n Jul 27, 2021
153ec3a
Added from_value function
GirZ0n Jul 30, 2021
bd14b02
Added comment
GirZ0n Jul 30, 2021
bf5b649
Added get_ratio
GirZ0n Jul 30, 2021
2bc1d17
Small refactoring: added get_ratio
GirZ0n Jul 30, 2021
000c1cb
Fixed PR issues
GirZ0n Jul 30, 2021
255c309
Merge branch 'develop' into statistics
GirZ0n Jul 30, 2021
06d687e
Small fixes
GirZ0n Jul 30, 2021
b1c988e
Merge remote-tracking branch 'origin/statistics' into statistics
GirZ0n Jul 30, 2021
de45c09
Added isnull
GirZ0n Jul 30, 2021
5c4e0d9
typo fix
GirZ0n Jul 30, 2021
c7f9efc
Fixed tests
GirZ0n Jul 30, 2021
38657fa
Added --log-output and fixed null checks
GirZ0n Jul 30, 2021
c39599d
Added filemode
GirZ0n Jul 30, 2021
31c34ed
typo fix
GirZ0n Jul 30, 2021
8780d0e
Fixed tests
GirZ0n Jul 30, 2021
37a9393
Fixed tests
GirZ0n Jul 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions src/python/evaluation/issues_statistics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,33 @@ Run the [get_raw_issues.py](get_raw_issues.py) with the arguments from command l
| **‑‑to‑save‑path** | Allows to save the path to the file where the issue was found. By default, the path is not saved. |
| **‑o**, **‑‑output** | Path where the dataset with raw issues will be saved. If not specified, the dataset will be saved next to the original one. |
| **‑l**, **‑‑log-output** | Path where logs will be stored. If not specified, then logs will be output to stderr. |

## Get raw issues statistics
The script takes the dataframe obtained after executing [get_raw_issues.py](get_raw_issues.py) and outputs dataframes with statistics grouped by language.

The input dataset must have 3 obligatory columns:
- `id`
- `code`
- `lang`
- `raw_issues`

Possible values for column `lang` are: `python3`, `kotlin`, `javascript`, `java7`, `java8`, `java9`, `java11`, `java15`.

The output files is a new `xlsx` or `csv` files which contains the `value` column and the columns responsible for its category statistics.

The `value` column shows the metric value (for measurable issue categories), quantity (for quantitative issue categories) or `ratio * 100` (for `CODE_STYLE` and `LINE_LEN`), where `ratio` is calculated as in the corresponding rules (`CodeStyleRule` and `LineLengthRule`).

The table cells indicate how often value occurs in one fragment (for quantitative categories) or in all fragments (for measurable categories).

All output datasets are arranged in folders according to language.

### Usage
Run the [get_raw_issues_statistics.py](get_raw_issues_statistics.py) with the arguments from command line.

**Required arguments:**
- `solutions_with_raw_issues` — path to an xlsx- or csv-file with code samples and raw issues, which were received with [get_raw_issues.py](get_raw_issues.py).

**Optional arguments:**
| Argument | Description |
|----------|-------------|
| **‑o**, **‑‑output** | Path to the folder where datasets with statistics will be saved. If not specified, the datasets will be saved in the folder next to the original dataset. |
227 changes: 227 additions & 0 deletions src/python/evaluation/issues_statistics/get_raw_issues_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
import argparse
import json
import logging
import sys
from collections import Counter
from json import JSONDecodeError
from pathlib import Path
from typing import Dict, List, Optional

sys.path.append('')
sys.path.append('../../..')

import pandas as pd
from pandarallel import pandarallel
from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path, write_df_to_file
from src.python.evaluation.common.util import ColumnName
from src.python.evaluation.issues_statistics.common.raw_issue_encoder_decoder import RawIssueDecoder
from src.python.evaluation.issues_statistics.get_raw_issues import RAW_ISSUES
from src.python.review.application_config import LanguageVersion
from src.python.review.common.file_system import Extension, get_parent_folder, get_total_code_lines_from_code
from src.python.review.common.language import Language
from src.python.review.inspectors.issue import BaseIssue, ISSUE_TYPE_TO_CLASS, IssueType, Measurable
from src.python.review.quality.rules.code_style_scoring import CodeStyleRule
from src.python.review.quality.rules.line_len_scoring import LineLengthRule
from src.python.review.reviewers.utils.code_statistics import get_code_style_lines

ID = ColumnName.ID.value
LANG = ColumnName.LANG.value
CODE = ColumnName.CODE.value

CODE_STYLE_LINES = f'{IssueType.CODE_STYLE.value}_lines'
CODE_STYLE_RATIO = f'{IssueType.CODE_STYLE.value}_ratio'
LINE_LEN_NUMBER = f'{IssueType.LINE_LEN.value}_number'
LINE_LEN_RATIO = f'{IssueType.LINE_LEN.value}_ratio'
TOTAL_LINES = 'total_lines'
VALUE = 'value'

OUTPUT_DF_NAME = 'stats'
DEFAULT_OUTPUT_FOLDER_NAME = 'raw_issues_statistics'

logger = logging.getLogger(__name__)


def configure_arguments(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
'solutions_with_raw_issues',
type=lambda value: Path(value).absolute(),
help=f'Local XLSX-file or CSV-file path. Your file must include column-names: '
f'"{ID}", "{CODE}", "{LANG}", and "{RAW_ISSUES}".',
)

parser.add_argument(
'-o', '--output',
type=lambda value: Path(value).absolute(),
help='Path to the folder where datasets with statistics will be saved. '
'If not specified, the datasets will be saved in the folder next to the original one.',
)

parser.add_argument(
'-l', '--log-output',
type=lambda value: Path(value).absolute(),
help='Path where logs will be stored. If not specified, then logs will be output to stderr.',
)


def _convert_language_code_to_language(fragment_id: str, language_code: str) -> str:
language_version = LanguageVersion.from_value(language_code)

if language_version is None:
logger.warning(f'{fragment_id}: it was not possible to determine the language version from "{language_code}".')
return language_code

language = Language.from_language_version(language_version)

if language == Language.UNKNOWN:
logger.warning(f'{fragment_id}: it was not possible to determine the language from "{language_version}".')
return language_code

return language.value


def _extract_stats_from_issues(row: pd.Series) -> pd.Series:
print(f'{row[ID]}: extracting stats.')

if pd.isnull(row[CODE]):
logger.warning(f'{row[ID]}: no code.')
row[CODE] = ""

if pd.isnull(row[LANG]):
logger.warning(f'{row[ID]}: no lang.')
row[LANG] = ""

try:
issues: List[BaseIssue] = json.loads(row[RAW_ISSUES], cls=RawIssueDecoder)
except (JSONDecodeError, TypeError):
logger.warning(f'{row[ID]}: failed to decode issues.')
issues: List[BaseIssue] = []

counter = Counter([issue.type for issue in issues])

for issue_type, issue_class in ISSUE_TYPE_TO_CLASS.items():
if issubclass(issue_class, Measurable):
row[issue_type.value] = [issue.measure() for issue in issues if isinstance(issue, issue_class)]
else:
row[issue_type.value] = counter[issue_type]

row[CODE_STYLE_LINES] = get_code_style_lines(issues)
row[LINE_LEN_NUMBER] = counter[IssueType.LINE_LEN]
row[TOTAL_LINES] = get_total_code_lines_from_code(row[CODE])

row[LANG] = _convert_language_code_to_language(row[ID], row[LANG])

print(f'{row[ID]}: extraction of statistics is complete.')

return row


def _convert_ratio_to_int(ratio: float):
"""
Round the ratio to 2 decimal places, multiply by 100, and take the integer part.
"""
return int((round(ratio, 2) * 100))


def _group_stats_by_lang(df_with_stats: pd.DataFrame) -> Dict[str, pd.DataFrame]:
logger.info('The grouping of statistics by language has started.')

result = {}

df_grouped_by_lang = df_with_stats.groupby(LANG)
for lang in df_grouped_by_lang.groups:
logger.info(f'"{lang}" statistics grouping started.')

lang_group = df_grouped_by_lang.get_group(lang)

columns_with_stats = []

for issue_type, issue_class in ISSUE_TYPE_TO_CLASS.items():
column = lang_group[issue_type.value]
if issubclass(issue_class, Measurable):
column = column.explode()
columns_with_stats.append(column.value_counts())

columns_with_stats.append(lang_group[TOTAL_LINES].value_counts())

line_len_ratio_column = lang_group.apply(
lambda row: LineLengthRule.get_ratio(row[LINE_LEN_NUMBER], row[TOTAL_LINES]),
axis=1,
)
line_len_ratio_column = line_len_ratio_column.apply(_convert_ratio_to_int)
line_len_ratio_column.name = LINE_LEN_RATIO
columns_with_stats.append(line_len_ratio_column.value_counts())

code_style_ratio_column = lang_group.apply(
lambda row: CodeStyleRule.get_ratio(
row[CODE_STYLE_LINES], row[TOTAL_LINES], Language.from_value(str(lang), default=Language.UNKNOWN),
),
axis=1,
)
code_style_ratio_column = code_style_ratio_column.apply(_convert_ratio_to_int)
code_style_ratio_column.name = CODE_STYLE_RATIO
columns_with_stats.append(code_style_ratio_column.value_counts())

stats = pd.concat(columns_with_stats, axis=1).fillna(0).astype(int)

# Put values in a separate column
stats.index.name = VALUE
stats.reset_index(inplace=True)

result[str(lang)] = stats
logger.info(f'"{lang}" statistics grouping finished.')

logger.info('The grouping of statistics by language has finished.')

return result


def inspect_raw_issues(solutions_with_raw_issues: pd.DataFrame) -> Dict[str, pd.DataFrame]:
pandarallel.initialize()

solutions_with_raw_issues = solutions_with_raw_issues.parallel_apply(_extract_stats_from_issues, axis=1)

return _group_stats_by_lang(solutions_with_raw_issues)


def _get_output_folder(solutions_file_path: Path, output_folder: Optional[Path]):
if output_folder is not None:
return output_folder

return get_parent_folder(solutions_file_path) / DEFAULT_OUTPUT_FOLDER_NAME


def _save_stats(stats_by_lang: Dict[str, pd.DataFrame], solutions_file_path: Path, output_path: Optional[Path]) -> None:
output_folder = _get_output_folder(solutions_file_path, output_path)
output_extension = Extension.get_extension_from_file(str(solutions_file_path))

logger.info(f'Saving statistics to a folder: {output_folder}.')

for lang, stats in stats_by_lang.items():
lang_folder = output_folder / lang
lang_folder.mkdir(parents=True, exist_ok=True)
write_df_to_file(stats, lang_folder / f'{OUTPUT_DF_NAME}{output_extension.value}', output_extension)

logger.info('Saving statistics is complete.')


if __name__ == "__main__":
parser = argparse.ArgumentParser()
configure_arguments(parser)
args = parser.parse_args()

if args.log_output is not None:
args.log_output.parent.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
filename=args.log_output, filemode="w", level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s',
)

solutions_with_raw_issues = get_solutions_df_by_file_path(args.solutions_with_raw_issues)

logger.info("Dataset inspection started.")

stats_by_lang = inspect_raw_issues(solutions_with_raw_issues)

logger.info("Dataset inspection finished.")

_save_stats(stats_by_lang, args.solutions_with_raw_issues, args.output)
7 changes: 7 additions & 0 deletions src/python/review/application_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,10 @@ def is_java(self) -> bool:
or self == LanguageVersion.JAVA_11
or self == LanguageVersion.JAVA_15
)

@classmethod
def from_value(cls, value: str, default=None):
try:
return LanguageVersion(value)
except ValueError:
return default
19 changes: 19 additions & 0 deletions src/python/review/common/file_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,22 @@ def copy_directory(source: Union[str, Path], destination: Union[str, Path], dirs

def copy_file(source: Union[str, Path], destination: Union[str, Path]):
shutil.copy(source, destination)


# Before using it, check that there are no line breaks in the string
def __is_line_empty(line: str) -> bool:
return len(line.strip()) == 0


def __is_comment(line: str) -> bool:
return line.strip().startswith(('#', '//'))


def get_total_code_lines_from_file(path: Path) -> int:
code = get_content_from_file(path, to_strip_nl=False)
return get_total_code_lines_from_code(code)


def get_total_code_lines_from_code(code: str) -> int:
lines = code.splitlines()
return len(list(filter(lambda line: not __is_line_empty(line) and not __is_comment(line), lines)))
7 changes: 7 additions & 0 deletions src/python/review/common/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ def from_language_version(language_version: LanguageVersion) -> 'Language':
def values(cls) -> List[str]:
return [member.value for member in Language]

@classmethod
def from_value(cls, value: str, default=None):
try:
return Language(value)
except ValueError:
return default


EXTENSION_TO_LANGUAGE = {
Extension.JAVA: Language.JAVA,
Expand Down
14 changes: 10 additions & 4 deletions src/python/review/quality/rules/code_style_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ def apply(self, n_code_style_lines, n_code_style, total_lines):
self.n_code_style_lines = n_code_style_lines
self.n_code_style = n_code_style

self.get_ratio(n_code_style_lines, n_code_style, total_lines)
self.update_quality(n_code_style_lines, n_code_style)
self.ratio = self.get_ratio(n_code_style_lines, total_lines, self.config.language)

if self.ratio > self.config.n_code_style_bad:
self.save_quality(QualityType.BAD)
Expand All @@ -84,17 +85,22 @@ def apply(self, n_code_style_lines, n_code_style, total_lines):
if n_code_style_lines > self.config.n_code_style_lines_bad:
self.quality_type = QualityType.BAD

def get_ratio(self, n_code_style_lines, n_code_style, total_lines):
@staticmethod
def get_ratio(n_code_style_lines: int, total_lines: int, language: Language) -> float:
if language == Language.PYTHON:
return n_code_style_lines / max(1, total_lines)
else:
return n_code_style_lines / max(1, total_lines - 4)

def update_quality(self, n_code_style_lines: int, n_code_style: int):
if self.config.language == Language.PYTHON:
if n_code_style == 1:
self.save_quality(QualityType.MODERATE)
self.ratio = n_code_style_lines / max(1, total_lines)
else:
if n_code_style_lines == 1:
self.save_quality(QualityType.GOOD)
elif n_code_style_lines == 2:
self.save_quality(QualityType.MODERATE)
self.ratio = n_code_style_lines / max(1, total_lines - 4)

def __get_next_quality_type(self) -> QualityType:
if self.quality_type == QualityType.BAD:
Expand Down
6 changes: 5 additions & 1 deletion src/python/review/quality/rules/line_len_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(self, config: LineLengthRuleConfig):

# TODO: refactor
def apply(self, n_line_len, n_lines):
self.ratio = n_line_len / max(n_lines, 1)
self.ratio = self.get_ratio(n_line_len, n_lines)
self.n_line_len = n_line_len
self.n_lines = n_lines

Expand Down Expand Up @@ -60,3 +60,7 @@ def merge(self, other: 'LineLengthRule') -> 'LineLengthRule':
result_rule.apply(self.n_line_len + other.n_line_len, self.n_lines + other.n_lines)

return result_rule

@staticmethod
def get_ratio(n_line_len: int, n_lines: int) -> float:
return n_line_len / max(n_lines, 1)
17 changes: 2 additions & 15 deletions src/python/review/reviewers/utils/code_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import Dict, List

from src.python.review.common.file_system import get_content_from_file
from src.python.review.common.file_system import get_total_code_lines_from_file
from src.python.review.inspectors.issue import BaseIssue, IssueType


Expand Down Expand Up @@ -53,19 +53,6 @@ def issue_type_to_statistics_dict(self) -> Dict[IssueType, int]:
}


def __get_total_lines(path: Path) -> int:
lines = get_content_from_file(path, to_strip_nl=False).splitlines()
return len(list(filter(lambda line: not __is_empty(line) and not __is_comment(line), lines)))


def __is_empty(line: str) -> bool:
return len(line.strip()) == 0


def __is_comment(line: str) -> bool:
return line.strip().startswith(('#', '//'))


def get_code_style_lines(issues: List[BaseIssue]) -> int:
code_style_issues = filter(lambda issue: issue.type == IssueType.CODE_STYLE, issues)
line_counter = Counter([issue.line_no for issue in code_style_issues])
Expand Down Expand Up @@ -111,6 +98,6 @@ def gather_code_statistics(issues: List[BaseIssue], path: Path) -> CodeStatistic
coupling=couplings,
weighted_method_complexities=weighted_method_complexities,
method_number=method_numbers,
total_lines=__get_total_lines(path),
total_lines=get_total_code_lines_from_file(path),
code_style_lines=get_code_style_lines(issues),
)
Loading