diff --git a/src/python/evaluation/paper_evaluation/issues_statistics/README.md b/src/python/evaluation/paper_evaluation/issues_statistics/README.md new file mode 100644 index 00000000..0c185a79 --- /dev/null +++ b/src/python/evaluation/paper_evaluation/issues_statistics/README.md @@ -0,0 +1,107 @@ +# Raw issue statistics visualization + +This script allows you to visualize raw issue statistics for a paper. + +## Usage +Run the [raw_issues_statistics_visualization.py](./raw_issues_statistics_visualization.py) with the arguments from command line. + +**Required arguments**: + +-`stats_path` — path to a file with stats that were founded by [get_raw_issues_statistics.py](../../issues_statistics/get_raw_issues_statistics.py). Must be an xlsx or csv file. +-`config_path` — path to the yaml file containing information about the charts to be plotted. A description of the config and its example is provided in [this section](#config). +-`save_dir` — directory where the plotted charts will be saved. + +**Optional arguments**: +Argument | Description +--- | --- +**‑‑file‑extension** | Allows you to select the extension of output files. Available extensions: `.png`, `.jpg`, `.jpeg`, `.webp`, `.svg`, `.pdf`, `.eps`, `.json`. Default is `.svg`. + +## Config +The configuration file is a yaml file where each group name has its config. The group config contains `plot_config` and configs for each column of statistics. + +The `plot_config` consists of the following parameters: +- `rows` — number of rows. Default: `1`. +- `cols` — number of cols. Default: `1`. +- `height` — graph height. Default: `800`. +- `width` — graph width. Default: `1600`. +- `x_axis_name` — name of the x-axis. Default: `Value`. +- `y_axis_name` — name of the y-axis. Default: `Quantity`. +- `specs` — сonfiguration of traces on the graph. See [documentation](https://plotly.com/python-api-reference/generated/plotly.subplots.make_subplots.html) for details. Default: `None`. + +The column config consists of the following arguments: +- `range_of_values` — allows you to filter the values. It is an array of two values: a and b. Only values that belong to the range [a, b) are taken into account when plotting. By default, all values are taken into account when plotting. +- `trace_name` — trace name. The default is the name of the column. + +## Examples +### config.yaml +```yaml +measurable: + plot_config: + rows: 2 + cols: 2 + specs: [[{}, {}], [{colspan: 2}, null]] + x_axis_name: Measure + y_axis_name: Number of issues + BOOL_EXPR_LEN: + range_of_values: [1, 11] + trace_name: Boolean Expresion Length + CYCLOMATIC_COMPLEXITY: + range_of_values: [1, 11] + trace_name: Cyclomatic Complexity + FUNC_LEN: + range_of_values: [0, 60] + trace_name: Function Length + +maintainability_and_cohesion: + plot_config: + rows: 2 + width: 1000 + x_axis_name: Lack of measure (%) + y_axis_name: Number of issues + MAINTAINABILITY: + trace_name: Maintainability + COHESION: + trace_name: Cohesion + +ratio: + plot_config: + rows: 2 + width: 1000 + x_axis_name: Ratio (%) + y_axis_name: Number of fragments + CODE_STYLE_ratio: + range_of_values: [ 1, 101 ] + trace_name: Code Style + LINE_LEN_ratio: + range_of_values: [ 1, 101 ] + trace_name: Line Length + +countable: + plot_config: + rows: 2 + cols: 2 + specs: [[{"rowspan": 2}, {}], [null, {}]] + x_axis_name: Number of issues in one fragment + y_axis_name: Number of fragments + ERROR_PRONE: + range_of_values: [ 0, 10 ] + trace_name: Error Prone + BEST_PRACTICES: + range_of_values: [ 0, 10 ] + trace_name: Best Practices + COMPLEXITY: + range_of_values: [ 0, 10 ] + trace_name: Complexity +``` + +### measurable.png + + +### maintainability_and_cohesion.png + + +### ratio.png + + +### countable.png + diff --git a/src/python/evaluation/paper_evaluation/issues_statistics/__init__.py b/src/python/evaluation/paper_evaluation/issues_statistics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/evaluation/paper_evaluation/issues_statistics/examples/countable.png b/src/python/evaluation/paper_evaluation/issues_statistics/examples/countable.png new file mode 100644 index 00000000..28f86363 Binary files /dev/null and b/src/python/evaluation/paper_evaluation/issues_statistics/examples/countable.png differ diff --git a/src/python/evaluation/paper_evaluation/issues_statistics/examples/maintainability_and_cohesion.png b/src/python/evaluation/paper_evaluation/issues_statistics/examples/maintainability_and_cohesion.png new file mode 100644 index 00000000..a037a49a Binary files /dev/null and b/src/python/evaluation/paper_evaluation/issues_statistics/examples/maintainability_and_cohesion.png differ diff --git a/src/python/evaluation/paper_evaluation/issues_statistics/examples/measurable.png b/src/python/evaluation/paper_evaluation/issues_statistics/examples/measurable.png new file mode 100644 index 00000000..ae892783 Binary files /dev/null and b/src/python/evaluation/paper_evaluation/issues_statistics/examples/measurable.png differ diff --git a/src/python/evaluation/paper_evaluation/issues_statistics/examples/ratio.png b/src/python/evaluation/paper_evaluation/issues_statistics/examples/ratio.png new file mode 100644 index 00000000..52d9953c Binary files /dev/null and b/src/python/evaluation/paper_evaluation/issues_statistics/examples/ratio.png differ diff --git a/src/python/evaluation/paper_evaluation/issues_statistics/raw_issues_statistics_visualization.py b/src/python/evaluation/paper_evaluation/issues_statistics/raw_issues_statistics_visualization.py new file mode 100644 index 00000000..10450119 --- /dev/null +++ b/src/python/evaluation/paper_evaluation/issues_statistics/raw_issues_statistics_visualization.py @@ -0,0 +1,207 @@ +import argparse +import logging +import sys +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Dict, List, Optional + +import pandas as pd +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.plots.common.utils import get_supported_extensions, save_plot +from src.python.evaluation.plots.plotters.raw_issues_statistics_plotters import prepare_stats +from src.python.review.common.file_system import Extension, parse_yaml + +logger = logging.getLogger(__name__) +COLORWAY = ['rgb(47,22,84)', 'rgb(99,47,177)', 'rgb(153,110,216)'] + + +class _ConfigFields(Enum): + PLOT_CONFIG = 'plot_config' + ROWS = 'rows' + COLS = 'cols' + SPECS = 'specs' + HEIGHT = 'height' + WIDTH = 'width' + X_AXIS_NAME = 'x_axis_name' + Y_AXIS_NAME = 'y_axis_name' + + RANGE_OF_VALUES = 'range_of_values' + TRACE_NAME = 'trace_name' + + +_PLOT_CONFIG = _ConfigFields.PLOT_CONFIG.value +_ROWS = _ConfigFields.ROWS.value +_COLS = _ConfigFields.COLS.value +_SPECS = _ConfigFields.SPECS.value +_HEIGHT = _ConfigFields.HEIGHT.value +_WIDTH = _ConfigFields.WIDTH.value +_X_AXIS_NAME = _ConfigFields.X_AXIS_NAME.value +_Y_AXIS_NAME = _ConfigFields.Y_AXIS_NAME.value +_RANGE_OF_VALUES = _ConfigFields.RANGE_OF_VALUES.value +_TRACE_NAME = _ConfigFields.TRACE_NAME.value + + +@dataclass +class PlotConfig: + name: str + rows: int = 1 + cols: int = 1 + height: int = 800 + width: int = 1600 + x_axis_name: str = 'Value' + y_axis_name: str = 'Quantity' + specs: Optional[List] = None + + @staticmethod + def get_from_dict(plot_name: str, config: Dict) -> 'PlotConfig': + params = {'name': plot_name} + params.update(config) + return PlotConfig(**params) + + +@dataclass +class TraceConfig: + column: str + range_of_values: Optional[range] = None + trace_name: Optional[str] = None + + @staticmethod + def get_from_dict(column_name: str, config: Dict) -> 'TraceConfig': + params = {'column': column_name} + params.update(config) + + if _RANGE_OF_VALUES in params: + params[_RANGE_OF_VALUES] = range(*params[_RANGE_OF_VALUES]) + + return TraceConfig(**params) + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + 'stats_path', + type=lambda value: Path(value).absolute(), + help='Path to the statistics file. Must be an xlsx or csv file.', + ) + + parser.add_argument( + 'config_path', + type=lambda value: Path(value).absolute(), + help='Path to the yaml file containing information about the graphs to be plotted.', + ) + + parser.add_argument( + 'save_dir', + type=lambda value: Path(value).absolute(), + help='The directory where the plotted charts will be saved.', + ) + + parser.add_argument( + '--file-extension', + type=str, + default=Extension.SVG.value, + choices=get_supported_extensions(), + help='Allows you to select the extension of output files.', + ) + + +def _update_fig(fig: go.Figure, plot_config: PlotConfig) -> None: + fig.update_layout( + width=plot_config.width, + height=plot_config.height, + font_size=22, + paper_bgcolor='rgba(0,0,0,0)', + plot_bgcolor='rgba(0,0,0,0)', + colorway=COLORWAY, + ) + + axes_common_params = { + 'showline': True, + 'linewidth': 1, + 'linecolor': 'black', + 'mirror': True, + } + + fig.update_xaxes(title=plot_config.x_axis_name, **axes_common_params) + fig.update_yaxes(title=plot_config.y_axis_name, **axes_common_params) + + +def build_subplots(df: pd.DataFrame, plot_config: PlotConfig, trace_configs: List[TraceConfig]) -> go.Figure: + fig = make_subplots( + rows=plot_config.rows, + cols=plot_config.cols, + specs=plot_config.specs, + ) + + if plot_config.specs is None: + plot_config.specs = [[{} for _ in range(plot_config.cols)] for _ in range(plot_config.rows)] + + for row_index, row in enumerate(plot_config.specs, start=1): + for column_index, cell in enumerate(row, start=1): + if cell is None: + continue + + trace_config = trace_configs.pop(0) + + stats = prepare_stats( + df, + trace_config.column, + trace_config.range_of_values, + plot_config.x_axis_name, + plot_config.y_axis_name, + ) + + fig.add_scatter( + x=stats[plot_config.x_axis_name], + y=stats[plot_config.y_axis_name], + col=column_index, + row=row_index, + line={'width': 5}, + marker={'size': 10}, + name=trace_config.trace_name if trace_config.trace_name is not None else trace_config.column, + ) + + _update_fig(fig, plot_config) + + return fig + + +def plot_and_save(stats: pd.DataFrame, config: Dict, output_dir: Path, extension: Extension) -> None: + for group_name, group_config in config.items(): + plot_config = PlotConfig.get_from_dict(group_name, group_config.pop(_PLOT_CONFIG)) + trace_configs = [] + for column_name, column_config in group_config.items(): + trace_configs.append(TraceConfig.get_from_dict(column_name, column_config)) + subplots = build_subplots(stats, plot_config, trace_configs) + save_plot(subplots, output_dir, group_name, extension) + + +def main(): + parser = argparse.ArgumentParser() + configure_arguments(parser) + + try: + args = parser.parse_args() + + config = parse_yaml(args.config_path) + stats = get_solutions_df_by_file_path(args.stats_path) + + plot_and_save(stats, config, args.save_dir, Extension(args.file_extension)) + + return 0 + + except IndexError: + logger.error( + 'The number of traces must be consistent with the number of rows and columns, as well as the specs.', + ) + return 2 + + except Exception: + logger.exception('An unexpected error.') + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/python/evaluation/plots/plotters/raw_issues_statistics_plotters.py b/src/python/evaluation/plots/plotters/raw_issues_statistics_plotters.py index abc74092..4e316e06 100644 --- a/src/python/evaluation/plots/plotters/raw_issues_statistics_plotters.py +++ b/src/python/evaluation/plots/plotters/raw_issues_statistics_plotters.py @@ -53,11 +53,17 @@ class PlotConfig: n_bins: Optional[int] = None -def _prepare_stats(stats: pd.DataFrame, config: PlotConfig, x_axis_name: str, y_axis_name: str) -> pd.DataFrame: - result_df = stats[[VALUE, config.column]] - - if config.range_of_values is not None: - result_df = result_df[result_df[VALUE].isin(config.range_of_values)] +def prepare_stats( + stats: pd.DataFrame, + column: str, + range_of_values: Optional[range], + x_axis_name: str, + y_axis_name: str, +) -> pd.DataFrame: + result_df = stats[[VALUE, column]] + + if range_of_values is not None: + result_df = result_df[result_df[VALUE].isin(range_of_values)] result_df.set_index(VALUE, inplace=True) @@ -67,25 +73,31 @@ def _prepare_stats(stats: pd.DataFrame, config: PlotConfig, x_axis_name: str, y_ # Fill in the missing intermediate values with zeros min_index, max_index = result_df.index.min(), result_df.index.max() if pd.isna(min_index) or pd.isna(max_index): - logger.warning(f'{config.column}: no data') + logger.warning(f'{column}: no data') else: result_df = result_df.reindex(range(min_index, max_index + 1), fill_value=0) result_df.reset_index(inplace=True) - return result_df.rename(columns={VALUE: x_axis_name, config.column: y_axis_name}) + return result_df.rename(columns={VALUE: x_axis_name, column: y_axis_name}) -def _get_axis_names(config: PlotConfig, default_x_axis_name: str, default_y_axis_name: str) -> Tuple[str, str]: - x_axis_name = default_x_axis_name - if config.x_axis_name is not None: - x_axis_name = config.x_axis_name +def _get_axis_names( + *, + x_axis_name: Optional[str], + y_axis_name: Optional[str], + default_x_axis_name: str, + default_y_axis_name: str, +) -> Tuple[str, str]: + new_x_axis_name = default_x_axis_name + if x_axis_name is not None: + new_x_axis_name = x_axis_name - y_axis_name = default_y_axis_name - if config.y_axis_name is not None: - y_axis_name = config.y_axis_name + new_y_axis_name = default_y_axis_name + if y_axis_name is not None: + new_y_axis_name = y_axis_name - return x_axis_name, y_axis_name + return new_x_axis_name, new_y_axis_name def plot_line_chart( @@ -93,12 +105,17 @@ def plot_line_chart( config: PlotConfig, group_stats: bool, ) -> Dict[str, go.Figure]: - x_axis_name, y_axis_name = _get_axis_names(config, default_x_axis_name='Value', default_y_axis_name='Quantity') + x_axis_name, y_axis_name = _get_axis_names( + x_axis_name=config.x_axis_name, + y_axis_name=config.y_axis_name, + default_x_axis_name='Value', + default_y_axis_name='Quantity', + ) if not group_stats: plots = {} for lang, stats in stats_by_lang.items(): - stats = _prepare_stats(stats, config, x_axis_name, y_axis_name) + stats = prepare_stats(stats, config.column, config.range_of_values, x_axis_name, y_axis_name) plots[lang] = create_line_chart( stats, x_axis=x_axis_name, @@ -111,7 +128,7 @@ def plot_line_chart( plot = go.Figure() for lang, stats in stats_by_lang.items(): - stats = _prepare_stats(stats, config, x_axis_name, y_axis_name) + stats = prepare_stats(stats, config.column, config.range_of_values, x_axis_name, y_axis_name) trace = create_scatter_trace(stats, x_column=x_axis_name, y_column=y_axis_name) trace.name = lang plot.add_trace(trace) @@ -134,7 +151,10 @@ def plot_histogram( group_stats: bool, ) -> Dict[str, go.Figure]: x_axis_name, y_axis_name = _get_axis_names( - config, default_x_axis_name='Value', default_y_axis_name='Quantity', + x_axis_name=config.x_axis_name, + y_axis_name=config.y_axis_name, + default_x_axis_name='Value', + default_y_axis_name='Quantity', ) if group_stats: @@ -142,7 +162,7 @@ def plot_histogram( plots = {} for lang, stats in stats_by_lang.items(): - stats = _prepare_stats(stats, config, x_axis_name, y_axis_name) + stats = prepare_stats(stats, config.column, config.range_of_values, x_axis_name, y_axis_name) plots[lang] = create_histogram( stats, x_axis_name, @@ -172,7 +192,8 @@ def plot_box_plot( group_stats: bool, ) -> Dict[str, go.Figure]: x_axis_name, y_axis_name = _get_axis_names( - config, + x_axis_name=config.x_axis_name, + y_axis_name=config.y_axis_name, default_x_axis_name='Category', default_y_axis_name='Values', )