diff --git a/src/python/evaluation/plots/README.md b/src/python/evaluation/plots/README.md index 6c8c3f11..bacce3d2 100644 --- a/src/python/evaluation/plots/README.md +++ b/src/python/evaluation/plots/README.md @@ -1,7 +1,7 @@ # Hyperstyle evaluation: plots -This module allows you to visualize the data obtained with the [inspectors](../inspectors) module +This module allows you to visualize the data. -## [diffs_plotter.py](diffs_plotter.py) +## Diffs plotter This script allows you to visualize a dataset obtained with [diffs_between_df.py](../inspectors/diffs_between_df.py). The script can build the following charts: @@ -83,3 +83,68 @@ The result will be four graphs (`unique_issues_by_category`, `unique_penalty_iss #### Distribution of influence on penalty by category + +## Raw issues statistics plotter +This script allows you to visualize a dataset obtained with [get_raw_issues_statistics.py](../issues_statistics/get_raw_issues_statistics.py). + +The script can build the following charts: +* Line chart ([Example](#line-chart)) +* Box plot ([Example](#box-plot)) +* Histogram ([Example](#histogram)) + +### Usage +Run the [raw_issues_statistics_plotter.py](raw_issues_statistics_plotter.py) with the arguments from command line. + +**Required arguments**: +1. `stats` — path to a file with stats that were founded by [get_raw_issues_statistics.py](../issues_statistics/get_raw_issues_statistics.py). +2. `save_dir` — directory where the plotted charts will be saved. +3. `config_path` — path to the yaml file containing information about the charts to be plotted. A description of the config and its example is provided in [this section](#config-1). + +**Optional arguments**: + +Argument | Description +--- | --- +**‑‑file‑extension** | Allows you to select the extension of output files. Available extensions: `.png`, `.jpg`, `.jpeg`, `.webp`, `.svg`, `.pdf`, `.eps`, `.json`. Default is `.svg`. + +### Config +The configuration file is a dictionary in yaml format, where for each column of the original dataset the types of graphs to be plotted are specified. You can also put the common parameters when plotting multiple graphs for one column in a separate `common` group. + +**Possible values of the charts**: +* `line_chart` +* `histogram` +* `box_plot` + +**Possible parameters**: +Parametr | Description +---|--- +**x_axis_name** | Name of the x-axis. The default value depends on the type of chart. +**y_axis_name** | Name of the y-axis. The default value depends on the type of chart. +**boundaries** | Dictionary consisting of pairs `boundary value`: `boundary name` (boundary name may not exist). Allows to draw vertical or horizontal lines on graphs (depending on the type of plot). By default, the boundaries are not drawn. +**range_of_values** | Allows you to filter the values. It is an array of two values: a and b. Only values that belong to the range [a, b) are taken into account when plotting. By default, all values are taken into account when plotting. +**margin** | Defines the outer margin on all four sides of the chart. The available values are specified in the Enum class `MARGIN` from [plots const file](./common/plotly_consts.py). If not specified, the default value provided by Plotly is used. +**sort_order** | Defines the sorting order of the chart. The available values are specified in the Enum class `SORT_ORDER` from [plots const file](./common/plotly_consts.py). If not specified, the default value provided by Plotly is used. +**color** | Defines the color of the chart. The available values are specified in the Enum class `COLOR` from [plots const file](./common/plotly_consts.py). If not specified, the default value provided by Plotly is used. +**n_bins** | Allows you to adjust the number of bins when plotting a box plot. By default, this value is set by Plotly. + +#### Example of config +```yaml +CYCLOMATIC_COMPLEXITY: + line_chart: + x_axis_name: Cyclomatic complexity value + histigram: + common: + range_of_values: [0, 20] +``` + +The result will be two graphs: line chart and histogram. The values in both charts will be between 0 and 19 inclusive. In the line chart the x-axis will be named "Cyclomatic complexity value". + +### Examples + +#### Line chart + + +#### Box plot + + +#### Histogram + \ No newline at end of file diff --git a/src/python/evaluation/plots/common/utils.py b/src/python/evaluation/plots/common/utils.py index 8fb1673e..5c14d07c 100644 --- a/src/python/evaluation/plots/common/utils.py +++ b/src/python/evaluation/plots/common/utils.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from typing import List, Optional +from typing import Dict, List, Optional import pandas as pd import plotly.express as px @@ -35,9 +35,39 @@ def create_box_plot( margin: Optional[plotly_consts.MARGIN] = None, sort_order: Optional[plotly_consts.SORT_ORDER] = None, color: Optional[plotly_consts.COLOR] = None, + horizontal_lines: Optional[Dict[int, Optional[str]]] = None, ) -> go.Figure: fig = px.box(df, x=x_axis, y=y_axis) - update_figure(fig, margin, sort_order, color) + update_figure(fig, margin=margin, sort_order=sort_order, color=color, horizontal_lines=horizontal_lines) + return fig + + +def create_line_chart( + df: pd.DataFrame, + x_axis: str, + y_axis: str, + margin: Optional[plotly_consts.MARGIN] = None, + color: Optional[plotly_consts.COLOR] = None, + vertical_lines: Optional[Dict[int, Optional[str]]] = None, +) -> go.Figure: + fig = px.line(df, x=x_axis, y=y_axis) + update_figure(fig, margin=margin, color=color, vertical_lines=vertical_lines) + return fig + + +def create_histogram( + df: pd.DataFrame, + x_axis: str, + y_axis: str, + n_bins: Optional[int] = None, + margin: Optional[plotly_consts.MARGIN] = None, + color: Optional[plotly_consts.COLOR] = None, + vertical_lines: Optional[Dict[int, Optional[str]]] = None, +) -> go.Figure: + fig = px.histogram(df, x=x_axis, y=y_axis, nbins=n_bins) + update_figure( + fig, margin=margin, color=color, vertical_lines=vertical_lines, x_axis_name=x_axis, y_axis_name=y_axis, + ) return fig @@ -46,6 +76,10 @@ def update_figure( margin: Optional[plotly_consts.MARGIN] = None, sort_order: Optional[plotly_consts.SORT_ORDER] = None, color: Optional[plotly_consts.COLOR] = None, + horizontal_lines: Optional[Dict[int, Optional[str]]] = None, + vertical_lines: Optional[Dict[int, Optional[str]]] = None, + x_axis_name: Optional[str] = None, + y_axis_name: Optional[str] = None, ) -> None: new_layout = {} @@ -55,6 +89,12 @@ def update_figure( if sort_order is not None: new_layout["xaxis"] = {"categoryorder": sort_order.value} + if x_axis_name is not None: + new_layout['xaxis_title'] = x_axis_name + + if y_axis_name is not None: + new_layout['yaxis_title'] = y_axis_name + fig.update_layout(**new_layout) new_trace = {} @@ -64,6 +104,14 @@ def update_figure( fig.update_traces(**new_trace) + if horizontal_lines is not None: + for y, annotation in horizontal_lines.items(): + fig.add_hline(y=y, annotation_text=annotation) + + if vertical_lines is not None: + for x, annotation in vertical_lines.items(): + fig.add_vline(x=x, annotation_text=annotation) + def save_plot( fig: go.Figure, diff --git a/src/python/evaluation/plots/diffs_plotter.py b/src/python/evaluation/plots/diffs_plotter.py index 500b018e..5ecce6f7 100644 --- a/src/python/evaluation/plots/diffs_plotter.py +++ b/src/python/evaluation/plots/diffs_plotter.py @@ -15,13 +15,13 @@ ) from src.python.evaluation.inspectors.print_inspectors_statistics import gather_statistics from src.python.evaluation.plots.common import plotly_consts -from src.python.evaluation.plots.common.plotters import ( +from src.python.evaluation.plots.common.utils import get_supported_extensions, save_plot +from src.python.evaluation.plots.plotters.diffs_plotters import ( get_issues_by_category, get_median_penalty_influence_by_category, get_penalty_influence_distribution, get_unique_issues_by_category, ) -from src.python.evaluation.plots.common.utils import get_supported_extensions, save_plot from src.python.review.common.file_system import deserialize_data_from_file, Extension, parse_yaml diff --git a/src/python/evaluation/plots/examples/BEST_PRACTICES_box_plot.png b/src/python/evaluation/plots/examples/BEST_PRACTICES_box_plot.png new file mode 100644 index 00000000..b0108a1a Binary files /dev/null and b/src/python/evaluation/plots/examples/BEST_PRACTICES_box_plot.png differ diff --git a/src/python/evaluation/plots/examples/CODE_STYLE_ratio_histogram.png b/src/python/evaluation/plots/examples/CODE_STYLE_ratio_histogram.png new file mode 100644 index 00000000..6ebce84e Binary files /dev/null and b/src/python/evaluation/plots/examples/CODE_STYLE_ratio_histogram.png differ diff --git a/src/python/evaluation/plots/examples/CYCLOMATIC_COMPLEXITY_line_chart.png b/src/python/evaluation/plots/examples/CYCLOMATIC_COMPLEXITY_line_chart.png new file mode 100644 index 00000000..3cb363e3 Binary files /dev/null and b/src/python/evaluation/plots/examples/CYCLOMATIC_COMPLEXITY_line_chart.png differ diff --git a/src/python/evaluation/plots/plotters/__init__.py b/src/python/evaluation/plots/plotters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/evaluation/plots/common/plotters.py b/src/python/evaluation/plots/plotters/diffs_plotters.py similarity index 100% rename from src/python/evaluation/plots/common/plotters.py rename to src/python/evaluation/plots/plotters/diffs_plotters.py diff --git a/src/python/evaluation/plots/plotters/raw_issues_statistics_plotters.py b/src/python/evaluation/plots/plotters/raw_issues_statistics_plotters.py new file mode 100644 index 00000000..c545313d --- /dev/null +++ b/src/python/evaluation/plots/plotters/raw_issues_statistics_plotters.py @@ -0,0 +1,124 @@ +from dataclasses import dataclass +from enum import Enum, unique +from typing import Callable, Dict, List, Optional, Tuple + +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from src.python.evaluation.issues_statistics.get_raw_issues_statistics import VALUE +from src.python.evaluation.plots.common import plotly_consts +from src.python.evaluation.plots.common.utils import create_box_plot, create_histogram, create_line_chart + + +@unique +class PlotTypes(Enum): + LINE_CHART = 'line_chart' + HISTOGRAM = 'histogram' + BOX_PLOT = 'box_plot' + + def to_plotter_function(self) -> Callable[..., go.Figure]: + type_to_function = { + PlotTypes.LINE_CHART: plot_line_chart, + PlotTypes.HISTOGRAM: plot_histogram, + PlotTypes.BOX_PLOT: plot_box_plot, + } + + return type_to_function[self] + + +@dataclass +class PlotConfig: + column: str + type: PlotTypes + x_axis_name: Optional[str] = None + y_axis_name: Optional[str] = None + margin: Optional[plotly_consts.MARGIN] = None + color: Optional[plotly_consts.COLOR] = None + boundaries: Optional[Dict[int, Optional[str]]] = None + range_of_values: Optional[range] = None + n_bins: Optional[int] = None + + +def _prepare_stats(stats: pd.DataFrame, config: PlotConfig, x_axis_name: str, y_axis_name: str) -> pd.DataFrame: + result_df = stats[[VALUE, config.column]] + + if config.range_of_values is not None: + result_df = result_df[result_df[VALUE].isin(config.range_of_values)] + + result_df.set_index(VALUE, inplace=True) + + # Trim trailing zeros + result_df = result_df.apply(lambda column: np.trim_zeros(column, trim='b')).dropna() + + # Fill in the missing intermediate values with zeros + min_index, max_index = result_df.index.min(), result_df.index.max() + result_df = result_df.reindex(range(min_index, max_index + 1), fill_value=0) + + result_df.reset_index(inplace=True) + + return result_df.rename(columns={VALUE: x_axis_name, config.column: y_axis_name}) + + +def _get_axis_names(config: PlotConfig, default_x_axis_name: str, default_y_axis_name: str) -> Tuple[str, str]: + x_axis_name = default_x_axis_name + if config.x_axis_name is not None: + x_axis_name = config.x_axis_name + + y_axis_name = default_y_axis_name + if config.y_axis_name is not None: + y_axis_name = config.y_axis_name + + return x_axis_name, y_axis_name + + +def plot_line_chart(stats: pd.DataFrame, config: PlotConfig) -> go.Figure: + x_axis_name, y_axis_name = _get_axis_names( + config, default_x_axis_name='Value', default_y_axis_name='Number of fragments', + ) + + stats = _prepare_stats(stats, config, x_axis_name, y_axis_name) + + return create_line_chart( + stats, x_axis_name, y_axis_name, margin=config.margin, color=config.color, vertical_lines=config.boundaries, + ) + + +def plot_histogram(stats: pd.DataFrame, config: PlotConfig) -> go.Figure: + x_axis_name, y_axis_name = _get_axis_names( + config, default_x_axis_name='Value', default_y_axis_name='Number of fragments', + ) + + stats = _prepare_stats(stats, config, x_axis_name, y_axis_name) + + return create_histogram( + stats, + x_axis_name, + y_axis_name, + margin=config.margin, + color=config.color, + n_bins=config.n_bins, + vertical_lines=config.boundaries, + ) + + +def _get_all_values_from_stats(stats: pd.DataFrame, column_name: str) -> List[int]: + result = [] + stats.apply(lambda row: result.extend([row[VALUE]] * row[column_name]), axis=1) + return result + + +def plot_box_plot(stats: pd.DataFrame, config: PlotConfig) -> go.Figure: + x_axis_name, y_axis_name = _get_axis_names( + config, + default_x_axis_name="Category", + default_y_axis_name='Values', + ) + + values = _get_all_values_from_stats(stats, config.column) + + if config.range_of_values is not None: + values = list(filter(lambda elem: elem in config.range_of_values, values)) + + values_df = pd.DataFrame.from_dict({x_axis_name: config.column, y_axis_name: values}) + + return create_box_plot(values_df, x_axis_name, y_axis_name, horizontal_lines=config.boundaries) diff --git a/src/python/evaluation/plots/raw_issues_statistics_plotter.py b/src/python/evaluation/plots/raw_issues_statistics_plotter.py new file mode 100644 index 00000000..02888080 --- /dev/null +++ b/src/python/evaluation/plots/raw_issues_statistics_plotter.py @@ -0,0 +1,132 @@ +import argparse +import sys +from enum import Enum, unique +from pathlib import Path +from typing import Dict, List, Optional + +sys.path.append('../../../..') + +import pandas as pd +from src.python.evaluation.common.pandas_util import get_solutions_df_by_file_path +from src.python.evaluation.plots.common import plotly_consts +from src.python.evaluation.plots.common.utils import ( + get_supported_extensions, + save_plot, +) +from src.python.evaluation.plots.plotters.raw_issues_statistics_plotters import PlotConfig, PlotTypes +from src.python.review.common.file_system import Extension, parse_yaml + + +@unique +class ConfigFields(Enum): + X_AXIS_NAME = 'x_axis_name' + Y_AXIS_NAME = 'y_axis_name' + MARGIN = 'margin' + COLOR = 'color' + BOUNDARIES = 'boundaries' + COMMON = 'common' + RANGE_OF_VALUES = 'range_of_values' + N_BINS = 'n_bins' + + +X_AXIS_NAME = ConfigFields.X_AXIS_NAME.value +Y_AXIS_NAME = ConfigFields.Y_AXIS_NAME.value +MARGIN = ConfigFields.MARGIN.value +COLOR = ConfigFields.COLOR.value +BOUNDARIES = ConfigFields.BOUNDARIES.value +COMMON = ConfigFields.COMMON.value +RANGE_OF_VALUES = ConfigFields.RANGE_OF_VALUES.value +N_BINS = ConfigFields.N_BINS.value + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + 'stats', + type=lambda value: Path(value).absolute(), + help='Path to dataset with statistics.', + ) + + parser.add_argument( + 'save_dir', + type=lambda value: Path(value).absolute(), + help='The directory where the plotted charts will be saved.', + ) + + parser.add_argument( + 'config_path', + type=lambda value: Path(value).absolute(), + help='Path to the yaml file containing information about the graphs to be plotted.', + ) + + parser.add_argument( + '--file-extension', + type=str, + default=Extension.SVG.value, + choices=get_supported_extensions(), + help='Allows you to select the extension of output files.', + ) + + +def _get_plot_config( + column_name: str, + plot_type: str, + plot_config: Optional[Dict], + common: Optional[Dict], +) -> PlotConfig: + params = {'column': column_name, 'type': PlotTypes(plot_type.lower())} + + if common is not None: + params.update(common) + + if plot_config is not None: + params.update(plot_config) + + if MARGIN in params: + margin_value = params.get(MARGIN).upper() + params[MARGIN] = plotly_consts.MARGIN[margin_value] + + if COLOR in params: + color_value = params.get(COLOR).upper() + params[COLOR] = plotly_consts.COLOR[color_value] + + if RANGE_OF_VALUES in params: + params[RANGE_OF_VALUES] = range(*params[RANGE_OF_VALUES]) + + return PlotConfig(**params) + + +def get_plot_configs(column_name: str, column_config: Dict) -> List[PlotConfig]: + common = column_config.pop(COMMON, None) + + plot_configs = [] + for plot_type, plot_config in column_config.items(): + plot_configs.append(_get_plot_config(column_name, plot_type, plot_config, common)) + + return plot_configs + + +def plot_and_save(config: Dict, stats: pd.DataFrame, save_dir: Path, extension: Extension) -> None: + for column_name, column_config in config.items(): + plot_configs = get_plot_configs(column_name, column_config) + for plot_config in plot_configs: + plotter_function = plot_config.type.to_plotter_function() + plot = plotter_function(stats, plot_config) + subdir = save_dir / plot_config.column + save_plot(plot, subdir, plot_name=f'{plot_config.column}_{plot_config.type.value}', extension=extension) + + +def main(): + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + stats = get_solutions_df_by_file_path(args.stats) + + extension = Extension(args.file_extension) + config = parse_yaml(args.config_path) + + plot_and_save(config, stats, args.save_dir, extension) + + +if __name__ == "__main__": + main() diff --git a/whitelist.txt b/whitelist.txt index 67d97c06..cae7f1ef 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -12,6 +12,7 @@ checkstyle cloneable concat config +configs conftest const consts @@ -65,6 +66,7 @@ getuid gradle groupby hashtable +hline hyperstyle idx ignorecase @@ -108,6 +110,7 @@ multilabel multiline multithreading namespace +nbins ncss ndarray nl @@ -179,6 +182,7 @@ usecols util utils varargs +vline wandb warmup webp