diff --git a/requirements-evaluation.txt b/requirements-evaluation.txt index 82f0fb4c..413e363b 100644 --- a/requirements-evaluation.txt +++ b/requirements-evaluation.txt @@ -2,3 +2,6 @@ openpyxl==3.0.7 pandas==1.2.3 pandarallel numpy~=1.20.2 +plotly +kaleido +pyyaml diff --git a/src/python/evaluation/plots/README.md b/src/python/evaluation/plots/README.md new file mode 100644 index 00000000..6c8c3f11 --- /dev/null +++ b/src/python/evaluation/plots/README.md @@ -0,0 +1,85 @@ +# Hyperstyle evaluation: plots +This module allows you to visualize the data obtained with the [inspectors](../inspectors) module + +## [diffs_plotter.py](diffs_plotter.py) +This script allows you to visualize a dataset obtained with [diffs_between_df.py](../inspectors/diffs_between_df.py). + +The script can build the following charts: +* number of unique issues by category ([Example](#number-of-unique-issues-by-category)) +* number of issues by category ([Example](#number-of-issues-by-category)) +* number of unique penalty issues by category ([Example](#number-of-unique-penalty-issues-by-category)) +* number of penalty issues by category ([Example](#number-of-penalty-issues-by-category)) +* median penalty influence by category ([Example](#median-influence-on-penalty-by-category)) +* distribution of penalty influence by category ([Example](#distribution-of-influence-on-penalty-by-category)) + +### Usage +Run the [diffs_plotter.py](diffs_plotter.py) with the arguments from command line. + +**Required arguments**: +1. `diffs_file_path` — path to a file with serialized diffs that were founded by [diffs_between_df.py](../inspectors/diffs_between_df.py). +2. `save_dir` — directory where the plotted charts will be saved. +3. `config_path` — path to the yaml file containing information about the charts to be plotted. A description of the config and its example is provided in [this section](#config). + + +**Optional arguments**: + +Argument | Description +--- | --- +**‑‑file‑extension** | Allows you to select the extension of output files. Available extensions: `.png`, `.jpg`, `.jpeg`, `.webp`, `.svg`, `.pdf`, `.eps`, `.json`. Default is `.svg`. + +### Config +The configuration file is a dictionary in yaml format, where each chart you want to build has its parameters. + +**Possible values of the charts**: +* `unique_issues_by_category` to plot the number of unique issues by category +* `issues_by_category` to plot the number of issues by category +* `unique_penalty_issues_by_category` to plot the number of unique penalty issues by category +* `penalty_issues_by_category` to plot the number of penalty issues by category +* `median_penalty_influence_by_category` to plot the median penalty influence by category +* `penalty_influence_distribution` to plot the distribution of penalty influence by category + +**Possible parameters**: +Parametr | Description +---|--- +**x_axis_name** | Name of the x-axis. The default value depends on the type of chart. +**y_axis_name** | Name of the y-axis. The default value depends on the type of chart. +**limit** | A value that allows you to filter the data before displaying them.

For charts `unique_issues_by_category`, `issues_by_category`, `unique_penalty_issues_by_category` and `penalty_issues_by_category` only those categories will be shown where the number of issues is greater than or equal to the limit.

For chart `penalty_issues_by_category` only those categories will be shown where the number of median value is greater than or equal to the limit.

For chart `penalty_influence_distribution` only those categories will be shown where the number of values is greater than or equal to the limit.

The default value depends on the type of chart. +**margin** | Defines the outer margin on all four sides of the chart. The available values are specified in the Enum class `MARGIN` from [plots const file](./common/plotly_consts.py). If not specified, the default value provided by Plotly is used. +**sort_order** | Defines the sorting order of the chart. The available values are specified in the Enum class `SORT_ORDER` from [plots const file](./common/plotly_consts.py). If not specified, the default value provided by Plotly is used. +**color** | Defines the color of the chart. The available values are specified in the Enum class `COLOR` from [plots const file](./common/plotly_consts.py). If not specified, the default value provided by Plotly is used. + +#### Example of config +```yaml +unique_issues_by_category: + margin: "ZERO" + limit: 10 + sort_order: "total descending" + color: "RED" +unique_penalty_issues_by_category: + limit: 30 + sort_order: "category ascending" +median_penalty_influence_by_category: +penalty_influence_distribution: +``` + +The result will be four graphs (`unique_issues_by_category`, `unique_penalty_issues_by_category`, `median_penalty_influence_by_category`, `penalty_influence_distribution`) with the corresponding parameters. + +### Examples + +#### Number of unique issues by category + + +#### Number of issues by category + + +#### Number of unique penalty issues by category + + +#### Number of penalty issues by category + + +#### Median influence on penalty by category + + +#### Distribution of influence on penalty by category + diff --git a/src/python/evaluation/plots/__init__.py b/src/python/evaluation/plots/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/evaluation/plots/common/__init__.py b/src/python/evaluation/plots/common/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/evaluation/plots/common/plotly_consts.py b/src/python/evaluation/plots/common/plotly_consts.py new file mode 100644 index 00000000..fc4f703a --- /dev/null +++ b/src/python/evaluation/plots/common/plotly_consts.py @@ -0,0 +1,26 @@ +from enum import Enum + + +class MARGIN(Enum): + ZERO = {'l': 0, 'r': 0, 'b': 0, 't': 0} + + +class SORT_ORDER(Enum): # noqa: N801 + CATEGORY_ASCENDING = 'category ascending' + CATEGORY_DESCENDING = 'category descending' + TOTAL_ASCENDING = 'total ascending' + TOTAL_DESCENDING = 'total descending' + + +class COLOR(Enum): + # Colors from px.colors.DEFAULT_PLOTLY_COLORS + BLUE = "rgb(31, 119, 180)" + ORANGE = "rgb(255, 127, 14)" + GREEN = "rgb(44, 160, 44)" + RED = "rgb(214, 39, 40)" + PURPLE = "rgb(148, 103, 189)" + BROWN = "rgb(140, 86, 75)" + PINK = "rgb(227, 119, 194)" + GRAY = "rgb(127, 127, 127)" + YELLOW = "rgb(188, 189, 34)" + CYAN = "rgb(23, 190, 207)" diff --git a/src/python/evaluation/plots/common/plotters.py b/src/python/evaluation/plots/common/plotters.py new file mode 100644 index 00000000..e97ceb94 --- /dev/null +++ b/src/python/evaluation/plots/common/plotters.py @@ -0,0 +1,131 @@ +from statistics import median +from typing import Any, Callable, Dict, Optional + +import pandas as pd +import plotly.graph_objects as go +from src.python.evaluation.inspectors.common.statistics import IssuesStatistics, PenaltyInfluenceStatistics +from src.python.evaluation.plots.common import plotly_consts +from src.python.evaluation.plots.common.utils import create_bar_plot, create_box_plot +from src.python.review.inspectors.issue import IssueType + + +def _get_dataframe_from_dict( + data_dict: Dict[Any, Any], + key_name: str, + value_name: str, + key_mapper: Callable = lambda x: x, + value_mapper: Callable = lambda y: y, +): + """ + Converts 'data_dict' to a dataframe consisting of two columns: 'key_name', 'value_name'. + 'key_name' contains all keys of 'data_dict', 'value_name' contains all corresponding + values of 'data_dict'. With the functions 'key_mapper' and 'value_mapper' you can + additionally convert keys and values respectively. + """ + converted_dict = { + key_name: list(map(key_mapper, data_dict.keys())), + value_name: list(map(value_mapper, data_dict.values())), + } + + return pd.DataFrame.from_dict(converted_dict) + + +def _extract_stats_from_issues_statistics( + statistics: IssuesStatistics, limit: int, only_unique: bool, +) -> Dict[IssueType, int]: + categorized_statistics = statistics.get_short_categorized_statistics() + + # If you want to get only unique issues, you should use position 0 of the tuple, otherwise 1. + position = int(not only_unique) + + return { + issue_type: stat[position] for issue_type, stat in categorized_statistics.items() if stat[position] >= limit + } + + +def get_unique_issues_by_category( + statistics: IssuesStatistics, + x_axis_name: str = 'Categories', + y_axis_name: str = 'Number of unique issues', + limit: int = 0, + margin: Optional[plotly_consts.MARGIN] = None, + sort_order: Optional[plotly_consts.SORT_ORDER] = None, + color: Optional[plotly_consts.COLOR] = None, +) -> go.Figure: + filtered_stats = _extract_stats_from_issues_statistics(statistics, limit, only_unique=True) + + df = _get_dataframe_from_dict( + filtered_stats, + key_name=x_axis_name, + value_name=y_axis_name, + key_mapper=lambda issue_type: issue_type.name, + ) + + return create_bar_plot(df, x_axis_name, y_axis_name, margin, sort_order, color) + + +def get_issues_by_category( + statistics: IssuesStatistics, + x_axis_name: str = 'Categories', + y_axis_name: str = 'Number of issues', + limit: int = 0, + margin: Optional[plotly_consts.MARGIN] = None, + sort_order: Optional[plotly_consts.SORT_ORDER] = None, + color: Optional[plotly_consts.COLOR] = None, +) -> go.Figure: + filtered_stats = _extract_stats_from_issues_statistics(statistics, limit, only_unique=False) + + df = _get_dataframe_from_dict( + filtered_stats, + key_name=x_axis_name, + value_name=y_axis_name, + key_mapper=lambda issue_type: issue_type.name, + ) + + return create_bar_plot(df, x_axis_name, y_axis_name, margin, sort_order, color) + + +def get_median_penalty_influence_by_category( + statistics: PenaltyInfluenceStatistics, + x_axis_name: str = 'Categories', + y_axis_name: str = 'Penalty influence (%)', + limit: int = 0, + margin: Optional[plotly_consts.MARGIN] = None, + sort_order: Optional[plotly_consts.SORT_ORDER] = None, + color: Optional[plotly_consts.COLOR] = None, +) -> go.Figure: + stat = statistics.stat + filtered_stats = {issue_type: influence for issue_type, influence in stat.items() if median(influence) >= limit} + + df = _get_dataframe_from_dict( + filtered_stats, + key_name=x_axis_name, + value_name=y_axis_name, + key_mapper=lambda issue_type: issue_type.name, + value_mapper=lambda influence: median(influence), + ) + + return create_bar_plot(df, x_axis_name, y_axis_name, margin, sort_order, color) + + +def get_penalty_influence_distribution( + statistics: PenaltyInfluenceStatistics, + x_axis_name: str = 'Categories', + y_axis_name: str = 'Penalty influence (%)', + limit: int = 0, + margin: Optional[plotly_consts.MARGIN] = None, + sort_order: Optional[plotly_consts.SORT_ORDER] = None, + color: Optional[plotly_consts.COLOR] = None, +): + stat = statistics.stat + filtered_stats = {issue_type: influence for issue_type, influence in stat.items() if len(influence) >= limit} + + df = _get_dataframe_from_dict( + filtered_stats, + key_name=x_axis_name, + value_name=y_axis_name, + key_mapper=lambda issue_type: issue_type.name, + ) + df = df.explode(y_axis_name) + + return create_box_plot(df, x_axis_name, y_axis_name, margin, sort_order, color) diff --git a/src/python/evaluation/plots/common/utils.py b/src/python/evaluation/plots/common/utils.py new file mode 100644 index 00000000..8fb1673e --- /dev/null +++ b/src/python/evaluation/plots/common/utils.py @@ -0,0 +1,76 @@ +import os +from pathlib import Path +from typing import List, Optional + +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from src.python.evaluation.plots.common import plotly_consts +from src.python.review.common.file_system import Extension + + +def get_supported_extensions() -> List[str]: + extensions = Extension.get_image_extensions() + extensions.append(Extension.JSON) + return [extension.value for extension in extensions] + + +def create_bar_plot( + df: pd.DataFrame, + x_axis: str, + y_axis: str, + margin: Optional[plotly_consts.MARGIN] = None, + sort_order: Optional[plotly_consts.SORT_ORDER] = None, + color: Optional[plotly_consts.COLOR] = None, +) -> go.Figure: + fig = px.bar(df, x=x_axis, y=y_axis, text=y_axis) + update_figure(fig, margin, sort_order, color) + return fig + + +def create_box_plot( + df: pd.DataFrame, + x_axis: str, + y_axis: str, + margin: Optional[plotly_consts.MARGIN] = None, + sort_order: Optional[plotly_consts.SORT_ORDER] = None, + color: Optional[plotly_consts.COLOR] = None, +) -> go.Figure: + fig = px.box(df, x=x_axis, y=y_axis) + update_figure(fig, margin, sort_order, color) + return fig + + +def update_figure( + fig: go.Figure, + margin: Optional[plotly_consts.MARGIN] = None, + sort_order: Optional[plotly_consts.SORT_ORDER] = None, + color: Optional[plotly_consts.COLOR] = None, +) -> None: + new_layout = {} + + if margin is not None: + new_layout["margin"] = margin.value + + if sort_order is not None: + new_layout["xaxis"] = {"categoryorder": sort_order.value} + + fig.update_layout(**new_layout) + + new_trace = {} + + if color is not None: + new_trace["marker"] = {"color": color.value} + + fig.update_traces(**new_trace) + + +def save_plot( + fig: go.Figure, + dir_path: Path, + plot_name: str = "result_plot", + extension: Extension = Extension.SVG, +) -> None: + os.makedirs(dir_path, exist_ok=True) + file = dir_path / f"{plot_name}{extension.value}" + fig.write_image(str(file)) diff --git a/src/python/evaluation/plots/diffs_plotter.py b/src/python/evaluation/plots/diffs_plotter.py new file mode 100644 index 00000000..500b018e --- /dev/null +++ b/src/python/evaluation/plots/diffs_plotter.py @@ -0,0 +1,172 @@ +import argparse +import sys +from enum import Enum, unique +from pathlib import Path +from typing import Any, Callable, Dict, Union + +sys.path.append('../../../..') + +import plotly.graph_objects as go +from src.python.common.tool_arguments import RunToolArgument +from src.python.evaluation.inspectors.common.statistics import ( + GeneralInspectorsStatistics, + IssuesStatistics, + PenaltyInfluenceStatistics, +) +from src.python.evaluation.inspectors.print_inspectors_statistics import gather_statistics +from src.python.evaluation.plots.common import plotly_consts +from src.python.evaluation.plots.common.plotters import ( + get_issues_by_category, + get_median_penalty_influence_by_category, + get_penalty_influence_distribution, + get_unique_issues_by_category, +) +from src.python.evaluation.plots.common.utils import get_supported_extensions, save_plot +from src.python.review.common.file_system import deserialize_data_from_file, Extension, parse_yaml + + +@unique +class ConfigFields(Enum): + X_AXIS_NAME = 'x_axis_name' + Y_AXIS_NAME = 'y_axis_name' + LIMIT = 'limit' + MARGIN = 'margin' + SORT_ORDER = 'sort_order' + COLOR = 'color' + + +X_AXIS_NAME = ConfigFields.X_AXIS_NAME.value +Y_AXIS_NAME = ConfigFields.Y_AXIS_NAME.value +LIMIT = ConfigFields.LIMIT.value +MARGIN = ConfigFields.MARGIN.value +SORT_ORDER = ConfigFields.SORT_ORDER.value +COLOR = ConfigFields.COLOR.value + + +@unique +class PlotTypes(Enum): + UNIQUE_ISSUES_BY_CATEGORY = 'unique_issues_by_category' + ISSUES_BY_CATEGORY = 'issues_by_category' + UNIQUE_PENALTY_ISSUES_BY_CATEGORY = 'unique_penalty_issues_by_category' + PENALTY_ISSUES_BY_CATEGORY = 'penalty_issues_by_category' + MEDIAN_PENALTY_INFLUENCE_BY_CATEGORY = 'median_penalty_influence_by_category' + PENALTY_INFLUENCE_DISTRIBUTION = 'penalty_influence_distribution' + + def to_plotter_function(self) -> Callable[..., go.Figure]: + type_to_function = { + PlotTypes.UNIQUE_ISSUES_BY_CATEGORY: get_unique_issues_by_category, + PlotTypes.ISSUES_BY_CATEGORY: get_issues_by_category, + PlotTypes.UNIQUE_PENALTY_ISSUES_BY_CATEGORY: get_unique_issues_by_category, + PlotTypes.PENALTY_ISSUES_BY_CATEGORY: get_issues_by_category, + PlotTypes.MEDIAN_PENALTY_INFLUENCE_BY_CATEGORY: get_median_penalty_influence_by_category, + PlotTypes.PENALTY_INFLUENCE_DISTRIBUTION: get_penalty_influence_distribution, + } + + return type_to_function[self] + + def extract_statistics( + self, + statistics: GeneralInspectorsStatistics, + ) -> Union[IssuesStatistics, PenaltyInfluenceStatistics]: + type_to_statistics = { + PlotTypes.UNIQUE_ISSUES_BY_CATEGORY: statistics.new_issues_stat, + PlotTypes.ISSUES_BY_CATEGORY: statistics.new_issues_stat, + PlotTypes.UNIQUE_PENALTY_ISSUES_BY_CATEGORY: statistics.penalty_issues_stat, + PlotTypes.PENALTY_ISSUES_BY_CATEGORY: statistics.penalty_issues_stat, + PlotTypes.MEDIAN_PENALTY_INFLUENCE_BY_CATEGORY: statistics.penalty_influence_stat, + PlotTypes.PENALTY_INFLUENCE_DISTRIBUTION: statistics.penalty_influence_stat, + } + + return type_to_statistics[self] + + +def configure_arguments(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + RunToolArgument.DIFFS_FILE_PATH.value.long_name, + type=lambda value: Path(value).absolute(), + help=RunToolArgument.DIFFS_FILE_PATH.value.description, + ) + + parser.add_argument( + 'save_dir', + type=lambda value: Path(value).absolute(), + help='The directory where the plotted charts will be saved', + ) + + parser.add_argument( + 'config_path', + type=lambda value: Path(value).absolute(), + help='Path to the yaml file containing information about the graphs to be plotted.', + ) + + parser.add_argument( + '--file-extension', + type=str, + default=Extension.SVG.value, + choices=get_supported_extensions(), + help='Allows you to select the extension of output files', + ) + + +def get_plot_params(config: Dict, plot_type: PlotTypes) -> Dict[str, Any]: + config_params = config.get(plot_type.value) + params = {} + + if config_params is None: + return params + + if config_params.get(MARGIN) is not None: + margin_value = config_params.get(MARGIN).upper() + params[MARGIN] = plotly_consts.MARGIN[margin_value] + + if config_params.get(SORT_ORDER) is not None: + sort_order_value = config_params.get(SORT_ORDER) + params[SORT_ORDER] = plotly_consts.SORT_ORDER(sort_order_value) + + if config_params.get(LIMIT) is not None: + params[LIMIT] = config_params.get(LIMIT) + + if config_params.get(X_AXIS_NAME) is not None: + params[X_AXIS_NAME] = config_params.get(X_AXIS_NAME) + + if config_params.get(Y_AXIS_NAME) is not None: + params[Y_AXIS_NAME] = config_params.get(Y_AXIS_NAME) + + if config_params.get(COLOR) is not None: + color_value = config_params.get(COLOR) + params[COLOR] = plotly_consts.COLOR[color_value] + + return params + + +def plot_and_save( + config: Dict, + general_statistics: GeneralInspectorsStatistics, + save_dir: Path, + extension: Extension, +) -> None: + for plot_type in PlotTypes: + if plot_type.value in config: + params = get_plot_params(config, plot_type) + plotter_function = plot_type.to_plotter_function() + statistics = plot_type.extract_statistics(general_statistics) + plot = plotter_function(statistics, **params) + save_plot(plot, save_dir, plot_name=plot_type.value, extension=extension) + + +def main() -> None: + parser = argparse.ArgumentParser() + configure_arguments(parser) + args = parser.parse_args() + + diffs = deserialize_data_from_file(args.diffs_file_path) + general_statistics = gather_statistics(diffs) + + extension = Extension(args.file_extension) + config = parse_yaml(args.config_path) + + plot_and_save(config, general_statistics, args.save_dir, extension) + + +if __name__ == '__main__': + main() diff --git a/src/python/evaluation/plots/examples/issues_by_category.png b/src/python/evaluation/plots/examples/issues_by_category.png new file mode 100644 index 00000000..3e55aa12 Binary files /dev/null and b/src/python/evaluation/plots/examples/issues_by_category.png differ diff --git a/src/python/evaluation/plots/examples/median_penalty_influence_by_category.png b/src/python/evaluation/plots/examples/median_penalty_influence_by_category.png new file mode 100644 index 00000000..94d5baf9 Binary files /dev/null and b/src/python/evaluation/plots/examples/median_penalty_influence_by_category.png differ diff --git a/src/python/evaluation/plots/examples/penalty_influence_distribution.png b/src/python/evaluation/plots/examples/penalty_influence_distribution.png new file mode 100644 index 00000000..6fbdce22 Binary files /dev/null and b/src/python/evaluation/plots/examples/penalty_influence_distribution.png differ diff --git a/src/python/evaluation/plots/examples/penalty_issues_by_category.png b/src/python/evaluation/plots/examples/penalty_issues_by_category.png new file mode 100644 index 00000000..3e55aa12 Binary files /dev/null and b/src/python/evaluation/plots/examples/penalty_issues_by_category.png differ diff --git a/src/python/evaluation/plots/examples/unique_issues_by_category.png b/src/python/evaluation/plots/examples/unique_issues_by_category.png new file mode 100644 index 00000000..ab69b22f Binary files /dev/null and b/src/python/evaluation/plots/examples/unique_issues_by_category.png differ diff --git a/src/python/evaluation/plots/examples/unique_penalty_issues_by_category.png b/src/python/evaluation/plots/examples/unique_penalty_issues_by_category.png new file mode 100644 index 00000000..ab69b22f Binary files /dev/null and b/src/python/evaluation/plots/examples/unique_penalty_issues_by_category.png differ diff --git a/src/python/review/common/file_system.py b/src/python/review/common/file_system.py index 802ccf83..bbdeeda6 100644 --- a/src/python/review/common/file_system.py +++ b/src/python/review/common/file_system.py @@ -9,6 +9,8 @@ from pathlib import Path from typing import Any, Callable, List, Optional, Tuple, Union +import yaml + @unique class FileSystemItem(Enum): @@ -37,12 +39,33 @@ class Extension(Enum): PICKLE = '.pickle' JSON = '.json' + # Image extensions + PNG = '.png' + JPG = '.jpg' + JPEG = '.jpeg' + WEBP = '.webp' + SVG = '.svg' + PDF = '.pdf' + EPS = '.eps' + # Not empty extensions are returned with a dot, for example, '.txt' # If file has no extensions, an empty one ('') is returned @classmethod def get_extension_from_file(cls, file: str) -> 'Extension': return Extension(os.path.splitext(file)[1]) + @classmethod + def get_image_extensions(cls) -> List['Extension']: + return [ + Extension.PNG, + Extension.JPG, + Extension.JPEG, + Extension.WEBP, + Extension.SVG, + Extension.PDF, + Extension.EPS, + ] + ItemCondition = Callable[[str], bool] @@ -100,6 +123,11 @@ def deserialize_data_from_file(path: Path) -> Any: return u.load() +def parse_yaml(path: Union[Path, str]) -> Any: + with open(path) as file: + return yaml.safe_load(file) + + # For getting name of the last folder or file # For example, returns 'folder' for both 'path/data/folder' and 'path/data/folder/' def get_name_from_path(path: Union[Path, str], with_extension: bool = True) -> str: diff --git a/whitelist.txt b/whitelist.txt index d5fed167..cf21e35f 100644 --- a/whitelist.txt +++ b/whitelist.txt @@ -118,6 +118,13 @@ reindex datasets usecols linesep +plotly +JPG +WEBP +SVG +EPS +xaxis + preprocessing num dataloader @@ -143,4 +150,6 @@ idx QodanaDataset cuda f1 -WANDB \ No newline at end of file +WANDB +PNG +consts