diff --git a/.gitignore b/.gitignore index c9dd281..b3dd7bf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,20 +1,13 @@ -syntax: glob -.python-version -.venv -env/* -venv/* -ENV/* -.idea -.vscode +.venv* +.vscode* .DS_Store -dython.egg*/* +*.egg-info* *__pycache__* -*run_stuff.py* + build/* dist/* -build_deploy.sh site/* -debug.py -.coverage -.hypothesis -.pytest_cache* \ No newline at end of file + +*.coverage* +*.hypothesis* +*.pytest_cache* \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 4542b86..c95af8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,10 @@ # Change Log -## 0.7.12 (dev) +## 0.7.12 * _Dython now officially supports Python 3.14_ * Added new tests (issue [#172](https://github.com/shakedzy/dython/issues/172)) * `examples` module removed (all examples exist in the [official documentation](https://shakedzy.xyz/dython/getting_started/examples/)) +* Added [Youden's J](https://en.wikipedia.org/wiki/Youden%27s_J_statistic) statistic to `model_utils.metric_graph` ROC Curve option _(breaking change: function signature has changed)_. ## 0.7.11 * Fixing dependency issue ([#170](https://github.com/shakedzy/dython/issues/170)) diff --git a/docs/modules/model_utils.md b/docs/modules/model_utils.md index 335a8e9..0bd6f84 100644 --- a/docs/modules/model_utils.md +++ b/docs/modules/model_utils.md @@ -118,10 +118,15 @@ Plots true-positive rate as a function of the false-positive rate of the positiv where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from (0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5. +Computes the estimated optimal threshold using two methods: +* Geometric distance: Finding the closest point to the optimum at (0,1) using Euclidean distance +* Youden's J: Maximizing $TPR - FPR$ (corresponding to $Y - X$) + **Precision-Recall:** Plots precision as a function of recall of the positive label in a binary classification, where $Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear line with precision of the ratio of positive examples in the dataset. +Estimated optimal threshold is computed using Euclidean (geometric) distance. Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html) (as was seen on April 2018): @@ -258,8 +263,20 @@ Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/mo consider the data as a multiclass data rather than binary (useful when plotting curves of different models one against the other) -**Returns:** A dictionary, one key for each class. Each value is another dictionary, -holding AUC and eOpT values. +**Returns:** +A dictionary with these keys: +- `ax`: the Matplotlib plot axis +- `metrics`: each key is a class name from the list of provided classes., + Per each class, another dict exists with AUC results + and measurement methods results. + AUC key holds both the measured area-under-curve (under `val`) + and the AUC of a random-guess classifier (under `naive`) for + comparison. + Each measurement method key contains three values: `x`, `y`, `val`, + corresponding to the (x,y) coordinates on the metric graph of the + threshold, and its value. + If only one class exists, then the measurements method keys and AUC + will be directly under `metrics`. **Example:** See [examples](../getting_started/examples.md). diff --git a/dython/_private.py b/dython/_private.py index 1081136..e229f94 100644 --- a/dython/_private.py +++ b/dython/_private.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt -from typing import Any, Literal, cast, overload, Type +from typing import Any, cast, overload, Type from .typing import OneDimArray, TwoDimArray @@ -92,7 +92,7 @@ def convert( ) ) else: - return converted + return converted # pyright: ignore[reportReturnType] def remove_incomplete_samples( @@ -100,8 +100,8 @@ def remove_incomplete_samples( y: OneDimArray, ) -> tuple[OneDimArray, OneDimArray]: - x = [v if v is not None else np.nan for v in x] - y = [v if v is not None else np.nan for v in y] + x = [v if v is not None else np.nan for v in x] # pyright: ignore[reportAssignmentType] + y = [v if v is not None else np.nan for v in y] # pyright: ignore[reportAssignmentType] arr = np.array([x, y]).transpose() arr = arr[~np.isnan(arr).any(axis=1)].transpose() if isinstance(x, list): diff --git a/dython/model_utils.py b/dython/model_utils.py index 6cb0ca8..05677be 100644 --- a/dython/model_utils.py +++ b/dython/model_utils.py @@ -5,7 +5,7 @@ from sklearn.metrics import roc_curve, precision_recall_curve, auc from sklearn.preprocessing import LabelEncoder from typing import Any, Iterable -from .typing import Number, OneDimArray +from .typing import Number, OneDimArray, MetricGraphResult, SingleCurveResult, SingleMethodResult from ._private import convert, plot_or_not __all__ = ["random_forest_feature_importance", "metric_graph", "ks_abc"] @@ -53,29 +53,38 @@ def _draw_estimated_optimal_threshold_mark( ms: int, fmt: str, ax: Axes, -) -> tuple[Number, Number, Number]: +) -> list[tuple[Number, Number, Number]]: annotation_offset = (-0.027, 0.03) a = np.zeros((len(x_axis), 2)) a[:, 0] = x_axis a[:, 1] = y_axis + a = a[a[:, 0] != a[:, 1]] if metric == "roc": - dist = lambda row: row[0] ** 2 + (1 - row[1]) ** 2 # optimal: (0,1) + dists = [ # optimal: (0,1) + lambda row: row[0] ** 2 + (1 - row[1]) ** 2, # geo + lambda row: row[0] - row[1] # Inverse Youden's J (X-Y instead of Y-X) as later on we're finding the min value, and Youden's J needs to be maximized + ] else: # metric == 'pr' - dist = ( - lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2 - ) # optimal: (1,1) - amin = np.apply_along_axis(dist, 1, a).argmin() - ax.plot(x_axis[amin], y_axis[amin], color=color, marker="o", ms=ms) # pyright: ignore[reportCallIssue, reportArgumentType] - ax.annotate( - "{th:{fmt}}".format(th=thresholds[amin], fmt=fmt), # pyright: ignore[reportCallIssue, reportArgumentType] - xy=(x_axis[amin], y_axis[amin]), # pyright: ignore[reportCallIssue, reportArgumentType] - color=color, - xytext=( - x_axis[amin] + annotation_offset[0], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue] - y_axis[amin] + annotation_offset[1], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue] - ), - ) - return thresholds[amin], x_axis[amin], y_axis[amin] # pyright: ignore[reportCallIssue, reportArgumentType, reportReturnType] + dists = [ # optimal: (1,1) + lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2 # geo + ] + output_tuples = [] + for dist, marker in zip(dists, ['o','x']): + amin = np.apply_along_axis(dist, 1, a).argmin() + ax.plot(x_axis[amin], y_axis[amin], color=color, marker=marker, ms=ms) # pyright: ignore[reportCallIssue, reportArgumentType] + ax.annotate( + "{th:{fmt}}".format(th=thresholds[amin], fmt=fmt), # pyright: ignore[reportCallIssue, reportArgumentType] + xy=(x_axis[amin], y_axis[amin]), # pyright: ignore[reportCallIssue, reportArgumentType] + color=color, + xytext=( + x_axis[amin] + annotation_offset[0], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue] + y_axis[amin] + annotation_offset[1], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue] + ), + ) + output_tuples.append( + (thresholds[amin], x_axis[amin], y_axis[amin]) # pyright: ignore[reportArgumentType, reportCallIssue] + ) + return output_tuples def _plot_macro_metric( @@ -141,26 +150,38 @@ def _binary_metric_graph( metric=metric.upper(), class_label=class_label, auc=auc_score, fmt=fmt ) if metric == "pr": - label += ", naive = {ytr:{fmt}}".format(ytr=y_t_ratio, fmt=fmt) + label += ", naive = {ytr:{fmt}})".format(ytr=y_t_ratio, fmt=fmt) if eoptimal: - eopt, eopt_x, eopt_y = _draw_estimated_optimal_threshold_mark( + eopts = _draw_estimated_optimal_threshold_mark( metric, x_axis, y_axis, th, color, ms, fmt, ax ) - label += ", eOpT = {th:{fmt}})".format(th=eopt, fmt=fmt) + if len(eopts) == 1: + eopts.append((None, None, None)) # pyright: ignore[reportArgumentType] else: - eopt = None - eopt_x = None - eopt_y = None - label += ")" + eopts = [ + (None, None, None), + (None, None, None) + ] ax.plot(x_axis, y_axis, color=color, lw=lw, ls=ls, label=label) return { "x": x_axis, "y": y_axis, "thresholds": th, "auc": auc_score, - "eopt": eopt, - "eopt_x": eopt_x, - "eopt_y": eopt_y, + "eopts": [ + { + "eopt": eopts[0][0], + "eopt_x": eopts[0][1], + "eopt_y": eopts[0][2], + "name": "geo" + }, + { + "eopt": eopts[1][0], + "eopt_x": eopts[1][1], + "eopt_y": eopts[1][2], + "name": "youden_j" + }, + ], "y_t_ratio": y_t_ratio, } @@ -168,12 +189,19 @@ def _binary_metric_graph( def _build_metric_graph_output_dict( metric: str, d: dict[str, Any] -) -> dict[str, dict[str, Any]]: +) -> SingleCurveResult: naive = d["y_t_ratio"] if metric == "pr" else 0.5 - return { - "auc": {"val": d["auc"], "naive": naive}, - "eopt": {"val": d["eopt"], "x": d["eopt_x"], "y": d["eopt_y"]}, - } + output: dict = {'auc': {"val": d["auc"], "naive": naive}} + for eopt in d['eopts']: + if eopt['eopt'] is None: + continue + method_result = SingleMethodResult( + x=eopt['eopt_x'], + y=eopt['eopt_y'], + val=eopt['eopt'] + ) + output[eopt['name']] = method_result + return output # pyright: ignore[reportReturnType] def metric_graph( @@ -199,15 +227,25 @@ def metric_graph( title: str | None = None, filename: str | None = None, force_multiclass: bool = False, -) -> dict[str, Any]: +) -> MetricGraphResult: """ - Plot a ROC graph of predictor's results (including AUC scores), where each + Plot a metric graph of predictor's results (including AUC scores), where each row of y_true and y_pred represent a single example. - If there are 1 or two columns only, the data is treated as a binary - classification (see input example below). - If there are more then 2 columns, each column is considered a - unique class, and a ROC graph and AUC score will be computed for each. - A Macro-ROC and Micro-ROC are computed and plotted too by default. + + **ROC:** + Plots true-positive rate as a function of the false-positive rate of the positive label in a binary classification, + where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from + (0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5. + + Computes the estimated optimal threshold using two methods: + * Geometric distance: Finding the closest point to the optimum at (0,1) using Euclidean distance + * Youden's J: Maximizing $TPR - FPR$ (corresponding to $Y - X$) + + **Precision-Recall:** + Plots precision as a function of recall of the positive label in a binary classification, where + $Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear + line with precision of the ratio of positive examples in the dataset. + Estimated optimal threshold is computed using Euclidean (geometric) distance. Based on sklearn examples (as was seen on April 2018): http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html @@ -270,8 +308,20 @@ def metric_graph( Returns: -------- - A dictionary, one key for each class. Each value is another dictionary, - holding AUC and eOpT values. + A dictionary with these keys: + - `ax`: the Matplotlib plot axis + - `metrics`: each key is a class name from the list of provided classes., + Per each class, another dict exists with AUC results + and measurement methods results. + AUC key holds both the measured area-under-curve (under `val`) + and the AUC of a random-guess classifier (under `naive`) for + comparison. + Each measurement method key contains three values: `x`, `y`, `val`, + corresponding to the (x,y) coordinates on the metric graph of the + threshold, and its value. + If only one class exists, then the measurements method keys and AUC + will be directly under `metrics`. + Binary Classification Input Example: ------------------------------------ @@ -325,7 +375,7 @@ def metric_graph( else: colors_list: list[str] = colors or _ROC_PLOT_COLORS - output_dict = dict() + output_dict: dict[str, SingleCurveResult] = {} pr_naives = list() if ( len(y_pred_array.shape) == 1 @@ -422,8 +472,11 @@ def metric_graph( filename=filename, plot=plot, ) - output_dict["ax"] = axis - return output_dict + metric_graph_result = MetricGraphResult( + ax=axis, + metrics=output_dict if len(output_dict) > 1 else output_dict[list(output_dict.keys())[0]] + ) + return metric_graph_result def random_forest_feature_importance( diff --git a/dython/nominal.py b/dython/nominal.py index 0d64d9a..b08274a 100644 --- a/dython/nominal.py +++ b/dython/nominal.py @@ -13,10 +13,10 @@ from collections import Counter from matplotlib.colors import Colormap from matplotlib.axes._axes import Axes -from typing import Any, Callable, Iterable, Literal, TypedDict, cast, overload +from typing import Any, Callable, Iterable, Literal, cast, overload from ._private import convert, remove_incomplete_samples, replace_nan_with_value, plot_or_not from .data_utils import identify_columns_by_type -from .typing import Number, OneDimArray, TwoDimArray +from .typing import Number, OneDimArray, TwoDimArray, AssociationsResult __all__ = [ @@ -53,11 +53,6 @@ NomNomAssocStr = Literal["cramer", "theil"] -class AssociationsResult(TypedDict): - corr: pd.DataFrame - ax: Axes | None - - def _inf_nan_str(x: Number) -> str: if np.isnan(x): return "NaN" diff --git a/dython/typing.py b/dython/typing.py index cd344cf..69d1cb7 100644 --- a/dython/typing.py +++ b/dython/typing.py @@ -1,8 +1,30 @@ import numpy as np import pandas as pd -from typing import Sequence, Any +from typing import Sequence, Any, TypedDict, Protocol +from matplotlib.axes._axes import Axes Number = int | float OneDimArray = Sequence[Number | str] | pd.Series | np.ndarray[Any, np.dtype[np.int64] | np.dtype[np.float64] | np.dtype[np.str_]] TwoDimArray = np.ndarray[Any, np.dtype[np.int64] | np.dtype[np.float64] | np.dtype[np.str_]] | pd.DataFrame + + +class AssociationsResult(TypedDict): + corr: pd.DataFrame + ax: Axes | None + + +class SingleMethodResult(TypedDict): + x: float + y: float + val: float + + +class SingleCurveResult(Protocol): + auc: dict[str, float] + def __getitem__(self, key: str) -> SingleMethodResult: ... + + +class MetricGraphResult(TypedDict): + metrics: dict[str, SingleCurveResult] | SingleCurveResult + ax: Axes \ No newline at end of file diff --git a/setup.py b/setup.py index 6b25dd2..f919133 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ EXTRAS_REQUIRE = {"dev": [s.strip() for s in dev_requirements.split("\n")]} min_minor = 10 -max_minor = 13 +max_minor = 14 CLASSIFIERS = [ f"Programming Language :: Python :: 3.{str(v)}" for v in range(min_minor, max_minor+1) ] diff --git a/tests/test_model_utils/test_metric_graph_advanced.py b/tests/test_model_utils/test_metric_graph_advanced.py index 282ff0d..d1da955 100644 --- a/tests/test_model_utils/test_metric_graph_advanced.py +++ b/tests/test_model_utils/test_metric_graph_advanced.py @@ -28,9 +28,9 @@ def test_metric_graph_multiclass_with_class_names(): y_pred = np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.2, 0.1, 0.7], [0.6, 0.3, 0.1]]) result = metric_graph(y_true, y_pred, metric='roc', class_names=['A', 'B', 'C'], plot=False) - assert 'A' in result - assert 'B' in result - assert 'C' in result + assert 'A' in result['metrics'] + assert 'B' in result['metrics'] + assert 'C' in result['metrics'] def test_metric_graph_multiclass_wrong_class_names_type(): @@ -58,8 +58,8 @@ def test_metric_graph_pr_binary(): y_pred = [0.1, 0.9, 0.3, 0.8, 0.7, 0.2] result = metric_graph(y_true, y_pred, metric='pr', plot=False) - assert 'auc' in result['0'] - assert 'naive' in result['0']['auc'] + assert 'auc' in result['metrics'] + assert 'naive' in result['metrics']['auc'] def test_metric_graph_pr_multiclass(): @@ -68,7 +68,7 @@ def test_metric_graph_pr_multiclass(): y_pred = np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.2, 0.1, 0.7], [0.6, 0.3, 0.1]]) result = metric_graph(y_true, y_pred, metric='pr', plot=False) - assert '0' in result + assert '0' in result['metrics'] def test_metric_graph_with_colors_string(): @@ -86,7 +86,7 @@ def test_metric_graph_multiclass_no_micro(): y_pred = np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.2, 0.1, 0.7], [0.6, 0.3, 0.1]]) result = metric_graph(y_true, y_pred, metric='roc', micro=False, plot=False) - assert '0' in result + assert '0' in result['metrics'] def test_metric_graph_multiclass_no_macro(): @@ -95,7 +95,7 @@ def test_metric_graph_multiclass_no_macro(): y_pred = np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.2, 0.1, 0.7], [0.6, 0.3, 0.1]]) result = metric_graph(y_true, y_pred, metric='roc', macro=False, plot=False) - assert '0' in result + assert '0' in result['metrics'] def test_metric_graph_pr_multiclass_no_macro(): @@ -104,7 +104,7 @@ def test_metric_graph_pr_multiclass_no_macro(): y_pred = np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.2, 0.1, 0.7], [0.6, 0.3, 0.1]]) result = metric_graph(y_true, y_pred, metric='pr', macro=False, plot=False) - assert '0' in result + assert '0' in result['metrics'] def test_metric_graph_binary_no_eopt(): @@ -113,7 +113,7 @@ def test_metric_graph_binary_no_eopt(): y_pred = [0.1, 0.9, 0.3, 0.8] result = metric_graph(y_true, y_pred, metric='roc', eopt=False, plot=False) - assert result['0']['eopt']['val'] is None + assert len(list(result['metrics'].keys())) == 1 def test_metric_graph_multiclass_force(): @@ -122,8 +122,8 @@ def test_metric_graph_multiclass_force(): y_pred = np.array([[0.7, 0.3], [0.2, 0.8], [0.6, 0.4], [0.3, 0.7]]) result = metric_graph(y_true, y_pred, metric='roc', force_multiclass=True, plot=False) - assert '0' in result - assert '1' in result + assert '0' in result['metrics'] + assert '1' in result['metrics'] def test_metric_graph_binary_2d_array(): @@ -132,7 +132,7 @@ def test_metric_graph_binary_2d_array(): y_pred = np.array([[0.7, 0.3], [0.2, 0.8], [0.6, 0.4], [0.3, 0.7]]) result = metric_graph(y_true, y_pred, metric='roc', plot=False) - assert 'auc' in result['0'] + assert 'auc' in result['metrics'] def test_metric_graph_mismatched_shapes(): @@ -161,7 +161,7 @@ def test_metric_graph_binary_1d_arrays(): y_pred = np.array([0.9, 0.2, 0.7, 0.3]) result = metric_graph(y_true, y_pred, metric='roc', plot=False) - assert '0' in result + assert '0' not in result['metrics'] def test_metric_graph_with_custom_params(): @@ -191,5 +191,5 @@ def test_metric_graph_with_class_name_string(): y_pred = [0.1, 0.9, 0.3, 0.8] result = metric_graph(y_true, y_pred, metric='roc', class_names='PositiveClass', plot=False) - assert 'PositiveClass' in result + assert 'PositiveClass' not in result['metrics']