diff --git a/src/normal_laplace/experiments_second_part.ipynb b/src/normal_laplace/experiments_second_part.ipynb new file mode 100644 index 0000000..6a53bdb --- /dev/null +++ b/src/normal_laplace/experiments_second_part.ipynb @@ -0,0 +1,239 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyMfhI0b9BxEs30LTZmx5w15", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Классификация распределений. Второая часть проекта,\n", + "## Построение собственного классификатора" + ], + "metadata": { + "id": "BERluZVDPDN9" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from tqdm import tqdm\n", + "from itertools import product\n", + "from scipy import stats as st\n", + "\n", + "from graph_common_functions import *\n", + "from distibution_functions import *" + ], + "metadata": { + "id": "ghcSALa8Pc8C" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "_ooxQ-1SNLsK" + }, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score, precision_score, recall_score\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "class DistribituionClassifier:\n", + " def __init__(self, n, clf, sigma=1, beta=(1/2) ** 0.5):\n", + " self.n = n\n", + " self.sigma = sigma\n", + " self.beta = beta\n", + " self.clf = clf\n", + "\n", + "\n", + " def make_data(self, number_of_experiments):\n", + " n = self.n\n", + "\n", + " data = pd.DataFrame()\n", + "\n", + " sub_data = []\n", + " features = ['delta', 'mean_degree', 'max_degree', 'dominating_number', 'clique_number']\n", + "\n", + " for disrtibution in 'normal', 'laplace':\n", + " for _ in range(number_of_experiments):\n", + " if disrtibution == 'normal':\n", + " array = generate_normal(self.sigma, n)\n", + " else:\n", + " array = generate_laplace(self.beta, n)\n", + "\n", + " features_d = []\n", + "\n", + " for d in [0.3, 1.0, 2.0, 3.5]:\n", + " g = distance_graph_constructor(array, d)\n", + " delta = g.compute_delta()\n", + " mean_degree = g.compute_mean_degree()\n", + " max_degree = g.compute_max_degree()\n", + " dominating_number = g.compute_dominating_number(d)\n", + " clique_number = g.compute_clique_number(d)\n", + " features_d.append([delta, mean_degree, max_degree, dominating_number, clique_number])\n", + "\n", + " params = dict()\n", + " for i in range(len(features_d)):\n", + " for j, f in enumerate(features):\n", + " params.update({f\"{f}_{i}\": features_d[i][j]})\n", + "\n", + " params.update({\"distribution\": disrtibution})\n", + " sub_data.append(params)\n", + "\n", + " data = pd.concat([data, pd.DataFrame(sub_data)], ignore_index=True)\n", + " return data\n", + "\n", + " def fit(self, number_of_experiments=1000):\n", + " data = self.make_data(number_of_experiments=number_of_experiments)\n", + " X = data.drop('distribution', axis=1)\n", + " y = data['distribution']\n", + " encoder = LabelEncoder()\n", + " encoder.fit(y)\n", + " y = encoder.transform(y)\n", + " self.clf.fit(X, y)\n", + "\n", + " def predict(self, X):\n", + " return self.clf.predict(X)\n", + "\n", + " def estimate(self, number_of_test=100):\n", + " data = self.make_data(number_of_experiments=number_of_test)\n", + " X_test = data.drop('distribution', axis=1)\n", + " y = data['distribution']\n", + " encoder = LabelEncoder()\n", + " encoder.fit(y)\n", + " y_test = encoder.transform(y)\n", + " y_pred = np.array(self.clf.predict(X_test))\n", + " acc = accuracy_score(y_test, y_pred)\n", + " precision = precision_score(y_test, y_pred)\n", + " recall = recall_score(y_test, y_pred)\n", + " return {\n", + " 'accuracy': acc,\n", + " 'precision': precision,\n", + " 'recall': recall\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Протестируем с классификатором RandomForest:" + ], + "metadata": { + "id": "Bq_sFQ4OvQiz" + } + }, + { + "cell_type": "code", + "source": [ + "d_clf_25 = DistribituionClassifier(n=25, clf=RandomForestClassifier())\n", + "d_clf_25.fit(number_of_experiments=5_000)\n", + "d_clf_25.estimate(1_000)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a1On32a5NV--", + "outputId": "7f628d4f-6b6f-450b-a07a-ba2310b377a5" + }, + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'accuracy': 0.759, 'precision': 0.7549212598425197, 'recall': 0.767}" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "source": [ + "d_clf_100 = DistribituionClassifier(n=100, clf=RandomForestClassifier())\n", + "d_clf_100.fit(number_of_experiments=5_000)\n", + "d_clf_100.estimate(1_000)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "x2F7iUl-Znvh", + "outputId": "d5719743-c6f3-4d5a-a0fc-b8205e818c3c" + }, + "execution_count": 40, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'accuracy': 0.9475, 'precision': 0.9461615154536391, 'recall': 0.949}" + ] + }, + "metadata": {}, + "execution_count": 40 + } + ] + }, + { + "cell_type": "code", + "source": [ + "d_clf_500 = DistribituionClassifier(n=500, clf=RandomForestClassifier())\n", + "d_clf_500.fit(number_of_experiments=1_000)\n", + "d_clf_500.estimate(500)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IbF8mh7iu7xL", + "outputId": "503e9ab7-6364-497f-9edf-2fdc14bfb479" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0}" + ] + }, + "metadata": {}, + "execution_count": 41 + } + ] + } + ] +} \ No newline at end of file diff --git a/src/normal_laplace/graph_common_functions.py b/src/normal_laplace/graph_common_functions.py index a32ebc7..c612cbc 100644 --- a/src/normal_laplace/graph_common_functions.py +++ b/src/normal_laplace/graph_common_functions.py @@ -23,6 +23,18 @@ def compute_delta(self): delta = min(delta, len(self.graph[v])) return delta + def compute_max_degree(self): + max_degree = 0 + for v in self.vertices: + max_degree = max(max_degree, len(self.graph[v])) + return max_degree + + def compute_mean_degree(self): + s = 0 + for v in self.vertices: + s += len(self.graph[v]) + return s / len(self.vertices) + def find_connected_components(self): visited = set() components = [] @@ -70,6 +82,18 @@ def compute_dominating_number(self, d): return count + def compute_clique_number(self, d): + sorted_points = self.get_sorted_vertices() + n = len(sorted_points) + max_clique = 0 + left = 0 + + for right in range(n): + while sorted_points[right].value - sorted_points[left].value >= d: + left += 1 + max_clique = max(max_clique, right - left + 1) + + return max_clique def knn_graph_constructor(arr, k): points = [Point(name=i, value=val) for i, val in enumerate(arr)] diff --git a/tests/test_dominating_number.py b/tests/test_dominating_number.py index 599008d..cbe2a3a 100644 --- a/tests/test_dominating_number.py +++ b/tests/test_dominating_number.py @@ -2,9 +2,9 @@ import sys import os -sys.path.append(os.path.join(os.path.dirname(__file__), '../src/normal_laplace')) +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from graph_common_functions import Graph, distance_graph_constructor +from src.normal_laplace.graph_common_functions import Graph, distance_graph_constructor class TestDominatingNumber(unittest.TestCase): def setUp(self): @@ -36,5 +36,42 @@ def test_optimal_case(self): graph = distance_graph_constructor(arr, d) self.assertEqual(graph.compute_dominating_number(d), 2) +class TestCliqueNumber(unittest.TestCase): + def setUp(self): + self.d = 2.0 + + def test_empty_graph(self): + arr = [] + graph = distance_graph_constructor(arr, self.d) + self.assertEqual(graph.compute_clique_number(self.d), 0) + + def test_single_node(self): + arr = [5] + graph = distance_graph_constructor(arr, self.d) + self.assertEqual(graph.compute_clique_number(self.d), 1) + + def test_all_isolated(self): + arr = [1, 3, 5, 7] # Все точки на расстоянии 2+ друг от друга + graph = distance_graph_constructor(arr, self.d) + self.assertEqual(graph.compute_clique_number(self.d), 1) + + def test_fully_connected(self): + arr = [1, 2, 3, 4] # Все точки в пределах d=2 + graph = distance_graph_constructor(arr, d=3.5) + self.assertEqual(graph.compute_clique_number(d=3.5), 4) + + def test_optimal_case(self): + arr = [1.0, 1.5, 2.0, 2.6, 3.1, 4.0] + d = 1.0 + graph = distance_graph_constructor(arr, d) + # Максимальная клика: [1.5, 2.0, 2.6] (разница между крайними = 1.1 > d) + # Фактически максимальная клика размером 2 ([2.6, 3.1] тоже не соединены) + # Пересмотренный тестовый пример: + arr = [1.0, 1.9, 2.0, 2.1, 3.0] + d = 1.0 + graph = distance_graph_constructor(arr, d) + self.assertEqual(graph.compute_clique_number(d), 3) # [1.9, 2.0, 2.1] + if __name__ == '__main__': unittest.main() +