maxGrigorenko · maxGrigorenko · May 23, 2025 · May 21, 2025 · May 21, 2025 · May 21, 2025
diff --git a/src/normal_laplace/experiments_second_part.ipynb b/src/normal_laplace/experiments_second_part.ipynb
@@ -0,0 +1,239 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyMfhI0b9BxEs30LTZmx5w15",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/maxGrigorenko/DistributionClassifier/blob/maxGrigorenko%2Fsecond_part/src/normal_laplace/experiments_second_part.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Классификация распределений. Второая часть проекта,\n",
+        "## Построение собственного классификатора"
+      ],
+      "metadata": {
+        "id": "BERluZVDPDN9"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "from tqdm import tqdm\n",
+        "from itertools import product\n",
+        "from scipy import stats as st\n",
+        "\n",
+        "from graph_common_functions import *\n",
+        "from distibution_functions import *"
+      ],
+      "metadata": {
+        "id": "ghcSALa8Pc8C"
+      },
+      "execution_count": 1,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 33,
+      "metadata": {
+        "id": "_ooxQ-1SNLsK"
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.metrics import accuracy_score, precision_score, recall_score\n",
+        "from sklearn.ensemble import RandomForestClassifier\n",
+        "from sklearn.preprocessing import LabelEncoder\n",
+        "\n",
+        "class DistribituionClassifier:\n",
+        "    def __init__(self, n, clf, sigma=1, beta=(1/2) ** 0.5):\n",
+        "        self.n = n\n",
+        "        self.sigma = sigma\n",
+        "        self.beta = beta\n",
+        "        self.clf = clf\n",
+        "\n",
+        "\n",
+        "    def make_data(self, number_of_experiments):\n",
+        "        n = self.n\n",
+        "\n",
+        "        data = pd.DataFrame()\n",
+        "\n",
+        "        sub_data = []\n",
+        "        features = ['delta', 'mean_degree', 'max_degree', 'dominating_number', 'clique_number']\n",
+        "\n",
+        "        for disrtibution in 'normal', 'laplace':\n",
+        "            for _ in range(number_of_experiments):\n",
+        "                if disrtibution == 'normal':\n",
+        "                    array = generate_normal(self.sigma, n)\n",
+        "                else:\n",
+        "                    array = generate_laplace(self.beta, n)\n",
+        "\n",
+        "                features_d = []\n",
+        "\n",
+        "                for d in [0.3, 1.0, 2.0, 3.5]:\n",
+        "                    g = distance_graph_constructor(array, d)\n",
+        "                    delta = g.compute_delta()\n",
+        "                    mean_degree = g.compute_mean_degree()\n",
+        "                    max_degree = g.compute_max_degree()\n",
+        "                    dominating_number = g.compute_dominating_number(d)\n",
+        "                    clique_number = g.compute_clique_number(d)\n",
+        "                    features_d.append([delta, mean_degree, max_degree, dominating_number, clique_number])\n",
+        "\n",
+        "                params = dict()\n",
+        "                for i in range(len(features_d)):\n",
+        "                    for j, f in enumerate(features):\n",
+        "                        params.update({f\"{f}_{i}\": features_d[i][j]})\n",
+        "\n",
+        "                params.update({\"distribution\": disrtibution})\n",
+        "                sub_data.append(params)\n",
+        "\n",
+        "        data = pd.concat([data, pd.DataFrame(sub_data)], ignore_index=True)\n",
+        "        return data\n",
+        "\n",
+        "    def fit(self, number_of_experiments=1000):\n",
+        "        data = self.make_data(number_of_experiments=number_of_experiments)\n",
+        "        X = data.drop('distribution', axis=1)\n",
+        "        y = data['distribution']\n",
+        "        encoder = LabelEncoder()\n",
+        "        encoder.fit(y)\n",
+        "        y = encoder.transform(y)\n",
+        "        self.clf.fit(X, y)\n",
+        "\n",
+        "    def predict(self, X):\n",
+        "        return self.clf.predict(X)\n",
+        "\n",
+        "    def estimate(self, number_of_test=100):\n",
+        "        data = self.make_data(number_of_experiments=number_of_test)\n",
+        "        X_test = data.drop('distribution', axis=1)\n",
+        "        y = data['distribution']\n",
+        "        encoder = LabelEncoder()\n",
+        "        encoder.fit(y)\n",
+        "        y_test = encoder.transform(y)\n",
+        "        y_pred = np.array(self.clf.predict(X_test))\n",
+        "        acc = accuracy_score(y_test, y_pred)\n",
+        "        precision = precision_score(y_test, y_pred)\n",
+        "        recall = recall_score(y_test, y_pred)\n",
+        "        return {\n",
+        "            'accuracy': acc,\n",
+        "            'precision': precision,\n",
+        "            'recall': recall\n",
+        "        }\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Протестируем с классификатором RandomForest:"
+      ],
+      "metadata": {
+        "id": "Bq_sFQ4OvQiz"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "d_clf_25 = DistribituionClassifier(n=25, clf=RandomForestClassifier())\n",
+        "d_clf_25.fit(number_of_experiments=5_000)\n",
+        "d_clf_25.estimate(1_000)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "a1On32a5NV--",
+        "outputId": "7f628d4f-6b6f-450b-a07a-ba2310b377a5"
+      },
+      "execution_count": 39,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "{'accuracy': 0.759, 'precision': 0.7549212598425197, 'recall': 0.767}"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 39
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "d_clf_100 = DistribituionClassifier(n=100, clf=RandomForestClassifier())\n",
+        "d_clf_100.fit(number_of_experiments=5_000)\n",
+        "d_clf_100.estimate(1_000)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "x2F7iUl-Znvh",
+        "outputId": "d5719743-c6f3-4d5a-a0fc-b8205e818c3c"
+      },
+      "execution_count": 40,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "{'accuracy': 0.9475, 'precision': 0.9461615154536391, 'recall': 0.949}"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 40
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "d_clf_500 = DistribituionClassifier(n=500, clf=RandomForestClassifier())\n",
+        "d_clf_500.fit(number_of_experiments=1_000)\n",
+        "d_clf_500.estimate(500)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "IbF8mh7iu7xL",
+        "outputId": "503e9ab7-6364-497f-9edf-2fdc14bfb479"
+      },
+      "execution_count": 41,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0}"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 41
+        }
+      ]
+    }
+  ]
+}
diff --git a/src/normal_laplace/graph_common_functions.py b/src/normal_laplace/graph_common_functions.py
@@ -23,6 +23,18 @@ def compute_delta(self):
             delta = min(delta, len(self.graph[v]))
         return delta
 
+    def compute_max_degree(self):
+        max_degree = 0
+        for v in self.vertices:
+            max_degree = max(max_degree, len(self.graph[v]))
+        return max_degree
+
+    def compute_mean_degree(self):
+        s = 0
+        for v in self.vertices:
+            s += len(self.graph[v])
+        return s / len(self.vertices)
+
     def find_connected_components(self):
         visited = set()
         components = []
@@ -70,6 +82,18 @@ def compute_dominating_number(self, d):
 
         return count
 
+    def compute_clique_number(self, d):
+        sorted_points = self.get_sorted_vertices()
+        n = len(sorted_points)
+        max_clique = 0
+        left = 0
+
+        for right in range(n):
+            while sorted_points[right].value - sorted_points[left].value >= d:
+                left += 1
+            max_clique = max(max_clique, right - left + 1)
+
+        return max_clique
 
 def knn_graph_constructor(arr, k):
     points = [Point(name=i, value=val) for i, val in enumerate(arr)]

diff --git a/tests/test_dominating_number.py b/tests/test_dominating_number.py
@@ -2,9 +2,9 @@
 import sys
 import os
 
-sys.path.append(os.path.join(os.path.dirname(__file__), '../src/normal_laplace'))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 
-from graph_common_functions import Graph, distance_graph_constructor
+from src.normal_laplace.graph_common_functions import Graph, distance_graph_constructor
 
 class TestDominatingNumber(unittest.TestCase):
     def setUp(self):
@@ -36,5 +36,42 @@ def test_optimal_case(self):
         graph = distance_graph_constructor(arr, d)
         self.assertEqual(graph.compute_dominating_number(d), 2)
 
+class TestCliqueNumber(unittest.TestCase):
+    def setUp(self):
+        self.d = 2.0
+
+    def test_empty_graph(self):
+        arr = []
+        graph = distance_graph_constructor(arr, self.d)
+        self.assertEqual(graph.compute_clique_number(self.d), 0)
+
+    def test_single_node(self):
+        arr = [5]
+        graph = distance_graph_constructor(arr, self.d)
+        self.assertEqual(graph.compute_clique_number(self.d), 1)
+
+    def test_all_isolated(self):
+        arr = [1, 3, 5, 7]  # Все точки на расстоянии 2+ друг от друга
+        graph = distance_graph_constructor(arr, self.d)
+        self.assertEqual(graph.compute_clique_number(self.d), 1)
+
+    def test_fully_connected(self):
+        arr = [1, 2, 3, 4]  # Все точки в пределах d=2
+        graph = distance_graph_constructor(arr, d=3.5)
+        self.assertEqual(graph.compute_clique_number(d=3.5), 4)
+
+    def test_optimal_case(self):
+        arr = [1.0, 1.5, 2.0, 2.6, 3.1, 4.0]
+        d = 1.0
+        graph = distance_graph_constructor(arr, d)
+        # Максимальная клика: [1.5, 2.0, 2.6] (разница между крайними = 1.1 > d)
+        # Фактически максимальная клика размером 2 ([2.6, 3.1] тоже не соединены)
+        # Пересмотренный тестовый пример:
+        arr = [1.0, 1.9, 2.0, 2.1, 3.0]
+        d = 1.0
+        graph = distance_graph_constructor(arr, d)
+        self.assertEqual(graph.compute_clique_number(d), 3)  # [1.9, 2.0, 2.1]
+
 if __name__ == '__main__':
     unittest.main()
+