Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 239 additions & 0 deletions src/normal_laplace/experiments_second_part.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyMfhI0b9BxEs30LTZmx5w15",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/maxGrigorenko/DistributionClassifier/blob/maxGrigorenko%2Fsecond_part/src/normal_laplace/experiments_second_part.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# Классификация распределений. Второая часть проекта,\n",
"## Построение собственного классификатора"
],
"metadata": {
"id": "BERluZVDPDN9"
}
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from tqdm import tqdm\n",
"from itertools import product\n",
"from scipy import stats as st\n",
"\n",
"from graph_common_functions import *\n",
"from distibution_functions import *"
],
"metadata": {
"id": "ghcSALa8Pc8C"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"id": "_ooxQ-1SNLsK"
},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score, precision_score, recall_score\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"class DistribituionClassifier:\n",
" def __init__(self, n, clf, sigma=1, beta=(1/2) ** 0.5):\n",
" self.n = n\n",
" self.sigma = sigma\n",
" self.beta = beta\n",
" self.clf = clf\n",
"\n",
"\n",
" def make_data(self, number_of_experiments):\n",
" n = self.n\n",
"\n",
" data = pd.DataFrame()\n",
"\n",
" sub_data = []\n",
" features = ['delta', 'mean_degree', 'max_degree', 'dominating_number', 'clique_number']\n",
"\n",
" for disrtibution in 'normal', 'laplace':\n",
" for _ in range(number_of_experiments):\n",
" if disrtibution == 'normal':\n",
" array = generate_normal(self.sigma, n)\n",
" else:\n",
" array = generate_laplace(self.beta, n)\n",
"\n",
" features_d = []\n",
"\n",
" for d in [0.3, 1.0, 2.0, 3.5]:\n",
" g = distance_graph_constructor(array, d)\n",
" delta = g.compute_delta()\n",
" mean_degree = g.compute_mean_degree()\n",
" max_degree = g.compute_max_degree()\n",
" dominating_number = g.compute_dominating_number(d)\n",
" clique_number = g.compute_clique_number(d)\n",
" features_d.append([delta, mean_degree, max_degree, dominating_number, clique_number])\n",
"\n",
" params = dict()\n",
" for i in range(len(features_d)):\n",
" for j, f in enumerate(features):\n",
" params.update({f\"{f}_{i}\": features_d[i][j]})\n",
"\n",
" params.update({\"distribution\": disrtibution})\n",
" sub_data.append(params)\n",
"\n",
" data = pd.concat([data, pd.DataFrame(sub_data)], ignore_index=True)\n",
" return data\n",
"\n",
" def fit(self, number_of_experiments=1000):\n",
" data = self.make_data(number_of_experiments=number_of_experiments)\n",
" X = data.drop('distribution', axis=1)\n",
" y = data['distribution']\n",
" encoder = LabelEncoder()\n",
" encoder.fit(y)\n",
" y = encoder.transform(y)\n",
" self.clf.fit(X, y)\n",
"\n",
" def predict(self, X):\n",
" return self.clf.predict(X)\n",
"\n",
" def estimate(self, number_of_test=100):\n",
" data = self.make_data(number_of_experiments=number_of_test)\n",
" X_test = data.drop('distribution', axis=1)\n",
" y = data['distribution']\n",
" encoder = LabelEncoder()\n",
" encoder.fit(y)\n",
" y_test = encoder.transform(y)\n",
" y_pred = np.array(self.clf.predict(X_test))\n",
" acc = accuracy_score(y_test, y_pred)\n",
" precision = precision_score(y_test, y_pred)\n",
" recall = recall_score(y_test, y_pred)\n",
" return {\n",
" 'accuracy': acc,\n",
" 'precision': precision,\n",
" 'recall': recall\n",
" }\n"
]
},
{
"cell_type": "markdown",
"source": [
"Протестируем с классификатором RandomForest:"
],
"metadata": {
"id": "Bq_sFQ4OvQiz"
}
},
{
"cell_type": "code",
"source": [
"d_clf_25 = DistribituionClassifier(n=25, clf=RandomForestClassifier())\n",
"d_clf_25.fit(number_of_experiments=5_000)\n",
"d_clf_25.estimate(1_000)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "a1On32a5NV--",
"outputId": "7f628d4f-6b6f-450b-a07a-ba2310b377a5"
},
"execution_count": 39,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'accuracy': 0.759, 'precision': 0.7549212598425197, 'recall': 0.767}"
]
},
"metadata": {},
"execution_count": 39
}
]
},
{
"cell_type": "code",
"source": [
"d_clf_100 = DistribituionClassifier(n=100, clf=RandomForestClassifier())\n",
"d_clf_100.fit(number_of_experiments=5_000)\n",
"d_clf_100.estimate(1_000)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "x2F7iUl-Znvh",
"outputId": "d5719743-c6f3-4d5a-a0fc-b8205e818c3c"
},
"execution_count": 40,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'accuracy': 0.9475, 'precision': 0.9461615154536391, 'recall': 0.949}"
]
},
"metadata": {},
"execution_count": 40
}
]
},
{
"cell_type": "code",
"source": [
"d_clf_500 = DistribituionClassifier(n=500, clf=RandomForestClassifier())\n",
"d_clf_500.fit(number_of_experiments=1_000)\n",
"d_clf_500.estimate(500)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IbF8mh7iu7xL",
"outputId": "503e9ab7-6364-497f-9edf-2fdc14bfb479"
},
"execution_count": 41,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0}"
]
},
"metadata": {},
"execution_count": 41
}
]
}
]
}
24 changes: 24 additions & 0 deletions src/normal_laplace/graph_common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ def compute_delta(self):
delta = min(delta, len(self.graph[v]))
return delta

def compute_max_degree(self):
max_degree = 0
for v in self.vertices:
max_degree = max(max_degree, len(self.graph[v]))
return max_degree

def compute_mean_degree(self):
s = 0
for v in self.vertices:
s += len(self.graph[v])
return s / len(self.vertices)

def find_connected_components(self):
visited = set()
components = []
Expand Down Expand Up @@ -70,6 +82,18 @@ def compute_dominating_number(self, d):

return count

def compute_clique_number(self, d):
sorted_points = self.get_sorted_vertices()
n = len(sorted_points)
max_clique = 0
left = 0

for right in range(n):
while sorted_points[right].value - sorted_points[left].value >= d:
left += 1
max_clique = max(max_clique, right - left + 1)

return max_clique

def knn_graph_constructor(arr, k):
points = [Point(name=i, value=val) for i, val in enumerate(arr)]
Expand Down
41 changes: 39 additions & 2 deletions tests/test_dominating_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import sys
import os

sys.path.append(os.path.join(os.path.dirname(__file__), '../src/normal_laplace'))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from graph_common_functions import Graph, distance_graph_constructor
from src.normal_laplace.graph_common_functions import Graph, distance_graph_constructor

class TestDominatingNumber(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -36,5 +36,42 @@ def test_optimal_case(self):
graph = distance_graph_constructor(arr, d)
self.assertEqual(graph.compute_dominating_number(d), 2)

class TestCliqueNumber(unittest.TestCase):
def setUp(self):
self.d = 2.0

def test_empty_graph(self):
arr = []
graph = distance_graph_constructor(arr, self.d)
self.assertEqual(graph.compute_clique_number(self.d), 0)

def test_single_node(self):
arr = [5]
graph = distance_graph_constructor(arr, self.d)
self.assertEqual(graph.compute_clique_number(self.d), 1)

def test_all_isolated(self):
arr = [1, 3, 5, 7] # Все точки на расстоянии 2+ друг от друга
graph = distance_graph_constructor(arr, self.d)
self.assertEqual(graph.compute_clique_number(self.d), 1)

def test_fully_connected(self):
arr = [1, 2, 3, 4] # Все точки в пределах d=2
graph = distance_graph_constructor(arr, d=3.5)
self.assertEqual(graph.compute_clique_number(d=3.5), 4)

def test_optimal_case(self):
arr = [1.0, 1.5, 2.0, 2.6, 3.1, 4.0]
d = 1.0
graph = distance_graph_constructor(arr, d)
# Максимальная клика: [1.5, 2.0, 2.6] (разница между крайними = 1.1 > d)
# Фактически максимальная клика размером 2 ([2.6, 3.1] тоже не соединены)
# Пересмотренный тестовый пример:
arr = [1.0, 1.9, 2.0, 2.1, 3.0]
d = 1.0
graph = distance_graph_constructor(arr, d)
self.assertEqual(graph.compute_clique_number(d), 3) # [1.9, 2.0, 2.1]

if __name__ == '__main__':
unittest.main()