-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsrc.py
More file actions
97 lines (89 loc) · 3.35 KB
/
src.py
File metadata and controls
97 lines (89 loc) · 3.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# (C)Copyright Brian Zheng 2019, all rights reserved
# This file is under the Apache 2.0 license, for more information, visit the LICENSE.txt file
from scipy.spatial.distance import euclidean
import collections
import numpy as np
class main:
# Source code for the K-Best-Average Algorithm(In development)
# The class "main" requires x_train and y_train for training test data,
# and the x value for which we need to predict the y
# The optional arguments are n, the number of y you want to
# calculate the average, the K, the distance away from the input # value and all cases within
# K will be taken for average calculation
# This algorithm is a lazy learning algorithm
def __init__(self):
# n and K defaults to None
# It is ok to have n and K set a positive integer at the same time,
# that will take n cases from K and calculate their average
self.avg_list = []
pass
def find_outlier(self, ds):
self.ds = sorted(ds)
self.o = []
q1, q3= np.percentile(self.ds,[25,75])
iqr = q3 - q1
lower_bound = q1 -(1.5 * iqr)
upper_bound = q3 +(1.5 * iqr)
for i in self.ds:
if i < lower_bound or i > upper_bound:
self.o.append(i)
else:
pass
return self.o
def split_data(self, input_data, p):
if p > 1:
raise Exception("p value should not be bigger than 1")
else:
self.ir = [input_data[0 : int(len(input_data) * p)]]
for f in self.ir[0]:
input_data.remove(f)
return self.ir[0], input_data
pass
def train(self, xydict):
# As this a lazy learning function, there is not an actual "train"
# function, it is more like an init function
self.xydict = xydict
self.dist_list = []
def predict(self, input_x, n, K):
# Fix: input_x must be a tuple to represent to coords of the x to predict
self.flist = []
self.dist_dict = {}
self.input_x = input_x
x_list = sorted(self.xydict, key=self.xydict.get)
#print(x_list)
for i in range(len(input_x)):
#print(i)
self.t = input_x[i]
#print(self.t)
#print(x_list)
for a in x_list:
#print(a)
self.f = (a)
#print(self.f)
#append this to a list
#print(self.xydict[f])
self.dist_dict[self.f] = (euclidean(list(a), list(self.t)))
self.flist = []
# Combine dist_list to a dict
self.n = n
self.K = K
self.xydict = dict(sorted(self.xydict.items(), key = lambda x:x[0]))
#print(self.n)
#print(len(list(self.dist_dict.keys())))
#fix this!!!
if self.n > len(list(self.dist_dict.keys())):
raise Exception("n is smaller than the total amount of cases")
else:
#print(self.xydict)
r = list(self.dist_dict.keys())
self.invert_dict = inv_map = {v: k for k, v in self.xydict.items()}
#self.xydict = sorted(self.xydict.items(), key=lambda kv: kv[1])
for i in range(self.n):
#calculate the average
#print(self.xydict[r[i]])
if self.dist_dict[self.invert_dict[self.xydict[r[i]]]] < K:
self.avg_list.append(list(self.xydict[r[i]]))
if len(self.avg_list) < self.n:
raise Exception("There are not enough cases to satify n")
data = np.array(self.avg_list)
return tuple(np.average(data, axis=0).tolist()) #https://stackoverflow.com/questions/55153446/getting-the-average-of-a-list-of-coordinates-in-python