Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 104 additions & 44 deletions BaseML/Classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,21 @@
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
import joblib
import random

class cls:
def __init__(self, algorithm='KNN', n_neighbors=5, n_estimators=100, N_CLUSTERS=5):
self.algorithm = algorithm
self.cwd = os.path.dirname(os.getcwd()) # 获取当前文件的绝对路径
self.file_dirname = os.path.dirname(os.path.abspath(__file__))
self.dataset_path = ' '
self.test_size = ' '
self.test_size = 0.2
self.test_set = ' '
self.x_train, self.x_test, self.y_train, self.y_test = 0, 0, 0, 0
if self.algorithm == 'KNN':
Expand All @@ -34,75 +36,133 @@ def __init__(self, algorithm='KNN', n_neighbors=5, n_estimators=100, N_CLUSTERS=
elif self.algorithm == 'Kmeans':
self.n = N_CLUSTERS
self.model = KMeans(self.n)
elif self.algorithm == 'MLP':
self.model = MLPClassifier(solver='lbfgs')

def train(self, seed=0, data_type='csv'):
if self.algorithm == 'AdaBoost' or 'SVM' or 'NaiveBayes':
if self.algorithm in ['AdaBoost','SVM','NaiveBayes', 'MLP']:
np.random.seed(seed)
if data_type == 'csv':
dataset = pd.read_csv(self.dataset_path, sep=',', header=None).values
np.random.shuffle(dataset)

data, label = dataset[:, :-1], dataset[:, -1]
train_index = int((1 - self.test_size) * len(dataset))
train_data, train_label = data[:train_index, ], label[:train_index]
self.test_set = {
'data': data[train_index:, ],
'label': label[train_index:]
}
self.model.fit(train_data, train_label)
np.random.shuffle(self.dataset)

self.x_train, self.x_test, self.y_train, self.y_test = \
train_test_split(self.x_train, self.y_train, test_size=0.2, random_state=0)
self.model.fit(self.x_train, self.y_train)

elif self.algorithm == 'CART':
self.model.fit(self.dataset)
print(self.model.explained_variance_ratio_)
self.model.fit(self.x_train, self.y_train)
# print(self.model.explained_variance_ratio_)
# 返回所保留的n个成分各自的方差百分比,这里可以理解为单个变量方差贡献率。

elif self.algorithm == 'KNN':
self.x_train, self.x_test, self.y_train, self.y_test = \
train_test_split(self.dataset['data'], self.dataset['target'], test_size=0.2, random_state=0)
train_test_split(self.x_train, self.y_train, test_size=0.2, random_state=0)
self.model.fit(self.x_train, self.y_train)
acc = self.model.score(self.x_test, self.y_test)
print('准确率为:{}%'.format(acc * 100))

elif self.algorithm == 'Kmeans':
self.dataset = pd.read_csv(self.dataset_path)
self.x_train = self.dataset.drop('省级行政区', axis=1)
self.x_train = self.x_train.drop("城市", axis=1)
self.x_np = np.array(self.x_train)
self.model.fit(self.x_np)

def inference(self, data):
if self.algorithm == 'AdaBoost' or 'SVM' or 'NaiveBayes':
pred = self.model.predict(self.test_set['data'])
acc = accuracy_score(self.test_set['label'], pred)
print('准确率为:{}%'.format(acc * 100))
# 对列数据进行文本过滤,只抽取有数据的列
delete_list = []
if self.x_train.ndim >= 2:
for col_idx in range(self.x_train.shape[1]):
col = self.x_train[:,col_idx]
# 随机取一个元素,查看其type
if isinstance(random.choice(col),str):
delete_list.append(col_idx)

elif self.algorithm == 'KNN':
result = self.model.predict(data)
print(result)
print("分类结果:{}".format(self.dataset['target_names'][result]))
self.x_train = np.delete(self.x_train, delete_list, axis=1)
self.model.fit(self.x_train)

elif self.algorithm == 'CART':
self.model.fit_transform(data)
print(self.model.n_features_)
print(self.model.n_samples_)
def inference(self, data = np.nan):
if data is not np.nan: # 对data进行了指定
self.x_test = data

if self.algorithm in ['AdaBoost','SVM','NaiveBayes', 'MLP','KNN','CART']:
pred = self.model.predict(self.x_test)
return pred

elif self.algorithm == 'Kmeans':
labels = self.model.labels_ # 获取聚类标签
print(silhouette_score(self.x_np, labels)) # 获取聚类结果总的轮廓系数
print(silhouette_score(self.x_train, labels)) # 获取聚类结果总的轮廓系数
print(self.model.cluster_centers_) # 输出类簇中心
for i in range(self.n):
print(f" CLUSTER-{i+1} ".center(60, '='))
print(self.dataset[labels == i])

def load_dataset(self, path, test_size=0.2, dataset=''):
self.dataset_path = path
self.test_size = test_size
self.dataset=dataset
pred = self.model.predict(self.x_test)
return pred

def save(self):

# 从文件加载数据集,支持csv文件和txt文件
def load_dataset_from_file(self, path, x_column = [], y_column = []):
if type == 'csv':
self.dataset = pd.read_csv(path).values # .values就转成numpy格式了
X = self.dataset[:,x_column]
y = self.dataset[:,y_column]
self.get_data(X,y,x_column,y_column)
elif type == 'txt':
self.dataset = np.loadtxt(path)
X = self.dataset[:,x_column]
y = self.dataset[:,y_column]
X = X.values
y = y.values
self.get_data(X,y,x_column,y_column)

# 从数据加载数据集,支持['numpy','list','DataFrame']
def load_dataset_from_data(self, X, y = None, x_column = [], y_column = []):
if type(X) != type(y):
raise TypeError("数据格式不同,无法加载")
if isinstance(X,list):
X = np.array(X)
y = np.array(y)
self.get_data(X,y,x_column,y_column)
elif isinstance(X,np.ndarray):
self.get_data(X,y,x_column,y_column)
elif isinstance(X,pd.DataFrame):
X = X.values
y = y.values
self.get_data(X,y,x_column,y_column)



# 支持的type有['csv', 'numpy','pandas','list','txt],后面一律转为numpy格式
def load_dataset(self, X, y = None, type = None, x_column = [], y_column = []):
if len(x_column) == 0:
raise ValueError("请传入数据列号")
if type == 'csv':
self.dataset = pd.read_csv(X).values # .values就转成numpy格式了
self.get_data(self.dataset,self.dataset,x_column,y_column)
elif type == 'numpy': # 统一转成numpy格式
self.get_data(X,y,x_column,y_column)
elif type == 'pandas':
X = X.values
y = y.values
self.get_data(X,y,x_column,y_column)
elif type == 'list':
X = np.array(X)
y = np.array(y)
self.get_data(X,y,x_column,y_column)
elif type == 'txt':
self.dataset = np.loadtxt(X)
self.dataset = self.dataset.values
self.get_data(self.dataset,self.dataset,x_column,y_column)


def save(self,path="checkpoint.pkl"):
print("Saving model checkpoints...")
joblib.dump(self.model, '../checkpoint.pkl', compress=3)
joblib.dump(self.model, path, compress=3)
print("Saved successfully!")


def load(self, path):
joblib.load(path)
self.model = joblib.load(path)

def get_data(self,X,y,x_column,y_column):
if len(X):
self.x_train = X[:,x_column]
if len(y): #
if y.ndim == 1:
y = y.reshape(-1,1)
self.y_train = y[:,y_column]
if self.y_train.shape[0]:
self.dataset = np.concatenate((self.x_train,self.y_train),axis=1) # 按列进行拼接
64 changes: 60 additions & 4 deletions BaseML/Regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,66 @@ def inference(self, data):
print(self.model.n_features_)
print(self.model.n_samples_)

def load_dataset(self,path,test_size=0.2, dataset=''):
self.dataset_path = path
self.test_size = test_size
self.dataset = dataset
# 从文件加载数据集,支持csv文件和txt文件
def load_dataset_from_file(self, path, x_column = [], y_column = []):
if type == 'csv':
self.dataset = pd.read_csv(path).values # .values就转成numpy格式了
self.get_data(X,y,x_column,y_column)
elif type == 'txt':
self.dataset = np.loadtxt(path)
X = X.values
y = y.values
self.get_data(X,y,x_column,y_column)

# 从数据加载数据集,支持['numpy','list','DataFrame']
def load_dataset_from_data(self, X, y = None, x_column = [], y_column = []):
if type(X) != type(y):
raise TypeError("数据格式不同,无法加载")
if isinstance(X,list):
X = np.array(X)
y = np.array(y)
self.get_data(X,y,x_column,y_column)
elif isinstance(X,np.ndarray):
self.get_data(X,y,x_column,y_column)
elif isinstance(X,pd.DataFrame):
X = X.values
y = y.values
self.get_data(X,y,x_column,y_column)



# 支持的type有['csv', 'numpy','pandas','list','txt],后面一律转为numpy格式
def load_dataset(self, X, y = None, type = None, x_column = [], y_column = []):
if len(x_column) == 0:
raise ValueError("请传入数据列号")
if type == 'csv':
self.dataset = pd.read_csv(X).values # .values就转成numpy格式了
self.get_data(X,y,x_column,y_column)
elif type == 'numpy': # 统一转成numpy格式
self.get_data(X,y,x_column,y_column)
elif type == 'pandas':
X = X.values
y = y.values
self.get_data(X,y,x_column,y_column)
elif type == 'list':
X = np.array(X)
y = np.array(y)
self.get_data(X,y,x_column,y_column)
elif type == 'txt':
self.dataset = np.loadtxt(X)
X = X.values
y = y.values
self.get_data(X,y,x_column,y_column)

def get_data(self,X,y,x_column,y_column):
if len(X):
self.x_train = X[:,x_column]
if len(y): #
if y.ndim == 1:
y = y.reshape(-1,1)
self.y_train = y[:,y_column]
if self.y_train.shape[0]:
self.dataset = np.concatenate((self.x_train,self.y_train),axis=1) # 按列进行拼接


def save(self):
Expand Down
Binary file added BaseML/__pycache__/AdaBoost.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/CART.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/Classification.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/GaussianNB.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/KNNClassifier.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/PCA.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/Perceptron.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/Regression.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/SVM.cpython-37.pyc
Binary file not shown.
Binary file added BaseML/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Loading