From 44847b112c1017f446e59d661f274364da2ebd40 Mon Sep 17 00:00:00 2001 From: MadhuTangudu Date: Sat, 25 Sep 2021 20:55:25 +0530 Subject: [PATCH 01/13] v0.4.0 changelog: - added knn.py and test_knn.py modules - added iris.csv dataset --- data/input/iris.csv | 151 +++++++++++++++++++++++++++++++++++++++++++ mllib/lib/knn.py | 153 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_knn.py | 101 +++++++++++++++++++++++++++++ 3 files changed, 405 insertions(+) create mode 100644 data/input/iris.csv create mode 100644 mllib/lib/knn.py create mode 100644 tests/test_knn.py diff --git a/data/input/iris.csv b/data/input/iris.csv new file mode 100644 index 0000000..d93a29c --- /dev/null +++ b/data/input/iris.csv @@ -0,0 +1,151 @@ +x3,x4,x1,x2,y +5.1,3.5,1.4,0.2,0 +4.9,3.0,1.4,0.2,0 +4.7,3.2,1.3,0.2,0 +4.6,3.1,1.5,0.2,0 +5.0,3.6,1.4,0.2,0 +5.4,3.9,1.7,0.4,0 +4.6,3.4,1.4,0.3,0 +5.0,3.4,1.5,0.2,0 +4.4,2.9,1.4,0.2,0 +4.9,3.1,1.5,0.1,0 +5.4,3.7,1.5,0.2,0 +4.8,3.4,1.6,0.2,0 +4.8,3.0,1.4,0.1,0 +4.3,3.0,1.1,0.1,0 +5.8,4.0,1.2,0.2,0 +5.7,4.4,1.5,0.4,0 +5.4,3.9,1.3,0.4,0 +5.1,3.5,1.4,0.3,0 +5.7,3.8,1.7,0.3,0 +5.1,3.8,1.5,0.3,0 +5.4,3.4,1.7,0.2,0 +5.1,3.7,1.5,0.4,0 +4.6,3.6,1.0,0.2,0 +5.1,3.3,1.7,0.5,0 +4.8,3.4,1.9,0.2,0 +5.0,3.0,1.6,0.2,0 +5.0,3.4,1.6,0.4,0 +5.2,3.5,1.5,0.2,0 +5.2,3.4,1.4,0.2,0 +4.7,3.2,1.6,0.2,0 +4.8,3.1,1.6,0.2,0 +5.4,3.4,1.5,0.4,0 +5.2,4.1,1.5,0.1,0 +5.5,4.2,1.4,0.2,0 +4.9,3.1,1.5,0.2,0 +5.0,3.2,1.2,0.2,0 +5.5,3.5,1.3,0.2,0 +4.9,3.6,1.4,0.1,0 +4.4,3.0,1.3,0.2,0 +5.1,3.4,1.5,0.2,0 +5.0,3.5,1.3,0.3,0 +4.5,2.3,1.3,0.3,0 +4.4,3.2,1.3,0.2,0 +5.0,3.5,1.6,0.6,0 +5.1,3.8,1.9,0.4,0 +4.8,3.0,1.4,0.3,0 +5.1,3.8,1.6,0.2,0 +4.6,3.2,1.4,0.2,0 +5.3,3.7,1.5,0.2,0 +5.0,3.3,1.4,0.2,0 +7.0,3.2,4.7,1.4,1 +6.4,3.2,4.5,1.5,1 +6.9,3.1,4.9,1.5,1 +5.5,2.3,4.0,1.3,1 +6.5,2.8,4.6,1.5,1 +5.7,2.8,4.5,1.3,1 +6.3,3.3,4.7,1.6,1 +4.9,2.4,3.3,1.0,1 +6.6,2.9,4.6,1.3,1 +5.2,2.7,3.9,1.4,1 +5.0,2.0,3.5,1.0,1 +5.9,3.0,4.2,1.5,1 +6.0,2.2,4.0,1.0,1 +6.1,2.9,4.7,1.4,1 +5.6,2.9,3.6,1.3,1 +6.7,3.1,4.4,1.4,1 +5.6,3.0,4.5,1.5,1 +5.8,2.7,4.1,1.0,1 +6.2,2.2,4.5,1.5,1 +5.6,2.5,3.9,1.1,1 +5.9,3.2,4.8,1.8,1 +6.1,2.8,4.0,1.3,1 +6.3,2.5,4.9,1.5,1 +6.1,2.8,4.7,1.2,1 +6.4,2.9,4.3,1.3,1 +6.6,3.0,4.4,1.4,1 +6.8,2.8,4.8,1.4,1 +6.7,3.0,5.0,1.7,1 +6.0,2.9,4.5,1.5,1 +5.7,2.6,3.5,1.0,1 +5.5,2.4,3.8,1.1,1 +5.5,2.4,3.7,1.0,1 +5.8,2.7,3.9,1.2,1 +6.0,2.7,5.1,1.6,1 +5.4,3.0,4.5,1.5,1 +6.0,3.4,4.5,1.6,1 +6.7,3.1,4.7,1.5,1 +6.3,2.3,4.4,1.3,1 +5.6,3.0,4.1,1.3,1 +5.5,2.5,4.0,1.3,1 +5.5,2.6,4.4,1.2,1 +6.1,3.0,4.6,1.4,1 +5.8,2.6,4.0,1.2,1 +5.0,2.3,3.3,1.0,1 +5.6,2.7,4.2,1.3,1 +5.7,3.0,4.2,1.2,1 +5.7,2.9,4.2,1.3,1 +6.2,2.9,4.3,1.3,1 +5.1,2.5,3.0,1.1,1 +5.7,2.8,4.1,1.3,1 +6.3,3.3,6.0,2.5,2 +5.8,2.7,5.1,1.9,2 +7.1,3.0,5.9,2.1,2 +6.3,2.9,5.6,1.8,2 +6.5,3.0,5.8,2.2,2 +7.6,3.0,6.6,2.1,2 +4.9,2.5,4.5,1.7,2 +7.3,2.9,6.3,1.8,2 +6.7,2.5,5.8,1.8,2 +7.2,3.6,6.1,2.5,2 +6.5,3.2,5.1,2.0,2 +6.4,2.7,5.3,1.9,2 +6.8,3.0,5.5,2.1,2 +5.7,2.5,5.0,2.0,2 +5.8,2.8,5.1,2.4,2 +6.4,3.2,5.3,2.3,2 +6.5,3.0,5.5,1.8,2 +7.7,3.8,6.7,2.2,2 +7.7,2.6,6.9,2.3,2 +6.0,2.2,5.0,1.5,2 +6.9,3.2,5.7,2.3,2 +5.6,2.8,4.9,2.0,2 +7.7,2.8,6.7,2.0,2 +6.3,2.7,4.9,1.8,2 +6.7,3.3,5.7,2.1,2 +7.2,3.2,6.0,1.8,2 +6.2,2.8,4.8,1.8,2 +6.1,3.0,4.9,1.8,2 +6.4,2.8,5.6,2.1,2 +7.2,3.0,5.8,1.6,2 +7.4,2.8,6.1,1.9,2 +7.9,3.8,6.4,2.0,2 +6.4,2.8,5.6,2.2,2 +6.3,2.8,5.1,1.5,2 +6.1,2.6,5.6,1.4,2 +7.7,3.0,6.1,2.3,2 +6.3,3.4,5.6,2.4,2 +6.4,3.1,5.5,1.8,2 +6.0,3.0,4.8,1.8,2 +6.9,3.1,5.4,2.1,2 +6.7,3.1,5.6,2.4,2 +6.9,3.1,5.1,2.3,2 +5.8,2.7,5.1,1.9,2 +6.8,3.2,5.9,2.3,2 +6.7,3.3,5.7,2.5,2 +6.7,3.0,5.2,2.3,2 +6.3,2.5,5.0,1.9,2 +6.5,3.0,5.2,2.0,2 +6.2,3.4,5.4,2.3,2 +5.9,3.0,5.1,1.8,2 diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py new file mode 100644 index 0000000..855c6bb --- /dev/null +++ b/mllib/lib/knn.py @@ -0,0 +1,153 @@ +""" +Module for commonly used machine learning modelling algorithms. + +**Available routines:** + +- class ``Knn``: Builds K-Nearest Neighnour model sing cross validation. + +Credits +------- +:: + + Authors: + - Diptesh + - Madhu + + Date: Sep 25, 2021 +""" + +# pylint: disable=invalid-name +# pylint: disable=too-many-arguments +# pylint: disable=too-few-public-methods + +from typing import List, Dict, Any + +import re +import sys +from inspect import getsourcefile +from os.path import abspath + +import pandas as pd + +from sklearn import neighbors as sn +from sklearn.preprocessing import scale +from sklearn.model_selection import GridSearchCV + +path = abspath(getsourcefile(lambda: 0)) +path = re.sub(r"(.+\/)(.+.py)", "\\1", path) +sys.path.insert(0, path) + +class Knn(): + """ K-Nearest Neighbour (KNN) module. + + Objective: + - Build KNN model and determine optimal k + + Parameters + ---------- + :df: pandas.DataFrame + + Pandas dataframe containing the `y_var` and `x_var` + + :y_var: str + + Target variable + + :x_var: list + + List containing independant variables + + :method: str, optional + + Can be either `classify` or `regression` (default is 'classify') + + :k_fold: int, optional + + Number of cross validations folds (default is 5) + + :param: dict, optional + + KNN parameters (the default is None). + In case of None, the parameters will default to:: + + n_neighbors: max(int(len(df)/(k_fold * 2)), 1) + weights: ["uniform", "distance"] + metric: ["euclidean", "manhattan"] + + Methods + ------- + predict + + Example + ------- + >>> mod = Knn(df=df_ip, y_var=["y"], x_var=["x1", "x2", "x3"]) + >>> df_op = mod.predict(df_predict) + + """ + + def __init__(self, + df: pd.DataFrame, + y_var: str, + x_var: List[str], + method: str = "classify", + k_fold: int = 5, + param: Dict = None): + """Initialize variables for module ``Knn``.""" + self.df = df.reset_index(drop=True) + self.y_var = y_var + self.x_var = x_var + self.method = method + self.model = None + self.k_fold = k_fold + if param is None: + max_k = max(int(len(self.df)/(self.k_fold * 2)), 1) + param = {"n_neighbors": list(range(1, max_k, 2)), + "weights": ["uniform", "distance"], + "metric": ["euclidean", "manhattan"]} + self.param = param + self._pre_process() + self._fit() + + def _pre_process(self): + """Pre-process the data, one hot encoding and scaling.""" + df_ip_x = pd.get_dummies(self.df[self.x_var]) + self.x_var = list(df_ip_x.columns) + df_ip_x = pd.DataFrame(scale(df_ip_x)) + df_ip_x.columns = self.x_var + self.df = self.df[[self.y_var]].join(df_ip_x) + + def _fit(self) -> Dict[str, Any]: + """Fit KNN model.""" + if self.method == "classify": + gs = GridSearchCV(sn.KNeighborsClassifier(), + self.param, + verbose=0, + cv=self.k_fold, + n_jobs=1) + elif self.method == "regression": + gs = GridSearchCV(sn.KNeighborsRegressor(), + self.param, + verbose=0, + cv=self.k_fold, + n_jobs=1) + gs_op = gs.fit(self.df[self.x_var], + self.df[self.y_var]) + opt_k = gs_op.best_params_.get("n_neighbors") + weight = gs_op.best_params_.get("weights") + metric = gs_op.best_params_.get("metric") + if self.method == "classify": + model = sn.KNeighborsClassifier(n_neighbors=opt_k, + weights=weight, + metric=metric) + elif self.method == "regression": + model = sn.KNeighborsRegressor(n_neighbors=opt_k, + weights=weight, + metric=metric) + self.model = model.fit(self.df[self.x_var], + self.df[self.y_var]) + return gs_op.best_params_ + + def predict(self, x_pred: pd.DataFrame) -> pd.DataFrame: + """Prediction module.""" + x_pred = pd.DataFrame(scale(pd.get_dummies(x_pred))) + return self.model.predict(x_pred) diff --git a/tests/test_knn.py b/tests/test_knn.py new file mode 100644 index 0000000..2813cbf --- /dev/null +++ b/tests/test_knn.py @@ -0,0 +1,101 @@ +""" +Test suite module for ``knn``. + +Credits +------- +:: + + Authors: + - Diptesh + - Madhu + + Date: Sep 25, 2021 +""" + +# pylint: disable=invalid-name +# pylint: disable=wrong-import-position + +import unittest +import warnings +import re +import sys + +from inspect import getsourcefile +from os.path import abspath + +import pandas as pd + +from sklearn.model_selection import train_test_split as split + +# Set base path +path = abspath(getsourcefile(lambda: 0)) +path = re.sub(r"(.+)(\/tests.*)", "\\1", path) + +sys.path.insert(0, path) + +from mllib.lib.knn import Knn # noqa: F841 + +# ============================================================================= +# --- DO NOT CHANGE ANYTHING FROM HERE +# ============================================================================= + +path = path + "/data/input/" + +# ============================================================================= +# --- User defined functions +# ============================================================================= + + +def ignore_warnings(test_func): + """Suppress deprecation warnings.""" + + def do_test(self, *args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + test_func(self, *args, **kwargs) + return do_test + + +class Test_Knn(unittest.TestCase): + """Test suite for module ``KNN``.""" + + def setUp(self): + """Set up for module ``KNN``.""" + + def test_knn_class(self): + """ Test KNN classification. + """ + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[["y", "x1", "x2"]] + df_train, df_test = split(df_ip, + stratify=df_ip["y"], + test_size=0.1, + random_state=42) + mod = Knn(df_train, "y", ["x1", "x2"], method="classify") + y_hat = mod.predict(df_test[["x1", "x2"]]).tolist() + y = df_test["y"].values.tolist() + acc = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) + self.assertGreaterEqual(acc, 0.93) + + def test_knn_reg(self): + """ Test KNN regression. + """ + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[["y", "x1", "x2"]] + df_train, df_test = split(df_ip, + stratify=df_ip["y"], + test_size=0.1, + random_state=42) + mod = Knn(df_train, "y", ["x1", "x2"], method="regression") + y_hat = mod.predict(df_test[["x1", "x2"]]).tolist() + y = df_test["y"].values.tolist() + acc = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) + self.assertGreaterEqual(acc, 0.87) + + +# ============================================================================= +# --- Main +# ============================================================================= + +if __name__ == '__main__': + unittest.main() From e0c4f237fd9a32c86618258cf358b60b438421bc Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Sat, 25 Sep 2021 21:45:10 +0530 Subject: [PATCH 02/13] v0.4.0 changelog: - added parallel processing by default in KNN module --- mllib/__main__.py | 19 +++++++++++++++++-- mllib/lib/knn.py | 47 +++++++++++++++++++++++++++------------------- mllib/lib/model.py | 2 +- tests/test_knn.py | 12 +++++------- 4 files changed, 51 insertions(+), 29 deletions(-) diff --git a/mllib/__main__.py b/mllib/__main__.py index 2fbd285..7875e9e 100644 --- a/mllib/__main__.py +++ b/mllib/__main__.py @@ -28,6 +28,7 @@ from lib import cfg, utils # noqa: F841 from lib.cluster import Cluster # noqa: F841 from lib.model import GLMNet # noqa: F841 +from lib.knn import KNN # noqa: F841 # ============================================================================= # --- DO NOT CHANGE ANYTHING FROM HERE @@ -52,13 +53,13 @@ CLI.add_argument("-f", "--filename", nargs=1, type=str, - default=["store.csv"], + default=["iris.csv"], help="input csv filename") args = CLI.parse_args() fn_ip = args.filename[0] -fn_ip = "store.csv" +fn_ip = "iris.csv" # ============================================================================= # --- Main @@ -84,6 +85,20 @@ print("\nGLMNet\n") for k, v in glm_mod.model_summary.items(): print(k, str(v).rjust(69 - len(k))) + print(elapsed_time("Time", start_t), + sep="\n") + # --- KNN + start_t = time.time_ns() + df_ip = pd.read_csv(path + "input/iris.csv") + df_ip = df_ip[["y", "x1", "x2"]] + df_train = df_ip.sample(frac=0.8, random_state=42) + df_test = df_ip.drop(df_train.index) + mod = KNN(df_train, "y", ["x1", "x2"], method="classify") + print("\nKNN\n") + y_hat = mod.predict(df_test[["x1", "x2"]]).tolist() + y = df_test["y"].values.tolist() + accuracy = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) + print("Accuracy:", accuracy) print(elapsed_time("Time", start_t), sep="\n") # --- EOF diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 855c6bb..36c7bc0 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -1,9 +1,9 @@ """ -Module for commonly used machine learning modelling algorithms. +k-NN module. **Available routines:** -- class ``Knn``: Builds K-Nearest Neighnour model sing cross validation. +- class ``KNN``: Builds K-Nearest Neighnour model using cross validation. Credits ------- @@ -37,35 +37,38 @@ path = re.sub(r"(.+\/)(.+.py)", "\\1", path) sys.path.insert(0, path) -class Knn(): - """ K-Nearest Neighbour (KNN) module. + +class KNN(): + """K-Nearest Neighbour (KNN) module. Objective: - - Build KNN model and determine optimal k + - Build + `KNN `_ + model and determine optimal k Parameters ---------- - :df: pandas.DataFrame + df : pandas.DataFrame Pandas dataframe containing the `y_var` and `x_var` - :y_var: str + y_var : str - Target variable + Dependant variable - :x_var: list + x_var : List[str] - List containing independant variables + Independant variables. - :method: str, optional + method : str, optional - Can be either `classify` or `regression` (default is 'classify') + Can be either `classify` or `regression` (the default is classify) - :k_fold: int, optional + k_fold : int, optional - Number of cross validations folds (default is 5) + Number of cross validations folds (the default is 5) - :param: dict, optional + param : dict, optional KNN parameters (the default is None). In case of None, the parameters will default to:: @@ -74,13 +77,19 @@ class Knn(): weights: ["uniform", "distance"] metric: ["euclidean", "manhattan"] + Returns + ------- + model : object + + Final optimal model. + Methods ------- predict Example ------- - >>> mod = Knn(df=df_ip, y_var=["y"], x_var=["x1", "x2", "x3"]) + >>> mod = KNN(df=df_ip, y_var=["y"], x_var=["x1", "x2", "x3"]) >>> df_op = mod.predict(df_predict) """ @@ -92,7 +101,7 @@ def __init__(self, method: str = "classify", k_fold: int = 5, param: Dict = None): - """Initialize variables for module ``Knn``.""" + """Initialize variables for module ``KNN``.""" self.df = df.reset_index(drop=True) self.y_var = y_var self.x_var = x_var @@ -123,13 +132,13 @@ def _fit(self) -> Dict[str, Any]: self.param, verbose=0, cv=self.k_fold, - n_jobs=1) + n_jobs=-1) elif self.method == "regression": gs = GridSearchCV(sn.KNeighborsRegressor(), self.param, verbose=0, cv=self.k_fold, - n_jobs=1) + n_jobs=-1) gs_op = gs.fit(self.df[self.x_var], self.df[self.y_var]) opt_k = gs_op.best_params_.get("n_neighbors") diff --git a/mllib/lib/model.py b/mllib/lib/model.py index a90ebcf..e8adf2a 100644 --- a/mllib/lib/model.py +++ b/mllib/lib/model.py @@ -1,5 +1,5 @@ """ -Module for commonly used machine learning modelling algorithms. +GLMNet module. **Available routines:** diff --git a/tests/test_knn.py b/tests/test_knn.py index 2813cbf..b6a55c8 100644 --- a/tests/test_knn.py +++ b/tests/test_knn.py @@ -33,7 +33,7 @@ sys.path.insert(0, path) -from mllib.lib.knn import Knn # noqa: F841 +from mllib.lib.knn import KNN # noqa: F841 # ============================================================================= # --- DO NOT CHANGE ANYTHING FROM HERE @@ -63,30 +63,28 @@ def setUp(self): """Set up for module ``KNN``.""" def test_knn_class(self): - """ Test KNN classification. - """ + """KNN: Test for classification.""" df_ip = pd.read_csv(path + "iris.csv") df_ip = df_ip[["y", "x1", "x2"]] df_train, df_test = split(df_ip, stratify=df_ip["y"], test_size=0.1, random_state=42) - mod = Knn(df_train, "y", ["x1", "x2"], method="classify") + mod = KNN(df_train, "y", ["x1", "x2"], method="classify") y_hat = mod.predict(df_test[["x1", "x2"]]).tolist() y = df_test["y"].values.tolist() acc = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) self.assertGreaterEqual(acc, 0.93) def test_knn_reg(self): - """ Test KNN regression. - """ + """KNN: Test for regression.""" df_ip = pd.read_csv(path + "iris.csv") df_ip = df_ip[["y", "x1", "x2"]] df_train, df_test = split(df_ip, stratify=df_ip["y"], test_size=0.1, random_state=42) - mod = Knn(df_train, "y", ["x1", "x2"], method="regression") + mod = KNN(df_train, "y", ["x1", "x2"], method="regression") y_hat = mod.predict(df_test[["x1", "x2"]]).tolist() y = df_test["y"].values.tolist() acc = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) From d28f86978d879f96744b2305a1b6ae193ef48a0c Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Sat, 25 Sep 2021 21:47:36 +0530 Subject: [PATCH 03/13] v0.4.0 --- logs/cov.out | 17 +++++++++-------- logs/pip.out | 2 +- logs/pylint/lib-knn-py.out | 4 ++++ logs/pylint/tests-test_knn-py.out | 4 ++++ requirements.txt | 2 +- 5 files changed, 19 insertions(+), 10 deletions(-) create mode 100644 logs/pylint/lib-knn-py.out create mode 100644 logs/pylint/tests-test_knn-py.out diff --git a/logs/cov.out b/logs/cov.out index 5aceeff..6b2ba28 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,8 +1,9 @@ -Name Stmts Miss Cover Missing ------------------------------------------------------ -mllib/__init__.py 7 0 100% -mllib/lib/__init__.py 7 0 100% -mllib/lib/cluster.py 103 0 100% -mllib/lib/model.py 45 0 100% ------------------------------------------------------ -TOTAL 162 0 100% +Name Stmts Miss Cover Missing +----------------------------------------------------------------------------------------- +/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 50 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 45 0 100% +----------------------------------------------------------------------------------------- +TOTAL 212 0 100% diff --git a/logs/pip.out b/logs/pip.out index 03fb79a..f61bf91 100644 --- a/logs/pip.out +++ b/logs/pip.out @@ -1 +1 @@ -./bin/run_tests.sh: line 78: pipreqs: command not found +INFO: Successfully saved requirements file in /media/ph33r/Data/Project/mllib/Git/requirements.txt diff --git a/logs/pylint/lib-knn-py.out b/logs/pylint/lib-knn-py.out new file mode 100644 index 0000000..b96d3cd --- /dev/null +++ b/logs/pylint/lib-knn-py.out @@ -0,0 +1,4 @@ + +------------------------------------ +Your code has been rated at 10.00/10 + diff --git a/logs/pylint/tests-test_knn-py.out b/logs/pylint/tests-test_knn-py.out new file mode 100644 index 0000000..b96d3cd --- /dev/null +++ b/logs/pylint/tests-test_knn-py.out @@ -0,0 +1,4 @@ + +------------------------------------ +Your code has been rated at 10.00/10 + diff --git a/requirements.txt b/requirements.txt index b593d70..45ef809 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy==1.19.5 Cython==0.29.15 pandas==1.1.3 +numpy==1.19.5 scikit_learn==1.0 From 0f63e4314d0aa0003798f9f722e660d6be462ee6 Mon Sep 17 00:00:00 2001 From: MadhuTangudu Date: Sun, 26 Sep 2021 00:30:00 +0530 Subject: [PATCH 04/13] v0.4.0 changelog: - _compute_metrics method added --- logs/cov.out | 18 ++++----- logs/pip.out | 2 +- logs/pylint/lib-knn-py.out | 9 ++++- logs/pylint/tests-test_knn-py.out | 4 +- mllib/lib/knn.py | 65 ++++++++++++++++++++++++++----- tests/test_knn.py | 11 +++--- 6 files changed, 80 insertions(+), 29 deletions(-) diff --git a/logs/cov.out b/logs/cov.out index 6b2ba28..457d257 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,9 +1,9 @@ -Name Stmts Miss Cover Missing ------------------------------------------------------------------------------------------ -/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 50 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 45 0 100% ------------------------------------------------------------------------------------------ -TOTAL 212 0 100% +Name Stmts Miss Cover Missing +----------------------------------------------------- +mllib/__init__.py 7 0 100% +mllib/lib/__init__.py 7 0 100% +mllib/lib/cluster.py 103 0 100% +mllib/lib/knn.py 67 0 100% +mllib/lib/model.py 45 0 100% +----------------------------------------------------- +TOTAL 229 0 100% diff --git a/logs/pip.out b/logs/pip.out index f61bf91..03fb79a 100644 --- a/logs/pip.out +++ b/logs/pip.out @@ -1 +1 @@ -INFO: Successfully saved requirements file in /media/ph33r/Data/Project/mllib/Git/requirements.txt +./bin/run_tests.sh: line 78: pipreqs: command not found diff --git a/logs/pylint/lib-knn-py.out b/logs/pylint/lib-knn-py.out index b96d3cd..840218f 100644 --- a/logs/pylint/lib-knn-py.out +++ b/logs/pylint/lib-knn-py.out @@ -1,4 +1,9 @@ +************* Module mllib.lib.knn +knn.py:174:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:175:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:176:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:177:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) ------------------------------------- -Your code has been rated at 10.00/10 +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) diff --git a/logs/pylint/tests-test_knn-py.out b/logs/pylint/tests-test_knn-py.out index b96d3cd..d7495ee 100644 --- a/logs/pylint/tests-test_knn-py.out +++ b/logs/pylint/tests-test_knn-py.out @@ -1,4 +1,4 @@ ------------------------------------- -Your code has been rated at 10.00/10 +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 36c7bc0..b7d037c 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -19,6 +19,8 @@ # pylint: disable=invalid-name # pylint: disable=too-many-arguments # pylint: disable=too-few-public-methods +# pylint: disable=R0902 +# pylint: disable=wrong-import-position from typing import List, Dict, Any @@ -28,15 +30,19 @@ from os.path import abspath import pandas as pd +import numpy as np from sklearn import neighbors as sn from sklearn.preprocessing import scale +from sklearn import metrics as sk_metrics + from sklearn.model_selection import GridSearchCV path = abspath(getsourcefile(lambda: 0)) path = re.sub(r"(.+\/)(.+.py)", "\\1", path) sys.path.insert(0, path) +import metrics # noqa: F841 class KNN(): """K-Nearest Neighbour (KNN) module. @@ -102,9 +108,9 @@ def __init__(self, k_fold: int = 5, param: Dict = None): """Initialize variables for module ``KNN``.""" - self.df = df.reset_index(drop=True) self.y_var = y_var self.x_var = x_var + self.df = df[[self.y_var] + self.x_var].reset_index(drop=True) self.method = method self.model = None self.k_fold = k_fold @@ -115,7 +121,9 @@ def __init__(self, "metric": ["euclidean", "manhattan"]} self.param = param self._pre_process() - self._fit() + self.best_params_ = self._fit() + self.model_summary = None + self._compute_metrics() def _pre_process(self): """Pre-process the data, one hot encoding and scaling.""" @@ -128,14 +136,16 @@ def _pre_process(self): def _fit(self) -> Dict[str, Any]: """Fit KNN model.""" if self.method == "classify": - gs = GridSearchCV(sn.KNeighborsClassifier(), - self.param, + gs = GridSearchCV(estimator=sn.KNeighborsClassifier(), + param_grid=self.param, + scoring='accuracy', verbose=0, cv=self.k_fold, n_jobs=-1) elif self.method == "regression": - gs = GridSearchCV(sn.KNeighborsRegressor(), - self.param, + gs = GridSearchCV(estimator=sn.KNeighborsRegressor(), + param_grid=self.param, + scoring='neg_root_mean_squared_error', verbose=0, cv=self.k_fold, n_jobs=-1) @@ -156,7 +166,42 @@ def _fit(self) -> Dict[str, Any]: self.df[self.y_var]) return gs_op.best_params_ - def predict(self, x_pred: pd.DataFrame) -> pd.DataFrame: - """Prediction module.""" - x_pred = pd.DataFrame(scale(pd.get_dummies(x_pred))) - return self.model.predict(x_pred) + def _compute_metrics(self): + """Compute commonly used metrics to evaluate the model.""" + y = self.df.iloc[:, 0].values.tolist() + y_hat = list(self.predict(self.df[self.x_var])["y"].values) + if self.method == "regression": + model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), + "mae": np.round(metrics.mae(y, y_hat), 3), + "mape": np.round(metrics.mape(y, y_hat), 3), + "rmse": np.round(metrics.rmse(y, y_hat), 3)} + model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) + if self.method == "classify": + model_summary = {"acc": np.round(\ + sk_metrics.accuracy_score(y, y_hat), 3), + "f1": np.round(\ + sk_metrics.f1_score(y, + y_hat, + average='micro'), 3)} + self.model_summary = model_summary + + def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: + """Predict y_var/target variable. + + Parameters + ---------- + df_predict : pd.DataFrame + + Pandas dataframe containing `x_var`. + + Returns + ------- + pd.DataFrame + + Pandas dataframe containing predicted `y_var` and `x_var`. + """ + df_predict = pd.DataFrame(scale(pd.get_dummies(df_predict))) + y_hat = self.model.predict(df_predict) + df_predict = df_predict.copy() + df_predict["y"] = y_hat + return df_predict diff --git a/tests/test_knn.py b/tests/test_knn.py index b6a55c8..ae2f2ce 100644 --- a/tests/test_knn.py +++ b/tests/test_knn.py @@ -26,6 +26,7 @@ import pandas as pd from sklearn.model_selection import train_test_split as split +from sklearn import metrics as sk_metrics # Set base path path = abspath(getsourcefile(lambda: 0)) @@ -71,9 +72,9 @@ def test_knn_class(self): test_size=0.1, random_state=42) mod = KNN(df_train, "y", ["x1", "x2"], method="classify") - y_hat = mod.predict(df_test[["x1", "x2"]]).tolist() + y_hat = mod.predict(df_test[["x1", "x2"]])["y"].tolist() y = df_test["y"].values.tolist() - acc = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) + acc = round(sk_metrics.accuracy_score(y, y_hat), 2) self.assertGreaterEqual(acc, 0.93) def test_knn_reg(self): @@ -85,10 +86,10 @@ def test_knn_reg(self): test_size=0.1, random_state=42) mod = KNN(df_train, "y", ["x1", "x2"], method="regression") - y_hat = mod.predict(df_test[["x1", "x2"]]).tolist() + y_hat = mod.predict(df_test[["x1", "x2"]])["y"].tolist() y = df_test["y"].values.tolist() - acc = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) - self.assertGreaterEqual(acc, 0.87) + acc = round(sk_metrics.mean_squared_error(y, y_hat), 2) + self.assertLessEqual(acc, 0.1) # ============================================================================= From 69436d313d2e3544d90d5d41a5b465bea3e2c48f Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Sun, 26 Sep 2021 01:00:09 +0530 Subject: [PATCH 05/13] v0.4.0 changelog: - minor changes in variables names --- logs/cov.out | 18 +++++++++--------- logs/pip.out | 2 +- logs/pylint/lib-knn-py.out | 12 ++++++------ mllib/lib/knn.py | 13 +++++++------ requirements.txt | 2 +- 5 files changed, 24 insertions(+), 23 deletions(-) diff --git a/logs/cov.out b/logs/cov.out index 457d257..cc34ded 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,9 +1,9 @@ -Name Stmts Miss Cover Missing ------------------------------------------------------ -mllib/__init__.py 7 0 100% -mllib/lib/__init__.py 7 0 100% -mllib/lib/cluster.py 103 0 100% -mllib/lib/knn.py 67 0 100% -mllib/lib/model.py 45 0 100% ------------------------------------------------------ -TOTAL 229 0 100% +Name Stmts Miss Cover Missing +----------------------------------------------------------------------------------------- +/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 69 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 45 0 100% +----------------------------------------------------------------------------------------- +TOTAL 231 0 100% diff --git a/logs/pip.out b/logs/pip.out index 03fb79a..f61bf91 100644 --- a/logs/pip.out +++ b/logs/pip.out @@ -1 +1 @@ -./bin/run_tests.sh: line 78: pipreqs: command not found +INFO: Successfully saved requirements file in /media/ph33r/Data/Project/mllib/Git/requirements.txt diff --git a/logs/pylint/lib-knn-py.out b/logs/pylint/lib-knn-py.out index 840218f..48851ad 100644 --- a/logs/pylint/lib-knn-py.out +++ b/logs/pylint/lib-knn-py.out @@ -1,9 +1,9 @@ ************* Module mllib.lib.knn -knn.py:174:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:175:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:176:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:177:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:175:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:176:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:177:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:178:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) +------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 9.70/10, +0.30) diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index b7d037c..8d19d7c 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -44,6 +44,7 @@ import metrics # noqa: F841 + class KNN(): """K-Nearest Neighbour (KNN) module. @@ -177,12 +178,11 @@ def _compute_metrics(self): "rmse": np.round(metrics.rmse(y, y_hat), 3)} model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) if self.method == "classify": - model_summary = {"acc": np.round(\ - sk_metrics.accuracy_score(y, y_hat), 3), - "f1": np.round(\ - sk_metrics.f1_score(y, - y_hat, - average='micro'), 3)} + accuracy = np.round(sk_metrics.accuracy_score(y, y_hat), 3) + f1_score = np.round(sk_metrics.f1_score(y, y_hat, + average='micro'), 3) + model_summary = {"accuracy": accuracy, + "f1": f1_score} self.model_summary = model_summary def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: @@ -199,6 +199,7 @@ def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: pd.DataFrame Pandas dataframe containing predicted `y_var` and `x_var`. + """ df_predict = pd.DataFrame(scale(pd.get_dummies(df_predict))) y_hat = self.model.predict(df_predict) diff --git a/requirements.txt b/requirements.txt index 45ef809..cf8b072 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -Cython==0.29.15 pandas==1.1.3 +Cython==0.29.15 numpy==1.19.5 scikit_learn==1.0 From 6a5e16a966a1f16d8bfbc691cf3cf2d8a899bb40 Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Sun, 26 Sep 2021 01:11:10 +0530 Subject: [PATCH 06/13] v0.4.0 --- mllib/lib/knn.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 8d19d7c..a2f2bd2 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -17,10 +17,7 @@ """ # pylint: disable=invalid-name -# pylint: disable=too-many-arguments -# pylint: disable=too-few-public-methods -# pylint: disable=R0902 -# pylint: disable=wrong-import-position +# pylint: disable=R0902,R0903,R0913,C0413 from typing import List, Dict, Any From bfd1b133a7966b3bc73c1a9ff7bc3453e07c3dbc Mon Sep 17 00:00:00 2001 From: MadhuTangudu Date: Sun, 26 Sep 2021 16:47:34 +0530 Subject: [PATCH 07/13] v0.4.0 changelog: - cross validation metric changed to accuracy and rmse for classification and regresion in knn.py file - test for categorical variable added to test_knn.py --- data/input/iris.csv | 302 ++++++++++++++++++------------------- logs/cov.out | 18 +-- logs/pip.out | 2 +- logs/pylint/lib-knn-py.out | 12 +- mllib/lib/knn.py | 18 ++- tests/test_knn.py | 10 ++ 6 files changed, 190 insertions(+), 172 deletions(-) diff --git a/data/input/iris.csv b/data/input/iris.csv index d93a29c..2e5cab0 100644 --- a/data/input/iris.csv +++ b/data/input/iris.csv @@ -1,151 +1,151 @@ -x3,x4,x1,x2,y -5.1,3.5,1.4,0.2,0 -4.9,3.0,1.4,0.2,0 -4.7,3.2,1.3,0.2,0 -4.6,3.1,1.5,0.2,0 -5.0,3.6,1.4,0.2,0 -5.4,3.9,1.7,0.4,0 -4.6,3.4,1.4,0.3,0 -5.0,3.4,1.5,0.2,0 -4.4,2.9,1.4,0.2,0 -4.9,3.1,1.5,0.1,0 -5.4,3.7,1.5,0.2,0 -4.8,3.4,1.6,0.2,0 -4.8,3.0,1.4,0.1,0 -4.3,3.0,1.1,0.1,0 -5.8,4.0,1.2,0.2,0 -5.7,4.4,1.5,0.4,0 -5.4,3.9,1.3,0.4,0 -5.1,3.5,1.4,0.3,0 -5.7,3.8,1.7,0.3,0 -5.1,3.8,1.5,0.3,0 -5.4,3.4,1.7,0.2,0 -5.1,3.7,1.5,0.4,0 -4.6,3.6,1.0,0.2,0 -5.1,3.3,1.7,0.5,0 -4.8,3.4,1.9,0.2,0 -5.0,3.0,1.6,0.2,0 -5.0,3.4,1.6,0.4,0 -5.2,3.5,1.5,0.2,0 -5.2,3.4,1.4,0.2,0 -4.7,3.2,1.6,0.2,0 -4.8,3.1,1.6,0.2,0 -5.4,3.4,1.5,0.4,0 -5.2,4.1,1.5,0.1,0 -5.5,4.2,1.4,0.2,0 -4.9,3.1,1.5,0.2,0 -5.0,3.2,1.2,0.2,0 -5.5,3.5,1.3,0.2,0 -4.9,3.6,1.4,0.1,0 -4.4,3.0,1.3,0.2,0 -5.1,3.4,1.5,0.2,0 -5.0,3.5,1.3,0.3,0 -4.5,2.3,1.3,0.3,0 -4.4,3.2,1.3,0.2,0 -5.0,3.5,1.6,0.6,0 -5.1,3.8,1.9,0.4,0 -4.8,3.0,1.4,0.3,0 -5.1,3.8,1.6,0.2,0 -4.6,3.2,1.4,0.2,0 -5.3,3.7,1.5,0.2,0 -5.0,3.3,1.4,0.2,0 -7.0,3.2,4.7,1.4,1 -6.4,3.2,4.5,1.5,1 -6.9,3.1,4.9,1.5,1 -5.5,2.3,4.0,1.3,1 -6.5,2.8,4.6,1.5,1 -5.7,2.8,4.5,1.3,1 -6.3,3.3,4.7,1.6,1 -4.9,2.4,3.3,1.0,1 -6.6,2.9,4.6,1.3,1 -5.2,2.7,3.9,1.4,1 -5.0,2.0,3.5,1.0,1 -5.9,3.0,4.2,1.5,1 -6.0,2.2,4.0,1.0,1 -6.1,2.9,4.7,1.4,1 -5.6,2.9,3.6,1.3,1 -6.7,3.1,4.4,1.4,1 -5.6,3.0,4.5,1.5,1 -5.8,2.7,4.1,1.0,1 -6.2,2.2,4.5,1.5,1 -5.6,2.5,3.9,1.1,1 -5.9,3.2,4.8,1.8,1 -6.1,2.8,4.0,1.3,1 -6.3,2.5,4.9,1.5,1 -6.1,2.8,4.7,1.2,1 -6.4,2.9,4.3,1.3,1 -6.6,3.0,4.4,1.4,1 -6.8,2.8,4.8,1.4,1 -6.7,3.0,5.0,1.7,1 -6.0,2.9,4.5,1.5,1 -5.7,2.6,3.5,1.0,1 -5.5,2.4,3.8,1.1,1 -5.5,2.4,3.7,1.0,1 -5.8,2.7,3.9,1.2,1 -6.0,2.7,5.1,1.6,1 -5.4,3.0,4.5,1.5,1 -6.0,3.4,4.5,1.6,1 -6.7,3.1,4.7,1.5,1 -6.3,2.3,4.4,1.3,1 -5.6,3.0,4.1,1.3,1 -5.5,2.5,4.0,1.3,1 -5.5,2.6,4.4,1.2,1 -6.1,3.0,4.6,1.4,1 -5.8,2.6,4.0,1.2,1 -5.0,2.3,3.3,1.0,1 -5.6,2.7,4.2,1.3,1 -5.7,3.0,4.2,1.2,1 -5.7,2.9,4.2,1.3,1 -6.2,2.9,4.3,1.3,1 -5.1,2.5,3.0,1.1,1 -5.7,2.8,4.1,1.3,1 -6.3,3.3,6.0,2.5,2 -5.8,2.7,5.1,1.9,2 -7.1,3.0,5.9,2.1,2 -6.3,2.9,5.6,1.8,2 -6.5,3.0,5.8,2.2,2 -7.6,3.0,6.6,2.1,2 -4.9,2.5,4.5,1.7,2 -7.3,2.9,6.3,1.8,2 -6.7,2.5,5.8,1.8,2 -7.2,3.6,6.1,2.5,2 -6.5,3.2,5.1,2.0,2 -6.4,2.7,5.3,1.9,2 -6.8,3.0,5.5,2.1,2 -5.7,2.5,5.0,2.0,2 -5.8,2.8,5.1,2.4,2 -6.4,3.2,5.3,2.3,2 -6.5,3.0,5.5,1.8,2 -7.7,3.8,6.7,2.2,2 -7.7,2.6,6.9,2.3,2 -6.0,2.2,5.0,1.5,2 -6.9,3.2,5.7,2.3,2 -5.6,2.8,4.9,2.0,2 -7.7,2.8,6.7,2.0,2 -6.3,2.7,4.9,1.8,2 -6.7,3.3,5.7,2.1,2 -7.2,3.2,6.0,1.8,2 -6.2,2.8,4.8,1.8,2 -6.1,3.0,4.9,1.8,2 -6.4,2.8,5.6,2.1,2 -7.2,3.0,5.8,1.6,2 -7.4,2.8,6.1,1.9,2 -7.9,3.8,6.4,2.0,2 -6.4,2.8,5.6,2.2,2 -6.3,2.8,5.1,1.5,2 -6.1,2.6,5.6,1.4,2 -7.7,3.0,6.1,2.3,2 -6.3,3.4,5.6,2.4,2 -6.4,3.1,5.5,1.8,2 -6.0,3.0,4.8,1.8,2 -6.9,3.1,5.4,2.1,2 -6.7,3.1,5.6,2.4,2 -6.9,3.1,5.1,2.3,2 -5.8,2.7,5.1,1.9,2 -6.8,3.2,5.9,2.3,2 -6.7,3.3,5.7,2.5,2 -6.7,3.0,5.2,2.3,2 -6.3,2.5,5.0,1.9,2 -6.5,3.0,5.2,2.0,2 -6.2,3.4,5.4,2.3,2 -5.9,3.0,5.1,1.8,2 +x3,x4,x1,x2,x5,y +5.1,3.5,1.4,0.2,a,0 +4.9,3,1.4,0.2,a,0 +4.7,3.2,1.3,0.2,a,0 +4.6,3.1,1.5,0.2,a,0 +5,3.6,1.4,0.2,a,0 +5.4,3.9,1.7,0.4,a,0 +4.6,3.4,1.4,0.3,a,0 +5,3.4,1.5,0.2,a,0 +4.4,2.9,1.4,0.2,e,0 +4.9,3.1,1.5,0.1,e,0 +5.4,3.7,1.5,0.2,e,0 +4.8,3.4,1.6,0.2,e,0 +4.8,3,1.4,0.1,e,0 +4.3,3,1.1,0.1,e,0 +5.8,4,1.2,0.2,e,0 +5.7,4.4,1.5,0.4,e,0 +5.4,3.9,1.3,0.4,e,0 +5.1,3.5,1.4,0.3,e,0 +5.7,3.8,1.7,0.3,e,0 +5.1,3.8,1.5,0.3,s,0 +5.4,3.4,1.7,0.2,s,0 +5.1,3.7,1.5,0.4,s,0 +4.6,3.6,1,0.2,s,0 +5.1,3.3,1.7,0.5,s,0 +4.8,3.4,1.9,0.2,s,0 +5,3,1.6,0.2,s,0 +5,3.4,1.6,0.4,s,0 +5.2,3.5,1.5,0.2,s,0 +5.2,3.4,1.4,0.2,s,0 +4.7,3.2,1.6,0.2,s,0 +4.8,3.1,1.6,0.2,e,0 +5.4,3.4,1.5,0.4,s,0 +5.2,4.1,1.5,0.1,a,0 +5.5,4.2,1.4,0.2,s,0 +4.9,3.1,1.5,0.2,a,0 +5,3.2,1.2,0.2,s,0 +5.5,3.5,1.3,0.2,a,0 +4.9,3.6,1.4,0.1,e,0 +4.4,3,1.3,0.2,s,0 +5.1,3.4,1.5,0.2,a,0 +5,3.5,1.3,0.3,s,0 +4.5,2.3,1.3,0.3,e,0 +4.4,3.2,1.3,0.2,s,0 +5,3.5,1.6,0.6,s,0 +5.1,3.8,1.9,0.4,s,0 +4.8,3,1.4,0.3,s,0 +5.1,3.8,1.6,0.2,a,0 +4.6,3.2,1.4,0.2,a,0 +5.3,3.7,1.5,0.2,a,0 +5,3.3,1.4,0.2,a,0 +7,3.2,4.7,1.4,e,1 +6.4,3.2,4.5,1.5,e,1 +6.9,3.1,4.9,1.5,e,1 +5.5,2.3,4,1.3,e,1 +6.5,2.8,4.6,1.5,s,1 +5.7,2.8,4.5,1.3,e,1 +6.3,3.3,4.7,1.6,s,1 +4.9,2.4,3.3,1,a,1 +6.6,2.9,4.6,1.3,s,1 +5.2,2.7,3.9,1.4,e,1 +5,2,3.5,1,s,1 +5.9,3,4.2,1.5,a,1 +6,2.2,4,1,s,1 +6.1,2.9,4.7,1.4,e,1 +5.6,2.9,3.6,1.3,s,1 +6.7,3.1,4.4,1.4,a,1 +5.6,3,4.5,1.5,a,1 +5.8,2.7,4.1,1,s,1 +6.2,2.2,4.5,1.5,e,1 +5.6,2.5,3.9,1.1,a,1 +5.9,3.2,4.8,1.8,e,1 +6.1,2.8,4,1.3,e,1 +6.3,2.5,4.9,1.5,s,1 +6.1,2.8,4.7,1.2,e,1 +6.4,2.9,4.3,1.3,s,1 +6.6,3,4.4,1.4,a,1 +6.8,2.8,4.8,1.4,s,1 +6.7,3,5,1.7,e,1 +6,2.9,4.5,1.5,s,1 +5.7,2.6,3.5,1,a,1 +5.5,2.4,3.8,1.1,s,1 +5.5,2.4,3.7,1,e,1 +5.8,2.7,3.9,1.2,s,1 +6,2.7,5.1,1.6,e,1 +5.4,3,4.5,1.5,s,1 +6,3.4,4.5,1.6,a,1 +6.7,3.1,4.7,1.5,a,1 +6.3,2.3,4.4,1.3,s,1 +5.6,3,4.1,1.3,e,1 +5.5,2.5,4,1.3,a,1 +5.5,2.6,4.4,1.2,e,1 +6.1,3,4.6,1.4,e,1 +5.8,2.6,4,1.2,s,1 +5,2.3,3.3,1,e,1 +5.6,2.7,4.2,1.3,s,1 +5.7,3,4.2,1.2,a,1 +5.7,2.9,4.2,1.3,s,1 +6.2,2.9,4.3,1.3,e,1 +5.1,2.5,3,1.1,s,1 +5.7,2.8,4.1,1.3,s,1 +6.3,3.3,6,2.5,s,2 +5.8,2.7,5.1,1.9,s,2 +7.1,3,5.9,2.1,a,2 +6.3,2.9,5.6,1.8,a,2 +6.5,3,5.8,2.2,a,2 +7.6,3,6.6,2.1,a,2 +4.9,2.5,4.5,1.7,e,2 +7.3,2.9,6.3,1.8,e,2 +6.7,2.5,5.8,1.8,e,2 +7.2,3.6,6.1,2.5,e,2 +6.5,3.2,5.1,2,s,2 +6.4,2.7,5.3,1.9,e,2 +6.8,3,5.5,2.1,s,2 +5.7,2.5,5,2,s,2 +5.8,2.8,5.1,2.4,e,2 +6.4,3.2,5.3,2.3,s,2 +6.5,3,5.5,1.8,a,2 +7.7,3.8,6.7,2.2,a,2 +7.7,2.6,6.9,2.3,s,2 +6,2.2,5,1.5,e,2 +6.9,3.2,5.7,2.3,s,2 +5.6,2.8,4.9,2,a,2 +7.7,2.8,6.7,2,s,2 +6.3,2.7,4.9,1.8,a,2 +6.7,3.3,5.7,2.1,s,2 +7.2,3.2,6,1.8,a,2 +6.2,2.8,4.8,1.8,s,2 +6.1,3,4.9,1.8,a,2 +6.4,2.8,5.6,2.1,s,2 +7.2,3,5.8,1.6,e,2 +7.4,2.8,6.1,1.9,e,2 +7.9,3.8,6.4,2,e,2 +6.4,2.8,5.6,2.2,e,2 +6.3,2.8,5.1,1.5,e,2 +6.1,2.6,5.6,1.4,s,2 +7.7,3,6.1,2.3,s,2 +6.3,3.4,5.6,2.4,s,2 +6.4,3.1,5.5,1.8,s,2 +6,3,4.8,1.8,s,2 +6.9,3.1,5.4,2.1,a,2 +6.7,3.1,5.6,2.4,a,2 +6.9,3.1,5.1,2.3,a,2 +5.8,2.7,5.1,1.9,a,2 +6.8,3.2,5.9,2.3,a,2 +6.7,3.3,5.7,2.5,s,2 +6.7,3,5.2,2.3,s,2 +6.3,2.5,5,1.9,s,2 +6.5,3,5.2,2,e,2 +6.2,3.4,5.4,2.3,e,2 +5.9,3,5.1,1.8,e,2 diff --git a/logs/cov.out b/logs/cov.out index cc34ded..a8c7525 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,9 +1,9 @@ -Name Stmts Miss Cover Missing ------------------------------------------------------------------------------------------ -/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 69 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 45 0 100% ------------------------------------------------------------------------------------------ -TOTAL 231 0 100% +Name Stmts Miss Cover Missing +----------------------------------------------------- +mllib/__init__.py 7 0 100% +mllib/lib/__init__.py 7 0 100% +mllib/lib/cluster.py 103 0 100% +mllib/lib/knn.py 77 0 100% +mllib/lib/model.py 45 0 100% +----------------------------------------------------- +TOTAL 239 0 100% diff --git a/logs/pip.out b/logs/pip.out index f61bf91..03fb79a 100644 --- a/logs/pip.out +++ b/logs/pip.out @@ -1 +1 @@ -INFO: Successfully saved requirements file in /media/ph33r/Data/Project/mllib/Git/requirements.txt +./bin/run_tests.sh: line 78: pipreqs: command not found diff --git a/logs/pylint/lib-knn-py.out b/logs/pylint/lib-knn-py.out index 48851ad..840218f 100644 --- a/logs/pylint/lib-knn-py.out +++ b/logs/pylint/lib-knn-py.out @@ -1,9 +1,9 @@ ************* Module mllib.lib.knn -knn.py:175:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:176:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:177:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:178:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:174:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:175:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:176:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:177:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 9.70/10, +0.30) +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index a2f2bd2..521b5f3 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -30,7 +30,7 @@ import numpy as np from sklearn import neighbors as sn -from sklearn.preprocessing import scale +from sklearn.preprocessing import MinMaxScaler from sklearn import metrics as sk_metrics from sklearn.model_selection import GridSearchCV @@ -108,7 +108,7 @@ def __init__(self, """Initialize variables for module ``KNN``.""" self.y_var = y_var self.x_var = x_var - self.df = df[[self.y_var] + self.x_var].reset_index(drop=True) + self.df = df.reset_index(drop=True) self.method = method self.model = None self.k_fold = k_fold @@ -124,10 +124,12 @@ def __init__(self, self._compute_metrics() def _pre_process(self): - """Pre-process the data, one hot encoding and scaling.""" + """Pre-process the data, one hot encoding and Normalizing.""" df_ip_x = pd.get_dummies(self.df[self.x_var]) self.x_var = list(df_ip_x.columns) - df_ip_x = pd.DataFrame(scale(df_ip_x)) + self.norm = MinMaxScaler() + self.norm.fit(df_ip_x) + df_ip_x = pd.DataFrame(self.norm.transform(df_ip_x[self.x_var])) df_ip_x.columns = self.x_var self.df = self.df[[self.y_var]].join(df_ip_x) @@ -198,8 +200,14 @@ def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: Pandas dataframe containing predicted `y_var` and `x_var`. """ - df_predict = pd.DataFrame(scale(pd.get_dummies(df_predict))) + df_predict = pd.get_dummies(df_predict) + df_predict_tmp = pd.DataFrame(columns=self.x_var) + df_predict = pd.concat([df_predict_tmp, df_predict]) + df_predict = df_predict.fillna(0) + df_predict = pd.DataFrame(self.norm.transform(df_predict[self.x_var])) + df_predict.columns = self.x_var y_hat = self.model.predict(df_predict) df_predict = df_predict.copy() df_predict["y"] = y_hat + df_predict = df_predict[[self.y_var] + self.x_var] return df_predict diff --git a/tests/test_knn.py b/tests/test_knn.py index ae2f2ce..7020044 100644 --- a/tests/test_knn.py +++ b/tests/test_knn.py @@ -91,6 +91,16 @@ def test_knn_reg(self): acc = round(sk_metrics.mean_squared_error(y, y_hat), 2) self.assertLessEqual(acc, 0.1) + def test_knn_cat(self): + """KNN: Test for dummies in prediction dataset.""" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[["y", "x1", "x5"]] + df_train = df_ip.iloc[1:140] + df_predict = df_ip.iloc[145:150] + mod = KNN(df_train, "y", ["x1", "x5"], method="classify") + df_predict_columns = mod.predict(df_predict).columns.tolist() + df_predict_columns.pop(0) + self.assertGreaterEqual(mod.x_var, df_predict_columns) # ============================================================================= # --- Main From d61fc5d044c6330abf741af3865d77180b6867ad Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Sun, 26 Sep 2021 18:22:40 +0530 Subject: [PATCH 08/13] v0.4.0 changelog: -added ignore warnings decorator in unit tests --- mllib/__main__.py | 2 +- mllib/lib/dev_knn.py | 30 ++++++++++++++++++++++++++++++ tests/test_cluster.py | 2 +- tests/test_knn.py | 4 +++- tests/test_metrics.py | 2 +- tests/test_model.py | 2 +- 6 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 mllib/lib/dev_knn.py diff --git a/mllib/__main__.py b/mllib/__main__.py index 7875e9e..4da6d36 100644 --- a/mllib/__main__.py +++ b/mllib/__main__.py @@ -95,7 +95,7 @@ df_test = df_ip.drop(df_train.index) mod = KNN(df_train, "y", ["x1", "x2"], method="classify") print("\nKNN\n") - y_hat = mod.predict(df_test[["x1", "x2"]]).tolist() + y_hat = mod.predict(df_test[["x1", "x2"]])["y"].tolist() y = df_test["y"].values.tolist() accuracy = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) print("Accuracy:", accuracy) diff --git a/mllib/lib/dev_knn.py b/mllib/lib/dev_knn.py new file mode 100644 index 0000000..501e83c --- /dev/null +++ b/mllib/lib/dev_knn.py @@ -0,0 +1,30 @@ +import pandas as pd + +from sklearn.preprocessing import MinMaxScaler +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import classification_report + +path = "/media/ph33r/Data/Project/mllib/Git/data/input/" + +fn_ip = "iris.csv" + +df = pd.read_csv(path + fn_ip) + +y_var = ["y"] +x_var = ["x1", "x2", "x3", "x4"] + +scaler = MinMaxScaler() +scaler.fit(df[x_var]) + +df_x_var = scaler.transform(df[x_var]) +df_y_var = df[y_var].values.ravel() + +classifier = KNeighborsClassifier(n_neighbors=3) +classifier.fit(df_x_var, df_y_var) + +y_hat = classifier.predict(df_x_var) + +tmp = classification_report(y_hat, df_y_var, output_dict=True, zero_division=0) +model_summary = tmp["weighted avg"] +model_summary["accuracy"] = tmp["accuracy"] +model_summary diff --git a/tests/test_cluster.py b/tests/test_cluster.py index efd74f4..f14e30b 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -44,7 +44,7 @@ def ignore_warnings(test_func): - """Suppress deprecation warnings of pulp.""" + """Suppress warnings.""" def do_test(self, *args, **kwargs): with warnings.catch_warnings(): diff --git a/tests/test_knn.py b/tests/test_knn.py index 7020044..e72d60c 100644 --- a/tests/test_knn.py +++ b/tests/test_knn.py @@ -48,7 +48,7 @@ def ignore_warnings(test_func): - """Suppress deprecation warnings.""" + """Suppress warnings.""" def do_test(self, *args, **kwargs): with warnings.catch_warnings(): @@ -77,6 +77,7 @@ def test_knn_class(self): acc = round(sk_metrics.accuracy_score(y, y_hat), 2) self.assertGreaterEqual(acc, 0.93) + @ignore_warnings def test_knn_reg(self): """KNN: Test for regression.""" df_ip = pd.read_csv(path + "iris.csv") @@ -102,6 +103,7 @@ def test_knn_cat(self): df_predict_columns.pop(0) self.assertGreaterEqual(mod.x_var, df_predict_columns) + # ============================================================================= # --- Main # ============================================================================= diff --git a/tests/test_metrics.py b/tests/test_metrics.py index d9b7eac..948bec4 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -38,7 +38,7 @@ def ignore_warnings(test_func): - """Suppress deprecation warnings of pulp.""" + """Suppress warnings.""" def do_test(self, *args, **kwargs): with warnings.catch_warnings(): diff --git a/tests/test_model.py b/tests/test_model.py index a73901c..cc05ec4 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -45,7 +45,7 @@ def ignore_warnings(test_func): - """Suppress deprecation warnings.""" + """Suppress warnings.""" def do_test(self, *args, **kwargs): with warnings.catch_warnings(): From 6e26a94594009a3fb1b0f17ca230f1c48ee37812 Mon Sep 17 00:00:00 2001 From: MadhuTangudu Date: Sun, 26 Sep 2021 20:55:31 +0530 Subject: [PATCH 09/13] v0..4.0 changelog: - code cleaning of knn.py, test_knn.py, model.py and test_model.py --- logs/cov.out | 4 ++-- mllib/lib/dev_knn.py | 30 ------------------------------ mllib/lib/knn.py | 30 ++++++++++++++---------------- mllib/lib/model.py | 12 ++++++------ tests/test_knn.py | 34 ++++++++++++++++++++-------------- tests/test_model.py | 4 ++-- 6 files changed, 44 insertions(+), 70 deletions(-) delete mode 100644 mllib/lib/dev_knn.py diff --git a/logs/cov.out b/logs/cov.out index a8c7525..1fbdc4b 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -3,7 +3,7 @@ Name Stmts Miss Cover Missing mllib/__init__.py 7 0 100% mllib/lib/__init__.py 7 0 100% mllib/lib/cluster.py 103 0 100% -mllib/lib/knn.py 77 0 100% +mllib/lib/knn.py 74 0 100% mllib/lib/model.py 45 0 100% ----------------------------------------------------- -TOTAL 239 0 100% +TOTAL 236 0 100% diff --git a/mllib/lib/dev_knn.py b/mllib/lib/dev_knn.py deleted file mode 100644 index 501e83c..0000000 --- a/mllib/lib/dev_knn.py +++ /dev/null @@ -1,30 +0,0 @@ -import pandas as pd - -from sklearn.preprocessing import MinMaxScaler -from sklearn.neighbors import KNeighborsClassifier -from sklearn.metrics import classification_report - -path = "/media/ph33r/Data/Project/mllib/Git/data/input/" - -fn_ip = "iris.csv" - -df = pd.read_csv(path + fn_ip) - -y_var = ["y"] -x_var = ["x1", "x2", "x3", "x4"] - -scaler = MinMaxScaler() -scaler.fit(df[x_var]) - -df_x_var = scaler.transform(df[x_var]) -df_y_var = df[y_var].values.ravel() - -classifier = KNeighborsClassifier(n_neighbors=3) -classifier.fit(df_x_var, df_y_var) - -y_hat = classifier.predict(df_x_var) - -tmp = classification_report(y_hat, df_y_var, output_dict=True, zero_division=0) -model_summary = tmp["weighted avg"] -model_summary["accuracy"] = tmp["accuracy"] -model_summary diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 521b5f3..9943baf 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -31,7 +31,7 @@ from sklearn import neighbors as sn from sklearn.preprocessing import MinMaxScaler -from sklearn import metrics as sk_metrics +from sklearn.metrics import classification_report from sklearn.model_selection import GridSearchCV @@ -62,11 +62,11 @@ class KNN(): x_var : List[str] - Independant variables. + Independant variables method : str, optional - Can be either `classify` or `regression` (the default is classify) + Can be either `classify` or `regression` (the default is regression) k_fold : int, optional @@ -93,7 +93,7 @@ class KNN(): Example ------- - >>> mod = KNN(df=df_ip, y_var=["y"], x_var=["x1", "x2", "x3"]) + >>> mod = KNN(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) >>> df_op = mod.predict(df_predict) """ @@ -102,7 +102,7 @@ def __init__(self, df: pd.DataFrame, y_var: str, x_var: List[str], - method: str = "classify", + method: str = "regression", k_fold: int = 5, param: Dict = None): """Initialize variables for module ``KNN``.""" @@ -168,8 +168,8 @@ def _fit(self) -> Dict[str, Any]: def _compute_metrics(self): """Compute commonly used metrics to evaluate the model.""" - y = self.df.iloc[:, 0].values.tolist() - y_hat = list(self.predict(self.df[self.x_var])["y"].values) + y = self.df.loc[:, self.y_var].values.tolist() + y_hat = list(self.predict(self.df[self.x_var])[self.y_var].values) if self.method == "regression": model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), "mae": np.round(metrics.mae(y, y_hat), 3), @@ -177,11 +177,10 @@ def _compute_metrics(self): "rmse": np.round(metrics.rmse(y, y_hat), 3)} model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) if self.method == "classify": - accuracy = np.round(sk_metrics.accuracy_score(y, y_hat), 3) - f1_score = np.round(sk_metrics.f1_score(y, y_hat, - average='micro'), 3) - model_summary = {"accuracy": accuracy, - "f1": f1_score} + model_summary = classification_report(y_hat, + y, + output_dict=True, + zero_division=0) self.model_summary = model_summary def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: @@ -200,6 +199,7 @@ def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: Pandas dataframe containing predicted `y_var` and `x_var`. """ + df_op = df_predict.copy(deep=True) df_predict = pd.get_dummies(df_predict) df_predict_tmp = pd.DataFrame(columns=self.x_var) df_predict = pd.concat([df_predict_tmp, df_predict]) @@ -207,7 +207,5 @@ def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: df_predict = pd.DataFrame(self.norm.transform(df_predict[self.x_var])) df_predict.columns = self.x_var y_hat = self.model.predict(df_predict) - df_predict = df_predict.copy() - df_predict["y"] = y_hat - df_predict = df_predict[[self.y_var] + self.x_var] - return df_predict + df_op.insert(loc=0, column=self.y_var, value=y_hat) + return df_op diff --git a/mllib/lib/model.py b/mllib/lib/model.py index e8adf2a..c9057ed 100644 --- a/mllib/lib/model.py +++ b/mllib/lib/model.py @@ -56,7 +56,7 @@ class GLMNet(): Pandas dataframe containing `y_var` and `x_var` variables. - y_var : List[str] + y_var : str Dependant variable. @@ -108,12 +108,12 @@ class GLMNet(): def __init__(self, df: pd.DataFrame, - y_var: List[str], + y_var: str, x_var: List[str], strata: str = None, param: Dict = None): """Initialize variables for module ``GLMNet``.""" - self.df = df[y_var + x_var] + self.df = df[[y_var] + x_var] self.y_var = y_var self.x_var = x_var self.strata = strata @@ -137,7 +137,7 @@ def _fit(self) -> None: """Fit the best GLMNet model.""" train_x, test_x,\ train_y, test_y = split(self.df[self.x_var], - self.df[self.y_var], + self.df[[self.y_var]], test_size=self.param["test_perc"], random_state=self.param["seed"], stratify=self.strata) @@ -161,7 +161,7 @@ def _fit(self) -> None: def _compute_metrics(self): """Compute commonly used metrics to evaluate the model.""" - y = self.df[self.y_var].iloc[:, 0].values.tolist() + y = self.df[[self.y_var]].iloc[:, 0].values.tolist() y_hat = list(self.predict(self.df[self.x_var])["y"].values) model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), "mae": np.round(metrics.mae(y, y_hat), 3), @@ -188,5 +188,5 @@ def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: """ y_hat = self.model.predict(df_predict) df_predict = df_predict.copy() - df_predict["y"] = y_hat + df_predict.insert(loc=0, column=self.y_var, value=y_hat) return df_predict diff --git a/tests/test_knn.py b/tests/test_knn.py index e72d60c..b244270 100644 --- a/tests/test_knn.py +++ b/tests/test_knn.py @@ -65,40 +65,46 @@ def setUp(self): def test_knn_class(self): """KNN: Test for classification.""" + x_var = ["x1", "x2"] + y_var = "y" df_ip = pd.read_csv(path + "iris.csv") - df_ip = df_ip[["y", "x1", "x2"]] + df_ip = df_ip[[y_var] + x_var] df_train, df_test = split(df_ip, - stratify=df_ip["y"], + stratify=df_ip[y_var], test_size=0.1, random_state=42) - mod = KNN(df_train, "y", ["x1", "x2"], method="classify") - y_hat = mod.predict(df_test[["x1", "x2"]])["y"].tolist() - y = df_test["y"].values.tolist() + mod = KNN(df_train, y_var, x_var, method="classify") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() acc = round(sk_metrics.accuracy_score(y, y_hat), 2) self.assertGreaterEqual(acc, 0.93) @ignore_warnings def test_knn_reg(self): """KNN: Test for regression.""" + x_var = ["x1", "x2"] + y_var = "y" df_ip = pd.read_csv(path + "iris.csv") - df_ip = df_ip[["y", "x1", "x2"]] + df_ip = df_ip[[y_var] + x_var] df_train, df_test = split(df_ip, - stratify=df_ip["y"], + stratify=df_ip[y_var], test_size=0.1, random_state=42) - mod = KNN(df_train, "y", ["x1", "x2"], method="regression") - y_hat = mod.predict(df_test[["x1", "x2"]])["y"].tolist() - y = df_test["y"].values.tolist() + mod = KNN(df_train, y_var, x_var, method="regression") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() acc = round(sk_metrics.mean_squared_error(y, y_hat), 2) self.assertLessEqual(acc, 0.1) def test_knn_cat(self): - """KNN: Test for dummies in prediction dataset.""" + """KNN: Test for one-hot encoding in prediction.""" + x_var = ["x1", "x2"] + y_var = "y" df_ip = pd.read_csv(path + "iris.csv") - df_ip = df_ip[["y", "x1", "x5"]] + df_ip = df_ip[[y_var] + x_var] df_train = df_ip.iloc[1:140] - df_predict = df_ip.iloc[145:150] - mod = KNN(df_train, "y", ["x1", "x5"], method="classify") + df_predict = df_ip.iloc[145:150, 1:] + mod = KNN(df_train, y_var, x_var, method="classify") df_predict_columns = mod.predict(df_predict).columns.tolist() df_predict_columns.pop(0) self.assertGreaterEqual(mod.x_var, df_predict_columns) diff --git a/tests/test_model.py b/tests/test_model.py index cc05ec4..a4c7ac1 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -64,7 +64,7 @@ def test_known_equation(self): """GLMNet: Test a known equation.""" df_ip = pd.read_csv(path + "test_glmnet.csv") mod = GLMNet(df=df_ip, - y_var=["y"], + y_var="y", x_var=["x1", "x2", "x3"]) op = mod.opt self.assertEqual(np.round(op.get('intercept'), 0), 100.0) @@ -76,7 +76,7 @@ def test_predict_target_variable(self): """GLMNet: Test to predict a target variable.""" df_ip = pd.read_csv(path + "test_glmnet.csv") mod = GLMNet(df=df_ip, - y_var=["y"], + y_var="y", x_var=["x1", "x2", "x3"]) df_predict = pd.DataFrame({"x1": [10, 20], "x2": [5, 10], From d66749e1efdcd67df3ee5cb2956778f3c5c8dd5a Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Sun, 26 Sep 2021 21:42:36 +0530 Subject: [PATCH 10/13] v0.4.0 --- logs/cov.out | 18 +++++++++--------- logs/pip.out | 2 +- mllib/__main__.py | 10 ++++------ mllib/lib/knn.py | 18 ++++++++++++++---- mllib/lib/model.py | 1 - requirements.txt | 2 +- 6 files changed, 29 insertions(+), 22 deletions(-) diff --git a/logs/cov.out b/logs/cov.out index 1fbdc4b..ecff896 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,9 +1,9 @@ -Name Stmts Miss Cover Missing ------------------------------------------------------ -mllib/__init__.py 7 0 100% -mllib/lib/__init__.py 7 0 100% -mllib/lib/cluster.py 103 0 100% -mllib/lib/knn.py 74 0 100% -mllib/lib/model.py 45 0 100% ------------------------------------------------------ -TOTAL 236 0 100% +Name Stmts Miss Cover Missing +----------------------------------------------------------------------------------------- +/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 77 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 44 0 100% +----------------------------------------------------------------------------------------- +TOTAL 238 0 100% diff --git a/logs/pip.out b/logs/pip.out index 03fb79a..f61bf91 100644 --- a/logs/pip.out +++ b/logs/pip.out @@ -1 +1 @@ -./bin/run_tests.sh: line 78: pipreqs: command not found +INFO: Successfully saved requirements file in /media/ph33r/Data/Project/mllib/Git/requirements.txt diff --git a/mllib/__main__.py b/mllib/__main__.py index 4da6d36..4077d8e 100644 --- a/mllib/__main__.py +++ b/mllib/__main__.py @@ -80,8 +80,8 @@ start_t = time.time_ns() df_ip = pd.read_csv(path + "input/test_glmnet.csv") glm_mod = GLMNet(df=df_ip, - y_var=["y"], - x_var=["x1", "x3"]) + y_var="y", + x_var=["x1", "x2"]) print("\nGLMNet\n") for k, v in glm_mod.model_summary.items(): print(k, str(v).rjust(69 - len(k))) @@ -95,10 +95,8 @@ df_test = df_ip.drop(df_train.index) mod = KNN(df_train, "y", ["x1", "x2"], method="classify") print("\nKNN\n") - y_hat = mod.predict(df_test[["x1", "x2"]])["y"].tolist() - y = df_test["y"].values.tolist() - accuracy = round(len([i for i, j in zip(y, y_hat) if i == j]) / len(y), 2) - print("Accuracy:", accuracy) + for k, v in mod.model_summary.items(): + print(k, str(v).rjust(69 - len(k))) print(elapsed_time("Time", start_t), sep="\n") # --- EOF diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 9943baf..284b718 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -87,6 +87,12 @@ class KNN(): Final optimal model. + model_summary : Dict + + Model summary containing key metrics like R-squared, RMSE, MSE, MAE, + MAPE for regression and Accuracy, Precision, Recall, F1 score for + classification. + Methods ------- predict @@ -177,10 +183,14 @@ def _compute_metrics(self): "rmse": np.round(metrics.rmse(y, y_hat), 3)} model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) if self.method == "classify": - model_summary = classification_report(y_hat, - y, - output_dict=True, - zero_division=0) + class_report = classification_report(y_hat, + y, + output_dict=True, + zero_division=0) + model_summary = class_report["weighted avg"] + model_summary["accuracy"] = class_report["accuracy"] + model_summary = {key: round(model_summary[key], 3) + for key in model_summary} self.model_summary = model_summary def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: diff --git a/mllib/lib/model.py b/mllib/lib/model.py index c9057ed..efa6afc 100644 --- a/mllib/lib/model.py +++ b/mllib/lib/model.py @@ -187,6 +187,5 @@ def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: """ y_hat = self.model.predict(df_predict) - df_predict = df_predict.copy() df_predict.insert(loc=0, column=self.y_var, value=y_hat) return df_predict diff --git a/requirements.txt b/requirements.txt index cf8b072..66d1dec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pandas==1.1.3 Cython==0.29.15 numpy==1.19.5 +pandas==1.1.3 scikit_learn==1.0 From d17f82aa0d137f630732f6b55d9d1ba461c1984e Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Mon, 27 Sep 2021 10:59:23 +0530 Subject: [PATCH 11/13] v0.4.0 changelog: - knn classification's default scorer changed to f1_weighted - removed some redundant codes --- logs/cov.out | 4 ++-- logs/pylint/lib-knn-py.out | 8 ++++---- mllib/__main__.py | 5 +---- mllib/lib/knn.py | 26 +++++++++----------------- requirements.txt | 2 +- 5 files changed, 17 insertions(+), 28 deletions(-) diff --git a/logs/cov.out b/logs/cov.out index ecff896..70db958 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -3,7 +3,7 @@ Name Stmts Miss Cover /media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% /media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% /media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 77 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 70 0 100% /media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 44 0 100% ----------------------------------------------------------------------------------------- -TOTAL 238 0 100% +TOTAL 231 0 100% diff --git a/logs/pylint/lib-knn-py.out b/logs/pylint/lib-knn-py.out index 840218f..28f2b90 100644 --- a/logs/pylint/lib-knn-py.out +++ b/logs/pylint/lib-knn-py.out @@ -1,8 +1,8 @@ ************* Module mllib.lib.knn -knn.py:174:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:175:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:176:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:177:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:172:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:173:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:174:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:175:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -------------------------------------------------------------------- Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) diff --git a/mllib/__main__.py b/mllib/__main__.py index 4077d8e..7cbcca8 100644 --- a/mllib/__main__.py +++ b/mllib/__main__.py @@ -90,10 +90,7 @@ # --- KNN start_t = time.time_ns() df_ip = pd.read_csv(path + "input/iris.csv") - df_ip = df_ip[["y", "x1", "x2"]] - df_train = df_ip.sample(frac=0.8, random_state=42) - df_test = df_ip.drop(df_train.index) - mod = KNN(df_train, "y", ["x1", "x2"], method="classify") + mod = KNN(df_ip, "y", ["x1", "x2", "x3", "x4"], method="classify") print("\nKNN\n") for k, v in mod.model_summary.items(): print(k, str(v).rjust(69 - len(k))) diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 284b718..4169387 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -144,8 +144,10 @@ def _fit(self) -> Dict[str, Any]: if self.method == "classify": gs = GridSearchCV(estimator=sn.KNeighborsClassifier(), param_grid=self.param, - scoring='accuracy', + scoring='f1_weighted', verbose=0, + refit=True, + return_train_score=True, cv=self.k_fold, n_jobs=-1) elif self.method == "regression": @@ -153,29 +155,19 @@ def _fit(self) -> Dict[str, Any]: param_grid=self.param, scoring='neg_root_mean_squared_error', verbose=0, + refit=True, + return_train_score=True, cv=self.k_fold, n_jobs=-1) gs_op = gs.fit(self.df[self.x_var], self.df[self.y_var]) - opt_k = gs_op.best_params_.get("n_neighbors") - weight = gs_op.best_params_.get("weights") - metric = gs_op.best_params_.get("metric") - if self.method == "classify": - model = sn.KNeighborsClassifier(n_neighbors=opt_k, - weights=weight, - metric=metric) - elif self.method == "regression": - model = sn.KNeighborsRegressor(n_neighbors=opt_k, - weights=weight, - metric=metric) - self.model = model.fit(self.df[self.x_var], - self.df[self.y_var]) + self.model = gs_op return gs_op.best_params_ def _compute_metrics(self): """Compute commonly used metrics to evaluate the model.""" y = self.df.loc[:, self.y_var].values.tolist() - y_hat = list(self.predict(self.df[self.x_var])[self.y_var].values) + y_hat = list(self.model.predict(self.df[self.x_var])) if self.method == "regression": model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), "mae": np.round(metrics.mae(y, y_hat), 3), @@ -183,8 +175,8 @@ def _compute_metrics(self): "rmse": np.round(metrics.rmse(y, y_hat), 3)} model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) if self.method == "classify": - class_report = classification_report(y_hat, - y, + class_report = classification_report(y, + y_hat, output_dict=True, zero_division=0) model_summary = class_report["weighted avg"] diff --git a/requirements.txt b/requirements.txt index 66d1dec..ef333fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -Cython==0.29.15 numpy==1.19.5 pandas==1.1.3 +Cython==0.29.15 scikit_learn==1.0 From d0a5e1e226b3b843a4da8735695b0a93e9835958 Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Mon, 27 Sep 2021 11:14:46 +0530 Subject: [PATCH 12/13] v0.4.0 --- mllib/lib/knn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 4169387..9e98369 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -87,6 +87,10 @@ class KNN(): Final optimal model. + best_params_ : Dict + + Best parameters amongst the given parameters. + model_summary : Dict Model summary containing key metrics like R-squared, RMSE, MSE, MAE, From 9c67779f9996fec5d5864f74cf57b6387818411f Mon Sep 17 00:00:00 2001 From: MadhuTangudu Date: Mon, 27 Sep 2021 13:36:36 +0530 Subject: [PATCH 13/13] v0.4.0 changelog: - local run test --- logs/cov.out | 18 +++++++++--------- logs/pip.out | 2 +- logs/pylint/lib-knn-py.out | 8 ++++---- mllib/lib/knn.py | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/logs/cov.out b/logs/cov.out index 70db958..f0a3c4b 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,9 +1,9 @@ -Name Stmts Miss Cover Missing ------------------------------------------------------------------------------------------ -/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 70 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 44 0 100% ------------------------------------------------------------------------------------------ -TOTAL 231 0 100% +Name Stmts Miss Cover Missing +----------------------------------------------------- +mllib/__init__.py 7 0 100% +mllib/lib/__init__.py 7 0 100% +mllib/lib/cluster.py 103 0 100% +mllib/lib/knn.py 70 0 100% +mllib/lib/model.py 44 0 100% +----------------------------------------------------- +TOTAL 231 0 100% diff --git a/logs/pip.out b/logs/pip.out index f61bf91..03fb79a 100644 --- a/logs/pip.out +++ b/logs/pip.out @@ -1 +1 @@ -INFO: Successfully saved requirements file in /media/ph33r/Data/Project/mllib/Git/requirements.txt +./bin/run_tests.sh: line 78: pipreqs: command not found diff --git a/logs/pylint/lib-knn-py.out b/logs/pylint/lib-knn-py.out index 28f2b90..ccf9413 100644 --- a/logs/pylint/lib-knn-py.out +++ b/logs/pylint/lib-knn-py.out @@ -1,8 +1,8 @@ ************* Module mllib.lib.knn -knn.py:172:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:173:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:174:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -knn.py:175:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:176:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:177:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:178:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +knn.py:179:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -------------------------------------------------------------------- Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 9e98369..5b14b05 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -134,7 +134,7 @@ def __init__(self, self._compute_metrics() def _pre_process(self): - """Pre-process the data, one hot encoding and Normalizing.""" + """Pre-process the data, one hot encoding and normalizing.""" df_ip_x = pd.get_dummies(self.df[self.x_var]) self.x_var = list(df_ip_x.columns) self.norm = MinMaxScaler()