diff --git a/logs/cov.out b/logs/cov.out index f0a3c4b..eed6fca 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,9 +1,10 @@ -Name Stmts Miss Cover Missing ------------------------------------------------------ -mllib/__init__.py 7 0 100% -mllib/lib/__init__.py 7 0 100% -mllib/lib/cluster.py 103 0 100% -mllib/lib/knn.py 70 0 100% -mllib/lib/model.py 44 0 100% ------------------------------------------------------ -TOTAL 231 0 100% +Name Stmts Miss Cover Missing +----------------------------------------------------------------------------------------- +/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 70 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 44 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/tree.py 79 0 100% +----------------------------------------------------------------------------------------- +TOTAL 310 0 100% diff --git a/logs/pip.out b/logs/pip.out index 03fb79a..f61bf91 100644 --- a/logs/pip.out +++ b/logs/pip.out @@ -1 +1 @@ -./bin/run_tests.sh: line 78: pipreqs: command not found +INFO: Successfully saved requirements file in /media/ph33r/Data/Project/mllib/Git/requirements.txt diff --git a/logs/pylint/lib-glmnet_ts-py.out b/logs/pylint/lib-glmnet_ts-py.out deleted file mode 100644 index 77fd809..0000000 --- a/logs/pylint/lib-glmnet_ts-py.out +++ /dev/null @@ -1,9 +0,0 @@ -************* Module mllib.lib.glmnet_ts -glmnet_ts.py:238:41: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -glmnet_ts.py:239:41: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -glmnet_ts.py:240:42: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -glmnet_ts.py:241:42: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) - --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) - diff --git a/logs/pylint/lib-tree-py.out b/logs/pylint/lib-tree-py.out new file mode 100644 index 0000000..1b9facf --- /dev/null +++ b/logs/pylint/lib-tree-py.out @@ -0,0 +1,9 @@ +************* Module mllib.lib.tree +tree.py:73:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +tree.py:74:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +tree.py:75:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +tree.py:76:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) + +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) + diff --git a/logs/pylint/tests-test_glmnet_ts-py.out b/logs/pylint/tests-test_glmnet_ts-py.out deleted file mode 100644 index d7495ee..0000000 --- a/logs/pylint/tests-test_glmnet_ts-py.out +++ /dev/null @@ -1,4 +0,0 @@ - --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) - diff --git a/logs/pylint/lib-glmnet-py.out b/logs/pylint/tests-test_tree-py.out similarity index 100% rename from logs/pylint/lib-glmnet-py.out rename to logs/pylint/tests-test_tree-py.out diff --git a/mllib/__main__.py b/mllib/__main__.py index 7cbcca8..d5bc8be 100644 --- a/mllib/__main__.py +++ b/mllib/__main__.py @@ -29,6 +29,8 @@ from lib.cluster import Cluster # noqa: F841 from lib.model import GLMNet # noqa: F841 from lib.knn import KNN # noqa: F841 +from lib.tree import RandomForest # noqa: F841 +from lib.tree import XGBoost # noqa: F841 # ============================================================================= # --- DO NOT CHANGE ANYTHING FROM HERE @@ -92,6 +94,28 @@ df_ip = pd.read_csv(path + "input/iris.csv") mod = KNN(df_ip, "y", ["x1", "x2", "x3", "x4"], method="classify") print("\nKNN\n") + for k, v in mod.model_summary.items(): + print(k, str(v).rjust(69 - len(k))) + print(elapsed_time("Time", start_t), + sep="\n") + # --- Random forest + start_t = time.time_ns() + df_ip = pd.read_csv(path + "input/iris.csv") + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + mod = RandomForest(df_ip, y_var, x_var, method="classify") + print("\nRandom forest\n") + for k, v in mod.model_summary.items(): + print(k, str(v).rjust(69 - len(k))) + print(elapsed_time("Time", start_t), + sep="\n") + # --- XGBoost + start_t = time.time_ns() + df_ip = pd.read_csv(path + "input/iris.csv") + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + mod = XGBoost(df_ip, y_var, x_var, method="classify") + print("\nXGBoost\n") for k, v in mod.model_summary.items(): print(k, str(v).rjust(69 - len(k))) print(elapsed_time("Time", start_t), diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 5b14b05..7483e22 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -104,7 +104,7 @@ class KNN(): Example ------- >>> mod = KNN(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) - >>> df_op = mod.predict(df_predict) + >>> df_op = mod.predict(x_predict) """ @@ -189,7 +189,7 @@ def _compute_metrics(self): for key in model_summary} self.model_summary = model_summary - def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: + def predict(self, x_predict: pd.DataFrame) -> pd.DataFrame: """Predict y_var/target variable. Parameters @@ -205,8 +205,8 @@ def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: Pandas dataframe containing predicted `y_var` and `x_var`. """ - df_op = df_predict.copy(deep=True) - df_predict = pd.get_dummies(df_predict) + df_op = x_predict.copy(deep=True) + df_predict = pd.get_dummies(x_predict) df_predict_tmp = pd.DataFrame(columns=self.x_var) df_predict = pd.concat([df_predict_tmp, df_predict]) df_predict = df_predict.fillna(0) diff --git a/mllib/lib/tree.py b/mllib/lib/tree.py new file mode 100644 index 0000000..5a33fb9 --- /dev/null +++ b/mllib/lib/tree.py @@ -0,0 +1,309 @@ +""" +Tree based models. + +**Available routines:** + +- class ``RandomForest``: Builds Random Forest model using cross validation. +- class ``XGBoost``: Builds XGBoost model using cross validation. + +Credits +------- +:: + + Authors: + - Diptesh + - Madhu + + Date: Sep 27, 2021 +""" + +# pylint: disable=invalid-name +# pylint: disable=R0902,R0903,R0913,C0413 + +from typing import List, Dict, Any + +import re +import sys +from inspect import getsourcefile +from os.path import abspath + +import pandas as pd +import numpy as np +import sklearn.ensemble as rf +import xgboost as xgb + +from sklearn.model_selection import RandomizedSearchCV +from sklearn.metrics import classification_report + +path = abspath(getsourcefile(lambda: 0)) +path = re.sub(r"(.+\/)(.+.py)", "\\1", path) +sys.path.insert(0, path) + +import metrics # noqa: F841 + + +class Tree(): + """Parent class for tree based models.""" + + def __init__(self, + df: pd.DataFrame, + y_var: str, + x_var: List[str], + method: str = "regression", + k_fold: int = 5, + param: Dict = None): + """Initialize variables.""" + self.y_var = y_var + self.x_var = x_var + self.df = df.reset_index(drop=True) + self.method = method + self.k_fold = k_fold + self.seed = 1 + self.model = None + self.model_summary = None + self.param = param + self.best_params_ = self._fit() + self._compute_metrics() + + def _compute_metrics(self): + """Compute commonly used metrics to evaluate the model.""" + y = self.df.loc[:, self.y_var].values.tolist() + y_hat = list(self.model.predict(self.df[self.x_var])) + if self.method == "regression": + model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), + "mae": np.round(metrics.mae(y, y_hat), 3), + "mape": np.round(metrics.mape(y, y_hat), 3), + "rmse": np.round(metrics.rmse(y, y_hat), 3)} + model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) + if self.method == "classify": + class_report = classification_report(y, + y_hat, + output_dict=True, + zero_division=0) + model_summary = class_report["weighted avg"] + model_summary["accuracy"] = class_report["accuracy"] + model_summary = {key: round(model_summary[key], 3) + for key in model_summary} + self.model_summary = model_summary + + def _fit(self) -> Dict[str, Any]: # pragma: no cover + """Fit model.""" + return self.best_params_ + + def predict(self, x_predict: pd.DataFrame) -> pd.DataFrame: + """Predict values.""" + df_op = x_predict.copy(deep=True) + y_hat = self.model.predict(x_predict) + df_op.insert(loc=0, column=self.y_var, value=y_hat) + return df_op + + +class RandomForest(Tree): + """Random forest module. + + Objective: + - Build + `Random forest `_ + model and determine optimal k + + Parameters + ---------- + df : pandas.DataFrame + + Pandas dataframe containing the `y_var` and `x_var` + + y_var : str + + Dependant variable + + x_var : List[str] + + Independant variables + + method : str, optional + + Can be either `classify` or `regression` (the default is regression) + + k_fold : int, optional + + Number of cross validations folds (the default is 5) + + param : dict, optional + + Random forest parameters (the default is None). + In case of None, the parameters will default to:: + + bootstrap: [True] + max_depth: [1, len(x_var)] + n_estimators: [1000] + max_features: ["sqrt", "auto"] + min_samples_leaf: [2, 5] + + Returns + ------- + model : object + + Final optimal model. + + best_params_ : Dict + + Best parameters amongst the given parameters. + + model_summary : Dict + + Model summary containing key metrics like R-squared, RMSE, MSE, MAE, + MAPE for regression and Accuracy, Precision, Recall, F1 score for + classification. + + Methods + ------- + predict + + Example + ------- + >>> mod = RandomForest(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) + >>> df_op = mod.predict(x_predict) + + """ + + def _fit(self) -> Dict[str, Any]: + """Fit RandomForest model.""" + if self.param is None: + self.param = {"bootstrap": [True], + "max_depth": list(range(1, len(self.x_var))), + "n_estimators": [100]} + if self.method == "classify": + self.param["max_features"] = ["sqrt"] + self.param["min_samples_leaf"] = [2] + elif self.method == "regression": + self.param["max_features"] = [int(len(self.x_var) / 3)] + self.param["min_samples_leaf"] = [5] + if self.method == "classify": + tmp_model = rf.RandomForestClassifier(oob_score=True, + random_state=self.seed) + elif self.method == "regression": + tmp_model = rf.RandomForestRegressor(oob_score=True, + random_state=self.seed) + gs = RandomizedSearchCV(estimator=tmp_model, + param_distributions=self.param, + n_jobs=-1, + verbose=0, + refit=True, + n_iter=3, + return_train_score=True, + cv=self.k_fold) + gs_op = gs.fit(self.df[self.x_var], + self.df[self.y_var]) + self.model = gs_op + return gs_op.best_params_ + + +class XGBoost(Tree): + """XGBoost module. + + Objective: + - Build + `XGBoost `_ + model and determine optimal k + + Parameters + ---------- + df : pandas.DataFrame + + Pandas dataframe containing the `y_var` and `x_var` + + y_var : str + + Dependant variable + + x_var : List[str] + + Independant variables + + method : str, optional + + Can be either `classify` or `regression` (the default is regression) + + k_fold : int, optional + + Number of cross validations folds (the default is 5) + + param : dict, optional + + XGBoost parameters (the default is None). + In case of None, the parameters will default to:: + + n_estimators: [100] + learning_rate: [0.01, 0.1, 0.2, 0.3] + subsample: [0.5, 0.75, 1.0] + colsample_bytree: [0.5, 1.0] + min_child_weight: [0.5, 1.0, 3.0] + max_depth: [int(len(self.x_var) * 0.8)] + objective: ["reg:squarederror", "binary:logistic"] + + Returns + ------- + model : object + + Final optimal model. + + best_params_ : Dict + + Best parameters amongst the given parameters. + + model_summary : Dict + + Model summary containing key metrics like R-squared, RMSE, MSE, MAE, + MAPE for regression and Accuracy, Precision, Recall, F1 score for + classification. + + Methods + ------- + predict + + Example + ------- + >>> mod = XGBoost(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) + >>> df_op = mod.predict(x_predict) + + """ + + def _fit(self) -> Dict[str, Any]: + """Fit XGBoost model.""" + if self.param is None: + self.param = {"n_estimators": [100], + "learning_rate": [0.01, 0.1, 0.2, 0.3], + "subsample": [0.5, 0.75, 1.0], + "colsample_bytree": [0.5, 1.0], + "min_child_weight": [0.5, 1.0, 3.0], + "max_depth": [int(len(self.x_var) * 0.8)]} + if self.method == "classify": + self.param["objective"] = ["binary:logistic"] + elif self.method == "regression": + self.param["objective"] = ["reg:squarederror"] + if self.method == "classify": + tmp_model = xgb.XGBClassifier(n_jobs=1, + verbosity=0, + silent=True, + random_state=self.seed, + seed=self.seed, + use_label_encoder=False) + elif self.method == "regression": + tmp_model = xgb.XGBRegressor(n_jobs=1, + verbosity=0, + silent=True, + random_state=self.seed, + seed=self.seed) + gs = RandomizedSearchCV(estimator=tmp_model, + param_distributions=self.param, + n_jobs=-1, + verbose=0, + refit=True, + n_iter=10, + return_train_score=True, + cv=self.k_fold, + random_state=self.seed) + gs_op = gs.fit(self.df[self.x_var], + self.df[self.y_var]) + self.model = gs_op + return gs_op.best_params_ diff --git a/requirements.txt b/requirements.txt index ef333fe..7715ab9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ +Cython==0.29.15 numpy==1.19.5 pandas==1.1.3 -Cython==0.29.15 +xgboost==1.3.3 scikit_learn==1.0 diff --git a/tests/test_knn.py b/tests/test_knn.py index b244270..a4c0547 100644 --- a/tests/test_knn.py +++ b/tests/test_knn.py @@ -93,8 +93,8 @@ def test_knn_reg(self): mod = KNN(df_train, y_var, x_var, method="regression") y_hat = mod.predict(df_test[x_var])[y_var].tolist() y = df_test[y_var].values.tolist() - acc = round(sk_metrics.mean_squared_error(y, y_hat), 2) - self.assertLessEqual(acc, 0.1) + mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) + self.assertLessEqual(mse, 0.1) def test_knn_cat(self): """KNN: Test for one-hot encoding in prediction.""" diff --git a/tests/test_tree.py b/tests/test_tree.py new file mode 100644 index 0000000..5ae5fc2 --- /dev/null +++ b/tests/test_tree.py @@ -0,0 +1,147 @@ +""" +Test suite module for ``XGBoost``. + +Credits +------- +:: + + Authors: + - Diptesh + - Madhu + + Date: Sep 27, 2021 +""" + +# pylint: disable=invalid-name +# pylint: disable=wrong-import-position + +import unittest +import warnings +import re +import sys + +from inspect import getsourcefile +from os.path import abspath + +import pandas as pd + +from sklearn.model_selection import train_test_split as split +from sklearn import metrics as sk_metrics + +# Set base path +path = abspath(getsourcefile(lambda: 0)) +path = re.sub(r"(.+)(\/tests.*)", "\\1", path) + +sys.path.insert(0, path) + +from mllib.lib.tree import RandomForest # noqa: F841 +from mllib.lib.tree import XGBoost # noqa: F841 + +# ============================================================================= +# --- DO NOT CHANGE ANYTHING FROM HERE +# ============================================================================= + +path = path + "/data/input/" + +# ============================================================================= +# --- User defined functions +# ============================================================================= + + +def ignore_warnings(test_func): + """Suppress warnings.""" + + def do_test(self, *args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + test_func(self, *args, **kwargs) + return do_test + + +class Test_RandomForest(unittest.TestCase): + """Test suite for module ``RandomForest``.""" + + def setUp(self): + """Set up for module ``RandomForest``.""" + + def test_rf_class(self): + """RandomForest: Test for classification.""" + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=42) + mod = RandomForest(df_train, y_var, x_var, method="classify") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + acc = round(sk_metrics.accuracy_score(y, y_hat), 2) + self.assertGreaterEqual(acc, 0.93) + + @ignore_warnings + def test_rf_reg(self): + """RandomForest: Test for regression.""" + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=42) + mod = RandomForest(df_train, y_var, x_var, method="regression") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) + self.assertLessEqual(mse, 0.1) + + +class Test_XGBoost(unittest.TestCase): + """Test suite for module ``XGBoost``.""" + + def setUp(self): + """Set up for module ``XGBoost``.""" + + @ignore_warnings + def test_xgboost_class(self): + """XGBoost: Test for classification.""" + x_var = ["x1", "x2"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=1) + mod = XGBoost(df_train, y_var, x_var, method="classify") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + acc = round(sk_metrics.accuracy_score(y, y_hat), 2) + self.assertGreaterEqual(acc, 0.93) + + @ignore_warnings + def test_xgboost_reg(self): + """XGBoost: Test for regression.""" + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=1) + mod = XGBoost(df_train, y_var, x_var, method="regression") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) + self.assertLessEqual(mse, 0.5) + + +# ============================================================================= +# --- Main +# ============================================================================= + +if __name__ == '__main__': + unittest.main()