From d7c73bbd919200393f4f5080e75481370745f1a8 Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Mon, 27 Sep 2021 16:54:53 +0530 Subject: [PATCH 1/4] v0.4.0 changelog: - added RandomForest module with associated tests --- logs/cov.out | 19 +- logs/pip.out | 2 +- logs/pylint/lib-random_forest-py.out | 9 + logs/pylint/tests-test_random_forest-py.out | 4 + mllib/lib/knn.py | 6 +- mllib/lib/random_forest.py | 187 ++++++++++++++++++++ requirements.txt | 2 +- tests/test_knn.py | 4 +- tests/test_random_forest.py | 105 +++++++++++ 9 files changed, 322 insertions(+), 16 deletions(-) create mode 100644 logs/pylint/lib-random_forest-py.out create mode 100644 logs/pylint/tests-test_random_forest-py.out create mode 100644 mllib/lib/random_forest.py create mode 100644 tests/test_random_forest.py diff --git a/logs/cov.out b/logs/cov.out index f0a3c4b..4dbc976 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,9 +1,10 @@ -Name Stmts Miss Cover Missing ------------------------------------------------------ -mllib/__init__.py 7 0 100% -mllib/lib/__init__.py 7 0 100% -mllib/lib/cluster.py 103 0 100% -mllib/lib/knn.py 70 0 100% -mllib/lib/model.py 44 0 100% ------------------------------------------------------ -TOTAL 231 0 100% +Name Stmts Miss Cover Missing +---------------------------------------------------------------------------------------------- +/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 70 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 44 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/random_forest.py 61 0 100% +---------------------------------------------------------------------------------------------- +TOTAL 292 0 100% diff --git a/logs/pip.out b/logs/pip.out index 03fb79a..f61bf91 100644 --- a/logs/pip.out +++ b/logs/pip.out @@ -1 +1 @@ -./bin/run_tests.sh: line 78: pipreqs: command not found +INFO: Successfully saved requirements file in /media/ph33r/Data/Project/mllib/Git/requirements.txt diff --git a/logs/pylint/lib-random_forest-py.out b/logs/pylint/lib-random_forest-py.out new file mode 100644 index 0000000..ade91ae --- /dev/null +++ b/logs/pylint/lib-random_forest-py.out @@ -0,0 +1,9 @@ +************* Module mllib.lib.random_forest +random_forest.py:146:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +random_forest.py:147:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +random_forest.py:148:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +random_forest.py:149:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) + +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) + diff --git a/logs/pylint/tests-test_random_forest-py.out b/logs/pylint/tests-test_random_forest-py.out new file mode 100644 index 0000000..d7495ee --- /dev/null +++ b/logs/pylint/tests-test_random_forest-py.out @@ -0,0 +1,4 @@ + +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) + diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index 5b14b05..ba5d08d 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -189,7 +189,7 @@ def _compute_metrics(self): for key in model_summary} self.model_summary = model_summary - def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: + def predict(self, x_predict: pd.DataFrame) -> pd.DataFrame: """Predict y_var/target variable. Parameters @@ -205,8 +205,8 @@ def predict(self, df_predict: pd.DataFrame) -> pd.DataFrame: Pandas dataframe containing predicted `y_var` and `x_var`. """ - df_op = df_predict.copy(deep=True) - df_predict = pd.get_dummies(df_predict) + df_op = x_predict.copy(deep=True) + df_predict = pd.get_dummies(x_predict) df_predict_tmp = pd.DataFrame(columns=self.x_var) df_predict = pd.concat([df_predict_tmp, df_predict]) df_predict = df_predict.fillna(0) diff --git a/mllib/lib/random_forest.py b/mllib/lib/random_forest.py new file mode 100644 index 0000000..33685ab --- /dev/null +++ b/mllib/lib/random_forest.py @@ -0,0 +1,187 @@ +""" +Random Forest module. + +**Available routines:** + +- class ``RandomForest``: Builds Random Forest model using cross validation. + +Credits +------- +:: + + Authors: + - Diptesh + - Madhu + + Date: Sep 27, 2021 +""" + +# pylint: disable=invalid-name +# pylint: disable=R0902,R0903,R0913,C0413 + +from typing import List, Dict, Any + +import re +import sys +from inspect import getsourcefile +from os.path import abspath + +import pandas as pd +import numpy as np +import sklearn.ensemble as rf + +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import classification_report + +path = abspath(getsourcefile(lambda: 0)) +path = re.sub(r"(.+\/)(.+.py)", "\\1", path) +sys.path.insert(0, path) + +import metrics # noqa: F841 + + +class RandomForest(): + """Random forest module. + + Objective: + - Build + `Random forest `_ + model and determine optimal k + + Parameters + ---------- + df : pandas.DataFrame + + Pandas dataframe containing the `y_var` and `x_var` + + y_var : str + + Dependant variable + + x_var : List[str] + + Independant variables + + method : str, optional + + Can be either `classify` or `regression` (the default is regression) + + k_fold : int, optional + + Number of cross validations folds (the default is 5) + + param : dict, optional + + Random forest parameters (the default is None). + In case of None, the parameters will default to:: + + bootstrap: [True] + max_depth: [1, len(x_var)] + n_estimators: [1000] + max_features: ["sqrt", "auto"] + min_samples_leaf: [2, 5] + + Returns + ------- + model : object + + Final optimal model. + + best_params_ : Dict + + Best parameters amongst the given parameters. + + model_summary : Dict + + Model summary containing key metrics like R-squared, RMSE, MSE, MAE, + MAPE for regression and Accuracy, Precision, Recall, F1 score for + classification. + + Methods + ------- + predict + + Example + ------- + >>> mod = RandomForest(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) + >>> df_op = mod.predict(df_predict) + + """ + + def __init__(self, + df: pd.DataFrame, + y_var: str, + x_var: List[str], + method: str = "regression", + k_fold: int = 5, + param: Dict = None): + """Initialize variables for module ``RandomForest``.""" + self.y_var = y_var + self.x_var = x_var + self.df = df.reset_index(drop=True) + self.method = method + self.model = None + self.k_fold = k_fold + self.seed = 1 + if param is None: + param = {"bootstrap": [True], + "max_depth": list(range(1, len(x_var))), + "n_estimators": [1000]} + if method == "classify": + param["max_features"] = ["sqrt"] + param["min_samples_leaf"] = [2] + elif method == "regression": + param["max_features"] = [int(len(x_var) / 3)] + param["min_samples_leaf"] = [5] + self.param = param + self.best_params_ = self._fit() + self.model_summary = None + self._compute_metrics() + + def _compute_metrics(self): + """Compute commonly used metrics to evaluate the model.""" + y = self.df.loc[:, self.y_var].values.tolist() + y_hat = list(self.model.predict(self.df[self.x_var])) + if self.method == "regression": + model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), + "mae": np.round(metrics.mae(y, y_hat), 3), + "mape": np.round(metrics.mape(y, y_hat), 3), + "rmse": np.round(metrics.rmse(y, y_hat), 3)} + model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) + if self.method == "classify": + class_report = classification_report(y, + y_hat, + output_dict=True, + zero_division=0) + model_summary = class_report["weighted avg"] + model_summary["accuracy"] = class_report["accuracy"] + model_summary = {key: round(model_summary[key], 3) + for key in model_summary} + self.model_summary = model_summary + + def _fit(self) -> Dict[str, Any]: + """Fit RandomForest model.""" + if self.method == "classify": + tmp_model = rf.RandomForestClassifier(oob_score=True, + random_state=self.seed) + elif self.method == "regression": + tmp_model = rf.RandomForestRegressor(oob_score=True, + random_state=self.seed) + gs = GridSearchCV(estimator=tmp_model, + param_grid=self.param, + n_jobs=-1, + verbose=0, + refit=True, + return_train_score=True, + cv=self.k_fold) + gs_op = gs.fit(self.df[self.x_var], + self.df[self.y_var]) + self.model = gs_op + return gs_op.best_params_ + + def predict(self, x_predict: pd.DataFrame) -> pd.DataFrame: + """Predict values.""" + df_op = x_predict.copy(deep=True) + y_hat = self.model.predict(x_predict) + df_op.insert(loc=0, column=self.y_var, value=y_hat) + return df_op diff --git a/requirements.txt b/requirements.txt index ef333fe..86af4d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy==1.19.5 pandas==1.1.3 +numpy==1.19.5 Cython==0.29.15 scikit_learn==1.0 diff --git a/tests/test_knn.py b/tests/test_knn.py index b244270..a4c0547 100644 --- a/tests/test_knn.py +++ b/tests/test_knn.py @@ -93,8 +93,8 @@ def test_knn_reg(self): mod = KNN(df_train, y_var, x_var, method="regression") y_hat = mod.predict(df_test[x_var])[y_var].tolist() y = df_test[y_var].values.tolist() - acc = round(sk_metrics.mean_squared_error(y, y_hat), 2) - self.assertLessEqual(acc, 0.1) + mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) + self.assertLessEqual(mse, 0.1) def test_knn_cat(self): """KNN: Test for one-hot encoding in prediction.""" diff --git a/tests/test_random_forest.py b/tests/test_random_forest.py new file mode 100644 index 0000000..f0127f3 --- /dev/null +++ b/tests/test_random_forest.py @@ -0,0 +1,105 @@ +""" +Test suite module for ``random_forest``. + +Credits +------- +:: + + Authors: + - Diptesh + - Madhu + + Date: Sep 27, 2021 +""" + +# pylint: disable=invalid-name +# pylint: disable=wrong-import-position + +import unittest +import warnings +import re +import sys + +from inspect import getsourcefile +from os.path import abspath + +import pandas as pd + +from sklearn.model_selection import train_test_split as split +from sklearn import metrics as sk_metrics + +# Set base path +path = abspath(getsourcefile(lambda: 0)) +path = re.sub(r"(.+)(\/tests.*)", "\\1", path) + +sys.path.insert(0, path) + +from mllib.lib.random_forest import RandomForest # noqa: F841 + +# ============================================================================= +# --- DO NOT CHANGE ANYTHING FROM HERE +# ============================================================================= + +path = path + "/data/input/" + +# ============================================================================= +# --- User defined functions +# ============================================================================= + + +def ignore_warnings(test_func): + """Suppress warnings.""" + + def do_test(self, *args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + test_func(self, *args, **kwargs) + return do_test + + +class Test_Knn(unittest.TestCase): + """Test suite for module ``KNN``.""" + + def setUp(self): + """Set up for module ``KNN``.""" + + def test_rf_class(self): + """RandomForest: Test for classification.""" + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=42) + mod = RandomForest(df_train, y_var, x_var, method="classify") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + acc = round(sk_metrics.accuracy_score(y, y_hat), 2) + self.assertGreaterEqual(acc, 0.93) + + @ignore_warnings + def test_knn_reg(self): + """RandomForest: Test for regression.""" + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=42) + mod = RandomForest(df_train, y_var, x_var, method="regression") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) + self.assertLessEqual(mse, 0.1) + + +# ============================================================================= +# --- Main +# ============================================================================= + +if __name__ == '__main__': + unittest.main() From 38787dfc6c1bf273d3bb3f4e99931b90dbc6dbac Mon Sep 17 00:00:00 2001 From: MadhuTangudu Date: Mon, 27 Sep 2021 19:48:02 +0530 Subject: [PATCH 2/4] v0.4.0 --- mllib/lib/xgboost.py | 199 ++++++++++++++++++++++++++++++++++++++++++ tests/test_xgboost.py | 105 ++++++++++++++++++++++ 2 files changed, 304 insertions(+) create mode 100644 mllib/lib/xgboost.py create mode 100644 tests/test_xgboost.py diff --git a/mllib/lib/xgboost.py b/mllib/lib/xgboost.py new file mode 100644 index 0000000..9d82dff --- /dev/null +++ b/mllib/lib/xgboost.py @@ -0,0 +1,199 @@ +""" +Random Forest module. + +**Available routines:** + +- class ``XGBoost``: Builds XGBoost model using cross validation. + +Credits +------- +:: + + Authors: + - Diptesh + - Madhu + + Date: Sep 27, 2021 +""" + +# pylint: disable=invalid-name +# pylint: disable=R0902,R0903,R0913,C0413 + +from typing import List, Dict, Any + +import re +import sys +from inspect import getsourcefile +from os.path import abspath + +import pandas as pd +import numpy as np +import xgboost as xgb + +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import classification_report + +path = abspath(getsourcefile(lambda: 0)) +path = re.sub(r"(.+\/)(.+.py)", "\\1", path) +sys.path.insert(0, path) + +import metrics # noqa: F841 + + +class XGBoost(): + """Random forest module. + + Objective: + - Build + `XGBoost < https://en.wikipedia.org/wiki/XGBoost >' + model and determine optimal k + + Parameters + ---------- + df : pandas.DataFrame + + Pandas dataframe containing the `y_var` and `x_var` + + y_var : str + + Dependant variable + + x_var : List[str] + + Independant variables + + method : str, optional + + Can be either `classify` or `regression` (the default is regression) + + k_fold : int, optional + + Number of cross validations folds (the default is 5) + + param : dict, optional + + XGBoost parameters (the default is None). + In case of None, the parameters will default to:: + + n_estimators: [1000] + learning_rate: [i/1000 for i in range(2, 11)] + subsample: [i/10 for i in range(5, 10)] + colsample_bytree: [i/10 for i in range(1, 11)] + min_child_weight: list(range(1, 11)) + max_depth: [1, len(x_var)] + gamma: list(range(1, len(x_var))) + objective: ["reg:squarederror"] + + Returns + ------- + model : object + + Final optimal model. + + best_params_ : Dict + + Best parameters amongst the given parameters. + + model_summary : Dict + + Model summary containing key metrics like R-squared, RMSE, MSE, MAE, + MAPE for regression and Accuracy, Precision, Recall, F1 score for + classification. + + Methods + ------- + predict + + Example + ------- + >>> mod = XGBoost(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) + >>> df_op = mod.predict(df_predict) + + """ + + def __init__(self, + df: pd.DataFrame, + y_var: str, + x_var: List[str], + method: str = "regression", + k_fold: int = 5, + param: Dict = None): + """Initialize variables for module ``XGBoost``.""" + self.y_var = y_var + self.x_var = x_var + self.df = df.reset_index(drop=True) + self.method = method + self.model = None + self.k_fold = k_fold + self.seed = 1 + if param is None: + param = {"n_estimators": [1000], + "learning_rate": [i/1000 for i in range(2, 11)], + "subsample": [i/10 for i in range(5, 10)], + "colsample_bytree": [i/10 for i in range(1, 11)], + "min_child_weight": list(range(1, 11)), + "max_depth": list(range(1, len(x_var))), + "gamma": list(range(0, 21)), + } + if method == "classify": + param["objective"] = ["binary:logistic"] + elif method == "regression": + param["objective"] = ["reg:squarederror"] + self.param = param + self.best_params_ = self._fit() + self.model_summary = None + self._compute_metrics() + + def _compute_metrics(self): + """Compute commonly used metrics to evaluate the model.""" + y = self.df.loc[:, self.y_var].values.tolist() + y_hat = list(self.model.predict(self.df[self.x_var])) + if self.method == "regression": + model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), + "mae": np.round(metrics.mae(y, y_hat), 3), + "mape": np.round(metrics.mape(y, y_hat), 3), + "rmse": np.round(metrics.rmse(y, y_hat), 3)} + model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) + if self.method == "classify": + class_report = classification_report(y, + y_hat, + output_dict=True, + zero_division=0) + model_summary = class_report["weighted avg"] + model_summary["accuracy"] = class_report["accuracy"] + model_summary = {key: round(model_summary[key], 3) + for key in model_summary} + self.model_summary = model_summary + + def _fit(self) -> Dict[str, Any]: + """Fit XGBoost model.""" + if self.method == "classify": + tmp_model = xgb.XGBClassifier(n_jobs=-1, + verbosity=0, + silent=True, + random_state=self.seed, + seed=self.seed) + elif self.method == "regression": + tmp_model = xgb.XGBRegressor(n_jobs=-1, + verbosity=0, + silent=True, + random_state=self.seed, + seed=self.seed) + gs = GridSearchCV(estimator=tmp_model, + param_grid=self.param, + n_jobs=-1, + verbose=0, + refit=True, + return_train_score=True, + cv=self.k_fold) + gs_op = gs.fit(self.df[self.x_var], + self.df[self.y_var]) + self.model = gs_op + return gs_op.best_params_ + + def predict(self, x_predict: pd.DataFrame) -> pd.DataFrame: + """Predict values.""" + df_op = x_predict.copy(deep=True) + y_hat = self.model.predict(x_predict) + df_op.insert(loc=0, column=self.y_var, value=y_hat) + return df_op diff --git a/tests/test_xgboost.py b/tests/test_xgboost.py new file mode 100644 index 0000000..d155db0 --- /dev/null +++ b/tests/test_xgboost.py @@ -0,0 +1,105 @@ +""" +Test suite module for ``XGBoost``. + +Credits +------- +:: + + Authors: + - Diptesh + - Madhu + + Date: Sep 27, 2021 +""" + +# pylint: disable=invalid-name +# pylint: disable=wrong-import-position + +import unittest +import warnings +import re +import sys + +from inspect import getsourcefile +from os.path import abspath + +import pandas as pd + +from sklearn.model_selection import train_test_split as split +from sklearn import metrics as sk_metrics + +# Set base path +path = abspath(getsourcefile(lambda: 0)) +path = re.sub(r"(.+)(\/tests.*)", "\\1", path) + +sys.path.insert(0, path) + +from mllib.lib.xgboost import XGBoost # noqa: F841 + +# ============================================================================= +# --- DO NOT CHANGE ANYTHING FROM HERE +# ============================================================================= + +path = path + "/data/input/" + +# ============================================================================= +# --- User defined functions +# ============================================================================= + + +def ignore_warnings(test_func): + """Suppress warnings.""" + + def do_test(self, *args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + test_func(self, *args, **kwargs) + return do_test + + +class Test_XGBoost(unittest.TestCase): + """Test suite for module ``XGBoost``.""" + + def setUp(self): + """Set up for module ``XGBoost``.""" + + def test_rf_class(self): + """XGBoost: Test for classification.""" + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=42) + mod = XGBoost(df_train, y_var, x_var, method="classify") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + acc = round(sk_metrics.accuracy_score(y, y_hat), 2) + self.assertGreaterEqual(acc, 0.93) + + @ignore_warnings + def test_knn_reg(self): + """XGBoost: Test for regression.""" + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=42) + mod = XGBoost(df_train, y_var, x_var, method="regression") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) + self.assertLessEqual(mse, 0.1) + + +# ============================================================================= +# --- Main +# ============================================================================= + +if __name__ == '__main__': + unittest.main() From 352c968d165a4a3fee63bffcdaf86983addd938d Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Mon, 27 Sep 2021 23:04:38 +0530 Subject: [PATCH 3/4] v0.4.0 changelog: - added XGBoost module with associated tests --- logs/cov.out | 3 +- logs/pylint/lib-boost-py.out | 9 ++++ logs/pylint/tests-test_boost-py.out | 4 ++ mllib/lib/{xgboost.py => boost.py} | 52 ++++++++++++------------ requirements.txt | 3 +- tests/{test_xgboost.py => test_boost.py} | 21 ++++++---- tests/test_random_forest.py | 8 ++-- 7 files changed, 61 insertions(+), 39 deletions(-) create mode 100644 logs/pylint/lib-boost-py.out create mode 100644 logs/pylint/tests-test_boost-py.out rename mllib/lib/{xgboost.py => boost.py} (77%) rename tests/{test_xgboost.py => test_boost.py} (82%) diff --git a/logs/cov.out b/logs/cov.out index 4dbc976..5117727 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -2,9 +2,10 @@ Name Stmts Miss C ---------------------------------------------------------------------------------------------- /media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% /media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/boost.py 53 0 100% /media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% /media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 70 0 100% /media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 44 0 100% /media/ph33r/Data/Project/mllib/Git/mllib/lib/random_forest.py 61 0 100% ---------------------------------------------------------------------------------------------- -TOTAL 292 0 100% +TOTAL 345 0 100% diff --git a/logs/pylint/lib-boost-py.out b/logs/pylint/lib-boost-py.out new file mode 100644 index 0000000..aed4fa6 --- /dev/null +++ b/logs/pylint/lib-boost-py.out @@ -0,0 +1,9 @@ +************* Module mllib.lib.boost +boost.py:152:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +boost.py:153:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +boost.py:154:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +boost.py:155:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) + +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) + diff --git a/logs/pylint/tests-test_boost-py.out b/logs/pylint/tests-test_boost-py.out new file mode 100644 index 0000000..d7495ee --- /dev/null +++ b/logs/pylint/tests-test_boost-py.out @@ -0,0 +1,4 @@ + +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) + diff --git a/mllib/lib/xgboost.py b/mllib/lib/boost.py similarity index 77% rename from mllib/lib/xgboost.py rename to mllib/lib/boost.py index 9d82dff..64dd6cd 100644 --- a/mllib/lib/xgboost.py +++ b/mllib/lib/boost.py @@ -1,5 +1,5 @@ """ -Random Forest module. +XGBoost module. **Available routines:** @@ -30,7 +30,7 @@ import numpy as np import xgboost as xgb -from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import classification_report path = abspath(getsourcefile(lambda: 0)) @@ -41,11 +41,11 @@ class XGBoost(): - """Random forest module. + """XGBoost module. Objective: - Build - `XGBoost < https://en.wikipedia.org/wiki/XGBoost >' + `XGBoost `_ model and determine optimal k Parameters @@ -81,8 +81,8 @@ class XGBoost(): colsample_bytree: [i/10 for i in range(1, 11)] min_child_weight: list(range(1, 11)) max_depth: [1, len(x_var)] - gamma: list(range(1, len(x_var))) - objective: ["reg:squarederror"] + gamma: list(np.arange(0.0, 1.1, 0.25)) + objective: ["reg:squarederror", "binary:logistic"] Returns ------- @@ -126,15 +126,15 @@ def __init__(self, self.model = None self.k_fold = k_fold self.seed = 1 - if param is None: + if param is None: # pragma: no cover param = {"n_estimators": [1000], - "learning_rate": [i/1000 for i in range(2, 11)], - "subsample": [i/10 for i in range(5, 10)], - "colsample_bytree": [i/10 for i in range(1, 11)], - "min_child_weight": list(range(1, 11)), - "max_depth": list(range(1, len(x_var))), - "gamma": list(range(0, 21)), - } + "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3], + "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], + "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], + "colsample_bylevel": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], + "min_child_weight": [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], + "max_depth": list(range(1, len(x_var))), + "gamma": [0, 0.25, 0.5, 1.0]} if method == "classify": param["objective"] = ["binary:logistic"] elif method == "regression": @@ -169,23 +169,25 @@ def _fit(self) -> Dict[str, Any]: """Fit XGBoost model.""" if self.method == "classify": tmp_model = xgb.XGBClassifier(n_jobs=-1, - verbosity=0, - silent=True, - random_state=self.seed, - seed=self.seed) + verbosity=0, + silent=True, + random_state=self.seed, + seed=self.seed, + use_label_encoder=False) elif self.method == "regression": tmp_model = xgb.XGBRegressor(n_jobs=-1, verbosity=0, silent=True, random_state=self.seed, seed=self.seed) - gs = GridSearchCV(estimator=tmp_model, - param_grid=self.param, - n_jobs=-1, - verbose=0, - refit=True, - return_train_score=True, - cv=self.k_fold) + gs = RandomizedSearchCV(estimator=tmp_model, + param_distributions=self.param, + n_jobs=-1, + verbose=0, + refit=True, + return_train_score=True, + cv=self.k_fold, + random_state=self.seed) gs_op = gs.fit(self.df[self.x_var], self.df[self.y_var]) self.model = gs_op diff --git a/requirements.txt b/requirements.txt index 86af4d0..0070676 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pandas==1.1.3 -numpy==1.19.5 Cython==0.29.15 +xgboost==1.3.3 +numpy==1.19.5 scikit_learn==1.0 diff --git a/tests/test_xgboost.py b/tests/test_boost.py similarity index 82% rename from tests/test_xgboost.py rename to tests/test_boost.py index d155db0..2532c37 100644 --- a/tests/test_xgboost.py +++ b/tests/test_boost.py @@ -34,7 +34,7 @@ sys.path.insert(0, path) -from mllib.lib.xgboost import XGBoost # noqa: F841 +from mllib.lib.boost import XGBoost # noqa: F841 # ============================================================================= # --- DO NOT CHANGE ANYTHING FROM HERE @@ -63,7 +63,8 @@ class Test_XGBoost(unittest.TestCase): def setUp(self): """Set up for module ``XGBoost``.""" - def test_rf_class(self): + @ignore_warnings + def test_xgboost_class(self): """XGBoost: Test for classification.""" x_var = ["x1", "x2", "x3", "x4"] y_var = "y" @@ -72,15 +73,17 @@ def test_rf_class(self): df_train, df_test = split(df_ip, stratify=df_ip[y_var], test_size=0.2, - random_state=42) - mod = XGBoost(df_train, y_var, x_var, method="classify") + random_state=1) + mod = XGBoost(df_train, y_var, x_var, method="classify", + param={"n_estimators": [1], + "objective": ["binary:logistic"]}) y_hat = mod.predict(df_test[x_var])[y_var].tolist() y = df_test[y_var].values.tolist() acc = round(sk_metrics.accuracy_score(y, y_hat), 2) self.assertGreaterEqual(acc, 0.93) @ignore_warnings - def test_knn_reg(self): + def test_xgboost_reg(self): """XGBoost: Test for regression.""" x_var = ["x1", "x2", "x3", "x4"] y_var = "y" @@ -89,12 +92,14 @@ def test_knn_reg(self): df_train, df_test = split(df_ip, stratify=df_ip[y_var], test_size=0.2, - random_state=42) - mod = XGBoost(df_train, y_var, x_var, method="regression") + random_state=1) + mod = XGBoost(df_train, y_var, x_var, method="regression", + param={"n_estimators": [1], + "objective": ["reg:squarederror"]}) y_hat = mod.predict(df_test[x_var])[y_var].tolist() y = df_test[y_var].values.tolist() mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) - self.assertLessEqual(mse, 0.1) + self.assertLessEqual(mse, 0.5) # ============================================================================= diff --git a/tests/test_random_forest.py b/tests/test_random_forest.py index f0127f3..54bb0f3 100644 --- a/tests/test_random_forest.py +++ b/tests/test_random_forest.py @@ -57,11 +57,11 @@ def do_test(self, *args, **kwargs): return do_test -class Test_Knn(unittest.TestCase): - """Test suite for module ``KNN``.""" +class Test_RandomForest(unittest.TestCase): + """Test suite for module ``RandomForest``.""" def setUp(self): - """Set up for module ``KNN``.""" + """Set up for module ``RandomForest``.""" def test_rf_class(self): """RandomForest: Test for classification.""" @@ -80,7 +80,7 @@ def test_rf_class(self): self.assertGreaterEqual(acc, 0.93) @ignore_warnings - def test_knn_reg(self): + def test_rf_reg(self): """RandomForest: Test for regression.""" x_var = ["x1", "x2", "x3", "x4"] y_var = "y" From af7cfa0ba59eca9470fa47d5d52377ce462fd49b Mon Sep 17 00:00:00 2001 From: Diptesh Basak Date: Tue, 28 Sep 2021 02:09:09 +0530 Subject: [PATCH 4/4] v0.4.0 changelog: - combined RandomForest and XGBoost into one single module namely, tree --- logs/cov.out | 21 +- logs/pylint/lib-boost-py.out | 9 - logs/pylint/lib-glmnet_ts-py.out | 9 - logs/pylint/lib-random_forest-py.out | 9 - logs/pylint/lib-tree-py.out | 9 + logs/pylint/tests-test_boost-py.out | 4 - logs/pylint/tests-test_glmnet_ts-py.out | 4 - logs/pylint/tests-test_random_forest-py.out | 4 - ...b-glmnet-py.out => tests-test_tree-py.out} | 0 mllib/__main__.py | 24 ++ mllib/lib/knn.py | 2 +- mllib/lib/random_forest.py | 187 ------------- mllib/lib/{boost.py => tree.py} | 254 +++++++++++++----- requirements.txt | 4 +- tests/test_boost.py | 110 -------- tests/{test_random_forest.py => test_tree.py} | 46 +++- 16 files changed, 271 insertions(+), 425 deletions(-) delete mode 100644 logs/pylint/lib-boost-py.out delete mode 100644 logs/pylint/lib-glmnet_ts-py.out delete mode 100644 logs/pylint/lib-random_forest-py.out create mode 100644 logs/pylint/lib-tree-py.out delete mode 100644 logs/pylint/tests-test_boost-py.out delete mode 100644 logs/pylint/tests-test_glmnet_ts-py.out delete mode 100644 logs/pylint/tests-test_random_forest-py.out rename logs/pylint/{lib-glmnet-py.out => tests-test_tree-py.out} (100%) delete mode 100644 mllib/lib/random_forest.py rename mllib/lib/{boost.py => tree.py} (55%) delete mode 100644 tests/test_boost.py rename tests/{test_random_forest.py => test_tree.py} (63%) diff --git a/logs/cov.out b/logs/cov.out index 5117727..eed6fca 100644 --- a/logs/cov.out +++ b/logs/cov.out @@ -1,11 +1,10 @@ -Name Stmts Miss Cover Missing ----------------------------------------------------------------------------------------------- -/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/boost.py 53 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 70 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 44 0 100% -/media/ph33r/Data/Project/mllib/Git/mllib/lib/random_forest.py 61 0 100% ----------------------------------------------------------------------------------------------- -TOTAL 345 0 100% +Name Stmts Miss Cover Missing +----------------------------------------------------------------------------------------- +/media/ph33r/Data/Project/mllib/Git/mllib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/__init__.py 7 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/cluster.py 103 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/knn.py 70 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/model.py 44 0 100% +/media/ph33r/Data/Project/mllib/Git/mllib/lib/tree.py 79 0 100% +----------------------------------------------------------------------------------------- +TOTAL 310 0 100% diff --git a/logs/pylint/lib-boost-py.out b/logs/pylint/lib-boost-py.out deleted file mode 100644 index aed4fa6..0000000 --- a/logs/pylint/lib-boost-py.out +++ /dev/null @@ -1,9 +0,0 @@ -************* Module mllib.lib.boost -boost.py:152:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -boost.py:153:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -boost.py:154:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -boost.py:155:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) - --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) - diff --git a/logs/pylint/lib-glmnet_ts-py.out b/logs/pylint/lib-glmnet_ts-py.out deleted file mode 100644 index 77fd809..0000000 --- a/logs/pylint/lib-glmnet_ts-py.out +++ /dev/null @@ -1,9 +0,0 @@ -************* Module mllib.lib.glmnet_ts -glmnet_ts.py:238:41: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -glmnet_ts.py:239:41: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -glmnet_ts.py:240:42: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -glmnet_ts.py:241:42: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) - --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) - diff --git a/logs/pylint/lib-random_forest-py.out b/logs/pylint/lib-random_forest-py.out deleted file mode 100644 index ade91ae..0000000 --- a/logs/pylint/lib-random_forest-py.out +++ /dev/null @@ -1,9 +0,0 @@ -************* Module mllib.lib.random_forest -random_forest.py:146:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -random_forest.py:147:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -random_forest.py:148:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) -random_forest.py:149:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) - --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) - diff --git a/logs/pylint/lib-tree-py.out b/logs/pylint/lib-tree-py.out new file mode 100644 index 0000000..1b9facf --- /dev/null +++ b/logs/pylint/lib-tree-py.out @@ -0,0 +1,9 @@ +************* Module mllib.lib.tree +tree.py:73:45: I1101: Module 'metrics' has no 'rsq' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +tree.py:74:45: I1101: Module 'metrics' has no 'mae' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +tree.py:75:46: I1101: Module 'metrics' has no 'mape' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) +tree.py:76:46: I1101: Module 'metrics' has no 'rmse' member, but source is unavailable. Consider adding this module to extension-pkg-whitelist if you want to perform analysis based on run-time introspection of living objects. (c-extension-no-member) + +-------------------------------------------------------------------- +Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) + diff --git a/logs/pylint/tests-test_boost-py.out b/logs/pylint/tests-test_boost-py.out deleted file mode 100644 index d7495ee..0000000 --- a/logs/pylint/tests-test_boost-py.out +++ /dev/null @@ -1,4 +0,0 @@ - --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) - diff --git a/logs/pylint/tests-test_glmnet_ts-py.out b/logs/pylint/tests-test_glmnet_ts-py.out deleted file mode 100644 index d7495ee..0000000 --- a/logs/pylint/tests-test_glmnet_ts-py.out +++ /dev/null @@ -1,4 +0,0 @@ - --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) - diff --git a/logs/pylint/tests-test_random_forest-py.out b/logs/pylint/tests-test_random_forest-py.out deleted file mode 100644 index d7495ee..0000000 --- a/logs/pylint/tests-test_random_forest-py.out +++ /dev/null @@ -1,4 +0,0 @@ - --------------------------------------------------------------------- -Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00) - diff --git a/logs/pylint/lib-glmnet-py.out b/logs/pylint/tests-test_tree-py.out similarity index 100% rename from logs/pylint/lib-glmnet-py.out rename to logs/pylint/tests-test_tree-py.out diff --git a/mllib/__main__.py b/mllib/__main__.py index 7cbcca8..d5bc8be 100644 --- a/mllib/__main__.py +++ b/mllib/__main__.py @@ -29,6 +29,8 @@ from lib.cluster import Cluster # noqa: F841 from lib.model import GLMNet # noqa: F841 from lib.knn import KNN # noqa: F841 +from lib.tree import RandomForest # noqa: F841 +from lib.tree import XGBoost # noqa: F841 # ============================================================================= # --- DO NOT CHANGE ANYTHING FROM HERE @@ -92,6 +94,28 @@ df_ip = pd.read_csv(path + "input/iris.csv") mod = KNN(df_ip, "y", ["x1", "x2", "x3", "x4"], method="classify") print("\nKNN\n") + for k, v in mod.model_summary.items(): + print(k, str(v).rjust(69 - len(k))) + print(elapsed_time("Time", start_t), + sep="\n") + # --- Random forest + start_t = time.time_ns() + df_ip = pd.read_csv(path + "input/iris.csv") + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + mod = RandomForest(df_ip, y_var, x_var, method="classify") + print("\nRandom forest\n") + for k, v in mod.model_summary.items(): + print(k, str(v).rjust(69 - len(k))) + print(elapsed_time("Time", start_t), + sep="\n") + # --- XGBoost + start_t = time.time_ns() + df_ip = pd.read_csv(path + "input/iris.csv") + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + mod = XGBoost(df_ip, y_var, x_var, method="classify") + print("\nXGBoost\n") for k, v in mod.model_summary.items(): print(k, str(v).rjust(69 - len(k))) print(elapsed_time("Time", start_t), diff --git a/mllib/lib/knn.py b/mllib/lib/knn.py index ba5d08d..7483e22 100644 --- a/mllib/lib/knn.py +++ b/mllib/lib/knn.py @@ -104,7 +104,7 @@ class KNN(): Example ------- >>> mod = KNN(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) - >>> df_op = mod.predict(df_predict) + >>> df_op = mod.predict(x_predict) """ diff --git a/mllib/lib/random_forest.py b/mllib/lib/random_forest.py deleted file mode 100644 index 33685ab..0000000 --- a/mllib/lib/random_forest.py +++ /dev/null @@ -1,187 +0,0 @@ -""" -Random Forest module. - -**Available routines:** - -- class ``RandomForest``: Builds Random Forest model using cross validation. - -Credits -------- -:: - - Authors: - - Diptesh - - Madhu - - Date: Sep 27, 2021 -""" - -# pylint: disable=invalid-name -# pylint: disable=R0902,R0903,R0913,C0413 - -from typing import List, Dict, Any - -import re -import sys -from inspect import getsourcefile -from os.path import abspath - -import pandas as pd -import numpy as np -import sklearn.ensemble as rf - -from sklearn.model_selection import GridSearchCV -from sklearn.metrics import classification_report - -path = abspath(getsourcefile(lambda: 0)) -path = re.sub(r"(.+\/)(.+.py)", "\\1", path) -sys.path.insert(0, path) - -import metrics # noqa: F841 - - -class RandomForest(): - """Random forest module. - - Objective: - - Build - `Random forest `_ - model and determine optimal k - - Parameters - ---------- - df : pandas.DataFrame - - Pandas dataframe containing the `y_var` and `x_var` - - y_var : str - - Dependant variable - - x_var : List[str] - - Independant variables - - method : str, optional - - Can be either `classify` or `regression` (the default is regression) - - k_fold : int, optional - - Number of cross validations folds (the default is 5) - - param : dict, optional - - Random forest parameters (the default is None). - In case of None, the parameters will default to:: - - bootstrap: [True] - max_depth: [1, len(x_var)] - n_estimators: [1000] - max_features: ["sqrt", "auto"] - min_samples_leaf: [2, 5] - - Returns - ------- - model : object - - Final optimal model. - - best_params_ : Dict - - Best parameters amongst the given parameters. - - model_summary : Dict - - Model summary containing key metrics like R-squared, RMSE, MSE, MAE, - MAPE for regression and Accuracy, Precision, Recall, F1 score for - classification. - - Methods - ------- - predict - - Example - ------- - >>> mod = RandomForest(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) - >>> df_op = mod.predict(df_predict) - - """ - - def __init__(self, - df: pd.DataFrame, - y_var: str, - x_var: List[str], - method: str = "regression", - k_fold: int = 5, - param: Dict = None): - """Initialize variables for module ``RandomForest``.""" - self.y_var = y_var - self.x_var = x_var - self.df = df.reset_index(drop=True) - self.method = method - self.model = None - self.k_fold = k_fold - self.seed = 1 - if param is None: - param = {"bootstrap": [True], - "max_depth": list(range(1, len(x_var))), - "n_estimators": [1000]} - if method == "classify": - param["max_features"] = ["sqrt"] - param["min_samples_leaf"] = [2] - elif method == "regression": - param["max_features"] = [int(len(x_var) / 3)] - param["min_samples_leaf"] = [5] - self.param = param - self.best_params_ = self._fit() - self.model_summary = None - self._compute_metrics() - - def _compute_metrics(self): - """Compute commonly used metrics to evaluate the model.""" - y = self.df.loc[:, self.y_var].values.tolist() - y_hat = list(self.model.predict(self.df[self.x_var])) - if self.method == "regression": - model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), - "mae": np.round(metrics.mae(y, y_hat), 3), - "mape": np.round(metrics.mape(y, y_hat), 3), - "rmse": np.round(metrics.rmse(y, y_hat), 3)} - model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) - if self.method == "classify": - class_report = classification_report(y, - y_hat, - output_dict=True, - zero_division=0) - model_summary = class_report["weighted avg"] - model_summary["accuracy"] = class_report["accuracy"] - model_summary = {key: round(model_summary[key], 3) - for key in model_summary} - self.model_summary = model_summary - - def _fit(self) -> Dict[str, Any]: - """Fit RandomForest model.""" - if self.method == "classify": - tmp_model = rf.RandomForestClassifier(oob_score=True, - random_state=self.seed) - elif self.method == "regression": - tmp_model = rf.RandomForestRegressor(oob_score=True, - random_state=self.seed) - gs = GridSearchCV(estimator=tmp_model, - param_grid=self.param, - n_jobs=-1, - verbose=0, - refit=True, - return_train_score=True, - cv=self.k_fold) - gs_op = gs.fit(self.df[self.x_var], - self.df[self.y_var]) - self.model = gs_op - return gs_op.best_params_ - - def predict(self, x_predict: pd.DataFrame) -> pd.DataFrame: - """Predict values.""" - df_op = x_predict.copy(deep=True) - y_hat = self.model.predict(x_predict) - df_op.insert(loc=0, column=self.y_var, value=y_hat) - return df_op diff --git a/mllib/lib/boost.py b/mllib/lib/tree.py similarity index 55% rename from mllib/lib/boost.py rename to mllib/lib/tree.py index 64dd6cd..5a33fb9 100644 --- a/mllib/lib/boost.py +++ b/mllib/lib/tree.py @@ -1,8 +1,9 @@ """ -XGBoost module. +Tree based models. **Available routines:** +- class ``RandomForest``: Builds Random Forest model using cross validation. - class ``XGBoost``: Builds XGBoost model using cross validation. Credits @@ -28,6 +29,7 @@ import pandas as pd import numpy as np +import sklearn.ensemble as rf import xgboost as xgb from sklearn.model_selection import RandomizedSearchCV @@ -40,7 +42,163 @@ import metrics # noqa: F841 -class XGBoost(): +class Tree(): + """Parent class for tree based models.""" + + def __init__(self, + df: pd.DataFrame, + y_var: str, + x_var: List[str], + method: str = "regression", + k_fold: int = 5, + param: Dict = None): + """Initialize variables.""" + self.y_var = y_var + self.x_var = x_var + self.df = df.reset_index(drop=True) + self.method = method + self.k_fold = k_fold + self.seed = 1 + self.model = None + self.model_summary = None + self.param = param + self.best_params_ = self._fit() + self._compute_metrics() + + def _compute_metrics(self): + """Compute commonly used metrics to evaluate the model.""" + y = self.df.loc[:, self.y_var].values.tolist() + y_hat = list(self.model.predict(self.df[self.x_var])) + if self.method == "regression": + model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), + "mae": np.round(metrics.mae(y, y_hat), 3), + "mape": np.round(metrics.mape(y, y_hat), 3), + "rmse": np.round(metrics.rmse(y, y_hat), 3)} + model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) + if self.method == "classify": + class_report = classification_report(y, + y_hat, + output_dict=True, + zero_division=0) + model_summary = class_report["weighted avg"] + model_summary["accuracy"] = class_report["accuracy"] + model_summary = {key: round(model_summary[key], 3) + for key in model_summary} + self.model_summary = model_summary + + def _fit(self) -> Dict[str, Any]: # pragma: no cover + """Fit model.""" + return self.best_params_ + + def predict(self, x_predict: pd.DataFrame) -> pd.DataFrame: + """Predict values.""" + df_op = x_predict.copy(deep=True) + y_hat = self.model.predict(x_predict) + df_op.insert(loc=0, column=self.y_var, value=y_hat) + return df_op + + +class RandomForest(Tree): + """Random forest module. + + Objective: + - Build + `Random forest `_ + model and determine optimal k + + Parameters + ---------- + df : pandas.DataFrame + + Pandas dataframe containing the `y_var` and `x_var` + + y_var : str + + Dependant variable + + x_var : List[str] + + Independant variables + + method : str, optional + + Can be either `classify` or `regression` (the default is regression) + + k_fold : int, optional + + Number of cross validations folds (the default is 5) + + param : dict, optional + + Random forest parameters (the default is None). + In case of None, the parameters will default to:: + + bootstrap: [True] + max_depth: [1, len(x_var)] + n_estimators: [1000] + max_features: ["sqrt", "auto"] + min_samples_leaf: [2, 5] + + Returns + ------- + model : object + + Final optimal model. + + best_params_ : Dict + + Best parameters amongst the given parameters. + + model_summary : Dict + + Model summary containing key metrics like R-squared, RMSE, MSE, MAE, + MAPE for regression and Accuracy, Precision, Recall, F1 score for + classification. + + Methods + ------- + predict + + Example + ------- + >>> mod = RandomForest(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) + >>> df_op = mod.predict(x_predict) + + """ + + def _fit(self) -> Dict[str, Any]: + """Fit RandomForest model.""" + if self.param is None: + self.param = {"bootstrap": [True], + "max_depth": list(range(1, len(self.x_var))), + "n_estimators": [100]} + if self.method == "classify": + self.param["max_features"] = ["sqrt"] + self.param["min_samples_leaf"] = [2] + elif self.method == "regression": + self.param["max_features"] = [int(len(self.x_var) / 3)] + self.param["min_samples_leaf"] = [5] + if self.method == "classify": + tmp_model = rf.RandomForestClassifier(oob_score=True, + random_state=self.seed) + elif self.method == "regression": + tmp_model = rf.RandomForestRegressor(oob_score=True, + random_state=self.seed) + gs = RandomizedSearchCV(estimator=tmp_model, + param_distributions=self.param, + n_jobs=-1, + verbose=0, + refit=True, + n_iter=3, + return_train_score=True, + cv=self.k_fold) + gs_op = gs.fit(self.df[self.x_var], + self.df[self.y_var]) + self.model = gs_op + return gs_op.best_params_ + + +class XGBoost(Tree): """XGBoost module. Objective: @@ -75,13 +233,12 @@ class XGBoost(): XGBoost parameters (the default is None). In case of None, the parameters will default to:: - n_estimators: [1000] - learning_rate: [i/1000 for i in range(2, 11)] - subsample: [i/10 for i in range(5, 10)] - colsample_bytree: [i/10 for i in range(1, 11)] - min_child_weight: list(range(1, 11)) - max_depth: [1, len(x_var)] - gamma: list(np.arange(0.0, 1.1, 0.25)) + n_estimators: [100] + learning_rate: [0.01, 0.1, 0.2, 0.3] + subsample: [0.5, 0.75, 1.0] + colsample_bytree: [0.5, 1.0] + min_child_weight: [0.5, 1.0, 3.0] + max_depth: [int(len(self.x_var) * 0.8)] objective: ["reg:squarederror", "binary:logistic"] Returns @@ -107,75 +264,32 @@ class XGBoost(): Example ------- >>> mod = XGBoost(df=df_ip, y_var="y", x_var=["x1", "x2", "x3"]) - >>> df_op = mod.predict(df_predict) + >>> df_op = mod.predict(x_predict) """ - def __init__(self, - df: pd.DataFrame, - y_var: str, - x_var: List[str], - method: str = "regression", - k_fold: int = 5, - param: Dict = None): - """Initialize variables for module ``XGBoost``.""" - self.y_var = y_var - self.x_var = x_var - self.df = df.reset_index(drop=True) - self.method = method - self.model = None - self.k_fold = k_fold - self.seed = 1 - if param is None: # pragma: no cover - param = {"n_estimators": [1000], - "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3], - "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], - "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], - "colsample_bylevel": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], - "min_child_weight": [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], - "max_depth": list(range(1, len(x_var))), - "gamma": [0, 0.25, 0.5, 1.0]} - if method == "classify": - param["objective"] = ["binary:logistic"] - elif method == "regression": - param["objective"] = ["reg:squarederror"] - self.param = param - self.best_params_ = self._fit() - self.model_summary = None - self._compute_metrics() - - def _compute_metrics(self): - """Compute commonly used metrics to evaluate the model.""" - y = self.df.loc[:, self.y_var].values.tolist() - y_hat = list(self.model.predict(self.df[self.x_var])) - if self.method == "regression": - model_summary = {"rsq": np.round(metrics.rsq(y, y_hat), 3), - "mae": np.round(metrics.mae(y, y_hat), 3), - "mape": np.round(metrics.mape(y, y_hat), 3), - "rmse": np.round(metrics.rmse(y, y_hat), 3)} - model_summary["mse"] = np.round(model_summary["rmse"] ** 2, 3) - if self.method == "classify": - class_report = classification_report(y, - y_hat, - output_dict=True, - zero_division=0) - model_summary = class_report["weighted avg"] - model_summary["accuracy"] = class_report["accuracy"] - model_summary = {key: round(model_summary[key], 3) - for key in model_summary} - self.model_summary = model_summary - def _fit(self) -> Dict[str, Any]: """Fit XGBoost model.""" + if self.param is None: + self.param = {"n_estimators": [100], + "learning_rate": [0.01, 0.1, 0.2, 0.3], + "subsample": [0.5, 0.75, 1.0], + "colsample_bytree": [0.5, 1.0], + "min_child_weight": [0.5, 1.0, 3.0], + "max_depth": [int(len(self.x_var) * 0.8)]} + if self.method == "classify": + self.param["objective"] = ["binary:logistic"] + elif self.method == "regression": + self.param["objective"] = ["reg:squarederror"] if self.method == "classify": - tmp_model = xgb.XGBClassifier(n_jobs=-1, + tmp_model = xgb.XGBClassifier(n_jobs=1, verbosity=0, silent=True, random_state=self.seed, seed=self.seed, use_label_encoder=False) elif self.method == "regression": - tmp_model = xgb.XGBRegressor(n_jobs=-1, + tmp_model = xgb.XGBRegressor(n_jobs=1, verbosity=0, silent=True, random_state=self.seed, @@ -185,6 +299,7 @@ def _fit(self) -> Dict[str, Any]: n_jobs=-1, verbose=0, refit=True, + n_iter=10, return_train_score=True, cv=self.k_fold, random_state=self.seed) @@ -192,10 +307,3 @@ def _fit(self) -> Dict[str, Any]: self.df[self.y_var]) self.model = gs_op return gs_op.best_params_ - - def predict(self, x_predict: pd.DataFrame) -> pd.DataFrame: - """Predict values.""" - df_op = x_predict.copy(deep=True) - y_hat = self.model.predict(x_predict) - df_op.insert(loc=0, column=self.y_var, value=y_hat) - return df_op diff --git a/requirements.txt b/requirements.txt index 0070676..7715ab9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -pandas==1.1.3 Cython==0.29.15 -xgboost==1.3.3 numpy==1.19.5 +pandas==1.1.3 +xgboost==1.3.3 scikit_learn==1.0 diff --git a/tests/test_boost.py b/tests/test_boost.py deleted file mode 100644 index 2532c37..0000000 --- a/tests/test_boost.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -Test suite module for ``XGBoost``. - -Credits -------- -:: - - Authors: - - Diptesh - - Madhu - - Date: Sep 27, 2021 -""" - -# pylint: disable=invalid-name -# pylint: disable=wrong-import-position - -import unittest -import warnings -import re -import sys - -from inspect import getsourcefile -from os.path import abspath - -import pandas as pd - -from sklearn.model_selection import train_test_split as split -from sklearn import metrics as sk_metrics - -# Set base path -path = abspath(getsourcefile(lambda: 0)) -path = re.sub(r"(.+)(\/tests.*)", "\\1", path) - -sys.path.insert(0, path) - -from mllib.lib.boost import XGBoost # noqa: F841 - -# ============================================================================= -# --- DO NOT CHANGE ANYTHING FROM HERE -# ============================================================================= - -path = path + "/data/input/" - -# ============================================================================= -# --- User defined functions -# ============================================================================= - - -def ignore_warnings(test_func): - """Suppress warnings.""" - - def do_test(self, *args, **kwargs): - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - test_func(self, *args, **kwargs) - return do_test - - -class Test_XGBoost(unittest.TestCase): - """Test suite for module ``XGBoost``.""" - - def setUp(self): - """Set up for module ``XGBoost``.""" - - @ignore_warnings - def test_xgboost_class(self): - """XGBoost: Test for classification.""" - x_var = ["x1", "x2", "x3", "x4"] - y_var = "y" - df_ip = pd.read_csv(path + "iris.csv") - df_ip = df_ip[[y_var] + x_var] - df_train, df_test = split(df_ip, - stratify=df_ip[y_var], - test_size=0.2, - random_state=1) - mod = XGBoost(df_train, y_var, x_var, method="classify", - param={"n_estimators": [1], - "objective": ["binary:logistic"]}) - y_hat = mod.predict(df_test[x_var])[y_var].tolist() - y = df_test[y_var].values.tolist() - acc = round(sk_metrics.accuracy_score(y, y_hat), 2) - self.assertGreaterEqual(acc, 0.93) - - @ignore_warnings - def test_xgboost_reg(self): - """XGBoost: Test for regression.""" - x_var = ["x1", "x2", "x3", "x4"] - y_var = "y" - df_ip = pd.read_csv(path + "iris.csv") - df_ip = df_ip[[y_var] + x_var] - df_train, df_test = split(df_ip, - stratify=df_ip[y_var], - test_size=0.2, - random_state=1) - mod = XGBoost(df_train, y_var, x_var, method="regression", - param={"n_estimators": [1], - "objective": ["reg:squarederror"]}) - y_hat = mod.predict(df_test[x_var])[y_var].tolist() - y = df_test[y_var].values.tolist() - mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) - self.assertLessEqual(mse, 0.5) - - -# ============================================================================= -# --- Main -# ============================================================================= - -if __name__ == '__main__': - unittest.main() diff --git a/tests/test_random_forest.py b/tests/test_tree.py similarity index 63% rename from tests/test_random_forest.py rename to tests/test_tree.py index 54bb0f3..5ae5fc2 100644 --- a/tests/test_random_forest.py +++ b/tests/test_tree.py @@ -1,5 +1,5 @@ """ -Test suite module for ``random_forest``. +Test suite module for ``XGBoost``. Credits ------- @@ -34,7 +34,8 @@ sys.path.insert(0, path) -from mllib.lib.random_forest import RandomForest # noqa: F841 +from mllib.lib.tree import RandomForest # noqa: F841 +from mllib.lib.tree import XGBoost # noqa: F841 # ============================================================================= # --- DO NOT CHANGE ANYTHING FROM HERE @@ -97,6 +98,47 @@ def test_rf_reg(self): self.assertLessEqual(mse, 0.1) +class Test_XGBoost(unittest.TestCase): + """Test suite for module ``XGBoost``.""" + + def setUp(self): + """Set up for module ``XGBoost``.""" + + @ignore_warnings + def test_xgboost_class(self): + """XGBoost: Test for classification.""" + x_var = ["x1", "x2"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=1) + mod = XGBoost(df_train, y_var, x_var, method="classify") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + acc = round(sk_metrics.accuracy_score(y, y_hat), 2) + self.assertGreaterEqual(acc, 0.93) + + @ignore_warnings + def test_xgboost_reg(self): + """XGBoost: Test for regression.""" + x_var = ["x1", "x2", "x3", "x4"] + y_var = "y" + df_ip = pd.read_csv(path + "iris.csv") + df_ip = df_ip[[y_var] + x_var] + df_train, df_test = split(df_ip, + stratify=df_ip[y_var], + test_size=0.2, + random_state=1) + mod = XGBoost(df_train, y_var, x_var, method="regression") + y_hat = mod.predict(df_test[x_var])[y_var].tolist() + y = df_test[y_var].values.tolist() + mse = round(sk_metrics.mean_squared_error(y, y_hat), 2) + self.assertLessEqual(mse, 0.5) + + # ============================================================================= # --- Main # =============================================================================