From 1ba2b064c0ae7898a0a1f5db6e20af67fb18e603 Mon Sep 17 00:00:00 2001 From: YasmineAmhis Date: Fri, 18 Dec 2020 09:16:50 +0100 Subject: [PATCH] add v0 of xgboost --- analysis/Bc2TauNu/MVA.py | 141 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 analysis/Bc2TauNu/MVA.py diff --git a/analysis/Bc2TauNu/MVA.py b/analysis/Bc2TauNu/MVA.py new file mode 100644 index 0000000..f3dad73 --- /dev/null +++ b/analysis/Bc2TauNu/MVA.py @@ -0,0 +1,141 @@ + +import uproot as ut +import numpy as np +import xgboost +from sklearn.model_selection import train_test_split +from sklearn.metrics import roc_curve, auc + +import matplotlib.pyplot as plt + +def plot_output(y_train, y_test, y_prob_train, y_prob_test, + figname='overtraining.png'): + """Plot train and test BDT output for signal and bkg. + + :param y_train: train labels + :type y_train: array + :param y_test: test labels + :type y_test: array + :param y_prob_train: train predictions + :type y_prob_train: array + :param y_prob_test: test predictions + :type y_prob_test: array + :param figname: title of output figure + :type figname: str + + """ + fig = plt.figure() + nbins = 50 + range = (0,1) + + # train: filled histo + plt.hist(y_prob_train[y_train==1][:,1], bins=nbins, + range=range, + label="Sig Train", density=True, #log=True, + bottom=0, + color="red", histtype="stepfilled", alpha=0.5) + plt.hist(y_prob_train[y_train==0][:,1], bins=nbins, + range=range, + label="Bkg Train", density=True, #log=True, + bottom=0, + color="blue", histtype="stepfilled", alpha=0.5) + + # test: error bars + test_sig = y_prob_test[y_test==1][:,1] + hist_sig, bins_sig = np.histogram(test_sig, + range=range, bins=nbins, + density=True) + cent_sig = (bins_sig[1:] + bins_sig[:-1])/2 + plt.errorbar(cent_sig, hist_sig, fmt='o', + color="red", label="Sig Test") + + test_bkg = y_prob_test[y_test==0][:,1] + hist_bkg, bins_bkg = np.histogram(test_bkg, + range=range, bins=nbins, + density=True) + cent_bkg = (bins_bkg[1:] + bins_bkg[:-1])/2 + plt.errorbar(cent_bkg, hist_bkg, fmt='o', + color="blue", label="Bkg Test") + + # plot settings + plt.ylim(ymin=0) + plt.xlabel("xgboost output") + plt.ylabel("Arbitrary units") + plt.legend(loc="upper center") + plt.savefig(figname) + return fig + + +if __name__ == '__main__': + # read the data + sig = ut.open('Bc2TauNu.root')['events;1'] + bkg = ut.open('inclusive_Zbb.root')['events;1'] + # define the variables we want to use + + vars = ["EVT_thrutshemis_e_min","EVT_thrutshemis_e_max", "echarged_min",\ + "echarged_max", "ncharged_min" ,"ncharged_max",\ + "eneutral_min","eneutral_max","nneutral_min","nneutral_max"] + + ''' + # meaning of the variables + lower hemisphere energy in each event + higher hemisphere energy in each event + charged particle energy in the lower energy hemisphere + charged particle energy in the higher energy hemisphere + number of charged particles in the lower energy hemisphere + number of charged particles in the higher energy hemisphere + same thing as the above 4 but for neutrals (I think this is just photons) + ''' + # create a pandas data frame with these variables only + sig_array = sig.pandas.df(vars, entrystart=0, entrystop=10000).to_numpy() # use only 1000 events for this example! + bkg_array = bkg.pandas.df(vars, entrystart=0, entrystop=10000).to_numpy() # use only 1000 events for this example! + print("Signal shape:", sig_array.shape) + print("Backgr shape:", bkg_array.shape) + + # merge and define signal and background labels + X = np.concatenate((sig_array, bkg_array)) + y = np.concatenate((np.ones(sig_array.shape[0]),# 1 is signal + np.zeros(bkg_array.shape[0]))) # 0 is background + + # split data in train and test samples + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) + print("Train size:", X_train.shape[0]) + print("Test size: ", X_test.shape[0]) + + # define a model, example: xgboost + params_dict = { + "n_estimators": 200, + "learning_rate": 0.3, + "max_depth": 3, + } + + + model = xgboost.XGBClassifier(**params_dict) + + # train it + model.fit(X_train, y_train) + print("Model has been trained") + + # plot features importance + xgboost.plot_importance(model) + plt.savefig("vars_ranking.png") + + # make predictions + y_prob_train = model.predict_proba(X_train) + y_prob_test = model.predict_proba(X_test) + + # plot train and test output + fig1 = plot_output(y_train, y_test, y_prob_train, y_prob_test) + + # evaluate performance with area under the roc + fpr, tpr, thresholds = roc_curve(y_train, y_prob_train[:,1]) + fnr = 1-fpr + roc_auc = auc(tpr, fnr) + fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_prob_test[:,1]) + fnr_test = 1-fpr_test + roc_auc_test = auc(tpr_test, fnr_test) + print("## Performance: ROC area") + print("- train: {}".format(roc_auc)) + print("- test : {}".format(roc_auc_test)) + + +# EOF