Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 19 additions & 7 deletions src/protify/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def parse_arguments():
parser.add_argument("--scikit_cv", type=int, default=3, help="Number of cross-validation folds for scikit model.")
parser.add_argument("--scikit_random_state", type=int, default=None, help="Random state for scikit model (if None, uses global seed).")
parser.add_argument("--scikit_model_name", type=str, default=None, help="Name of the scikit model to use.")
parser.add_argument("--scikit_model_args", type=str, default=None, help="JSON string of hyperparameters to use (skips tuning). E.g. '{\"n_estimators\": 500, \"max_depth\": 7}'")
parser.add_argument("--use_scikit", action="store_true", default=False, help="Use scikit model (default: False).")
parser.add_argument("--n_jobs", type=int, default=1, help="Number of processes to use in scikit.") # TODO integrate with GUI and main

Expand Down Expand Up @@ -680,14 +681,25 @@ def run_scikit_scheme(self):
for data_name, dataset in self.datasets.items():
### find best scikit model and parameters via cross validation and lazy predict
X_train, y_train, X_valid, y_valid, X_test, y_test, label_type = self.prepare_scikit_dataset(model_name, dataset)
if label_type == 'singlelabel':
results = scikit_probe.find_best_classifier(X_train, y_train, X_valid, y_valid)
elif label_type == 'regression':
results = scikit_probe.find_best_regressor(X_train, y_train, X_valid, y_valid)

# If a specific model is specified, skip LazyPredict and go straight to that model
if self.scikit_args.model_name is not None:
print_message(f"Skipping LazyPredict, using specified model: {self.scikit_args.model_name}")
results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, model_results=None)
else:
raise ValueError(f'Label type {label_type} not supported')
### train and evaluate best model
results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, results)
# Find best model via LazyPredict
if label_type == 'singlelabel':
results = scikit_probe.find_best_classifier(X_train, y_train, X_valid, y_valid)
elif label_type == 'regression':
results = scikit_probe.find_best_regressor(X_train, y_train, X_valid, y_valid)
else:
raise ValueError(f'Label type {label_type} not supported')
# Train and evaluate best model with optimal hyperparameters
results = scikit_probe.run_specific_model(X_train, y_train, X_valid, y_valid, X_test, y_test, results)

# Log the results for plotting
metrics_dict = {'test_mcc': results.final_scores} if isinstance(results.final_scores, (int, float)) else results.final_scores
self.log_metrics(data_name, model_name, metrics_dict, split_name='test')

@log_method_calls
def generate_plots(self):
Expand Down
123 changes: 93 additions & 30 deletions src/protify/probes/lazy_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,29 @@
"LinearSVC",
"Perceptron",
"MLPClassifier",
"SGDClassifier"
"SGDClassifier",
# O(n²) memory models - too slow for large datasets
"LabelPropagation",
"LabelSpreading",
"SVC",
"NuSVC",
# Sequential ensemble models - slow for large datasets
"AdaBoostClassifier",
"BaggingClassifier",
# O(n×m) prediction time - slow for large test sets
"KNeighborsClassifier",
# Unbounded tree depth - very slow on high-dim data
"DecisionTreeClassifier",
"ExtraTreeClassifier",
"ExtraTreesClassifier",
# Fails on negative values after StandardScaler
"CategoricalNB",
# O(d²) or O(d³) - slow on high-dimensional data (4608 features)
"LinearDiscriminantAnalysis",
"QuadraticDiscriminantAnalysis",
# Requires estimator argument
"FixedThresholdClassifier",
"TunedThresholdClassifierCV",
]

removed_regressors = [
Expand All @@ -82,7 +104,16 @@
"LassoLarsCV",
"ElasticNetCV",
"LinearSVR",
"LassoLarsIC"
"LassoLarsIC",
# Sequential ensemble models - slow for large datasets
"AdaBoostRegressor",
"BaggingRegressor",
# O(n×m) prediction time - slow for large test sets
"KNeighborsRegressor",
# Unbounded tree depth - very slow on high-dim data
"DecisionTreeRegressor",
"ExtraTreeRegressor",
"ExtraTreesRegressor",
]

# Tuple of (name, class)
Expand Down Expand Up @@ -176,6 +207,16 @@
CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier))
# CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier))

# Update dicts with XGB and LGBM
CLASSIFIER_DICT["XGBClassifier"] = xgboost.XGBClassifier
CLASSIFIER_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier
REGRESSOR_DICT["XGBRegressor"] = xgboost.XGBRegressor
REGRESSOR_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor
ALL_MODEL_DICT["XGBClassifier"] = xgboost.XGBClassifier
ALL_MODEL_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier
ALL_MODEL_DICT["XGBRegressor"] = xgboost.XGBRegressor
ALL_MODEL_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor

numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)
Expand Down Expand Up @@ -309,6 +350,13 @@ def fit(self, X_train, X_test, y_train, y_test):
("categorical_high", categorical_transformer_high, categorical_high),
]
)

# Precompute preprocessing once for all models (major optimization for large datasets)
print_message("Preprocessing data once for all models...")
preprocess_start = time.time()
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s")

if self.classifiers == "all":
self.classifiers = CLASSIFIERS
Expand All @@ -328,23 +376,25 @@ def fit(self, X_train, X_test, y_train, y_test):
total_start = time.time()

for name, model in tqdm(self.classifiers, desc="Training classifiers"):
print_message(f"Starting {name}...")
start = time.time()
try:
# Build model kwargs
model_kwargs = {}
if "random_state" in model().get_params().keys():
pipe = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", model(random_state=self.random_state)),
]
)
else:
pipe = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", model())]
)

pipe.fit(X_train, y_train)
self.models[name] = pipe
y_pred = pipe.predict(X_test)
model_kwargs["random_state"] = self.random_state
# Enable parallelization for models that support it
if "n_jobs" in model().get_params().keys():
model_kwargs["n_jobs"] = -1
# Enable verbose for boosting models to show iteration progress
if name in ("XGBClassifier", "LGBMClassifier"):
model_kwargs["verbose"] = 1

# Train directly on preprocessed data (no Pipeline needed)
clf = model(**model_kwargs)
clf.fit(X_train_transformed, y_train)
self.models[name] = clf
y_pred = clf.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred, normalize=True)
b_accuracy = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
Expand All @@ -362,6 +412,8 @@ def fit(self, X_train, X_test, y_train, y_test):
ROC_AUC.append(roc_auc)
F1.append(f1)
TIME.append(fit_time)

print_message(f" {name} completed in {fit_time:.1f}s | Acc: {accuracy:.3f} | F1: {f1:.3f}")

if self.custom_metric is not None:
custom_metric = self.custom_metric(y_test, y_pred)
Expand Down Expand Up @@ -548,6 +600,13 @@ def fit(self, X_train, X_test, y_train, y_test):
("categorical_high", categorical_transformer_high, categorical_high),
]
)

# Precompute preprocessing once for all models (major optimization for large datasets)
print_message("Preprocessing data once for all models...")
preprocess_start = time.time()
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s")

if self.regressors == "all":
self.regressors = REGRESSORS
Expand All @@ -567,23 +626,25 @@ def fit(self, X_train, X_test, y_train, y_test):
total_start = time.time()

for name, model in tqdm(self.regressors, desc="Training regressors"):
print_message(f"Starting {name}...")
start = time.time()
try:
# Build model kwargs
model_kwargs = {}
if "random_state" in model().get_params().keys():
pipe = Pipeline(
steps=[
("preprocessor", preprocessor),
("regressor", model(random_state=self.random_state)),
]
)
else:
pipe = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", model())]
)

pipe.fit(X_train, y_train)
self.models[name] = pipe
y_pred = pipe.predict(X_test)
model_kwargs["random_state"] = self.random_state
# Enable parallelization for models that support it
if "n_jobs" in model().get_params().keys():
model_kwargs["n_jobs"] = -1
# Enable verbose for boosting models to show iteration progress
if name in ("XGBRegressor", "LGBMRegressor"):
model_kwargs["verbose"] = 1

# Train directly on preprocessed data (no Pipeline needed)
reg = model(**model_kwargs)
reg.fit(X_train_transformed, y_train)
self.models[name] = reg
y_pred = reg.predict(X_test_transformed)

r_squared = r2_score(y_test, y_pred)
adj_rsquared = adjusted_rsquared(
Expand All @@ -597,6 +658,8 @@ def fit(self, X_train, X_test, y_train, y_test):
ADJR2.append(adj_rsquared)
RMSE.append(rmse)
TIME.append(fit_time)

print_message(f" {name} completed in {fit_time:.1f}s | R²: {r_squared:.3f} | RMSE: {rmse:.3f}")

if self.custom_metric:
custom_metric = self.custom_metric(y_test, y_pred)
Expand Down
Loading