Quickstart¶
This notebook gives a 5-minute tour of DeepGBoost: a Distributed Gradient Boosting Forest (DGBF) library with an API modelled after XGBoost.
We cover:
- Installing and importing the library
- Regression with the scikit-learn API (
DeepGBoostRegressor) - Binary classification with the scikit-learn API (
DeepGBoostClassifier) - Callbacks: early stopping and learning-rate scheduling
- Feature importances
In [1]:
Copied!
# If running outside of the repo, install with:
# pip install deepgboost
#
# If running from the repo root:
# pip install -e '.[dev]'
import deepgboost
print("DeepGBoost version:", deepgboost.__version__)
# If running outside of the repo, install with:
# pip install deepgboost
#
# If running from the repo root:
# pip install -e '.[dev]'
import deepgboost
print("DeepGBoost version:", deepgboost.__version__)
DeepGBoost version: 0.1.0
In [2]:
Copied!
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from deepgboost import (
DeepGBoostRegressor,
DeepGBoostClassifier,
EarlyStoppingCallback,
LearningRateSchedulerCallback,
EvaluationMonitorCallback,
)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from deepgboost import (
DeepGBoostRegressor,
DeepGBoostClassifier,
EarlyStoppingCallback,
LearningRateSchedulerCallback,
EvaluationMonitorCallback,
)
1. Regression¶
In [3]:
Copied!
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
reg = DeepGBoostRegressor(
n_trees=10,
n_layers=15,
max_depth=4,
learning_rate=0.1,
random_state=42,
)
reg.fit(X_train, y_train)
preds = reg.predict(X_test)
r2 = r2_score(y_test, preds)
print(f"R² score: {r2:.4f}")
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
reg = DeepGBoostRegressor(
n_trees=10,
n_layers=15,
max_depth=4,
learning_rate=0.1,
random_state=42,
)
reg.fit(X_train, y_train)
preds = reg.predict(X_test)
r2 = r2_score(y_test, preds)
print(f"R² score: {r2:.4f}")
R² score: 0.4645
2. Binary Classification¶
In [4]:
Copied!
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
clf = DeepGBoostClassifier(
n_trees=5,
n_layers=10,
max_depth=4,
learning_rate=0.15,
random_state=42,
)
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print(f"Accuracy: {acc:.4f}")
print(f"Classes: {clf.classes_}")
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
clf = DeepGBoostClassifier(
n_trees=5,
n_layers=10,
max_depth=4,
learning_rate=0.15,
random_state=42,
)
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print(f"Accuracy: {acc:.4f}")
print(f"Classes: {clf.classes_}")
Accuracy: 0.9649 Classes: [0 1]
In [5]:
Copied!
X, y = load_diabetes(return_X_y=True)
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=0
)
es = EarlyStoppingCallback(patience=5, restore_best=True)
reg = DeepGBoostRegressor(
n_trees=10,
n_layers=100, # allow up to 100 layers …
learning_rate=0.1,
random_state=0,
)
reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[es])
actual_layers = len(
reg.evals_result_.get("eval_0", {"train_loss": []}).get("train_loss", [])
)
print(f"Trained layers (stopped early): {actual_layers}")
print(f"R² on test: {r2_score(y_val, reg.predict(X_val)):.4f}")
X, y = load_diabetes(return_X_y=True)
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=0
)
es = EarlyStoppingCallback(patience=5, restore_best=True)
reg = DeepGBoostRegressor(
n_trees=10,
n_layers=100, # allow up to 100 layers …
learning_rate=0.1,
random_state=0,
)
reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[es])
actual_layers = len(
reg.evals_result_.get("eval_0", {"train_loss": []}).get("train_loss", [])
)
print(f"Trained layers (stopped early): {actual_layers}")
print(f"R² on test: {r2_score(y_val, reg.predict(X_val)):.4f}")
Trained layers (stopped early): 18 R² on test: 0.3062
3b. Learning-Rate Scheduler¶
In [6]:
Copied!
from deepgboost.dgbf.dgbf import DGBFModel
rates_seen = []
class LRRecorder(deepgboost.TrainingCallback):
def before_iteration(self, model, epoch, evals_log):
rates_seen.append(model.learning_rate)
return False
scheduler = LearningRateSchedulerCallback(lambda epoch: 0.3 * (0.85**epoch))
model = DGBFModel(n_trees=5, n_layers=10, random_state=0)
model.fit(X_train, y_train, callbacks=[scheduler, LRRecorder()])
plt.figure(figsize=(7, 3))
plt.plot(rates_seen, marker="o", ms=4)
plt.xlabel("Layer")
plt.ylabel("Learning rate")
plt.title("Exponential LR decay via LearningRateScheduler")
plt.tight_layout()
plt.show()
from deepgboost.dgbf.dgbf import DGBFModel
rates_seen = []
class LRRecorder(deepgboost.TrainingCallback):
def before_iteration(self, model, epoch, evals_log):
rates_seen.append(model.learning_rate)
return False
scheduler = LearningRateSchedulerCallback(lambda epoch: 0.3 * (0.85**epoch))
model = DGBFModel(n_trees=5, n_layers=10, random_state=0)
model.fit(X_train, y_train, callbacks=[scheduler, LRRecorder()])
plt.figure(figsize=(7, 3))
plt.plot(rates_seen, marker="o", ms=4)
plt.xlabel("Layer")
plt.ylabel("Learning rate")
plt.title("Exponential LR decay via LearningRateScheduler")
plt.tight_layout()
plt.show()
4. Feature Importances¶
In [7]:
Copied!
from deepgboost import plot_importance
X, y = load_diabetes(return_X_y=True)
feature_names = load_diabetes().feature_names
reg = DeepGBoostRegressor(n_trees=10, n_layers=15, random_state=0)
reg.fit(X, y)
fig, ax = plot_importance(
reg, feature_names=feature_names, title="Diabetes — feature importances"
)
plt.tight_layout()
plt.show()
from deepgboost import plot_importance
X, y = load_diabetes(return_X_y=True)
feature_names = load_diabetes().feature_names
reg = DeepGBoostRegressor(n_trees=10, n_layers=15, random_state=0)
reg.fit(X, y)
fig, ax = plot_importance(
reg, feature_names=feature_names, title="Diabetes — feature importances"
)
plt.tight_layout()
plt.show()