Quickstart¶

This notebook gives a 5-minute tour of DeepGBoost: a Distributed Gradient Boosting Forest (DGBF) library with an API modelled after XGBoost.

We cover:

Installing and importing the library
Regression with the scikit-learn API (DeepGBoostRegressor)
Binary classification with the scikit-learn API (DeepGBoostClassifier)
Callbacks: early stopping and learning-rate scheduling
Feature importances

In [1]:

Copied!





# If running outside of the repo, install with:
# pip install deepgboost
#
# If running from the repo root:
# pip install -e '.[dev]'
import deepgboost

print("DeepGBoost version:", deepgboost.__version__)
# If running outside of the repo, install with:
# pip install deepgboost
#
# If running from the repo root:
# pip install -e '.[dev]'
import deepgboost

print("DeepGBoost version:", deepgboost.__version__)

DeepGBoost version: 0.1.0

In [2]:

Copied!





import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score

from deepgboost import (
    DeepGBoostRegressor,
    DeepGBoostClassifier,
    EarlyStoppingCallback,
    LearningRateSchedulerCallback,
    EvaluationMonitorCallback,
)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score

from deepgboost import (
    DeepGBoostRegressor,
    DeepGBoostClassifier,
    EarlyStoppingCallback,
    LearningRateSchedulerCallback,
    EvaluationMonitorCallback,
)

1. Regression¶

In [3]:

Copied!





X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

reg = DeepGBoostRegressor(
    n_trees=10,
    n_layers=15,
    max_depth=4,
    learning_rate=0.1,
    random_state=42,
)
reg.fit(X_train, y_train)

preds = reg.predict(X_test)
r2 = r2_score(y_test, preds)
print(f"R² score: {r2:.4f}")
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

reg = DeepGBoostRegressor(
    n_trees=10,
    n_layers=15,
    max_depth=4,
    learning_rate=0.1,
    random_state=42,
)
reg.fit(X_train, y_train)

preds = reg.predict(X_test)
r2 = r2_score(y_test, preds)
print(f"R² score: {r2:.4f}")

R² score: 0.4645

2. Binary Classification¶

In [4]:

Copied!





X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = DeepGBoostClassifier(
    n_trees=5,
    n_layers=10,
    max_depth=4,
    learning_rate=0.15,
    random_state=42,
)
clf.fit(X_train, y_train)

acc = clf.score(X_test, y_test)
print(f"Accuracy: {acc:.4f}")
print(f"Classes:  {clf.classes_}")
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = DeepGBoostClassifier(
    n_trees=5,
    n_layers=10,
    max_depth=4,
    learning_rate=0.15,
    random_state=42,
)
clf.fit(X_train, y_train)

acc = clf.score(X_test, y_test)
print(f"Accuracy: {acc:.4f}")
print(f"Classes:  {clf.classes_}")

Accuracy: 0.9649
Classes:  [0 1]

3. Callbacks¶

3a. Early Stopping¶

In [5]:

Copied!





X, y = load_diabetes(return_X_y=True)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=0
)

es = EarlyStoppingCallback(patience=5, restore_best=True)

reg = DeepGBoostRegressor(
    n_trees=10,
    n_layers=100,  # allow up to 100 layers …
    learning_rate=0.1,
    random_state=0,
)
reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[es])

actual_layers = len(
    reg.evals_result_.get("eval_0", {"train_loss": []}).get("train_loss", [])
)
print(f"Trained layers (stopped early): {actual_layers}")
print(f"R² on test:  {r2_score(y_val, reg.predict(X_val)):.4f}")
X, y = load_diabetes(return_X_y=True)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=0
)

es = EarlyStoppingCallback(patience=5, restore_best=True)

reg = DeepGBoostRegressor(
    n_trees=10,
    n_layers=100,  # allow up to 100 layers …
    learning_rate=0.1,
    random_state=0,
)
reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[es])

actual_layers = len(
    reg.evals_result_.get("eval_0", {"train_loss": []}).get("train_loss", [])
)
print(f"Trained layers (stopped early): {actual_layers}")
print(f"R² on test:  {r2_score(y_val, reg.predict(X_val)):.4f}")

Trained layers (stopped early): 18
R² on test:  0.3062

3b. Learning-Rate Scheduler¶

In [6]:

Copied!





from deepgboost.dgbf.dgbf import DGBFModel

rates_seen = []


class LRRecorder(deepgboost.TrainingCallback):
    def before_iteration(self, model, epoch, evals_log):
        rates_seen.append(model.learning_rate)
        return False


scheduler = LearningRateSchedulerCallback(lambda epoch: 0.3 * (0.85**epoch))

model = DGBFModel(n_trees=5, n_layers=10, random_state=0)
model.fit(X_train, y_train, callbacks=[scheduler, LRRecorder()])

plt.figure(figsize=(7, 3))
plt.plot(rates_seen, marker="o", ms=4)
plt.xlabel("Layer")
plt.ylabel("Learning rate")
plt.title("Exponential LR decay via LearningRateScheduler")
plt.tight_layout()
plt.show()
from deepgboost.dgbf.dgbf import DGBFModel

rates_seen = []


class LRRecorder(deepgboost.TrainingCallback):
    def before_iteration(self, model, epoch, evals_log):
        rates_seen.append(model.learning_rate)
        return False


scheduler = LearningRateSchedulerCallback(lambda epoch: 0.3 * (0.85**epoch))

model = DGBFModel(n_trees=5, n_layers=10, random_state=0)
model.fit(X_train, y_train, callbacks=[scheduler, LRRecorder()])

plt.figure(figsize=(7, 3))
plt.plot(rates_seen, marker="o", ms=4)
plt.xlabel("Layer")
plt.ylabel("Learning rate")
plt.title("Exponential LR decay via LearningRateScheduler")
plt.tight_layout()
plt.show()

No description has been provided for this image

4. Feature Importances¶

In [7]:

Copied!





from deepgboost import plot_importance

X, y = load_diabetes(return_X_y=True)
feature_names = load_diabetes().feature_names

reg = DeepGBoostRegressor(n_trees=10, n_layers=15, random_state=0)
reg.fit(X, y)

fig, ax = plot_importance(
    reg, feature_names=feature_names, title="Diabetes — feature importances"
)
plt.tight_layout()
plt.show()
from deepgboost import plot_importance

X, y = load_diabetes(return_X_y=True)
feature_names = load_diabetes().feature_names

reg = DeepGBoostRegressor(n_trees=10, n_layers=15, random_state=0)
reg.fit(X, y)

fig, ax = plot_importance(
    reg, feature_names=feature_names, title="Diabetes — feature importances"
)
plt.tight_layout()
plt.show()