Classification¶
A detailed walkthrough of DeepGBoostClassifier covering:
- Binary classification — Breast Cancer dataset
- Multiclass classification — Iris dataset
predict_probaand calibration- ROC-AUC and confusion matrix
- Scikit-learn pipeline integration
- Feature importances
- String labels
In [1]:
Copied!
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
accuracy_score,
roc_auc_score,
ConfusionMatrixDisplay,
RocCurveDisplay,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
from deepgboost import DeepGBoostClassifier, plot_importance
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
accuracy_score,
roc_auc_score,
ConfusionMatrixDisplay,
RocCurveDisplay,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
from deepgboost import DeepGBoostClassifier, plot_importance
1. Binary Classification — Breast Cancer¶
In [2]:
Copied!
bc = load_breast_cancer()
X, y = bc.data, bc.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
clf = DeepGBoostClassifier(
n_trees=5,
n_layers=10,
max_depth=4,
learning_rate=0.15,
random_state=42,
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba[:, 1])
print(f"Accuracy: {acc:.4f}")
print(f"ROC-AUC: {auc:.4f}")
print(f"Classes: {clf.classes_}")
print(f"n_classes: {clf.n_classes_}")
bc = load_breast_cancer()
X, y = bc.data, bc.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
clf = DeepGBoostClassifier(
n_trees=5,
n_layers=10,
max_depth=4,
learning_rate=0.15,
random_state=42,
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba[:, 1])
print(f"Accuracy: {acc:.4f}")
print(f"ROC-AUC: {auc:.4f}")
print(f"Classes: {clf.classes_}")
print(f"n_classes: {clf.n_classes_}")
Accuracy: 0.9649 ROC-AUC: 0.9971 Classes: [0 1] n_classes: 2
ROC Curve¶
In [3]:
Copied!
fig, ax = plt.subplots(figsize=(5, 4))
RocCurveDisplay.from_predictions(
y_test, y_proba[:, 1], ax=ax, name=f"DeepGBoost (AUC={auc:.3f})"
)
ax.set_title("Breast Cancer — ROC Curve")
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(figsize=(5, 4))
RocCurveDisplay.from_predictions(
y_test, y_proba[:, 1], ax=ax, name=f"DeepGBoost (AUC={auc:.3f})"
)
ax.set_title("Breast Cancer — ROC Curve")
plt.tight_layout()
plt.show()
Confusion Matrix¶
In [4]:
Copied!
fig, ax = plt.subplots(figsize=(4, 3))
ConfusionMatrixDisplay.from_predictions(
y_test, y_pred, display_labels=bc.target_names, ax=ax
)
ax.set_title("Breast Cancer — Confusion Matrix")
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(figsize=(4, 3))
ConfusionMatrixDisplay.from_predictions(
y_test, y_pred, display_labels=bc.target_names, ax=ax
)
ax.set_title("Breast Cancer — Confusion Matrix")
plt.tight_layout()
plt.show()
Predicted probabilities¶
In [5]:
Copied!
fig, axes = plt.subplots(1, 2, figsize=(9, 3), sharey=True)
for cls_idx, (ax, name) in enumerate(zip(axes, bc.target_names)):
mask = y_test == cls_idx
ax.hist(y_proba[mask, 1], bins=20, edgecolor="k", linewidth=0.4)
ax.set_title(f"P(malignant | true={name})")
ax.set_xlabel("Predicted probability")
ax.set_ylabel("Count")
plt.suptitle("Probability distributions by true class", y=1.02)
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(9, 3), sharey=True)
for cls_idx, (ax, name) in enumerate(zip(axes, bc.target_names)):
mask = y_test == cls_idx
ax.hist(y_proba[mask, 1], bins=20, edgecolor="k", linewidth=0.4)
ax.set_title(f"P(malignant | true={name})")
ax.set_xlabel("Predicted probability")
ax.set_ylabel("Count")
plt.suptitle("Probability distributions by true class", y=1.02)
plt.tight_layout()
plt.show()
2. Multiclass Classification — Iris¶
In [6]:
Copied!
iris = load_iris()
Xi, yi = iris.data, iris.target
Xi_train, Xi_test, yi_train, yi_test = train_test_split(
Xi, yi, test_size=0.2, random_state=42
)
clf_mc = DeepGBoostClassifier(
n_trees=5,
n_layers=8,
max_depth=3,
learning_rate=0.1,
random_state=0,
)
clf_mc.fit(Xi_train, yi_train)
yi_pred = clf_mc.predict(Xi_test)
yi_proba = clf_mc.predict_proba(Xi_test)
acc_mc = accuracy_score(yi_test, yi_pred)
print(f"Accuracy: {acc_mc:.4f}")
print(f"Classes: {clf_mc.classes_}")
print(
f"predict_proba shape: {yi_proba.shape} (rows sum to 1: {np.allclose(yi_proba.sum(1), 1)})"
)
iris = load_iris()
Xi, yi = iris.data, iris.target
Xi_train, Xi_test, yi_train, yi_test = train_test_split(
Xi, yi, test_size=0.2, random_state=42
)
clf_mc = DeepGBoostClassifier(
n_trees=5,
n_layers=8,
max_depth=3,
learning_rate=0.1,
random_state=0,
)
clf_mc.fit(Xi_train, yi_train)
yi_pred = clf_mc.predict(Xi_test)
yi_proba = clf_mc.predict_proba(Xi_test)
acc_mc = accuracy_score(yi_test, yi_pred)
print(f"Accuracy: {acc_mc:.4f}")
print(f"Classes: {clf_mc.classes_}")
print(
f"predict_proba shape: {yi_proba.shape} (rows sum to 1: {np.allclose(yi_proba.sum(1), 1)})"
)
Accuracy: 1.0000 Classes: [0 1 2] predict_proba shape: (30, 3) (rows sum to 1: True)
In [7]:
Copied!
fig, ax = plt.subplots(figsize=(4, 3))
ConfusionMatrixDisplay.from_predictions(
yi_test, yi_pred, display_labels=iris.target_names, ax=ax
)
ax.set_title(f"Iris — Confusion Matrix (acc={acc_mc:.3f})")
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(figsize=(4, 3))
ConfusionMatrixDisplay.from_predictions(
yi_test, yi_pred, display_labels=iris.target_names, ax=ax
)
ax.set_title(f"Iris — Confusion Matrix (acc={acc_mc:.3f})")
plt.tight_layout()
plt.show()
Probability simplex (2-D PCA projection)¶
In [8]:
Copied!
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(Xi)
Xi_2d = pca.transform(Xi_test)
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]
for k, (ax, species) in enumerate(zip(axes, iris.target_names)):
sc = ax.scatter(
Xi_2d[:, 0],
Xi_2d[:, 1],
c=yi_proba[:, k],
cmap="RdYlGn",
vmin=0,
vmax=1,
edgecolors="k",
linewidths=0.3,
s=40,
)
ax.set_title(f"P({species})")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.colorbar(sc, ax=ax)
plt.suptitle("Predicted class probabilities (PCA 2-D)", y=1.02)
plt.tight_layout()
plt.show()
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(Xi)
Xi_2d = pca.transform(Xi_test)
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]
for k, (ax, species) in enumerate(zip(axes, iris.target_names)):
sc = ax.scatter(
Xi_2d[:, 0],
Xi_2d[:, 1],
c=yi_proba[:, k],
cmap="RdYlGn",
vmin=0,
vmax=1,
edgecolors="k",
linewidths=0.3,
s=40,
)
ax.set_title(f"P({species})")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.colorbar(sc, ax=ax)
plt.suptitle("Predicted class probabilities (PCA 2-D)", y=1.02)
plt.tight_layout()
plt.show()
3. Scikit-learn Pipeline¶
In [9]:
Copied!
pipe = Pipeline(
[
("scaler", StandardScaler()),
("clf", DeepGBoostClassifier(n_trees=5, n_layers=8, random_state=0)),
]
)
cv_scores = cross_val_score(pipe, Xi, yi, cv=5, scoring="accuracy")
print(f"5-fold CV accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
pipe = Pipeline(
[
("scaler", StandardScaler()),
("clf", DeepGBoostClassifier(n_trees=5, n_layers=8, random_state=0)),
]
)
cv_scores = cross_val_score(pipe, Xi, yi, cv=5, scoring="accuracy")
print(f"5-fold CV accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
5-fold CV accuracy: 0.9133 ± 0.0980
4. Feature Importances¶
In [10]:
Copied!
# Breast Cancer — binary
fig, ax = plot_importance(
clf,
feature_names=list(bc.feature_names),
max_features=10,
title="Breast Cancer — Top 10 Feature Importances",
)
plt.tight_layout()
plt.show()
# Breast Cancer — binary
fig, ax = plot_importance(
clf,
feature_names=list(bc.feature_names),
max_features=10,
title="Breast Cancer — Top 10 Feature Importances",
)
plt.tight_layout()
plt.show()
In [11]:
Copied!
# Iris — multiclass (importances averaged across OvR models)
fig, ax = plot_importance(
clf_mc,
feature_names=list(iris.feature_names),
title="Iris — Feature Importances (OvR average)",
)
plt.tight_layout()
plt.show()
# Iris — multiclass (importances averaged across OvR models)
fig, ax = plot_importance(
clf_mc,
feature_names=list(iris.feature_names),
title="Iris — Feature Importances (OvR average)",
)
plt.tight_layout()
plt.show()
5. String Labels¶
In [12]:
Copied!
# DeepGBoostClassifier handles arbitrary label types via LabelEncoder internally
y_str_train = np.array(["setosa", "versicolor", "virginica"])[yi_train]
y_str_test = np.array(["setosa", "versicolor", "virginica"])[yi_test]
clf_str = DeepGBoostClassifier(n_trees=5, n_layers=8, random_state=0)
clf_str.fit(Xi_train, y_str_train)
preds_str = clf_str.predict(Xi_test)
print("Predicted labels (sample):", preds_str[:10])
print("Classes stored:", clf_str.classes_)
print(f"Accuracy: {accuracy_score(y_str_test, preds_str):.4f}")
# DeepGBoostClassifier handles arbitrary label types via LabelEncoder internally
y_str_train = np.array(["setosa", "versicolor", "virginica"])[yi_train]
y_str_test = np.array(["setosa", "versicolor", "virginica"])[yi_test]
clf_str = DeepGBoostClassifier(n_trees=5, n_layers=8, random_state=0)
clf_str.fit(Xi_train, y_str_train)
preds_str = clf_str.predict(Xi_test)
print("Predicted labels (sample):", preds_str[:10])
print("Classes stored:", clf_str.classes_)
print(f"Accuracy: {accuracy_score(y_str_test, preds_str):.4f}")
Predicted labels (sample): ['versicolor' 'setosa' 'virginica' 'versicolor' 'versicolor' 'setosa' 'versicolor' 'virginica' 'versicolor' 'versicolor'] Classes stored: ['setosa' 'versicolor' 'virginica'] Accuracy: 1.0000
6. Sklearn get_params / set_params / clone¶
In [13]:
Copied!
clf_base = DeepGBoostClassifier(
n_trees=5, n_layers=8, learning_rate=0.1, random_state=0
)
print("get_params:", clf_base.get_params())
clf_base.set_params(n_layers=12, learning_rate=0.05)
print(
"After set_params — n_layers:",
clf_base.n_layers,
" learning_rate:",
clf_base.learning_rate,
)
clf_cloned = clone(clf_base)
print("Cloned (no fitted attrs):", not hasattr(clf_cloned, "_binary_model"))
clf_base = DeepGBoostClassifier(
n_trees=5, n_layers=8, learning_rate=0.1, random_state=0
)
print("get_params:", clf_base.get_params())
clf_base.set_params(n_layers=12, learning_rate=0.05)
print(
"After set_params — n_layers:",
clf_base.n_layers,
" learning_rate:",
clf_base.learning_rate,
)
clf_cloned = clone(clf_base)
print("Cloned (no fitted attrs):", not hasattr(clf_cloned, "_binary_model"))
get_params: {'early_stopping_rounds': None, 'eval_metric': None, 'hessian_reg': 0.0, 'learning_rate': 0.1, 'linear_alpha': 1.0, 'linear_projection': False, 'max_depth': None, 'max_features': None, 'n_jobs': 1, 'n_layers': 8, 'n_trees': 5, 'objective': None, 'random_state': 0, 'subsample_min_frac': 0.3, 'weight_solver': 'nnls'}
After set_params — n_layers: 12 learning_rate: 0.05
Cloned (no fitted attrs): True