|
import re |
|
import warnings |
|
|
|
import numpy as np |
|
import pytest |
|
from scipy.special import logsumexp |
|
|
|
from sklearn.datasets import load_digits, load_iris |
|
from sklearn.model_selection import cross_val_score, train_test_split |
|
from sklearn.naive_bayes import ( |
|
BernoulliNB, |
|
CategoricalNB, |
|
ComplementNB, |
|
GaussianNB, |
|
MultinomialNB, |
|
) |
|
from sklearn.utils._testing import ( |
|
assert_allclose, |
|
assert_almost_equal, |
|
assert_array_almost_equal, |
|
assert_array_equal, |
|
) |
|
from sklearn.utils.fixes import CSR_CONTAINERS |
|
|
|
DISCRETE_NAIVE_BAYES_CLASSES = [BernoulliNB, CategoricalNB, ComplementNB, MultinomialNB] |
|
ALL_NAIVE_BAYES_CLASSES = DISCRETE_NAIVE_BAYES_CLASSES + [GaussianNB] |
|
|
|
|
|
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) |
|
y = np.array([1, 1, 1, 2, 2, 2]) |
|
|
|
|
|
def get_random_normal_x_binary_y(global_random_seed): |
|
|
|
rng = np.random.RandomState(global_random_seed) |
|
X1 = rng.normal(size=(10, 3)) |
|
y1 = (rng.normal(size=10) > 0).astype(int) |
|
return X1, y1 |
|
|
|
|
|
def get_random_integer_x_three_classes_y(global_random_seed): |
|
|
|
|
|
rng = np.random.RandomState(global_random_seed) |
|
X2 = rng.randint(5, size=(6, 100)) |
|
y2 = np.array([1, 1, 2, 2, 3, 3]) |
|
return X2, y2 |
|
|
|
|
|
def test_gnb(): |
|
|
|
|
|
|
|
|
|
clf = GaussianNB() |
|
y_pred = clf.fit(X, y).predict(X) |
|
assert_array_equal(y_pred, y) |
|
|
|
y_pred_proba = clf.predict_proba(X) |
|
y_pred_log_proba = clf.predict_log_proba(X) |
|
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) |
|
|
|
|
|
|
|
|
|
with pytest.raises( |
|
ValueError, match="The target label.* in y do not exist in the initial classes" |
|
): |
|
GaussianNB().partial_fit(X, y, classes=[0, 1]) |
|
|
|
|
|
def test_gnb_prior(global_random_seed): |
|
|
|
clf = GaussianNB().fit(X, y) |
|
assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8) |
|
X1, y1 = get_random_normal_x_binary_y(global_random_seed) |
|
clf = GaussianNB().fit(X1, y1) |
|
|
|
assert_array_almost_equal(clf.class_prior_.sum(), 1) |
|
|
|
|
|
def test_gnb_sample_weight(global_random_seed): |
|
"""Test whether sample weights are properly used in GNB.""" |
|
|
|
sw = np.ones(6) |
|
clf = GaussianNB().fit(X, y) |
|
clf_sw = GaussianNB().fit(X, y, sw) |
|
|
|
assert_array_almost_equal(clf.theta_, clf_sw.theta_) |
|
assert_array_almost_equal(clf.var_, clf_sw.var_) |
|
|
|
|
|
|
|
rng = np.random.RandomState(global_random_seed) |
|
|
|
sw = rng.rand(y.shape[0]) |
|
clf1 = GaussianNB().fit(X, y, sample_weight=sw) |
|
clf2 = GaussianNB().partial_fit(X, y, classes=[1, 2], sample_weight=sw / 2) |
|
clf2.partial_fit(X, y, sample_weight=sw / 2) |
|
|
|
assert_array_almost_equal(clf1.theta_, clf2.theta_) |
|
assert_array_almost_equal(clf1.var_, clf2.var_) |
|
|
|
|
|
|
|
ind = rng.randint(0, X.shape[0], 20) |
|
sample_weight = np.bincount(ind, minlength=X.shape[0]) |
|
|
|
clf_dupl = GaussianNB().fit(X[ind], y[ind]) |
|
clf_sw = GaussianNB().fit(X, y, sample_weight) |
|
|
|
assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_) |
|
assert_array_almost_equal(clf_dupl.var_, clf_sw.var_) |
|
|
|
|
|
|
|
sample_weight = (y == 1).astype(np.float64) |
|
clf = GaussianNB().fit(X, y, sample_weight=sample_weight) |
|
|
|
|
|
def test_gnb_neg_priors(): |
|
"""Test whether an error is raised in case of negative priors""" |
|
clf = GaussianNB(priors=np.array([-1.0, 2.0])) |
|
|
|
msg = "Priors must be non-negative" |
|
with pytest.raises(ValueError, match=msg): |
|
clf.fit(X, y) |
|
|
|
|
|
def test_gnb_priors(): |
|
"""Test whether the class prior override is properly used""" |
|
clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y) |
|
assert_array_almost_equal( |
|
clf.predict_proba([[-0.1, -0.1]]), |
|
np.array([[0.825303662161683, 0.174696337838317]]), |
|
8, |
|
) |
|
assert_array_almost_equal(clf.class_prior_, np.array([0.3, 0.7])) |
|
|
|
|
|
def test_gnb_priors_sum_isclose(): |
|
|
|
X = np.array( |
|
[ |
|
[-1, -1], |
|
[-2, -1], |
|
[-3, -2], |
|
[-4, -5], |
|
[-5, -4], |
|
[1, 1], |
|
[2, 1], |
|
[3, 2], |
|
[4, 4], |
|
[5, 5], |
|
] |
|
) |
|
priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14, 0.11, 0.0]) |
|
Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) |
|
clf = GaussianNB(priors=priors) |
|
|
|
clf.fit(X, Y) |
|
|
|
|
|
def test_gnb_wrong_nb_priors(): |
|
"""Test whether an error is raised if the number of prior is different |
|
from the number of class""" |
|
clf = GaussianNB(priors=np.array([0.25, 0.25, 0.25, 0.25])) |
|
|
|
msg = "Number of priors must match number of classes" |
|
with pytest.raises(ValueError, match=msg): |
|
clf.fit(X, y) |
|
|
|
|
|
def test_gnb_prior_greater_one(): |
|
"""Test if an error is raised if the sum of prior greater than one""" |
|
clf = GaussianNB(priors=np.array([2.0, 1.0])) |
|
|
|
msg = "The sum of the priors should be 1" |
|
with pytest.raises(ValueError, match=msg): |
|
clf.fit(X, y) |
|
|
|
|
|
def test_gnb_prior_large_bias(): |
|
"""Test if good prediction when class prior favor largely one class""" |
|
clf = GaussianNB(priors=np.array([0.01, 0.99])) |
|
clf.fit(X, y) |
|
assert clf.predict([[-0.1, -0.1]]) == np.array([2]) |
|
|
|
|
|
def test_gnb_check_update_with_no_data(): |
|
"""Test when the partial fit is called without any data""" |
|
|
|
prev_points = 100 |
|
mean = 0.0 |
|
var = 1.0 |
|
x_empty = np.empty((0, X.shape[1])) |
|
tmean, tvar = GaussianNB._update_mean_variance(prev_points, mean, var, x_empty) |
|
assert tmean == mean |
|
assert tvar == var |
|
|
|
|
|
def test_gnb_partial_fit(): |
|
clf = GaussianNB().fit(X, y) |
|
clf_pf = GaussianNB().partial_fit(X, y, np.unique(y)) |
|
assert_array_almost_equal(clf.theta_, clf_pf.theta_) |
|
assert_array_almost_equal(clf.var_, clf_pf.var_) |
|
assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_) |
|
|
|
clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y)) |
|
clf_pf2.partial_fit(X[1::2], y[1::2]) |
|
assert_array_almost_equal(clf.theta_, clf_pf2.theta_) |
|
assert_array_almost_equal(clf.var_, clf_pf2.var_) |
|
assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_) |
|
|
|
|
|
def test_gnb_naive_bayes_scale_invariance(): |
|
|
|
iris = load_iris() |
|
X, y = iris.data, iris.target |
|
labels = [GaussianNB().fit(f * X, y).predict(f * X) for f in [1e-10, 1, 1e10]] |
|
assert_array_equal(labels[0], labels[1]) |
|
assert_array_equal(labels[1], labels[2]) |
|
|
|
|
|
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) |
|
def test_discretenb_prior(DiscreteNaiveBayes, global_random_seed): |
|
|
|
X2, y2 = get_random_integer_x_three_classes_y(global_random_seed) |
|
clf = DiscreteNaiveBayes().fit(X2, y2) |
|
assert_array_almost_equal( |
|
np.log(np.array([2, 2, 2]) / 6.0), clf.class_log_prior_, 8 |
|
) |
|
|
|
|
|
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) |
|
def test_discretenb_partial_fit(DiscreteNaiveBayes): |
|
clf1 = DiscreteNaiveBayes() |
|
clf1.fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1]) |
|
|
|
clf2 = DiscreteNaiveBayes() |
|
clf2.partial_fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1], classes=[0, 1]) |
|
assert_array_equal(clf1.class_count_, clf2.class_count_) |
|
if DiscreteNaiveBayes is CategoricalNB: |
|
for i in range(len(clf1.category_count_)): |
|
assert_array_equal(clf1.category_count_[i], clf2.category_count_[i]) |
|
else: |
|
assert_array_equal(clf1.feature_count_, clf2.feature_count_) |
|
|
|
clf3 = DiscreteNaiveBayes() |
|
|
|
clf3.partial_fit([[0, 1]], [0], classes=[0, 1]) |
|
clf3.partial_fit([[1, 0]], [1]) |
|
clf3.partial_fit([[1, 1]], [1]) |
|
assert_array_equal(clf1.class_count_, clf3.class_count_) |
|
if DiscreteNaiveBayes is CategoricalNB: |
|
|
|
|
|
|
|
for i in range(len(clf1.category_count_)): |
|
assert_array_equal( |
|
clf1.category_count_[i].shape, clf3.category_count_[i].shape |
|
) |
|
assert_array_equal( |
|
np.sum(clf1.category_count_[i], axis=1), |
|
np.sum(clf3.category_count_[i], axis=1), |
|
) |
|
|
|
|
|
|
|
assert_array_equal(clf1.category_count_[0][0], np.array([1, 0])) |
|
|
|
|
|
assert_array_equal(clf1.category_count_[0][1], np.array([0, 2])) |
|
|
|
|
|
|
|
assert_array_equal(clf1.category_count_[1][0], np.array([0, 1])) |
|
|
|
|
|
assert_array_equal(clf1.category_count_[1][1], np.array([1, 1])) |
|
else: |
|
assert_array_equal(clf1.feature_count_, clf3.feature_count_) |
|
|
|
|
|
@pytest.mark.parametrize("NaiveBayes", ALL_NAIVE_BAYES_CLASSES) |
|
def test_NB_partial_fit_no_first_classes(NaiveBayes, global_random_seed): |
|
|
|
X2, y2 = get_random_integer_x_three_classes_y(global_random_seed) |
|
|
|
with pytest.raises( |
|
ValueError, match="classes must be passed on the first call to partial_fit." |
|
): |
|
NaiveBayes().partial_fit(X2, y2) |
|
|
|
|
|
clf = NaiveBayes() |
|
clf.partial_fit(X2, y2, classes=np.unique(y2)) |
|
with pytest.raises( |
|
ValueError, match="is not the same as on last call to partial_fit" |
|
): |
|
clf.partial_fit(X2, y2, classes=np.arange(42)) |
|
|
|
|
|
def test_discretenb_predict_proba(): |
|
|
|
|
|
|
|
|
|
X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]] |
|
X_multinomial = [[0, 1], [1, 3], [4, 0]] |
|
|
|
|
|
y = [0, 0, 2] |
|
for DiscreteNaiveBayes, X in zip( |
|
[BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial] |
|
): |
|
clf = DiscreteNaiveBayes().fit(X, y) |
|
assert clf.predict(X[-1:]) == 2 |
|
assert clf.predict_proba([X[0]]).shape == (1, 2) |
|
assert_array_almost_equal( |
|
clf.predict_proba(X[:2]).sum(axis=1), np.array([1.0, 1.0]), 6 |
|
) |
|
|
|
|
|
y = [0, 1, 2] |
|
for DiscreteNaiveBayes, X in zip( |
|
[BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial] |
|
): |
|
clf = DiscreteNaiveBayes().fit(X, y) |
|
assert clf.predict_proba(X[0:1]).shape == (1, 3) |
|
assert clf.predict_proba(X[:2]).shape == (2, 3) |
|
assert_almost_equal(np.sum(clf.predict_proba([X[1]])), 1) |
|
assert_almost_equal(np.sum(clf.predict_proba([X[-1]])), 1) |
|
assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1) |
|
|
|
|
|
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) |
|
def test_discretenb_uniform_prior(DiscreteNaiveBayes): |
|
|
|
|
|
|
|
clf = DiscreteNaiveBayes() |
|
clf.set_params(fit_prior=False) |
|
clf.fit([[0], [0], [1]], [0, 0, 1]) |
|
prior = np.exp(clf.class_log_prior_) |
|
assert_array_almost_equal(prior, np.array([0.5, 0.5])) |
|
|
|
|
|
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) |
|
def test_discretenb_provide_prior(DiscreteNaiveBayes): |
|
|
|
|
|
clf = DiscreteNaiveBayes(class_prior=[0.5, 0.5]) |
|
clf.fit([[0], [0], [1]], [0, 0, 1]) |
|
prior = np.exp(clf.class_log_prior_) |
|
assert_array_almost_equal(prior, np.array([0.5, 0.5])) |
|
|
|
|
|
msg = "Number of priors must match number of classes" |
|
with pytest.raises(ValueError, match=msg): |
|
clf.fit([[0], [1], [2]], [0, 1, 2]) |
|
|
|
msg = "is not the same as on last call to partial_fit" |
|
with pytest.raises(ValueError, match=msg): |
|
clf.partial_fit([[0], [1]], [0, 1], classes=[0, 1, 1]) |
|
|
|
|
|
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) |
|
def test_discretenb_provide_prior_with_partial_fit(DiscreteNaiveBayes): |
|
|
|
|
|
|
|
iris = load_iris() |
|
iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split( |
|
iris.data, iris.target, test_size=0.4, random_state=415 |
|
) |
|
|
|
for prior in [None, [0.3, 0.3, 0.4]]: |
|
clf_full = DiscreteNaiveBayes(class_prior=prior) |
|
clf_full.fit(iris.data, iris.target) |
|
clf_partial = DiscreteNaiveBayes(class_prior=prior) |
|
clf_partial.partial_fit(iris_data1, iris_target1, classes=[0, 1, 2]) |
|
clf_partial.partial_fit(iris_data2, iris_target2) |
|
assert_array_almost_equal( |
|
clf_full.class_log_prior_, clf_partial.class_log_prior_ |
|
) |
|
|
|
|
|
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) |
|
def test_discretenb_sample_weight_multiclass(DiscreteNaiveBayes): |
|
|
|
X = [ |
|
[0, 0, 1], |
|
[0, 1, 1], |
|
[0, 1, 1], |
|
[1, 0, 0], |
|
] |
|
y = [0, 0, 1, 2] |
|
sample_weight = np.array([1, 1, 2, 2], dtype=np.float64) |
|
sample_weight /= sample_weight.sum() |
|
clf = DiscreteNaiveBayes().fit(X, y, sample_weight=sample_weight) |
|
assert_array_equal(clf.predict(X), [0, 1, 1, 2]) |
|
|
|
|
|
clf = DiscreteNaiveBayes() |
|
clf.partial_fit(X[:2], y[:2], classes=[0, 1, 2], sample_weight=sample_weight[:2]) |
|
clf.partial_fit(X[2:3], y[2:3], sample_weight=sample_weight[2:3]) |
|
clf.partial_fit(X[3:], y[3:], sample_weight=sample_weight[3:]) |
|
assert_array_equal(clf.predict(X), [0, 1, 1, 2]) |
|
|
|
|
|
@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) |
|
@pytest.mark.parametrize("use_partial_fit", [False, True]) |
|
@pytest.mark.parametrize("train_on_single_class_y", [False, True]) |
|
def test_discretenb_degenerate_one_class_case( |
|
DiscreteNaiveBayes, |
|
use_partial_fit, |
|
train_on_single_class_y, |
|
): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
X = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] |
|
y = [1, 1, 2] |
|
if train_on_single_class_y: |
|
X = X[:-1] |
|
y = y[:-1] |
|
classes = sorted(list(set(y))) |
|
num_classes = len(classes) |
|
|
|
clf = DiscreteNaiveBayes() |
|
if use_partial_fit: |
|
clf.partial_fit(X, y, classes=classes) |
|
else: |
|
clf.fit(X, y) |
|
assert clf.predict(X[:1]) == y[0] |
|
|
|
|
|
attribute_names = [ |
|
"classes_", |
|
"class_count_", |
|
"class_log_prior_", |
|
"feature_count_", |
|
"feature_log_prob_", |
|
] |
|
for attribute_name in attribute_names: |
|
attribute = getattr(clf, attribute_name, None) |
|
if attribute is None: |
|
|
|
continue |
|
if isinstance(attribute, np.ndarray): |
|
assert attribute.shape[0] == num_classes |
|
else: |
|
|
|
for element in attribute: |
|
assert element.shape[0] == num_classes |
|
|
|
|
|
@pytest.mark.parametrize("kind", ("dense", "sparse")) |
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
|
def test_mnnb(kind, global_random_seed, csr_container): |
|
|
|
|
|
|
|
X2, y2 = get_random_integer_x_three_classes_y(global_random_seed) |
|
|
|
if kind == "dense": |
|
X = X2 |
|
elif kind == "sparse": |
|
X = csr_container(X2) |
|
|
|
|
|
clf = MultinomialNB() |
|
|
|
msg = "Negative values in data passed to" |
|
with pytest.raises(ValueError, match=msg): |
|
clf.fit(-X, y2) |
|
y_pred = clf.fit(X, y2).predict(X) |
|
|
|
assert_array_equal(y_pred, y2) |
|
|
|
|
|
|
|
y_pred_proba = clf.predict_proba(X) |
|
y_pred_log_proba = clf.predict_log_proba(X) |
|
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) |
|
|
|
|
|
clf2 = MultinomialNB() |
|
clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2)) |
|
clf2.partial_fit(X[2:5], y2[2:5]) |
|
clf2.partial_fit(X[5:], y2[5:]) |
|
|
|
y_pred2 = clf2.predict(X) |
|
assert_array_equal(y_pred2, y2) |
|
|
|
y_pred_proba2 = clf2.predict_proba(X) |
|
y_pred_log_proba2 = clf2.predict_log_proba(X) |
|
assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8) |
|
assert_array_almost_equal(y_pred_proba2, y_pred_proba) |
|
assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba) |
|
|
|
|
|
clf3 = MultinomialNB() |
|
clf3.partial_fit(X, y2, classes=np.unique(y2)) |
|
|
|
y_pred3 = clf3.predict(X) |
|
assert_array_equal(y_pred3, y2) |
|
y_pred_proba3 = clf3.predict_proba(X) |
|
y_pred_log_proba3 = clf3.predict_log_proba(X) |
|
assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8) |
|
assert_array_almost_equal(y_pred_proba3, y_pred_proba) |
|
assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba) |
|
|
|
|
|
def test_mnb_prior_unobserved_targets(): |
|
|
|
|
|
|
|
X = np.array([[0, 1], [1, 0]]) |
|
y = np.array([0, 1]) |
|
|
|
clf = MultinomialNB() |
|
|
|
with warnings.catch_warnings(): |
|
warnings.simplefilter("error", RuntimeWarning) |
|
|
|
clf.partial_fit(X, y, classes=[0, 1, 2]) |
|
|
|
assert clf.predict([[0, 1]]) == 0 |
|
assert clf.predict([[1, 0]]) == 1 |
|
assert clf.predict([[1, 1]]) == 0 |
|
|
|
|
|
with warnings.catch_warnings(): |
|
warnings.simplefilter("error", RuntimeWarning) |
|
|
|
clf.partial_fit([[1, 1]], [2]) |
|
|
|
assert clf.predict([[0, 1]]) == 0 |
|
assert clf.predict([[1, 0]]) == 1 |
|
assert clf.predict([[1, 1]]) == 2 |
|
|
|
|
|
def test_bnb(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
X = np.array( |
|
[[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]] |
|
) |
|
|
|
|
|
Y = np.array([0, 0, 0, 1]) |
|
|
|
|
|
clf = BernoulliNB(alpha=1.0) |
|
clf.fit(X, Y) |
|
|
|
|
|
class_prior = np.array([0.75, 0.25]) |
|
assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior) |
|
|
|
|
|
feature_prob = np.array( |
|
[ |
|
[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], |
|
[1 / 3.0, 2 / 3.0, 2 / 3.0, 1 / 3.0, 1 / 3.0, 2 / 3.0], |
|
] |
|
) |
|
assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob) |
|
|
|
|
|
|
|
X_test = np.array([[0, 1, 1, 0, 0, 1]]) |
|
|
|
|
|
unnorm_predict_proba = np.array([[0.005183999999999999, 0.02194787379972565]]) |
|
predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba) |
|
assert_array_almost_equal(clf.predict_proba(X_test), predict_proba) |
|
|
|
|
|
def test_bnb_feature_log_prob(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]]) |
|
Y = np.array([0, 0, 1, 2, 2]) |
|
|
|
|
|
clf = BernoulliNB(alpha=1.0) |
|
clf.fit(X, Y) |
|
|
|
|
|
|
|
num = np.log(clf.feature_count_ + 1.0) |
|
denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T |
|
|
|
|
|
assert_array_almost_equal(clf.feature_log_prob_, (num - denom)) |
|
|
|
|
|
def test_cnb(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
X = np.array( |
|
[[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]] |
|
) |
|
|
|
|
|
Y = np.array([0, 0, 0, 1]) |
|
|
|
|
|
|
|
theta = np.array( |
|
[ |
|
[ |
|
(0 + 1) / (3 + 6), |
|
(1 + 1) / (3 + 6), |
|
(1 + 1) / (3 + 6), |
|
(0 + 1) / (3 + 6), |
|
(0 + 1) / (3 + 6), |
|
(1 + 1) / (3 + 6), |
|
], |
|
[ |
|
(1 + 1) / (6 + 6), |
|
(3 + 1) / (6 + 6), |
|
(0 + 1) / (6 + 6), |
|
(1 + 1) / (6 + 6), |
|
(1 + 1) / (6 + 6), |
|
(0 + 1) / (6 + 6), |
|
], |
|
] |
|
) |
|
|
|
weights = np.zeros(theta.shape) |
|
normed_weights = np.zeros(theta.shape) |
|
for i in range(2): |
|
weights[i] = -np.log(theta[i]) |
|
normed_weights[i] = weights[i] / weights[i].sum() |
|
|
|
|
|
clf = ComplementNB(alpha=1.0) |
|
|
|
msg = re.escape("Negative values in data passed to ComplementNB (input X)") |
|
with pytest.raises(ValueError, match=msg): |
|
clf.fit(-X, Y) |
|
|
|
clf.fit(X, Y) |
|
|
|
|
|
feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]]) |
|
assert_array_equal(clf.feature_count_, feature_count) |
|
class_count = np.array([3, 1]) |
|
assert_array_equal(clf.class_count_, class_count) |
|
feature_all = np.array([1, 4, 1, 1, 1, 1]) |
|
assert_array_equal(clf.feature_all_, feature_all) |
|
assert_array_almost_equal(clf.feature_log_prob_, weights) |
|
|
|
clf = ComplementNB(alpha=1.0, norm=True) |
|
clf.fit(X, Y) |
|
assert_array_almost_equal(clf.feature_log_prob_, normed_weights) |
|
|
|
|
|
def test_categoricalnb(global_random_seed): |
|
|
|
clf = CategoricalNB() |
|
X2, y2 = get_random_integer_x_three_classes_y(global_random_seed) |
|
|
|
y_pred = clf.fit(X2, y2).predict(X2) |
|
assert_array_equal(y_pred, y2) |
|
|
|
X3 = np.array([[1, 4], [2, 5]]) |
|
y3 = np.array([1, 2]) |
|
clf = CategoricalNB(alpha=1, fit_prior=False) |
|
|
|
clf.fit(X3, y3) |
|
assert_array_equal(clf.n_categories_, np.array([3, 6])) |
|
|
|
|
|
X = np.array([[0, -1]]) |
|
y = np.array([1]) |
|
error_msg = re.escape("Negative values in data passed to CategoricalNB (input X)") |
|
with pytest.raises(ValueError, match=error_msg): |
|
clf.predict(X) |
|
with pytest.raises(ValueError, match=error_msg): |
|
clf.fit(X, y) |
|
|
|
|
|
X3_test = np.array([[2, 5]]) |
|
|
|
|
|
bayes_numerator = np.array([[1 / 3 * 1 / 3, 2 / 3 * 2 / 3]]) |
|
bayes_denominator = bayes_numerator.sum() |
|
assert_array_almost_equal( |
|
clf.predict_proba(X3_test), bayes_numerator / bayes_denominator |
|
) |
|
|
|
|
|
assert len(clf.category_count_) == X3.shape[1] |
|
|
|
|
|
X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) |
|
y = np.array([1, 1, 2, 2]) |
|
clf = CategoricalNB(alpha=1, fit_prior=False) |
|
clf.fit(X, y) |
|
assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1])) |
|
assert_array_equal(clf.n_categories_, np.array([2, 2])) |
|
|
|
for factor in [1.0, 0.3, 5, 0.0001]: |
|
X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) |
|
y = np.array([1, 1, 2, 2]) |
|
sample_weight = np.array([1, 1, 10, 0.1]) * factor |
|
clf = CategoricalNB(alpha=1, fit_prior=False) |
|
clf.fit(X, y, sample_weight=sample_weight) |
|
assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2])) |
|
assert_array_equal(clf.n_categories_, np.array([2, 2])) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_", |
|
[ |
|
|
|
( |
|
3, |
|
np.array([[2, 0, 0], [1, 1, 0]]), |
|
np.array([[1, 1, 0], [1, 1, 0]]), |
|
np.array([[0, 2]]), |
|
np.array([3, 3]), |
|
), |
|
|
|
( |
|
[3, 4], |
|
np.array([[2, 0, 0], [1, 1, 0]]), |
|
np.array([[1, 1, 0, 0], [1, 1, 0, 0]]), |
|
np.array([[0, 3]]), |
|
np.array([3, 4]), |
|
), |
|
|
|
( |
|
[ |
|
1, |
|
np.array([[2, 0], [1, 1]]), |
|
np.array([[1, 1], [1, 1]]), |
|
np.array([[0, 1]]), |
|
np.array([2, 2]), |
|
] |
|
), |
|
], |
|
) |
|
def test_categoricalnb_with_min_categories( |
|
min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_ |
|
): |
|
X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) |
|
y_n_categories = np.array([1, 1, 2, 2]) |
|
expected_prediction = np.array([1]) |
|
|
|
clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) |
|
clf.fit(X_n_categories, y_n_categories) |
|
X1_count, X2_count = clf.category_count_ |
|
assert_array_equal(X1_count, exp_X1_count) |
|
assert_array_equal(X2_count, exp_X2_count) |
|
predictions = clf.predict(new_X) |
|
assert_array_equal(predictions, expected_prediction) |
|
assert_array_equal(clf.n_categories_, exp_n_categories_) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"min_categories, error_msg", |
|
[ |
|
([[3, 2], [2, 4]], "'min_categories' should have shape"), |
|
], |
|
) |
|
def test_categoricalnb_min_categories_errors(min_categories, error_msg): |
|
X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) |
|
y = np.array([1, 1, 2, 2]) |
|
|
|
clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) |
|
with pytest.raises(ValueError, match=error_msg): |
|
clf.fit(X, y) |
|
|
|
|
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
|
def test_alpha(csr_container): |
|
|
|
X = np.array([[1, 0], [1, 1]]) |
|
y = np.array([0, 1]) |
|
nb = BernoulliNB(alpha=0.0, force_alpha=False) |
|
msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10" |
|
with pytest.warns(UserWarning, match=msg): |
|
nb.partial_fit(X, y, classes=[0, 1]) |
|
with pytest.warns(UserWarning, match=msg): |
|
nb.fit(X, y) |
|
prob = np.array([[1, 0], [0, 1]]) |
|
assert_array_almost_equal(nb.predict_proba(X), prob) |
|
|
|
nb = MultinomialNB(alpha=0.0, force_alpha=False) |
|
with pytest.warns(UserWarning, match=msg): |
|
nb.partial_fit(X, y, classes=[0, 1]) |
|
with pytest.warns(UserWarning, match=msg): |
|
nb.fit(X, y) |
|
prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) |
|
assert_array_almost_equal(nb.predict_proba(X), prob) |
|
|
|
nb = CategoricalNB(alpha=0.0, force_alpha=False) |
|
with pytest.warns(UserWarning, match=msg): |
|
nb.fit(X, y) |
|
prob = np.array([[1.0, 0.0], [0.0, 1.0]]) |
|
assert_array_almost_equal(nb.predict_proba(X), prob) |
|
|
|
|
|
X = csr_container(X) |
|
nb = BernoulliNB(alpha=0.0, force_alpha=False) |
|
with pytest.warns(UserWarning, match=msg): |
|
nb.fit(X, y) |
|
prob = np.array([[1, 0], [0, 1]]) |
|
assert_array_almost_equal(nb.predict_proba(X), prob) |
|
|
|
nb = MultinomialNB(alpha=0.0, force_alpha=False) |
|
with pytest.warns(UserWarning, match=msg): |
|
nb.fit(X, y) |
|
prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) |
|
assert_array_almost_equal(nb.predict_proba(X), prob) |
|
|
|
|
|
def test_alpha_vector(): |
|
X = np.array([[1, 0], [1, 1]]) |
|
y = np.array([0, 1]) |
|
|
|
|
|
|
|
alpha = np.array([1, 2]) |
|
nb = MultinomialNB(alpha=alpha, force_alpha=False) |
|
nb.partial_fit(X, y, classes=[0, 1]) |
|
|
|
|
|
feature_prob = np.array([[1 / 2, 1 / 2], [2 / 5, 3 / 5]]) |
|
assert_array_almost_equal(nb.feature_log_prob_, np.log(feature_prob)) |
|
|
|
|
|
prob = np.array([[5 / 9, 4 / 9], [25 / 49, 24 / 49]]) |
|
assert_array_almost_equal(nb.predict_proba(X), prob) |
|
|
|
|
|
alpha = np.array([1.0, -0.1]) |
|
m_nb = MultinomialNB(alpha=alpha, force_alpha=False) |
|
expected_msg = "All values in alpha must be greater than 0." |
|
with pytest.raises(ValueError, match=expected_msg): |
|
m_nb.fit(X, y) |
|
|
|
|
|
ALPHA_MIN = 1e-10 |
|
alpha = np.array([ALPHA_MIN / 2, 0.5]) |
|
m_nb = MultinomialNB(alpha=alpha, force_alpha=False) |
|
m_nb.partial_fit(X, y, classes=[0, 1]) |
|
assert_array_almost_equal(m_nb._check_alpha(), [ALPHA_MIN, 0.5], decimal=12) |
|
|
|
|
|
alpha = np.array([1.0, 2.0, 3.0]) |
|
m_nb = MultinomialNB(alpha=alpha, force_alpha=False) |
|
expected_msg = "When alpha is an array, it should contains `n_features`" |
|
with pytest.raises(ValueError, match=expected_msg): |
|
m_nb.fit(X, y) |
|
|
|
|
|
def test_check_accuracy_on_digits(): |
|
|
|
|
|
|
|
X, y = load_digits(return_X_y=True) |
|
binary_3v8 = np.logical_or(y == 3, y == 8) |
|
X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8] |
|
|
|
|
|
scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10) |
|
assert scores.mean() > 0.86 |
|
|
|
scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10) |
|
assert scores.mean() > 0.94 |
|
|
|
|
|
scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10) |
|
assert scores.mean() > 0.83 |
|
|
|
scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10) |
|
assert scores.mean() > 0.92 |
|
|
|
|
|
scores = cross_val_score(GaussianNB(), X, y, cv=10) |
|
assert scores.mean() > 0.77 |
|
|
|
scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10) |
|
assert scores.mean() > 0.89 |
|
|
|
scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10) |
|
assert scores.mean() > 0.86 |
|
|
|
|
|
def test_check_alpha(): |
|
"""The provided value for alpha must only be |
|
used if alpha < _ALPHA_MIN and force_alpha is True. |
|
|
|
Non-regression test for: |
|
https://github.com/scikit-learn/scikit-learn/issues/10772 |
|
""" |
|
_ALPHA_MIN = 1e-10 |
|
b = BernoulliNB(alpha=0, force_alpha=True) |
|
assert b._check_alpha() == 0 |
|
|
|
alphas = np.array([0.0, 1.0]) |
|
|
|
b = BernoulliNB(alpha=alphas, force_alpha=True) |
|
|
|
b.n_features_in_ = alphas.shape[0] |
|
assert_array_equal(b._check_alpha(), alphas) |
|
|
|
msg = ( |
|
"alpha too small will result in numeric errors, setting alpha = %.1e" |
|
% _ALPHA_MIN |
|
) |
|
b = BernoulliNB(alpha=0, force_alpha=False) |
|
with pytest.warns(UserWarning, match=msg): |
|
assert b._check_alpha() == _ALPHA_MIN |
|
|
|
b = BernoulliNB(alpha=0, force_alpha=False) |
|
with pytest.warns(UserWarning, match=msg): |
|
assert b._check_alpha() == _ALPHA_MIN |
|
|
|
b = BernoulliNB(alpha=alphas, force_alpha=False) |
|
|
|
b.n_features_in_ = alphas.shape[0] |
|
with pytest.warns(UserWarning, match=msg): |
|
assert_array_equal(b._check_alpha(), np.array([_ALPHA_MIN, 1.0])) |
|
|
|
|
|
@pytest.mark.parametrize("Estimator", ALL_NAIVE_BAYES_CLASSES) |
|
def test_predict_joint_proba(Estimator, global_random_seed): |
|
X2, y2 = get_random_integer_x_three_classes_y(global_random_seed) |
|
est = Estimator().fit(X2, y2) |
|
jll = est.predict_joint_log_proba(X2) |
|
log_prob_x = logsumexp(jll, axis=1) |
|
log_prob_x_y = jll - np.atleast_2d(log_prob_x).T |
|
assert_allclose(est.predict_log_proba(X2), log_prob_x_y) |
|
|