spam-classifier / venv /lib /python3.11 /site-packages /sklearn /tests /test_kernel_approximation.py

Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

16.6 kB

	import re

	import numpy as np
	import pytest

	from sklearn.datasets import make_classification
	from sklearn.kernel_approximation import (
	AdditiveChi2Sampler,
	Nystroem,
	PolynomialCountSketch,
	RBFSampler,
	SkewedChi2Sampler,
	)
	from sklearn.metrics.pairwise import (
	chi2_kernel,
	kernel_metrics,
	polynomial_kernel,
	rbf_kernel,
	)
	from sklearn.utils._testing import (
	assert_allclose,
	assert_array_almost_equal,
	assert_array_equal,
	)
	from sklearn.utils.fixes import CSR_CONTAINERS

	# generate data
	rng = np.random.RandomState(0)
	X = rng.random_sample(size=(300, 50))
	Y = rng.random_sample(size=(300, 50))
	X /= X.sum(axis=1)[:, np.newaxis]
	Y /= Y.sum(axis=1)[:, np.newaxis]

	# Make sure X and Y are not writable to avoid introducing dependencies between
	# tests.
	X.flags.writeable = False
	Y.flags.writeable = False


	@pytest.mark.parametrize("gamma", [0.1, 1, 2.5])
	@pytest.mark.parametrize("degree, n_components", [(1, 500), (2, 500), (3, 5000)])
	@pytest.mark.parametrize("coef0", [0, 2.5])
	def test_polynomial_count_sketch(gamma, degree, coef0, n_components):
	# test that PolynomialCountSketch approximates polynomial
	# kernel on random data

	# compute exact kernel
	kernel = polynomial_kernel(X, Y, gamma=gamma, degree=degree, coef0=coef0)

	# approximate kernel mapping
	ps_transform = PolynomialCountSketch(
	n_components=n_components,
	gamma=gamma,
	coef0=coef0,
	degree=degree,
	random_state=42,
	)
	X_trans = ps_transform.fit_transform(X)
	Y_trans = ps_transform.transform(Y)
	kernel_approx = np.dot(X_trans, Y_trans.T)

	error = kernel - kernel_approx
	assert np.abs(np.mean(error)) <= 0.05 # close to unbiased
	np.abs(error, out=error)
	assert np.max(error) <= 0.1 # nothing too far off
	assert np.mean(error) <= 0.05 # mean is fairly close


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	@pytest.mark.parametrize("gamma", [0.1, 1.0])
	@pytest.mark.parametrize("degree", [1, 2, 3])
	@pytest.mark.parametrize("coef0", [0, 2.5])
	def test_polynomial_count_sketch_dense_sparse(gamma, degree, coef0, csr_container):
	"""Check that PolynomialCountSketch results are the same for dense and sparse
	input.
	"""
	ps_dense = PolynomialCountSketch(
	n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42
	)
	Xt_dense = ps_dense.fit_transform(X)
	Yt_dense = ps_dense.transform(Y)

	ps_sparse = PolynomialCountSketch(
	n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42
	)
	Xt_sparse = ps_sparse.fit_transform(csr_container(X))
	Yt_sparse = ps_sparse.transform(csr_container(Y))

	assert_allclose(Xt_dense, Xt_sparse)
	assert_allclose(Yt_dense, Yt_sparse)


	def _linear_kernel(X, Y):
	return np.dot(X, Y.T)


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_additive_chi2_sampler(csr_container):
	# test that AdditiveChi2Sampler approximates kernel on random data

	# compute exact kernel
	# abbreviations for easier formula
	X_ = X[:, np.newaxis, :].copy()
	Y_ = Y[np.newaxis, :, :].copy()

	large_kernel = 2 * X_ * Y_ / (X_ + Y_)

	# reduce to n_samples_x x n_samples_y by summing over features
	kernel = large_kernel.sum(axis=2)

	# approximate kernel mapping
	transform = AdditiveChi2Sampler(sample_steps=3)
	X_trans = transform.fit_transform(X)
	Y_trans = transform.transform(Y)

	kernel_approx = np.dot(X_trans, Y_trans.T)

	assert_array_almost_equal(kernel, kernel_approx, 1)

	X_sp_trans = transform.fit_transform(csr_container(X))
	Y_sp_trans = transform.transform(csr_container(Y))

	assert_array_equal(X_trans, X_sp_trans.toarray())
	assert_array_equal(Y_trans, Y_sp_trans.toarray())

	# test error is raised on negative input
	Y_neg = Y.copy()
	Y_neg[0, 0] = -1
	msg = "Negative values in data passed to"
	with pytest.raises(ValueError, match=msg):
	transform.fit(Y_neg)


	@pytest.mark.parametrize("method", ["fit", "fit_transform", "transform"])
	@pytest.mark.parametrize("sample_steps", range(1, 4))
	def test_additive_chi2_sampler_sample_steps(method, sample_steps):
	"""Check that the input sample step doesn't raise an error
	and that sample interval doesn't change after fit.
	"""
	transformer = AdditiveChi2Sampler(sample_steps=sample_steps)
	getattr(transformer, method)(X)

	sample_interval = 0.5
	transformer = AdditiveChi2Sampler(
	sample_steps=sample_steps,
	sample_interval=sample_interval,
	)
	getattr(transformer, method)(X)
	assert transformer.sample_interval == sample_interval


	@pytest.mark.parametrize("method", ["fit", "fit_transform", "transform"])
	def test_additive_chi2_sampler_wrong_sample_steps(method):
	"""Check that we raise a ValueError on invalid sample_steps"""
	transformer = AdditiveChi2Sampler(sample_steps=4)
	msg = re.escape(
	"If sample_steps is not in [1, 2, 3], you need to provide sample_interval"
	)
	with pytest.raises(ValueError, match=msg):
	getattr(transformer, method)(X)


	def test_skewed_chi2_sampler():
	# test that RBFSampler approximates kernel on random data

	# compute exact kernel
	c = 0.03
	# set on negative component but greater than c to ensure that the kernel
	# approximation is valid on the group (-c; +\infty) endowed with the skewed
	# multiplication.
	Y_ = Y.copy()
	Y_[0, 0] = -c / 2.0

	# abbreviations for easier formula
	X_c = (X + c)[:, np.newaxis, :]
	Y_c = (Y_ + c)[np.newaxis, :, :]

	# we do it in log-space in the hope that it's more stable
	# this array is n_samples_x x n_samples_y big x n_features
	log_kernel = (
	(np.log(X_c) / 2.0) + (np.log(Y_c) / 2.0) + np.log(2.0) - np.log(X_c + Y_c)
	)
	# reduce to n_samples_x x n_samples_y by summing over features in log-space
	kernel = np.exp(log_kernel.sum(axis=2))

	# approximate kernel mapping
	transform = SkewedChi2Sampler(skewedness=c, n_components=1000, random_state=42)
	X_trans = transform.fit_transform(X)
	Y_trans = transform.transform(Y_)

	kernel_approx = np.dot(X_trans, Y_trans.T)
	assert_array_almost_equal(kernel, kernel_approx, 1)
	assert np.isfinite(kernel).all(), "NaNs found in the Gram matrix"
	assert np.isfinite(kernel_approx).all(), "NaNs found in the approximate Gram matrix"

	# test error is raised on when inputs contains values smaller than -c
	Y_neg = Y_.copy()
	Y_neg[0, 0] = -c * 2.0
	msg = "X may not contain entries smaller than -skewedness"
	with pytest.raises(ValueError, match=msg):
	transform.transform(Y_neg)


	def test_additive_chi2_sampler_exceptions():
	"""Ensures correct error message"""
	transformer = AdditiveChi2Sampler()
	X_neg = X.copy()
	X_neg[0, 0] = -1
	with pytest.raises(ValueError, match="X in AdditiveChi2Sampler"):
	transformer.fit(X_neg)
	with pytest.raises(ValueError, match="X in AdditiveChi2Sampler"):
	transformer.fit(X)
	transformer.transform(X_neg)


	def test_rbf_sampler():
	# test that RBFSampler approximates kernel on random data
	# compute exact kernel
	gamma = 10.0
	kernel = rbf_kernel(X, Y, gamma=gamma)

	# approximate kernel mapping
	rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42)
	X_trans = rbf_transform.fit_transform(X)
	Y_trans = rbf_transform.transform(Y)
	kernel_approx = np.dot(X_trans, Y_trans.T)

	error = kernel - kernel_approx
	assert np.abs(np.mean(error)) <= 0.01 # close to unbiased
	np.abs(error, out=error)
	assert np.max(error) <= 0.1 # nothing too far off
	assert np.mean(error) <= 0.05 # mean is fairly close


	def test_rbf_sampler_fitted_attributes_dtype(global_dtype):
	"""Check that the fitted attributes are stored accordingly to the
	data type of X."""
	rbf = RBFSampler()

	X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)

	rbf.fit(X)

	assert rbf.random_offset_.dtype == global_dtype
	assert rbf.random_weights_.dtype == global_dtype


	def test_rbf_sampler_dtype_equivalence():
	"""Check the equivalence of the results with 32 and 64 bits input."""
	rbf32 = RBFSampler(random_state=42)
	X32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
	rbf32.fit(X32)

	rbf64 = RBFSampler(random_state=42)
	X64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
	rbf64.fit(X64)

	assert_allclose(rbf32.random_offset_, rbf64.random_offset_)
	assert_allclose(rbf32.random_weights_, rbf64.random_weights_)


	def test_rbf_sampler_gamma_scale():
	"""Check the inner value computed when `gamma='scale'`."""
	X, y = [[0.0], [1.0]], [0, 1]
	rbf = RBFSampler(gamma="scale")
	rbf.fit(X, y)
	assert rbf._gamma == pytest.approx(4)


	def test_skewed_chi2_sampler_fitted_attributes_dtype(global_dtype):
	"""Check that the fitted attributes are stored accordingly to the
	data type of X."""
	skewed_chi2_sampler = SkewedChi2Sampler()

	X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)

	skewed_chi2_sampler.fit(X)

	assert skewed_chi2_sampler.random_offset_.dtype == global_dtype
	assert skewed_chi2_sampler.random_weights_.dtype == global_dtype


	def test_skewed_chi2_sampler_dtype_equivalence():
	"""Check the equivalence of the results with 32 and 64 bits input."""
	skewed_chi2_sampler_32 = SkewedChi2Sampler(random_state=42)
	X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
	skewed_chi2_sampler_32.fit(X_32)

	skewed_chi2_sampler_64 = SkewedChi2Sampler(random_state=42)
	X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
	skewed_chi2_sampler_64.fit(X_64)

	assert_allclose(
	skewed_chi2_sampler_32.random_offset_, skewed_chi2_sampler_64.random_offset_
	)
	assert_allclose(
	skewed_chi2_sampler_32.random_weights_, skewed_chi2_sampler_64.random_weights_
	)


	@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
	def test_input_validation(csr_container):
	# Regression test: kernel approx. transformers should work on lists
	# No assertions; the old versions would simply crash
	X = [[1, 2], [3, 4], [5, 6]]
	AdditiveChi2Sampler().fit(X).transform(X)
	SkewedChi2Sampler().fit(X).transform(X)
	RBFSampler().fit(X).transform(X)

	X = csr_container(X)
	RBFSampler().fit(X).transform(X)


	def test_nystroem_approximation():
	# some basic tests
	rnd = np.random.RandomState(0)
	X = rnd.uniform(size=(10, 4))

	# With n_components = n_samples this is exact
	X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
	K = rbf_kernel(X)
	assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)

	trans = Nystroem(n_components=2, random_state=rnd)
	X_transformed = trans.fit(X).transform(X)
	assert X_transformed.shape == (X.shape[0], 2)

	# test callable kernel
	trans = Nystroem(n_components=2, kernel=_linear_kernel, random_state=rnd)
	X_transformed = trans.fit(X).transform(X)
	assert X_transformed.shape == (X.shape[0], 2)

	# test that available kernels fit and transform
	kernels_available = kernel_metrics()
	for kern in kernels_available:
	trans = Nystroem(n_components=2, kernel=kern, random_state=rnd)
	X_transformed = trans.fit(X).transform(X)
	assert X_transformed.shape == (X.shape[0], 2)


	def test_nystroem_default_parameters():
	rnd = np.random.RandomState(42)
	X = rnd.uniform(size=(10, 4))

	# rbf kernel should behave as gamma=None by default
	# aka gamma = 1 / n_features
	nystroem = Nystroem(n_components=10)
	X_transformed = nystroem.fit_transform(X)
	K = rbf_kernel(X, gamma=None)
	K2 = np.dot(X_transformed, X_transformed.T)
	assert_array_almost_equal(K, K2)

	# chi2 kernel should behave as gamma=1 by default
	nystroem = Nystroem(kernel="chi2", n_components=10)
	X_transformed = nystroem.fit_transform(X)
	K = chi2_kernel(X, gamma=1)
	K2 = np.dot(X_transformed, X_transformed.T)
	assert_array_almost_equal(K, K2)


	def test_nystroem_singular_kernel():
	# test that nystroem works with singular kernel matrix
	rng = np.random.RandomState(0)
	X = rng.rand(10, 20)
	X = np.vstack([X] * 2) # duplicate samples

	gamma = 100
	N = Nystroem(gamma=gamma, n_components=X.shape[0]).fit(X)
	X_transformed = N.transform(X)

	K = rbf_kernel(X, gamma=gamma)

	assert_array_almost_equal(K, np.dot(X_transformed, X_transformed.T))
	assert np.all(np.isfinite(Y))


	def test_nystroem_poly_kernel_params():
	# Non-regression: Nystroem should pass other parameters beside gamma.
	rnd = np.random.RandomState(37)
	X = rnd.uniform(size=(10, 4))

	K = polynomial_kernel(X, degree=3.1, coef0=0.1)
	nystroem = Nystroem(
	kernel="polynomial", n_components=X.shape[0], degree=3.1, coef0=0.1
	)
	X_transformed = nystroem.fit_transform(X)
	assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)


	def test_nystroem_callable():
	# Test Nystroem on a callable.
	rnd = np.random.RandomState(42)
	n_samples = 10
	X = rnd.uniform(size=(n_samples, 4))

	def logging_histogram_kernel(x, y, log):
	"""Histogram kernel that writes to a log."""
	log.append(1)
	return np.minimum(x, y).sum()

	kernel_log = []
	X = list(X) # test input validation
	Nystroem(
	kernel=logging_histogram_kernel,
	n_components=(n_samples - 1),
	kernel_params={"log": kernel_log},
	).fit(X)
	assert len(kernel_log) == n_samples * (n_samples - 1) / 2

	# if degree, gamma or coef0 is passed, we raise a ValueError
	msg = "Don't pass gamma, coef0 or degree to Nystroem"
	params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
	for param in params:
	ny = Nystroem(kernel=_linear_kernel, n_components=(n_samples - 1), **param)
	with pytest.raises(ValueError, match=msg):
	ny.fit(X)


	def test_nystroem_precomputed_kernel():
	# Non-regression: test Nystroem on precomputed kernel.
	# PR - 14706
	rnd = np.random.RandomState(12)
	X = rnd.uniform(size=(10, 4))

	K = polynomial_kernel(X, degree=2, coef0=0.1)
	nystroem = Nystroem(kernel="precomputed", n_components=X.shape[0])
	X_transformed = nystroem.fit_transform(K)
	assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)

	# if degree, gamma or coef0 is passed, we raise a ValueError
	msg = "Don't pass gamma, coef0 or degree to Nystroem"
	params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
	for param in params:
	ny = Nystroem(kernel="precomputed", n_components=X.shape[0], **param)
	with pytest.raises(ValueError, match=msg):
	ny.fit(K)


	def test_nystroem_component_indices():
	"""Check that `component_indices_` corresponds to the subset of
	training points used to construct the feature map.
	Non-regression test for:
	https://github.com/scikit-learn/scikit-learn/issues/20474
	"""
	X, _ = make_classification(n_samples=100, n_features=20)
	feature_map_nystroem = Nystroem(
	n_components=10,
	random_state=0,
	)
	feature_map_nystroem.fit(X)
	assert feature_map_nystroem.component_indices_.shape == (10,)


	@pytest.mark.parametrize(
	"Estimator", [PolynomialCountSketch, RBFSampler, SkewedChi2Sampler, Nystroem]
	)
	def test_get_feature_names_out(Estimator):
	"""Check get_feature_names_out"""
	est = Estimator().fit(X)
	X_trans = est.transform(X)

	names_out = est.get_feature_names_out()
	class_name = Estimator.__name__.lower()
	expected_names = [f"{class_name}{i}" for i in range(X_trans.shape[1])]
	assert_array_equal(names_out, expected_names)


	def test_additivechi2sampler_get_feature_names_out():
	"""Check get_feature_names_out for AdditiveChi2Sampler."""
	rng = np.random.RandomState(0)
	X = rng.random_sample(size=(300, 3))

	chi2_sampler = AdditiveChi2Sampler(sample_steps=3).fit(X)
	input_names = ["f0", "f1", "f2"]
	suffixes = [
	"f0_sqrt",
	"f1_sqrt",
	"f2_sqrt",
	"f0_cos1",
	"f1_cos1",
	"f2_cos1",
	"f0_sin1",
	"f1_sin1",
	"f2_sin1",
	"f0_cos2",
	"f1_cos2",
	"f2_cos2",
	"f0_sin2",
	"f1_sin2",
	"f2_sin2",
	]

	names_out = chi2_sampler.get_feature_names_out(input_features=input_names)
	expected_names = [f"additivechi2sampler_{suffix}" for suffix in suffixes]
	assert_array_equal(names_out, expected_names)