ModelForge
/

spam-classifier

Text Classification

binary-classification

Model card Files Files and versions Community

spam-classifier / venv /lib /python3.11 /site-packages /sklearn /manifold /_utils.pyx

Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

3.91 kB

	import numpy as np

	from libc cimport math
	from libc.math cimport INFINITY

	from ..utils._typedefs cimport float32_t, float64_t


	cdef float EPSILON_DBL = 1e-8
	cdef float PERPLEXITY_TOLERANCE = 1e-5


	# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
	def _binary_search_perplexity(
	const float32_t[:, :] sqdistances,
	float desired_perplexity,
	int verbose):
	"""Binary search for sigmas of conditional Gaussians.

	This approximation reduces the computational complexity from O(N^2) to
	O(uN).

	Parameters
	----------
	sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
	Distances between training samples and their k nearest neighbors.
	When using the exact method, this is a square (n_samples, n_samples)
	distance matrix. The TSNE default metric is "euclidean" which is
	interpreted as squared euclidean distance.

	desired_perplexity : float
	Desired perplexity (2^entropy) of the conditional Gaussians.

	verbose : int
	Verbosity level.

	Returns
	-------
	P : ndarray of shape (n_samples, n_samples), dtype=np.float64
	Probabilities of conditional Gaussian distributions p_i\|j.
	"""
	# Maximum number of binary search steps
	cdef long n_steps = 100

	cdef long n_samples = sqdistances.shape[0]
	cdef long n_neighbors = sqdistances.shape[1]
	cdef int using_neighbors = n_neighbors < n_samples
	# Precisions of conditional Gaussian distributions
	cdef double beta
	cdef double beta_min
	cdef double beta_max
	cdef double beta_sum = 0.0

	# Use log scale
	cdef double desired_entropy = math.log(desired_perplexity)
	cdef double entropy_diff

	cdef double entropy
	cdef double sum_Pi
	cdef double sum_disti_Pi
	cdef long i, j, l

	# This array is later used as a 32bit array. It has multiple intermediate
	# floating point additions that benefit from the extra precision
	cdef float64_t[:, :] P = np.zeros(
	(n_samples, n_neighbors), dtype=np.float64)

	for i in range(n_samples):
	beta_min = -INFINITY
	beta_max = INFINITY
	beta = 1.0

	# Binary search of precision for i-th conditional distribution
	for l in range(n_steps):
	# Compute current entropy and corresponding probabilities
	# computed just over the nearest neighbors or over all data
	# if we're not using neighbors
	sum_Pi = 0.0
	for j in range(n_neighbors):
	if j != i or using_neighbors:
	P[i, j] = math.exp(-sqdistances[i, j] * beta)
	sum_Pi += P[i, j]

	if sum_Pi == 0.0:
	sum_Pi = EPSILON_DBL
	sum_disti_Pi = 0.0

	for j in range(n_neighbors):
	P[i, j] /= sum_Pi
	sum_disti_Pi += sqdistances[i, j] * P[i, j]

	entropy = math.log(sum_Pi) + beta * sum_disti_Pi
	entropy_diff = entropy - desired_entropy

	if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
	break

	if entropy_diff > 0.0:
	beta_min = beta
	if beta_max == INFINITY:
	beta *= 2.0
	else:
	beta = (beta + beta_max) / 2.0
	else:
	beta_max = beta
	if beta_min == -INFINITY:
	beta /= 2.0
	else:
	beta = (beta + beta_min) / 2.0

	beta_sum += beta

	if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
	print("[t-SNE] Computed conditional probabilities for sample "
	"%d / %d" % (i + 1, n_samples))

	if verbose:
	print("[t-SNE] Mean sigma: %f"
	% np.mean(math.sqrt(n_samples / beta_sum)))
	return np.asarray(P)