|
import numpy as np |
|
|
|
from libc cimport math |
|
from libc.math cimport INFINITY |
|
|
|
from ..utils._typedefs cimport float32_t, float64_t |
|
|
|
|
|
cdef float EPSILON_DBL = 1e-8 |
|
cdef float PERPLEXITY_TOLERANCE = 1e-5 |
|
|
|
|
|
|
|
def _binary_search_perplexity( |
|
const float32_t[:, :] sqdistances, |
|
float desired_perplexity, |
|
int verbose): |
|
"""Binary search for sigmas of conditional Gaussians. |
|
|
|
This approximation reduces the computational complexity from O(N^2) to |
|
O(uN). |
|
|
|
Parameters |
|
---------- |
|
sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32 |
|
Distances between training samples and their k nearest neighbors. |
|
When using the exact method, this is a square (n_samples, n_samples) |
|
distance matrix. The TSNE default metric is "euclidean" which is |
|
interpreted as squared euclidean distance. |
|
|
|
desired_perplexity : float |
|
Desired perplexity (2^entropy) of the conditional Gaussians. |
|
|
|
verbose : int |
|
Verbosity level. |
|
|
|
Returns |
|
------- |
|
P : ndarray of shape (n_samples, n_samples), dtype=np.float64 |
|
Probabilities of conditional Gaussian distributions p_i|j. |
|
""" |
|
|
|
cdef long n_steps = 100 |
|
|
|
cdef long n_samples = sqdistances.shape[0] |
|
cdef long n_neighbors = sqdistances.shape[1] |
|
cdef int using_neighbors = n_neighbors < n_samples |
|
|
|
cdef double beta |
|
cdef double beta_min |
|
cdef double beta_max |
|
cdef double beta_sum = 0.0 |
|
|
|
|
|
cdef double desired_entropy = math.log(desired_perplexity) |
|
cdef double entropy_diff |
|
|
|
cdef double entropy |
|
cdef double sum_Pi |
|
cdef double sum_disti_Pi |
|
cdef long i, j, l |
|
|
|
|
|
|
|
cdef float64_t[:, :] P = np.zeros( |
|
(n_samples, n_neighbors), dtype=np.float64) |
|
|
|
for i in range(n_samples): |
|
beta_min = -INFINITY |
|
beta_max = INFINITY |
|
beta = 1.0 |
|
|
|
|
|
for l in range(n_steps): |
|
|
|
|
|
|
|
sum_Pi = 0.0 |
|
for j in range(n_neighbors): |
|
if j != i or using_neighbors: |
|
P[i, j] = math.exp(-sqdistances[i, j] * beta) |
|
sum_Pi += P[i, j] |
|
|
|
if sum_Pi == 0.0: |
|
sum_Pi = EPSILON_DBL |
|
sum_disti_Pi = 0.0 |
|
|
|
for j in range(n_neighbors): |
|
P[i, j] /= sum_Pi |
|
sum_disti_Pi += sqdistances[i, j] * P[i, j] |
|
|
|
entropy = math.log(sum_Pi) + beta * sum_disti_Pi |
|
entropy_diff = entropy - desired_entropy |
|
|
|
if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE: |
|
break |
|
|
|
if entropy_diff > 0.0: |
|
beta_min = beta |
|
if beta_max == INFINITY: |
|
beta *= 2.0 |
|
else: |
|
beta = (beta + beta_max) / 2.0 |
|
else: |
|
beta_max = beta |
|
if beta_min == -INFINITY: |
|
beta /= 2.0 |
|
else: |
|
beta = (beta + beta_min) / 2.0 |
|
|
|
beta_sum += beta |
|
|
|
if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples): |
|
print("[t-SNE] Computed conditional probabilities for sample " |
|
"%d / %d" % (i + 1, n_samples)) |
|
|
|
if verbose: |
|
print("[t-SNE] Mean sigma: %f" |
|
% np.mean(math.sqrt(n_samples / beta_sum))) |
|
return np.asarray(P) |
|
|