"""
Intrinsic dimensionality estimation using nearest neighbor methods.
This module implements methods to estimate the intrinsic dimensionality of datasets
based on the statistics of nearest neighbor distances.
"""
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pynndescent
[docs]
def nn_dimension(data=None, k=2, graph_method="sklearn", precomputed_graph=None):
"""
Estimate intrinsic dimension using the k-NN algorithm.
This method estimates the intrinsic dimensionality by analyzing the ratios
of distances to successive nearest neighbors using maximum likelihood
estimation. When k=2, this implements the TWO-NN algorithm [1], where
"TWO" refers to using the 1st and 2nd nearest neighbors to compute
distance ratios.
The algorithm works by:
1. For each point, computing the ratio r_i = d_2 / d_1 (for k=2), where
d_1 and d_2 are distances to the 1st and 2nd nearest neighbors
2. Using maximum likelihood to estimate dimension from the distribution
of these ratios, which follows a specific form dependent on the
intrinsic dimension
3. For k>2, extending to use multiple successive neighbor ratios
(d_2/d_1, d_3/d_2, ..., d_k/d_{k-1}) for more robust estimation
Parameters
----------
data : array-like of shape (n_samples, n_features), optional
The input data matrix where rows are samples and columns are features.
Either data or precomputed_graph must be provided.
k : int, default=2
The number of nearest neighbors to consider. k=2 gives the TWO-NN algorithm.
Higher values can provide more robust estimates.
graph_method : {'sklearn', 'pynndescent'}, default='sklearn'
Method to use for nearest neighbor graph construction:
- 'sklearn': Uses scikit-learn's NearestNeighbors (exact)
- 'pynndescent': Uses PyNNDescent (approximate, faster for large datasets)
Only used if data is provided.
precomputed_graph : tuple of (indices, distances), optional
Precomputed k-NN graph as a tuple of:
- indices: array of shape (n_samples, k+1) with neighbor indices
- distances: array of shape (n_samples, k+1) with neighbor distances
The first column (index 0) should contain self-references with distance 0.
Either data or precomputed_graph must be provided.
Returns
-------
d : float
The estimated intrinsic dimension of the dataset.
Notes
-----
The method is based on the principle that in a d-dimensional manifold,
the ratio of distances to successive nearest neighbors follows a specific
distribution that depends on d.
References
----------
[1] Facco, E., d'Errico, M., Rodriguez, A., & Laio, A. (2017).
Estimating the intrinsic dimension of datasets by a minimal
neighborhood information. Scientific Reports, 7(1), 12140.
https://doi.org/10.1038/s41598-017-11873-y
Examples
--------
>>> import numpy as np
>>> from driada.dimensionality.intrinsic import nn_dimension
>>>
>>> # Example 1: Clean low-dimensional data
>>> np.random.seed(42)
>>> # Generate 2D Swiss roll
>>> n = 1000
>>> t = 1.5 * np.pi * (1 + 2 * np.random.rand(n))
>>> height = 30 * np.random.rand(n)
>>> X = np.zeros((n, 3))
>>> X[:, 0] = t * np.cos(t)
>>> X[:, 1] = height
>>> X[:, 2] = t * np.sin(t)
>>> d_est = nn_dimension(X, k=10)
>>> print(f"Swiss roll (true dim=2): {d_est:.2f}")
Swiss roll (true dim=2): 1.94
>>>
>>> # Example 2: Impact of ambient noise
>>> # The TWO-NN estimator (k=2) can overestimate dimension when data
>>> # has noise in ambient dimensions. This is because noise affects
>>> # the ratios of nearest neighbor distances.
>>>
>>> # Pure 2D data: accurate estimate
>>> theta = np.random.uniform(0, 2*np.pi, 1000)
>>> r = np.sqrt(np.random.uniform(0, 1, 1000)) # Uniform distribution on disk
>>> data_2d = np.column_stack([r * np.cos(theta), r * np.sin(theta)])
>>> d_est = nn_dimension(data_2d, k=2)
>>> print(f"Pure 2D disk: {d_est:.2f}")
Pure 2D disk: 2.01
>>>
>>> # Same data with small noise in extra dimensions: overestimates
>>> # This happens because even small noise changes neighbor relationships
>>> noise = 0.01 * np.random.randn(1000, 8)
>>> data_10d = np.hstack([data_2d, noise])
>>> d_est = nn_dimension(data_10d, k=2)
>>> print(f"2D disk in 10D with noise: {d_est:.2f}") # Overestimates due to noise
2D disk in 10D with noise: 4.73
>>>
>>> # For noisy high-dimensional data:
>>> # 1. Use higher k values (more robust but may underestimate)
>>> # 2. Consider denoising or PCA preprocessing
>>> # 3. Use alternative methods like correlation_dimension
Raises
------
ValueError
If inputs are invalid, sample size too small, or duplicate points exist."""
# Validate inputs
if data is None and precomputed_graph is None:
raise ValueError("Either data or precomputed_graph must be provided")
if data is not None and precomputed_graph is not None:
raise ValueError("Provide either data or precomputed_graph, not both")
if k < 2:
raise ValueError("k must be at least 2")
# Get nearest neighbor distances
if precomputed_graph is not None:
# Use precomputed graph
indices, distances = precomputed_graph
indices = np.asarray(indices)
distances = np.asarray(distances)
n_samples = len(indices)
# Validate precomputed graph
if indices.shape != distances.shape:
raise ValueError("Indices and distances must have the same shape")
if indices.shape[1] < k + 1:
raise ValueError(
f"Precomputed graph must have at least k+1={k+1} neighbors, "
f"but has {indices.shape[1]}"
)
# Ensure we have enough columns
distances = distances[:, : k + 1]
indices = indices[:, : k + 1]
else:
# Compute from data
data = np.asarray(data)
n_samples = len(data)
if n_samples <= k:
raise ValueError(f"Number of samples ({n_samples}) must be greater than k ({k})")
# Compute nearest neighbor distances
if graph_method == "sklearn":
nbrs = NearestNeighbors(n_neighbors=k + 1, metric="euclidean")
nbrs.fit(data)
distances, indices = nbrs.kneighbors(data)
elif graph_method == "pynndescent":
index = pynndescent.NNDescent(data, metric="euclidean", n_neighbors=k + 1)
indices, distances = index.neighbor_graph
else:
raise ValueError(
f"Unknown graph construction method: {graph_method}. "
"Choose from 'sklearn' or 'pynndescent'."
)
# Compute ratios of successive nearest neighbor distances
# distances[:, 0] is the distance to self (0), so we start from index 1
ratios = np.zeros((k - 1, n_samples))
for i in range(k - 1):
# Ratio of (i+2)-th neighbor distance to (i+1)-th neighbor distance
# Check for zero distances to avoid division by zero
denom = distances[:, i + 1]
if np.any(denom == 0):
n_zero = np.sum(denom == 0)
raise ValueError(
f"Found {n_zero} zero distances to {i+1}-th nearest neighbor. "
"This typically indicates duplicate points in the dataset. "
"Consider removing duplicates or adding small noise."
)
ratios[i, :] = distances[:, i + 2] / denom
# Maximum likelihood estimation for intrinsic dimension
if k == 2:
# TWO-NN estimator: equation (3) from [1]
d = (n_samples - 1) / np.sum(np.log(ratios[0, :]))
else:
# Generalized k-NN estimator
log_sum = 0
for j in range(k - 1):
log_sum += (j + 1) * np.sum(np.log(ratios[j, :]))
d = (n_samples * (k - 1) - 1) / log_sum
return d
[docs]
def correlation_dimension(data, r_min=None, r_max=None, n_bins=20):
"""
Estimate the correlation dimension using the Grassberger-Procaccia algorithm.
The correlation dimension measures how the number of neighbor pairs scales
with distance, providing another estimate of intrinsic dimensionality.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The input data matrix. For large datasets, consider subsampling as
this method requires O(n²) memory.
r_min : float, optional
Minimum distance to consider. If None, uses 1st percentile of distances.
r_max : float, optional
Maximum distance to consider. If None, uses 50th percentile of distances.
n_bins : int, default=20
Number of distance bins for the correlation sum.
Returns
-------
dimension : float
The estimated correlation dimension. Returns NaN if estimation fails.
Raises
------
ValueError
If data has fewer than 2 samples or invalid parameters.
Notes
-----
This method has O(n²) memory complexity. For datasets with more than
10,000 samples, consider using a subsample or alternative methods.
References
----------
Grassberger, P., & Procaccia, I. (1983). Characterization of strange
attractors. Physical Review Letters, 50(5), 346.
Examples
--------
>>> import numpy as np
>>> from driada.dimensionality import correlation_dimension
>>>
>>> # Generate points on a 2D circle embedded in 3D
>>> t = np.linspace(0, 2*np.pi, 500)
>>> data = np.column_stack([np.cos(t), np.sin(t), np.zeros_like(t)])
>>> d_est = correlation_dimension(data, n_bins=15)
>>> print(f"Correlation dimension: {d_est:.2f}") # Should be close to 1
Correlation dimension: 1.07
>>>
>>> # For noisy data, specify scaling range
>>> np.random.seed(42) # For reproducible results
>>> noisy_data = data + 0.01 * np.random.randn(*data.shape)
>>> d_est = correlation_dimension(noisy_data, r_min=0.05, r_max=0.5)
>>> print(f"Correlation dimension: {d_est:.2f}")
Correlation dimension: 1.09
"""
from scipy.spatial.distance import pdist
data = np.asarray(data)
n_samples = len(data)
if n_samples < 2:
raise ValueError(f"Need at least 2 samples, got {n_samples}")
# Warn about memory usage for large datasets
if n_samples > 10000:
import warnings
warnings.warn(
f"Computing pairwise distances for {n_samples} samples requires "
f"~{n_samples * (n_samples - 1) * 8 / 2e9:.1f} GB of memory. "
"Consider using a subsample for large datasets.",
RuntimeWarning,
)
# Compute pairwise distances
distances = pdist(data)
distances = distances[distances > 0] # Remove zero distances
if len(distances) == 0:
raise ValueError("All points are identical")
if r_min is None:
r_min = np.percentile(distances, 1)
if r_max is None:
r_max = np.percentile(distances, 50)
if r_min <= 0:
raise ValueError(f"r_min must be positive, got {r_min}")
if r_max <= r_min:
raise ValueError(f"r_max ({r_max}) must be greater than r_min ({r_min})")
# Create log-spaced bins
r_values = np.logspace(np.log10(r_min), np.log10(r_max), n_bins)
# Compute correlation sum C(r) for each r
correlation_sum = []
for r in r_values:
C_r = np.mean(distances < r)
if C_r > 0:
correlation_sum.append(C_r)
else:
correlation_sum.append(np.nan)
correlation_sum = np.array(correlation_sum)
valid = ~np.isnan(correlation_sum) & (correlation_sum > 0)
if np.sum(valid) < 2:
return np.nan
# Fit line in log-log space
log_r = np.log(r_values[valid])
log_C = np.log(correlation_sum[valid])
# Linear regression to find slope (dimension)
coeffs = np.polyfit(log_r, log_C, 1)
dimension = coeffs[0]
return dimension
[docs]
def geodesic_dimension(data=None, graph=None, k=15, mode="full", factor=2, dim_step=0.1):
"""
Estimate intrinsic dimension using geodesic distances (De Granata et al. 2016).
This method estimates the intrinsic dimensionality by analyzing the distribution
of geodesic distances computed as shortest paths on a k-nearest neighbor graph.
The method focuses on the behavior of the distance distribution around its maximum.
Warning
-------
This method is sensitive to the k parameter. Small k values (k < 15) often lead
to underestimation due to disconnected or sparse graphs that poorly approximate
geodesic distances. For reliable estimates, use k ≥ 15-25, with larger values
needed for complex manifolds like Swiss rolls or S-curves.
Parameters
----------
data : array-like of shape (n_samples, n_features), optional
The input data matrix. Either data or graph must be provided.
graph : sparse matrix of shape (n_samples, n_samples), optional
Precomputed graph adjacency matrix with edge weights as distances.
If provided, data is ignored. Either data or graph must be provided.
k : int, default=15
Number of nearest neighbors for graph construction (if data provided).
mode : {'full', 'fast'}, default='full'
Computation mode:
- 'full': Use all pairwise geodesic distances
- 'fast': Use a random subset (1/factor of points)
factor : int, default=2
Subsampling factor for 'fast' mode. Uses n/factor random points.
dim_step : float, default=0.1
Step size for dimension grid search. Smaller values give more precise
estimates but take longer to compute.
Returns
-------
dimension : float
The estimated intrinsic dimension of the dataset/manifold.
Notes
-----
This method implements the approach from:
Granata, D., Carnevale, V. (2016). Accurate Estimation of the Intrinsic
Dimension Using Graph Distances: Unraveling the Geometric Complexity of
Datasets. Scientific Reports, 6, 31377.
The method is particularly robust for complex manifold geometries and can
handle small sample sizes effectively.
References
----------
[1] Granata, D., Carnevale, V. (2016). Accurate Estimation of the Intrinsic
Dimension Using Graph Distances: Unraveling the Geometric Complexity of
Datasets. Scientific Reports, 6, 31377.
https://doi.org/10.1038/srep31377
Examples
--------
>>> import numpy as np
>>> from driada.dimensionality.intrinsic import geodesic_dimension
>>> # Generate 2D Swiss roll
>>> from sklearn.datasets import make_swiss_roll
>>> np.random.seed(42) # For reproducible results
>>> data, _ = make_swiss_roll(n_samples=1000, noise=0.05, random_state=42)
>>> d_est = geodesic_dimension(data, k=15)
>>> print(f"Estimated dimension: {d_est:.2f}") # Should be close to 2
Estimated dimension: 1.90
Raises
------
ValueError
If inputs are invalid or graph is too disconnected.
RuntimeWarning
If graph is disconnected (infinite distances found)."""
import scipy.sparse as sp
from scipy.sparse.csgraph import shortest_path
from sklearn.neighbors import kneighbors_graph
if data is None and graph is None:
raise ValueError("Either data or graph must be provided")
if data is not None and graph is not None:
raise ValueError("Provide either data or graph, not both")
# Construct graph if data provided
if data is not None:
data = np.asarray(data)
n_samples = len(data)
# Build k-NN graph with distances
graph = kneighbors_graph(data, n_neighbors=k, mode="distance", include_self=False)
# Make symmetric by taking minimum distance
graph_min = graph.minimum(graph.T)
graph = sp.csr_matrix(graph_min)
else:
# Use provided graph
graph = sp.csr_matrix(graph)
n_samples = graph.shape[0]
# Subsample for fast mode
if mode == "fast":
indices = np.random.permutation(n_samples)[: n_samples // factor]
dm = graph[indices, :][:, indices]
else:
dm = graph
# Compute shortest paths (geodesic distances)
spmatrix = shortest_path(dm, method="D", directed=False)
all_dists = spmatrix.flatten()
all_dists = all_dists[all_dists != 0] # Remove self-distances
# Check for disconnected graph
n_infinite = np.sum(np.isinf(all_dists))
if n_infinite > 0:
import warnings
warnings.warn(
f"Graph appears to be disconnected. Found {n_infinite} infinite distances. "
f"These will be excluded from analysis.",
RuntimeWarning,
)
all_dists = all_dists[np.isfinite(all_dists)] # Remove infinite distances
# Check if we have enough finite distances
if len(all_dists) < 100:
raise ValueError(
f"Not enough finite distances for analysis. Got {len(all_dists)} distances. "
"Graph may be too disconnected."
)
# Analyze distance distribution
nbins = 500
hist, bin_edges = np.histogram(all_dists, bins=nbins, density=True)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
# Find maximum of distribution
dmax_idx = np.argmax(hist)
dmax = bin_centers[dmax_idx]
if dmax == 0:
raise ValueError(
"Maximum of distance distribution is at 0. "
"This may indicate all points are identical or graph is degenerate."
)
# Normalize distances by maximum
hist_norm, bin_edges_norm = np.histogram(all_dists / dmax, bins=nbins, density=True)
bin_centers_norm = (bin_edges_norm[:-1] + bin_edges_norm[1:]) / 2
# Analyze left side of distribution near maximum
std_norm = np.std(all_dists / dmax)
mask = (bin_centers_norm > 1 - 2 * std_norm) & (bin_centers_norm <= 1) & (hist_norm > 1e-6)
x_left = bin_centers_norm[mask]
y_left = np.log(hist_norm[mask] / np.max(hist_norm)) # Natural log, not log10
# Fit dimension by minimizing error against theoretical distribution
def theoretical_dist(x, D):
"""Theoretical distribution for D-dimensional hypersphere.
Based on Granata & Carnevale 2016 paper, the distribution of geodesic
distances near the maximum follows: (D-1) * log(sin(x * pi/2))
where x is the normalized distance and D is the intrinsic dimension.
The paper shows that for a D-dimensional hypersphere, the probability
distribution is proportional to sin^(D-1)(π*r/(2*r_max)), which gives
log P(r) ~ (D-1) * log(sin(π*r/(2*r_max))) when taking the logarithm.
Parameters
----------
x : array_like
Normalized distances (should be in range [0, 1]).
D : float
Candidate intrinsic dimension.
Returns
-------
array_like
Log probability values for the theoretical distribution."""
# Clip x to valid domain for sin
x_clipped = np.clip(x, 1e-10, 1 - 1e-10)
return (D - 1) * np.log(np.sin(x_clipped * np.pi / 2))
# Grid search for best dimension
dimensions = np.arange(1.0, 26.0, dim_step)
errors = []
for D in dimensions:
y_theory = theoretical_dist(x_left, D)
error = np.linalg.norm(y_theory - y_left) / np.sqrt(len(y_left))
errors.append(error)
# Find dimension with minimum error
best_idx = np.argmin(errors)
dimension = dimensions[best_idx]
return dimension