Source code for driada.dimensionality.intrinsic

"""
Intrinsic dimensionality estimation using nearest neighbor methods.

This module implements methods to estimate the intrinsic dimensionality of datasets
based on the statistics of nearest neighbor distances.
"""

import numpy as np
from sklearn.neighbors import NearestNeighbors
import pynndescent



[docs]
def nn_dimension(data=None, k=2, graph_method="sklearn", precomputed_graph=None):
    """
    Estimate intrinsic dimension using the k-NN algorithm.

    This method estimates the intrinsic dimensionality by analyzing the ratios
    of distances to successive nearest neighbors using maximum likelihood
    estimation. When k=2, this implements the TWO-NN algorithm [1], where
    "TWO" refers to using the 1st and 2nd nearest neighbors to compute
    distance ratios.

    The algorithm works by:

    1. For each point, computing the ratio r_i = d_2 / d_1 (for k=2), where
       d_1 and d_2 are distances to the 1st and 2nd nearest neighbors
    2. Using maximum likelihood to estimate dimension from the distribution
       of these ratios, which follows a specific form dependent on the
       intrinsic dimension
    3. For k>2, extending to use multiple successive neighbor ratios
       (d_2/d_1, d_3/d_2, ..., d_k/d_{k-1}) for more robust estimation

    Parameters
    ----------
    data : array-like of shape (n_samples, n_features), optional
        The input data matrix where rows are samples and columns are features.
        Either data or precomputed_graph must be provided.

    k : int, default=2
        The number of nearest neighbors to consider. k=2 gives the TWO-NN algorithm.
        Higher values can provide more robust estimates.

    graph_method : {'sklearn', 'pynndescent'}, default='sklearn'
        Method to use for nearest neighbor graph construction:
        - 'sklearn': Uses scikit-learn's NearestNeighbors (exact)
        - 'pynndescent': Uses PyNNDescent (approximate, faster for large datasets)
        Only used if data is provided.

    precomputed_graph : tuple of (indices, distances), optional
        Precomputed k-NN graph as a tuple of:
        - indices: array of shape (n_samples, k+1) with neighbor indices
        - distances: array of shape (n_samples, k+1) with neighbor distances
        The first column (index 0) should contain self-references with distance 0.
        Either data or precomputed_graph must be provided.

    Returns
    -------
    d : float
        The estimated intrinsic dimension of the dataset.

    Notes
    -----
    The method is based on the principle that in a d-dimensional manifold,
    the ratio of distances to successive nearest neighbors follows a specific
    distribution that depends on d.

    References
    ----------
    [1] Facco, E., d'Errico, M., Rodriguez, A., & Laio, A. (2017).
        Estimating the intrinsic dimension of datasets by a minimal
        neighborhood information. Scientific Reports, 7(1), 12140.
        https://doi.org/10.1038/s41598-017-11873-y

    Examples
    --------
    >>> import numpy as np
    >>> from driada.dimensionality.intrinsic import nn_dimension
    >>>
    >>> # Example 1: Clean low-dimensional data
    >>> np.random.seed(42)
    >>> # Generate 2D Swiss roll
    >>> n = 1000
    >>> t = 1.5 * np.pi * (1 + 2 * np.random.rand(n))
    >>> height = 30 * np.random.rand(n)
    >>> X = np.zeros((n, 3))
    >>> X[:, 0] = t * np.cos(t)
    >>> X[:, 1] = height
    >>> X[:, 2] = t * np.sin(t)
    >>> d_est = nn_dimension(X, k=10)
    >>> print(f"Swiss roll (true dim=2): {d_est:.2f}")
    Swiss roll (true dim=2): 1.94
    >>>
    >>> # Example 2: Impact of ambient noise
    >>> # The TWO-NN estimator (k=2) can overestimate dimension when data
    >>> # has noise in ambient dimensions. This is because noise affects
    >>> # the ratios of nearest neighbor distances.
    >>>
    >>> # Pure 2D data: accurate estimate
    >>> theta = np.random.uniform(0, 2*np.pi, 1000)
    >>> r = np.sqrt(np.random.uniform(0, 1, 1000))  # Uniform distribution on disk
    >>> data_2d = np.column_stack([r * np.cos(theta), r * np.sin(theta)])
    >>> d_est = nn_dimension(data_2d, k=2)
    >>> print(f"Pure 2D disk: {d_est:.2f}")
    Pure 2D disk: 2.01
    >>>
    >>> # Same data with small noise in extra dimensions: overestimates
    >>> # This happens because even small noise changes neighbor relationships
    >>> noise = 0.01 * np.random.randn(1000, 8)
    >>> data_10d = np.hstack([data_2d, noise])
    >>> d_est = nn_dimension(data_10d, k=2)
    >>> print(f"2D disk in 10D with noise: {d_est:.2f}")  # Overestimates due to noise
    2D disk in 10D with noise: 4.73
    >>>
    >>> # For noisy high-dimensional data:
    >>> # 1. Use higher k values (more robust but may underestimate)
    >>> # 2. Consider denoising or PCA preprocessing
    >>> # 3. Use alternative methods like correlation_dimension

    Raises
    ------
    ValueError
        If inputs are invalid, sample size too small, or duplicate points exist."""
    # Validate inputs
    if data is None and precomputed_graph is None:
        raise ValueError("Either data or precomputed_graph must be provided")

    if data is not None and precomputed_graph is not None:
        raise ValueError("Provide either data or precomputed_graph, not both")

    if k < 2:
        raise ValueError("k must be at least 2")

    # Get nearest neighbor distances
    if precomputed_graph is not None:
        # Use precomputed graph
        indices, distances = precomputed_graph
        indices = np.asarray(indices)
        distances = np.asarray(distances)
        n_samples = len(indices)

        # Validate precomputed graph
        if indices.shape != distances.shape:
            raise ValueError("Indices and distances must have the same shape")

        if indices.shape[1] < k + 1:
            raise ValueError(
                f"Precomputed graph must have at least k+1={k+1} neighbors, "
                f"but has {indices.shape[1]}"
            )

        # Ensure we have enough columns
        distances = distances[:, : k + 1]
        indices = indices[:, : k + 1]

    else:
        # Compute from data
        data = np.asarray(data)
        n_samples = len(data)

        if n_samples <= k:
            raise ValueError(f"Number of samples ({n_samples}) must be greater than k ({k})")

        # Compute nearest neighbor distances
        if graph_method == "sklearn":
            nbrs = NearestNeighbors(n_neighbors=k + 1, metric="euclidean")
            nbrs.fit(data)
            distances, indices = nbrs.kneighbors(data)
        elif graph_method == "pynndescent":
            index = pynndescent.NNDescent(data, metric="euclidean", n_neighbors=k + 1)
            indices, distances = index.neighbor_graph
        else:
            raise ValueError(
                f"Unknown graph construction method: {graph_method}. "
                "Choose from 'sklearn' or 'pynndescent'."
            )

    # Compute ratios of successive nearest neighbor distances
    # distances[:, 0] is the distance to self (0), so we start from index 1
    ratios = np.zeros((k - 1, n_samples))
    for i in range(k - 1):
        # Ratio of (i+2)-th neighbor distance to (i+1)-th neighbor distance
        # Check for zero distances to avoid division by zero
        denom = distances[:, i + 1]
        if np.any(denom == 0):
            n_zero = np.sum(denom == 0)
            raise ValueError(
                f"Found {n_zero} zero distances to {i+1}-th nearest neighbor. "
                "This typically indicates duplicate points in the dataset. "
                "Consider removing duplicates or adding small noise."
            )
        ratios[i, :] = distances[:, i + 2] / denom

    # Maximum likelihood estimation for intrinsic dimension
    if k == 2:
        # TWO-NN estimator: equation (3) from [1]
        d = (n_samples - 1) / np.sum(np.log(ratios[0, :]))
    else:
        # Generalized k-NN estimator
        log_sum = 0
        for j in range(k - 1):
            log_sum += (j + 1) * np.sum(np.log(ratios[j, :]))
        d = (n_samples * (k - 1) - 1) / log_sum

    return d




[docs]
def correlation_dimension(data, r_min=None, r_max=None, n_bins=20):
    """
    Estimate the correlation dimension using the Grassberger-Procaccia algorithm.

    The correlation dimension measures how the number of neighbor pairs scales
    with distance, providing another estimate of intrinsic dimensionality.

    Parameters
    ----------
    data : array-like of shape (n_samples, n_features)
        The input data matrix. For large datasets, consider subsampling as
        this method requires O(n²) memory.

    r_min : float, optional
        Minimum distance to consider. If None, uses 1st percentile of distances.

    r_max : float, optional
        Maximum distance to consider. If None, uses 50th percentile of distances.

    n_bins : int, default=20
        Number of distance bins for the correlation sum.

    Returns
    -------
    dimension : float
        The estimated correlation dimension. Returns NaN if estimation fails.

    Raises
    ------
    ValueError
        If data has fewer than 2 samples or invalid parameters.

    Notes
    -----
    This method has O(n²) memory complexity. For datasets with more than
    10,000 samples, consider using a subsample or alternative methods.

    References
    ----------
    Grassberger, P., & Procaccia, I. (1983). Characterization of strange
    attractors. Physical Review Letters, 50(5), 346.

    Examples
    --------
    >>> import numpy as np
    >>> from driada.dimensionality import correlation_dimension
    >>>
    >>> # Generate points on a 2D circle embedded in 3D
    >>> t = np.linspace(0, 2*np.pi, 500)
    >>> data = np.column_stack([np.cos(t), np.sin(t), np.zeros_like(t)])
    >>> d_est = correlation_dimension(data, n_bins=15)
    >>> print(f"Correlation dimension: {d_est:.2f}")  # Should be close to 1
    Correlation dimension: 1.07
    >>>
    >>> # For noisy data, specify scaling range
    >>> np.random.seed(42)  # For reproducible results
    >>> noisy_data = data + 0.01 * np.random.randn(*data.shape)
    >>> d_est = correlation_dimension(noisy_data, r_min=0.05, r_max=0.5)
    >>> print(f"Correlation dimension: {d_est:.2f}")
    Correlation dimension: 1.09
    """
    from scipy.spatial.distance import pdist

    data = np.asarray(data)
    n_samples = len(data)

    if n_samples < 2:
        raise ValueError(f"Need at least 2 samples, got {n_samples}")

    # Warn about memory usage for large datasets
    if n_samples > 10000:
        import warnings

        warnings.warn(
            f"Computing pairwise distances for {n_samples} samples requires "
            f"~{n_samples * (n_samples - 1) * 8 / 2e9:.1f} GB of memory. "
            "Consider using a subsample for large datasets.",
            RuntimeWarning,
        )

    # Compute pairwise distances
    distances = pdist(data)
    distances = distances[distances > 0]  # Remove zero distances

    if len(distances) == 0:
        raise ValueError("All points are identical")

    if r_min is None:
        r_min = np.percentile(distances, 1)
    if r_max is None:
        r_max = np.percentile(distances, 50)

    if r_min <= 0:
        raise ValueError(f"r_min must be positive, got {r_min}")
    if r_max <= r_min:
        raise ValueError(f"r_max ({r_max}) must be greater than r_min ({r_min})")

    # Create log-spaced bins
    r_values = np.logspace(np.log10(r_min), np.log10(r_max), n_bins)

    # Compute correlation sum C(r) for each r
    correlation_sum = []
    for r in r_values:
        C_r = np.mean(distances < r)
        if C_r > 0:
            correlation_sum.append(C_r)
        else:
            correlation_sum.append(np.nan)

    correlation_sum = np.array(correlation_sum)
    valid = ~np.isnan(correlation_sum) & (correlation_sum > 0)

    if np.sum(valid) < 2:
        return np.nan

    # Fit line in log-log space
    log_r = np.log(r_values[valid])
    log_C = np.log(correlation_sum[valid])

    # Linear regression to find slope (dimension)
    coeffs = np.polyfit(log_r, log_C, 1)
    dimension = coeffs[0]

    return dimension




[docs]
def geodesic_dimension(data=None, graph=None, k=15, mode="full", factor=2, dim_step=0.1):
    """
    Estimate intrinsic dimension using geodesic distances (De Granata et al. 2016).

    This method estimates the intrinsic dimensionality by analyzing the distribution
    of geodesic distances computed as shortest paths on a k-nearest neighbor graph.
    The method focuses on the behavior of the distance distribution around its maximum.

    Warning
    -------
    This method is sensitive to the k parameter. Small k values (k < 15) often lead
    to underestimation due to disconnected or sparse graphs that poorly approximate
    geodesic distances. For reliable estimates, use k ≥ 15-25, with larger values
    needed for complex manifolds like Swiss rolls or S-curves.

    Parameters
    ----------
    data : array-like of shape (n_samples, n_features), optional
        The input data matrix. Either data or graph must be provided.

    graph : sparse matrix of shape (n_samples, n_samples), optional
        Precomputed graph adjacency matrix with edge weights as distances.
        If provided, data is ignored. Either data or graph must be provided.

    k : int, default=15
        Number of nearest neighbors for graph construction (if data provided).

    mode : {'full', 'fast'}, default='full'
        Computation mode:
        - 'full': Use all pairwise geodesic distances
        - 'fast': Use a random subset (1/factor of points)

    factor : int, default=2
        Subsampling factor for 'fast' mode. Uses n/factor random points.

    dim_step : float, default=0.1
        Step size for dimension grid search. Smaller values give more precise
        estimates but take longer to compute.

    Returns
    -------
    dimension : float
        The estimated intrinsic dimension of the dataset/manifold.

    Notes
    -----
    This method implements the approach from:
    Granata, D., Carnevale, V. (2016). Accurate Estimation of the Intrinsic
    Dimension Using Graph Distances: Unraveling the Geometric Complexity of
    Datasets. Scientific Reports, 6, 31377.

    The method is particularly robust for complex manifold geometries and can
    handle small sample sizes effectively.

    References
    ----------
    [1] Granata, D., Carnevale, V. (2016). Accurate Estimation of the Intrinsic
        Dimension Using Graph Distances: Unraveling the Geometric Complexity of
        Datasets. Scientific Reports, 6, 31377.
        https://doi.org/10.1038/srep31377

    Examples
    --------
    >>> import numpy as np
    >>> from driada.dimensionality.intrinsic import geodesic_dimension
    >>> # Generate 2D Swiss roll
    >>> from sklearn.datasets import make_swiss_roll
    >>> np.random.seed(42)  # For reproducible results
    >>> data, _ = make_swiss_roll(n_samples=1000, noise=0.05, random_state=42)
    >>> d_est = geodesic_dimension(data, k=15)
    >>> print(f"Estimated dimension: {d_est:.2f}")  # Should be close to 2
    Estimated dimension: 1.90

    Raises
    ------
    ValueError
        If inputs are invalid or graph is too disconnected.
    RuntimeWarning
        If graph is disconnected (infinite distances found)."""
    import scipy.sparse as sp
    from scipy.sparse.csgraph import shortest_path
    from sklearn.neighbors import kneighbors_graph

    if data is None and graph is None:
        raise ValueError("Either data or graph must be provided")

    if data is not None and graph is not None:
        raise ValueError("Provide either data or graph, not both")

    # Construct graph if data provided
    if data is not None:
        data = np.asarray(data)
        n_samples = len(data)

        # Build k-NN graph with distances
        graph = kneighbors_graph(data, n_neighbors=k, mode="distance", include_self=False)
        # Make symmetric by taking minimum distance
        graph_min = graph.minimum(graph.T)
        graph = sp.csr_matrix(graph_min)

    else:
        # Use provided graph
        graph = sp.csr_matrix(graph)
        n_samples = graph.shape[0]

    # Subsample for fast mode
    if mode == "fast":
        indices = np.random.permutation(n_samples)[: n_samples // factor]
        dm = graph[indices, :][:, indices]
    else:
        dm = graph

    # Compute shortest paths (geodesic distances)
    spmatrix = shortest_path(dm, method="D", directed=False)
    all_dists = spmatrix.flatten()
    all_dists = all_dists[all_dists != 0]  # Remove self-distances

    # Check for disconnected graph
    n_infinite = np.sum(np.isinf(all_dists))
    if n_infinite > 0:
        import warnings

        warnings.warn(
            f"Graph appears to be disconnected. Found {n_infinite} infinite distances. "
            f"These will be excluded from analysis.",
            RuntimeWarning,
        )

    all_dists = all_dists[np.isfinite(all_dists)]  # Remove infinite distances

    # Check if we have enough finite distances
    if len(all_dists) < 100:
        raise ValueError(
            f"Not enough finite distances for analysis. Got {len(all_dists)} distances. "
            "Graph may be too disconnected."
        )

    # Analyze distance distribution
    nbins = 500
    hist, bin_edges = np.histogram(all_dists, bins=nbins, density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    # Find maximum of distribution
    dmax_idx = np.argmax(hist)
    dmax = bin_centers[dmax_idx]

    if dmax == 0:
        raise ValueError(
            "Maximum of distance distribution is at 0. "
            "This may indicate all points are identical or graph is degenerate."
        )

    # Normalize distances by maximum
    hist_norm, bin_edges_norm = np.histogram(all_dists / dmax, bins=nbins, density=True)
    bin_centers_norm = (bin_edges_norm[:-1] + bin_edges_norm[1:]) / 2

    # Analyze left side of distribution near maximum
    std_norm = np.std(all_dists / dmax)
    mask = (bin_centers_norm > 1 - 2 * std_norm) & (bin_centers_norm <= 1) & (hist_norm > 1e-6)
    x_left = bin_centers_norm[mask]
    y_left = np.log(hist_norm[mask] / np.max(hist_norm))  # Natural log, not log10

    # Fit dimension by minimizing error against theoretical distribution
    def theoretical_dist(x, D):
        """Theoretical distribution for D-dimensional hypersphere.

        Based on Granata & Carnevale 2016 paper, the distribution of geodesic
        distances near the maximum follows: (D-1) * log(sin(x * pi/2))
        where x is the normalized distance and D is the intrinsic dimension.

        The paper shows that for a D-dimensional hypersphere, the probability
        distribution is proportional to sin^(D-1)(π*r/(2*r_max)), which gives
        log P(r) ~ (D-1) * log(sin(π*r/(2*r_max))) when taking the logarithm.

        Parameters
        ----------
        x : array_like
            Normalized distances (should be in range [0, 1]).
        D : float
            Candidate intrinsic dimension.

        Returns
        -------
        array_like
            Log probability values for the theoretical distribution."""
        # Clip x to valid domain for sin
        x_clipped = np.clip(x, 1e-10, 1 - 1e-10)
        return (D - 1) * np.log(np.sin(x_clipped * np.pi / 2))

    # Grid search for best dimension
    dimensions = np.arange(1.0, 26.0, dim_step)
    errors = []

    for D in dimensions:
        y_theory = theoretical_dist(x_left, D)
        error = np.linalg.norm(y_theory - y_left) / np.sqrt(len(y_left))
        errors.append(error)

    # Find dimension with minimum error
    best_idx = np.argmin(errors)
    dimension = dimensions[best_idx]

    return dimension