Source code for driada.dimensionality.effective

from .utils import correct_cov_spectrum, eigh
import numpy as np
from scipy.stats import entropy
import warnings
from ..utils.data import correlation_matrix

DATA_SHAPE_THR = 0.01  # if n/t in multivariate time series data is more than DATA_SHAPE_THR,
# cov/corr spectrum may be significantly distorted, correction is recommended


def _eff_dim(corr_eigs, q=2):
    """Compute effective dimension using Renyi entropy.

    Parameters
    ----------
    corr_eigs : array-like
        Eigenvalues of correlation/covariance matrix. Must be non-negative.
    q : float, default=2
        Order of Renyi entropy. Common values:
        - q=1: Shannon entropy (standard entropy)
        - q=2: Quadratic entropy (collision entropy)
        - q=inf: Min-entropy

    Returns
    -------
    float
        Effective dimension based on entropy of eigenvalue distribution.

    Raises
    ------
    ValueError
        If q < 0 or if eigenvalues are invalid.

    Notes
    -----
    The effective dimension quantifies the spread of variance across
    principal components. Higher values indicate more distributed variance.

    References
    ----------
    Rényi, A. (1961). On measures of entropy and information. Proceedings
    of the Fourth Berkeley Symposium on Mathematical Statistics and
    Probability, Volume 1: Contributions to the Theory of Statistics.

    Examples
    --------
    >>> import numpy as np
    >>> # Example with eigenvalues from PCA
    >>> eigenvalues = np.array([5.0, 3.0, 1.0, 0.5, 0.3, 0.2])
    >>> eff_d = _eff_dim(eigenvalues, q=2)
    >>> print(f"Effective dimension (q=2): {eff_d:.2f}")
    Effective dimension (q=2): 2.83
    """
    if q < 0:
        raise ValueError("Renyi entropy is undefined for q<0")

    corr_eigs = np.asarray(corr_eigs)

    # Validate eigenvalues
    if len(corr_eigs) == 0:
        raise ValueError("Empty eigenvalue array")

    if np.any(corr_eigs < -1e-10):
        raise ValueError("Eigenvalues must be non-negative")

    # Clean up numerical noise
    corr_eigs = np.maximum(corr_eigs, 0)
    eig_sum = np.sum(corr_eigs)

    if eig_sum == 0:
        raise ValueError("All eigenvalues are zero")

    if q == 1:
        # Shannon entropy
        norm_corr_eigs = corr_eigs / eig_sum
        # Remove zero eigenvalues to avoid log(0)
        norm_corr_eigs = norm_corr_eigs[norm_corr_eigs > 0]
        return 2 ** entropy(norm_corr_eigs, base=2)

    elif q == 2:
        # Quadratic entropy
        return (eig_sum**2) / np.sum(corr_eigs**2)

    elif q == np.inf:
        # Min-entropy
        return eig_sum / np.max(corr_eigs)

    else:
        # General Renyi entropy
        norm_corr_eigs = corr_eigs / eig_sum
        # Remove zeros to avoid 0^q issues
        norm_corr_eigs = norm_corr_eigs[norm_corr_eigs > 0]
        return np.exp((1.0 / (1.0 - q)) * np.log(np.sum(norm_corr_eigs**q)))



[docs]
def eff_dim(data, enable_correction, q=2, **correction_kwargs):
    """Compute effective dimension of multivariate data.

    Parameters
    ----------
    data : array-like of shape (n_samples, n_features)
        Input data matrix where rows are samples and columns are features.
        This follows the standard scikit-learn convention.
    enable_correction : bool
        Whether to apply finite-sample spectrum correction to eigenvalues.
    q : float, default=2
        Order of Renyi entropy (see _eff_dim for details).
    **correction_kwargs
        Additional arguments for spectrum correction (see correct_cov_spectrum).

    Returns
    -------
    float
        Effective dimension of the data.

    Raises
    ------
    ValueError
        If data has invalid shape or if computation fails.

    Notes
    -----
    When n_features/n_samples > 0.01, spectrum correction is recommended
    to account for finite-sample biases in eigenvalue estimation.

    The data should be organized with samples as rows and features as
    columns, following the standard machine learning convention. For neural
    data, this means timepoints as rows and neurons as columns.

    Examples
    --------
    >>> import numpy as np
    >>> # Generate data with 3 effective dimensions
    >>> n_samples, n_features = 1000, 50
    >>> np.random.seed(42)  # For reproducible results
    >>> latent = np.random.randn(n_samples, 3)
    >>> mixing = np.random.randn(3, n_features)
    >>> data = latent @ mixing + 0.1 * np.random.randn(n_samples, n_features)
    >>>
    >>> # Compute effective dimension
    >>> eff_d = eff_dim(data, enable_correction=False, q=2)
    >>> print(f"Effective dimension: {eff_d:.2f}")  # Should be close to 3
    Effective dimension: 3.00
    """
    data = np.asarray(data)

    if data.ndim != 2:
        raise ValueError(f"Data must be 2D, got shape {data.shape}")

    # Transpose to get (n_variables, n_observations) for internal computation
    data_t = data.T
    n, t = data_t.shape

    if t < 2:
        raise ValueError(f"Need at least 2 samples, got {t}")

    if 1.0 * n / t > DATA_SHAPE_THR and not enable_correction:
        warnings.warn(
            f"n_features/n_samples ratio is {1.0*n/t:.3f}, which is bigger than {DATA_SHAPE_THR}. "
            "Spectrum correction is recommended (set enable_correction=True)"
        )

    cmat = correlation_matrix(data_t)

    # Check for NaN in correlation matrix
    if np.any(np.isnan(cmat)):
        raise ValueError(
            "Correlation matrix contains NaN values. "
            "Check for constant features or insufficient data."
        )

    if enable_correction:
        corrected_eigs = correct_cov_spectrum(n, t, cmat, **correction_kwargs)
        final_eigs = corrected_eigs[-1]
    else:
        final_eigs = eigh(cmat, eigvals_only=True)

    return _eff_dim(final_eigs, q=q)