"""
Linear dimensionality estimation methods.
This module implements dimensionality estimation based on linear methods
such as Principal Component Analysis (PCA).
"""
import numpy as np
from sklearn.decomposition import PCA
from scipy.stats import entropy
[docs]
def pca_dimension(data, threshold=0.95, standardize=True):
"""
Estimate dimensionality using Principal Component Analysis (PCA).
This method determines the number of principal components needed to
explain a specified fraction of the total variance in the data.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The input data matrix where rows are samples and columns are features.
threshold : float, default=0.95
The fraction of total variance that should be explained by the
selected components. Must be between 0 and 1.
standardize : bool, default=True
Whether to standardize the data (zero mean, unit variance) before
applying PCA. Standardization is recommended when features have
different scales.
Returns
-------
n_components : int
The number of principal components needed to explain the specified
fraction of variance.
Notes
-----
This method provides a linear estimate of dimensionality based on variance
explained. It may overestimate the intrinsic dimension for nonlinear
manifolds but is useful for understanding the effective linear dimension
of the data.
Examples
--------
>>> import numpy as np
>>> from driada.dimensionality import pca_dimension
>>> # Generate data with 3 effective dimensions
>>> np.random.seed(42) # For reproducible results
>>> data = np.random.randn(1000, 10)
>>> data[:, 3:] *= 0.1 # Make dimensions 4-10 have small variance
>>> n_dim = pca_dimension(data, threshold=0.95)
>>> print(f"Number of components for 95% variance: {n_dim}")
Number of components for 95% variance: 10
"""
if not 0 < threshold <= 1:
raise ValueError(f"threshold must be between 0 and 1, got {threshold}")
data = np.asarray(data)
if data.shape[0] < 2:
raise ValueError("Need at least 2 samples to estimate dimensionality")
# Standardize data if requested
if standardize:
# Compute mean and std, handling zero variance features
data_mean = np.mean(data, axis=0)
data_std = np.std(data, axis=0)
# Avoid division by zero for constant features
data_std[data_std == 0] = 1.0
data_standardized = (data - data_mean) / data_std
else:
data_standardized = data
# Apply PCA
pca = PCA()
pca.fit(data_standardized)
# Compute cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# Find number of components exceeding threshold
n_components = np.argmax(cumulative_variance >= threshold) + 1
return n_components
[docs]
def pca_dimension_profile(data, thresholds=None, standardize=True):
"""
Compute PCA dimensionality estimates for multiple variance thresholds.
This function provides a profile of how many components are needed
for different levels of variance explained, which can help in
understanding the distribution of variance across components.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The input data matrix.
thresholds : array-like, optional
Variance thresholds to evaluate. If None, uses
[0.5, 0.8, 0.9, 0.95, 0.99].
standardize : bool, default=True
Whether to standardize the data before PCA.
Returns
-------
profile : dict
Dictionary containing:
- 'thresholds': array of variance thresholds
- 'n_components': array of components needed for each threshold
- 'explained_variance_ratio': variance explained by each component
- 'cumulative_variance': cumulative variance explained
Examples
--------
>>> np.random.seed(42) # For reproducible results
>>> data = np.random.randn(1000, 20)
>>> profile = pca_dimension_profile(data)
>>> for thresh, n_comp in zip(profile['thresholds'], profile['n_components']):
... print(f"{thresh*100:.0f}% variance: {n_comp} components")
50% variance: 9 components
80% variance: 16 components
90% variance: 18 components
95% variance: 19 components
99% variance: 20 components
"""
if thresholds is None:
thresholds = np.array([0.5, 0.8, 0.9, 0.95, 0.99])
else:
thresholds = np.asarray(thresholds)
data = np.asarray(data)
# Standardize if requested
if standardize:
data_mean = np.mean(data, axis=0)
data_std = np.std(data, axis=0)
data_std[data_std == 0] = 1.0
data_standardized = (data - data_mean) / data_std
else:
data_standardized = data
# Apply PCA
pca = PCA()
pca.fit(data_standardized)
# Compute cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# Find components needed for each threshold
n_components = []
for threshold in thresholds:
if threshold > cumulative_variance[-1]:
# Can't reach this threshold
n_comp = len(cumulative_variance)
else:
n_comp = np.argmax(cumulative_variance >= threshold) + 1
n_components.append(n_comp)
return {
"thresholds": thresholds,
"n_components": np.array(n_components),
"explained_variance_ratio": pca.explained_variance_ratio_,
"cumulative_variance": cumulative_variance,
}
[docs]
def effective_rank(data, standardize=True):
"""
Compute the effective rank (Roy & Vetterli, 2007) of the data matrix.
The effective rank is a continuous measure of dimensionality based on
the entropy of the normalized eigenvalue distribution.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The input data matrix.
standardize : bool, default=True
Whether to standardize the data before computation.
Returns
-------
eff_rank : float
The effective rank of the data matrix.
References
----------
Roy, O., & Vetterli, M. (2007). The effective rank: A measure of
effective dimensionality. In 2007 15th European Signal Processing
Conference (pp. 606-610). IEEE.
Examples
--------
>>> import numpy as np
>>> from driada.dimensionality import effective_rank
>>> # Full rank matrix
>>> np.random.seed(42) # For reproducible results
>>> data = np.random.randn(100, 10)
>>> eff_r = effective_rank(data)
>>> print(f"Effective rank: {eff_r:.2f}")
Effective rank: 9.88
"""
data = np.asarray(data)
# Standardize if requested
if standardize:
data_mean = np.mean(data, axis=0)
data_std = np.std(data, axis=0)
data_std[data_std == 0] = 1.0
data_standardized = (data - data_mean) / data_std
else:
data_standardized = data
# Compute singular values
_, s, _ = np.linalg.svd(data_standardized, full_matrices=False)
# Remove negligible singular values to avoid numerical issues
tolerance = np.finfo(float).eps * max(data.shape) * s[0] if len(s) > 0 else 0
s_positive = s[s > tolerance]
if len(s_positive) == 0:
return 1.0 # Degenerate case
# Normalize to get probability distribution
s_normalized = s_positive / np.sum(s_positive)
# Compute entropy (base e)
ent = entropy(s_normalized)
# Convert to effective rank
eff_rank = np.exp(ent)
return eff_rank