Source code for driada.dimensionality.linear

"""
Linear dimensionality estimation methods.

This module implements dimensionality estimation based on linear methods
such as Principal Component Analysis (PCA).
"""

import numpy as np
from sklearn.decomposition import PCA
from scipy.stats import entropy


[docs] def pca_dimension(data, threshold=0.95, standardize=True): """ Estimate dimensionality using Principal Component Analysis (PCA). This method determines the number of principal components needed to explain a specified fraction of the total variance in the data. Parameters ---------- data : array-like of shape (n_samples, n_features) The input data matrix where rows are samples and columns are features. threshold : float, default=0.95 The fraction of total variance that should be explained by the selected components. Must be between 0 and 1. standardize : bool, default=True Whether to standardize the data (zero mean, unit variance) before applying PCA. Standardization is recommended when features have different scales. Returns ------- n_components : int The number of principal components needed to explain the specified fraction of variance. Notes ----- This method provides a linear estimate of dimensionality based on variance explained. It may overestimate the intrinsic dimension for nonlinear manifolds but is useful for understanding the effective linear dimension of the data. Examples -------- >>> import numpy as np >>> from driada.dimensionality import pca_dimension >>> # Generate data with 3 effective dimensions >>> np.random.seed(42) # For reproducible results >>> data = np.random.randn(1000, 10) >>> data[:, 3:] *= 0.1 # Make dimensions 4-10 have small variance >>> n_dim = pca_dimension(data, threshold=0.95) >>> print(f"Number of components for 95% variance: {n_dim}") Number of components for 95% variance: 10 """ if not 0 < threshold <= 1: raise ValueError(f"threshold must be between 0 and 1, got {threshold}") data = np.asarray(data) if data.shape[0] < 2: raise ValueError("Need at least 2 samples to estimate dimensionality") # Standardize data if requested if standardize: # Compute mean and std, handling zero variance features data_mean = np.mean(data, axis=0) data_std = np.std(data, axis=0) # Avoid division by zero for constant features data_std[data_std == 0] = 1.0 data_standardized = (data - data_mean) / data_std else: data_standardized = data # Apply PCA pca = PCA() pca.fit(data_standardized) # Compute cumulative explained variance cumulative_variance = np.cumsum(pca.explained_variance_ratio_) # Find number of components exceeding threshold n_components = np.argmax(cumulative_variance >= threshold) + 1 return n_components
[docs] def pca_dimension_profile(data, thresholds=None, standardize=True): """ Compute PCA dimensionality estimates for multiple variance thresholds. This function provides a profile of how many components are needed for different levels of variance explained, which can help in understanding the distribution of variance across components. Parameters ---------- data : array-like of shape (n_samples, n_features) The input data matrix. thresholds : array-like, optional Variance thresholds to evaluate. If None, uses [0.5, 0.8, 0.9, 0.95, 0.99]. standardize : bool, default=True Whether to standardize the data before PCA. Returns ------- profile : dict Dictionary containing: - 'thresholds': array of variance thresholds - 'n_components': array of components needed for each threshold - 'explained_variance_ratio': variance explained by each component - 'cumulative_variance': cumulative variance explained Examples -------- >>> np.random.seed(42) # For reproducible results >>> data = np.random.randn(1000, 20) >>> profile = pca_dimension_profile(data) >>> for thresh, n_comp in zip(profile['thresholds'], profile['n_components']): ... print(f"{thresh*100:.0f}% variance: {n_comp} components") 50% variance: 9 components 80% variance: 16 components 90% variance: 18 components 95% variance: 19 components 99% variance: 20 components """ if thresholds is None: thresholds = np.array([0.5, 0.8, 0.9, 0.95, 0.99]) else: thresholds = np.asarray(thresholds) data = np.asarray(data) # Standardize if requested if standardize: data_mean = np.mean(data, axis=0) data_std = np.std(data, axis=0) data_std[data_std == 0] = 1.0 data_standardized = (data - data_mean) / data_std else: data_standardized = data # Apply PCA pca = PCA() pca.fit(data_standardized) # Compute cumulative variance cumulative_variance = np.cumsum(pca.explained_variance_ratio_) # Find components needed for each threshold n_components = [] for threshold in thresholds: if threshold > cumulative_variance[-1]: # Can't reach this threshold n_comp = len(cumulative_variance) else: n_comp = np.argmax(cumulative_variance >= threshold) + 1 n_components.append(n_comp) return { "thresholds": thresholds, "n_components": np.array(n_components), "explained_variance_ratio": pca.explained_variance_ratio_, "cumulative_variance": cumulative_variance, }
[docs] def effective_rank(data, standardize=True): """ Compute the effective rank (Roy & Vetterli, 2007) of the data matrix. The effective rank is a continuous measure of dimensionality based on the entropy of the normalized eigenvalue distribution. Parameters ---------- data : array-like of shape (n_samples, n_features) The input data matrix. standardize : bool, default=True Whether to standardize the data before computation. Returns ------- eff_rank : float The effective rank of the data matrix. References ---------- Roy, O., & Vetterli, M. (2007). The effective rank: A measure of effective dimensionality. In 2007 15th European Signal Processing Conference (pp. 606-610). IEEE. Examples -------- >>> import numpy as np >>> from driada.dimensionality import effective_rank >>> # Full rank matrix >>> np.random.seed(42) # For reproducible results >>> data = np.random.randn(100, 10) >>> eff_r = effective_rank(data) >>> print(f"Effective rank: {eff_r:.2f}") Effective rank: 9.88 """ data = np.asarray(data) # Standardize if requested if standardize: data_mean = np.mean(data, axis=0) data_std = np.std(data, axis=0) data_std[data_std == 0] = 1.0 data_standardized = (data - data_mean) / data_std else: data_standardized = data # Compute singular values _, s, _ = np.linalg.svd(data_standardized, full_matrices=False) # Remove negligible singular values to avoid numerical issues tolerance = np.finfo(float).eps * max(data.shape) * s[0] if len(s) > 0 else 0 s_positive = s[s > tolerance] if len(s_positive) == 0: return 1.0 # Degenerate case # Normalize to get probability distribution s_normalized = s_positive / np.sum(s_positive) # Compute entropy (base e) ent = entropy(s_normalized) # Convert to effective rank eff_rank = np.exp(ent) return eff_rank