from pynndescent.distances import named_distances
from typing import Dict, Optional, Any
[docs]
class DRMethod(object):
"""Dimensionality reduction method configuration.
Attributes
----------
is_linear : bool
Whether the method is linear
requires_graph : bool
Whether the method requires a proximity graph
requires_distmat : bool
Whether the method requires a distance matrix
nn_based : bool
Whether the method is neural network based
handles_disconnected_graphs : bool
Whether the method can handle disconnected graphs without preprocessing
default_params : dict
Default parameters for the embedding method
default_graph_params : dict or None
Default graph construction parameters (if requires_graph)
default_metric_params : dict or None
Default metric parameters (if requires weights)
Notes
-----
Boolean attributes are stored internally but accept 0/1 integer values
for backward compatibility."""
[docs]
def __init__(
self,
is_linear,
requires_graph,
requires_distmat,
nn_based,
handles_disconnected_graphs=0,
default_params=None,
default_graph_params=None,
default_metric_params=None,
):
"""Initialize a DRMethod configuration object.
Parameters
----------
is_linear : int or bool
Whether the method is linear (1/True) or nonlinear (0/False).
requires_graph : int or bool
Whether the method requires a proximity graph (1/True) or not (0/False).
requires_distmat : int or bool
Whether the method requires a distance matrix (1/True) or not (0/False).
nn_based : int or bool
Whether the method is neural network based (1/True) or not (0/False).
handles_disconnected_graphs : int or bool, default=0
Whether the method can handle disconnected graphs without preprocessing.
default_params : dict or None, default=None
Default parameters for the embedding method. If None, uses empty dict.
default_graph_params : dict or None, default=None
Default graph construction parameters (if requires_graph).
default_metric_params : dict or None, default=None
Default metric parameters (if requires weights)."""
self.is_linear = bool(is_linear)
self.requires_graph = bool(requires_graph)
self.requires_distmat = bool(requires_distmat)
self.nn_based = bool(nn_based)
self.handles_disconnected_graphs = bool(handles_disconnected_graphs)
self.default_params = default_params or {}
self.default_graph_params = default_graph_params
self.default_metric_params = default_metric_params
# Default graph parameters for graph-based methods
DEFAULT_KNN_GRAPH = {
"g_method_name": "knn",
"nn": 15,
"weighted": 0,
"max_deleted_nodes": 0.2,
"dist_to_aff": "hk",
}
DEFAULT_METRIC = {"metric_name": "l2", "sigma": 1.0}
METHODS_DICT = {
"pca": DRMethod(1, 0, 0, 0, default_params={"dim": 2}),
"le": DRMethod(
0,
1,
0,
0,
default_params={"dim": 2},
default_graph_params=DEFAULT_KNN_GRAPH,
default_metric_params=DEFAULT_METRIC,
),
"auto_le": DRMethod(
0,
1,
0,
0,
default_params={"dim": 2},
default_graph_params=DEFAULT_KNN_GRAPH,
default_metric_params=DEFAULT_METRIC,
),
"dmaps": DRMethod(
0,
1,
0,
0,
default_params={"dim": 2, "dm_alpha": 0.5, "dm_t": 1},
default_graph_params={**DEFAULT_KNN_GRAPH, "weighted": 1},
default_metric_params=DEFAULT_METRIC,
),
"auto_dmaps": DRMethod(
0,
1,
0,
0,
default_params={"dim": 2, "dm_alpha": 0.5, "dm_t": 1},
default_graph_params=DEFAULT_KNN_GRAPH,
default_metric_params=DEFAULT_METRIC,
),
"mds": DRMethod(0, 0, 1, 0, default_params={"dim": 2}),
"isomap": DRMethod(
0,
1,
0,
0,
default_params={"dim": 2},
default_graph_params={**DEFAULT_KNN_GRAPH, "nn": 15},
default_metric_params=DEFAULT_METRIC,
),
"lle": DRMethod(
0,
1,
0,
0,
default_params={"dim": 2},
default_graph_params={**DEFAULT_KNN_GRAPH, "nn": 10},
default_metric_params=DEFAULT_METRIC,
),
"hlle": DRMethod(
0,
1,
0,
0,
default_params={"dim": 2},
default_graph_params={**DEFAULT_KNN_GRAPH, "nn": 10},
default_metric_params=DEFAULT_METRIC,
),
"mvu": DRMethod(
0,
1,
0,
0,
default_params={"dim": 2},
default_graph_params=DEFAULT_KNN_GRAPH,
default_metric_params=DEFAULT_METRIC,
),
"ae": DRMethod(0, 0, 0, 1, default_params={"dim": 2}),
"vae": DRMethod(0, 0, 0, 1, default_params={"dim": 2}),
"flexible_ae": DRMethod(0, 0, 0, 1, default_params={"dim": 2, "architecture": "ae"}),
"tsne": DRMethod(0, 0, 0, 0, default_params={"dim": 2, "perplexity": 30}),
"umap": DRMethod(
0,
1,
0,
0,
1, # handles_disconnected_graphs=1
default_params={"dim": 2, "min_dist": 0.1},
default_graph_params={**DEFAULT_KNN_GRAPH, "nn": 15},
default_metric_params=DEFAULT_METRIC,
),
}
GRAPH_CONSTRUCTION_METHODS = ["knn", "auto_knn", "eps", "eknn", "umap", "tsne"]
EMBEDDING_CONSTRUCTION_METHODS = [
"pca",
"le",
"auto_le",
"dmaps",
"auto_dmaps",
"mds",
"isomap",
"lle",
"hlle",
"mvu",
"ae",
"vae",
"flexible_ae",
"tsne",
"umap",
]
[docs]
def m_param_filter(para: Dict[str, Any]) -> Dict[str, Any]:
"""
This function prunes parameters that are excessive for
chosen distance matrix construction method.
Parameters
----------
para : dict
Dictionary with metric parameters including:
- metric_name: str or callable - name of metric or custom metric function
- sigma: float or None - bandwidth parameter
- p: float - parameter for minkowski metric
- Other metric-specific parameters
Returns
-------
dict
Filtered parameters appropriate for the chosen metric
Raises
------
KeyError
If 'metric_name' key is missing from para dict.
ValueError
If metric_name is unknown (not in named_distances, not 'hyperbolic',
and not callable).
Notes
-----
The special metric 'hyperbolic' is supported in addition to the standard
pynndescent named_distances. Custom callable metrics are also supported."""
name = para["metric_name"]
appr_keys = ["metric_name"]
if para.get("sigma") is not None:
appr_keys.append("sigma")
# Handle different metric types
if callable(name):
# Custom metric function - pass through
pass
elif name not in named_distances:
if name == "hyperbolic":
# Special case for hyperbolic metric
pass
else:
raise ValueError(
f'Unknown metric "{name}". Metric must be one of {list(named_distances.keys())}, '
f'"hyperbolic", or a callable custom metric function.'
)
# Add metric-specific parameters
if name == "minkowski" and "p" in para:
appr_keys.append("p")
return {key: para[key] for key in appr_keys if key in para}
[docs]
def g_param_filter(para: Dict[str, Any]) -> Dict[str, Any]:
"""Filter parameters to keep only those relevant for the graph method.
Different graph construction methods require different parameters.
This function ensures only the appropriate parameters are passed
to avoid errors or warnings from unused parameters.
Parameters
----------
para : dict
Dictionary containing all graph construction parameters.
Must include 'g_method_name' key.
Returns
-------
dict
Filtered dictionary containing only parameters relevant to the
specified graph construction method.
Raises
------
KeyError
If 'g_method_name' key is missing from para dict.
Notes
-----
Supported graph methods and their specific parameters:
- 'knn', 'auto_knn', 'umap': requires 'nn' (number of neighbors)
- 'eps': requires 'eps' (radius) and 'min_density' (minimum graph density)
- 'eknn': requires 'eps', 'min_density', and 'nn'
- 'tsne': requires 'perplexity'
All methods support: 'g_method_name', 'max_deleted_nodes', 'weighted',
'dist_to_aff', 'graph_preprocessing', 'seed'.
Unknown methods are accepted and will receive only the base parameters."""
gmethod = para["g_method_name"]
appr_keys = [
"g_method_name",
"max_deleted_nodes",
"weighted",
"dist_to_aff",
"graph_preprocessing",
"seed",
]
if gmethod in ["knn", "auto_knn", "umap"]:
appr_keys.extend(["nn", "knn_engine", "symmetrization"])
elif gmethod == "eps":
appr_keys.extend(["eps", "min_density"])
elif gmethod == "eknn":
appr_keys.extend(["eps", "min_density", "nn"])
elif gmethod == "tsne":
appr_keys.extend(["perplexity"])
return {key: para[key] for key in appr_keys if key in para}
[docs]
def e_param_filter(para: Dict[str, Any]) -> Dict[str, Any]:
"""Filter parameters to keep only those relevant for the embedding method.
Different dimensionality reduction methods require different parameters.
This function ensures only the appropriate parameters are passed to
avoid errors or warnings from unused parameters.
Parameters
----------
para : dict
Dictionary containing all embedding parameters.
Must include 'e_method_name' key.
Returns
-------
dict
Filtered dictionary containing only parameters relevant to the
specified embedding method.
Raises
------
KeyError
If 'e_method_name' key is missing from para dict.
Notes
-----
All methods support: 'e_method', 'e_method_name', 'dim' (target dimension).
Method-specific parameters:
- 'umap': adds 'min_dist' (minimum distance in low-dimensional space)
- 'dmaps', 'auto_dmaps': adds 'dm_alpha' (diffusion maps alpha parameter)
and 'dm_t' (diffusion time)
Unknown methods are accepted and will receive only the base parameters."""
appr_keys = ["e_method", "e_method_name", "dim"]
if para["e_method_name"] == "umap":
appr_keys.append("min_dist")
if para["e_method_name"] in ["dmaps", "auto_dmaps"]:
appr_keys.append("dm_alpha")
appr_keys.append("dm_t")
return {key: para[key] for key in appr_keys if key in para}
[docs]
def merge_params_with_defaults(
method_name: str, user_params: Optional[Dict[str, Any]] = None
) -> Dict[str, Dict[str, Any]]:
"""Merge user parameters with method defaults.
Parameters
----------
method_name : str
Name of the DR method. Must be one of the keys in METHODS_DICT.
user_params : dict or None
User-provided parameters. Can contain 'e_params', 'g_params', 'm_params' keys
for structured format, or direct parameter values for flat format.
Returns
-------
dict
Dictionary with 'e_params', 'g_params', 'm_params' keys containing merged parameters.
Raises
------
ValueError
If method_name is not found in METHODS_DICT.
Notes
-----
The function supports two input formats:
1. Structured format with explicit parameter groups:
{'e_params': {...}, 'g_params': {...}, 'm_params': {...}}
2. Flat format where parameters are auto-distributed:
- 'n_neighbors' → g_params['nn']
- 'metric' → m_params['metric_name']
- 'sigma' → m_params['sigma']
- 'max_deleted_nodes' → g_params['max_deleted_nodes']
- All others → e_params
The function also sets graph_preprocessing based on the method's
handles_disconnected_graphs property:
- If True: graph_preprocessing = None
- If False: graph_preprocessing = 'giant_cc'"""
if method_name not in METHODS_DICT:
raise ValueError(f"Unknown method: {method_name}")
method = METHODS_DICT[method_name]
# Initialize with defaults
e_params = method.default_params.copy()
e_params["e_method_name"] = method_name
e_params["e_method"] = method
g_params = method.default_graph_params.copy() if method.default_graph_params else None
# Set default graph_preprocessing based on handles_disconnected_graphs property
if g_params is not None:
if method.handles_disconnected_graphs:
g_params.setdefault("graph_preprocessing", None)
else:
g_params.setdefault("graph_preprocessing", "giant_cc")
m_params = method.default_metric_params.copy() if method.default_metric_params else None
if user_params is None:
return {"e_params": e_params, "g_params": g_params, "m_params": m_params}
# Handle different input formats
if "e_params" in user_params or "g_params" in user_params or "m_params" in user_params:
# User provided structured parameters
if "e_params" in user_params and user_params["e_params"]:
e_params.update(user_params["e_params"])
if "g_params" in user_params and user_params["g_params"] and g_params is not None:
g_params.update(user_params["g_params"])
if "m_params" in user_params and user_params["m_params"] and m_params is not None:
m_params.update(user_params["m_params"])
else:
# User provided flat parameters - need to distribute to appropriate dicts
# CRITICAL: Map ALL common parameter aliases to their correct locations
for key, value in user_params.items():
# Graph parameters (g_params)
if g_params is not None and key in [
"n_neighbors",
"nn",
"k", # All aliases for number of neighbors
"weighted",
"dist_to_aff",
"graph_preprocessing",
"g_method_name",
"max_deleted_nodes",
]:
if key in ["n_neighbors", "k"]: # Map aliases to nn
g_params["nn"] = value
elif key == "nn": # Direct nn parameter
g_params["nn"] = value
else: # Other graph params keep their names
g_params[key] = value
# Metric parameters (m_params)
elif m_params is not None and key in ["metric", "metric_name", "sigma"]:
if key == "metric": # Map alias to metric_name
m_params["metric_name"] = value
else: # Direct parameters
m_params[key] = value
# Common embedding parameter aliases
elif key == "n_components": # sklearn alias for dim
e_params["dim"] = value
# Everything else goes to embedding params
else:
e_params[key] = value
# Always ensure e_method is set
if "e_method" not in e_params or e_params["e_method"] is None:
e_params["e_method"] = method
return {"e_params": e_params, "g_params": g_params, "m_params": m_params}