Source code for driada.information.entropy

"""
Entropy calculation functions for discrete, continuous, and mixed variable types.

This module provides various entropy calculation methods including:
- Discrete entropy
- Joint entropy for discrete and mixed variables
- Conditional entropy for different variable type combinations
"""

import numpy as np
from .gcmi import ent_g
from .ksg import nonparam_entropy_c

# Import JIT versions if available
try:
    from .entropy_jit import entropy_d_jit, joint_entropy_dd_jit

    _JIT_AVAILABLE = True
except ImportError:
    _JIT_AVAILABLE = False

# Performance thresholds based on empirical measurements
ENTROPY_D_JIT_THRESHOLD = 1000  # Use JIT for arrays smaller than this
JOINT_ENTROPY_DD_ALWAYS_JIT = True  # Always use JIT for joint entropy


# Re-export mi_cd_fft from info_fft module (for backward compatibility)
from .info_fft import mi_cd_fft



[docs]
def entropy_d(x):
    """Calculate entropy for a discrete variable.

    Automatically selects between JIT-compiled and numpy implementations based
    on dataset size for optimal performance. JIT version is used for arrays
    smaller than ENTROPY_D_JIT_THRESHOLD (1000 elements).

    Parameters
    ----------
    x : array-like
        Discrete variable values. Should contain numeric values (integers or
        floats representing discrete states).

    Returns
    -------
    float
        Entropy in bits.

    Raises
    ------
    ValueError
        If input is not numeric.

    Examples
    --------
    >>> entropy_d([1, 1, 2, 2])  # uniform binary distribution
    1.0
    >>> entropy_d([1, 2, 3, 4])  # uniform 4-way distribution
    2.0

    Notes
    -----
    For small datasets (< 1000 elements), automatically uses JIT-compiled
    implementation if available. For larger datasets, uses optimized numpy
    implementation to avoid JIT compilation overhead."""
    x = np.asarray(x)

    # Verify input is numeric
    if not np.issubdtype(x.dtype, np.number):
        raise ValueError(f"Input must be numeric, got dtype: {x.dtype}")

    # Use JIT version for small datasets if available
    if _JIT_AVAILABLE and x.size < ENTROPY_D_JIT_THRESHOLD:
        return entropy_d_jit(x)

    # Use numpy implementation for large datasets
    unique_x, counts_x = np.unique(x, return_counts=True)
    p_x = counts_x / len(x)
    H_x = probs_to_entropy(p_x)
    return H_x



def probs_to_entropy(p):
    """Calculate entropy for a discrete probability distribution.

    Parameters
    ----------
    p : array-like
        Probability distribution. Will be automatically normalized to sum to 1.

    Returns
    -------
    float
        Entropy in bits.

    Examples
    --------
    >>> float(round(probs_to_entropy([0.5, 0.5]), 4))  # uniform binary
    1.0
    >>> float(abs(round(probs_to_entropy([1.0, 0.0]), 4)))  # deterministic
    0.0
    >>> float(round(probs_to_entropy([0.25, 0.25, 0.25, 0.25]), 4))  # uniform 4-way
    2.0

    Notes
    -----
    Probabilities are automatically normalized to sum to 1. A small epsilon
    (1e-10) is added before taking logarithm to avoid numerical issues with
    log(0) and ensure numerical stability."""
    p = np.asarray(p)
    p = p / np.sum(p)  # Normalize to sum to 1
    return -np.sum(p * np.log2(p + 1e-10))  # Add small value to avoid log(0)



[docs]
def joint_entropy_dd(x, y):
    """Calculate joint entropy for two discrete variables.

    Automatically uses JIT-compiled version which is consistently faster
    than the histogram2d approach across all dataset sizes.

    Parameters
    ----------
    x : array-like
        First discrete variable. Must have same length as y.
    y : array-like
        Second discrete variable. Must have same length as x.

    Returns
    -------
    float
        Joint entropy H(X,Y) in bits.

    Examples
    --------
    >>> joint_entropy_dd([1, 1, 2, 2], [1, 2, 1, 2])  # independent
    2.0
    >>> joint_entropy_dd([1, 1, 2, 2], [1, 1, 2, 2])  # perfectly dependent
    1.0

    Notes
    -----
    When JIT compilation is available, always uses the JIT version as it is
    consistently faster. Falls back to histogram2d-based implementation
    if JIT is not available."""
    x = np.asarray(x)
    y = np.asarray(y)

    # Use JIT version if available (always faster)
    if _JIT_AVAILABLE and JOINT_ENTROPY_DD_ALWAYS_JIT:
        return joint_entropy_dd_jit(x, y)

    # Fallback to histogram2d implementation
    joint_prob = np.histogram2d(x, y, bins=[np.unique(x).size, np.unique(y).size], density=True)[0]
    joint_prob /= np.sum(joint_prob)  # Normalize
    return probs_to_entropy(joint_prob.flatten())




[docs]
def conditional_entropy_cdd(z, x, y, k=5, estimator="gcmi"):
    """Calculate conditional differential entropy for a continuous variable given two discrete variables.

    Computes H(Z|X,Y) where Z is continuous and X,Y are discrete. Two estimators
    are available: GCMI (fast, Gaussian assumption) and KSG (accurate, nonparametric).

    Parameters
    ----------
    z : array-like
        Continuous variable. Must have same length as x and y.
    x : array-like
        First discrete variable. Must have same length as z and y.
    y : array-like
        Second discrete variable. Must have same length as z and x.
    k : int, optional
        For KSG: number of nearest neighbors. For GCMI: minimum subset size
        threshold (partitions smaller than k are excluded). Default: 5.
    estimator : {'gcmi', 'ksg'}, optional
        Entropy estimation method:
        - 'gcmi': Fast, assumes Gaussian distribution
        - 'ksg': Accurate, nonparametric k-nearest neighbor approach
        Default: 'gcmi'.

    Returns
    -------
    float
        Conditional entropy H(Z|X,Y) in bits.

    Examples
    --------
    >>> z = [0.1, 0.2, 0.8, 0.9, 0.3, 0.7]
    >>> x = [1, 1, 2, 2, 1, 2]
    >>> y = [1, 2, 1, 2, 1, 1]
    >>> result = conditional_entropy_cdd(z, x, y, k=3)
    >>> isinstance(result, float)
    True

    Notes
    -----
    GCMI estimator is faster but assumes data follows Gaussian distribution.
    KSG estimator is slower but works for arbitrary continuous distributions."""
    z = np.asarray(z)
    x = np.asarray(x)
    y = np.asarray(y)

    unique_x = np.unique(x)
    unique_y = np.unique(y)

    h_conditional = 0.0
    for ux in unique_x:
        for uy in unique_y:
            # Filter z based on x and y
            filtered_z = z[(x == ux) & (y == uy)]
            if len(filtered_z) > k:
                if estimator == "ksg":
                    # Use KSG estimator with k neighbors
                    entropy_val = nonparam_entropy_c(filtered_z.reshape(-1, 1), k=k)
                else:
                    # Use GCMI estimator (default)
                    entropy_val = ent_g(filtered_z.reshape(1, -1))

                h_conditional += entropy_val * (len(filtered_z) / len(z))

    return h_conditional




[docs]
def conditional_entropy_cd(z, x, k=5, estimator="gcmi"):
    """Calculate conditional differential entropy for a continuous variable given a discrete variable.

    Computes H(Z|X) where Z is continuous and X is discrete. Two estimators
    are available: GCMI (fast, Gaussian assumption) and KSG (accurate, nonparametric).

    Parameters
    ----------
    z : array-like
        Continuous variable. Must have same length as x.
    x : array-like
        Discrete variable. Must have same length as z.
    k : int, optional
        For KSG: number of nearest neighbors. For GCMI: minimum subset size
        threshold (partitions smaller than k are excluded). Default: 5.
    estimator : {'gcmi', 'ksg'}, optional
        Entropy estimation method:
        - 'gcmi': Fast, assumes Gaussian distribution
        - 'ksg': Accurate, nonparametric k-nearest neighbor approach
        Default: 'gcmi'.

    Returns
    -------
    float
        Conditional entropy H(Z|X) in bits.

    Examples
    --------
    >>> z = [0.1, 0.2, 0.8, 0.9]
    >>> x = [1, 1, 2, 2]
    >>> result = conditional_entropy_cd(z, x, k=1)
    >>> isinstance(result, float)
    True

    Notes
    -----
    GCMI estimator is faster but assumes data follows Gaussian distribution.
    KSG estimator is slower but works for arbitrary continuous distributions."""
    z = np.asarray(z)
    x = np.asarray(x)

    unique_x = np.unique(x)
    h_conditional = 0.0

    for ux in unique_x:
        # Filter z based on x
        filtered_z = z[x == ux]
        if len(filtered_z) > k:
            if estimator == "ksg":
                # Use KSG estimator with k neighbors
                entropy_val = nonparam_entropy_c(filtered_z.reshape(-1, 1), k=k)
            else:
                # Use GCMI estimator (default)
                entropy_val = ent_g(filtered_z.reshape(1, -1))

            h_conditional += entropy_val * (len(filtered_z) / len(z))

    return h_conditional



def joint_entropy_cdd(x, y, z, k=5, estimator="gcmi"):
    """Calculate joint entropy for two discrete and one continuous variable.

    Computes H(X,Y,Z) where X,Y are discrete and Z is continuous using
    the chain rule: H(X,Y,Z) = H(X,Y) + H(Z|X,Y)

    Parameters
    ----------
    x : array-like
        First discrete variable. Must have same length as y and z.
    y : array-like
        Second discrete variable. Must have same length as x and z.
    z : array-like
        Continuous variable. Must have same length as x and y.
    k : int, optional
        For KSG: number of nearest neighbors. For GCMI: minimum subset size
        threshold. Default: 5.
    estimator : {'gcmi', 'ksg'}, optional
        Entropy estimation method for the continuous component:
        - 'gcmi': Fast, assumes Gaussian distribution
        - 'ksg': Accurate, nonparametric k-nearest neighbor approach
        Default: 'gcmi'.

    Returns
    -------
    float
        Joint entropy H(X,Y,Z) in bits.

    Examples
    --------
    >>> x = [1, 1, 2, 2]
    >>> y = [1, 2, 1, 2]
    >>> z = [0.1, 0.2, 0.8, 0.9]
    >>> result = joint_entropy_cdd(x, y, z, k=2)
    >>> isinstance(result, float)
    True

    Notes
    -----
    Discrete component H(X,Y) is computed exactly. Continuous component H(Z|X,Y)
    uses the specified estimator. Chain rule ensures mathematical correctness."""
    x = np.asarray(x)
    y = np.asarray(y)
    z = np.asarray(z)

    H_xy = joint_entropy_dd(x, y)
    H_z_given_xy = conditional_entropy_cdd(z, x, y, k=k, estimator=estimator)
    H_xyz = H_xy + H_z_given_xy
    return H_xyz


def joint_entropy_cd(x, z, k=5, estimator="gcmi"):
    """Calculate joint entropy for one discrete and one continuous variable.

    Computes H(X,Z) where X is discrete and Z is continuous using
    the chain rule: H(X,Z) = H(X) + H(Z|X)

    Parameters
    ----------
    x : array-like
        Discrete variable. Must have same length as z.
    z : array-like
        Continuous variable. Must have same length as x.
    k : int, optional
        For KSG: number of nearest neighbors. For GCMI: minimum subset size
        threshold. Default: 5.
    estimator : {'gcmi', 'ksg'}, optional
        Entropy estimation method for the continuous component:
        - 'gcmi': Fast, assumes Gaussian distribution
        - 'ksg': Accurate, nonparametric k-nearest neighbor approach
        Default: 'gcmi'.

    Returns
    -------
    float
        Joint entropy H(X,Z) in bits.

    Examples
    --------
    >>> x = [1, 1, 2, 2]
    >>> z = [0.1, 0.2, 0.8, 0.9]
    >>> result = joint_entropy_cd(x, z, k=2)
    >>> isinstance(result, float)
    True

    Notes
    -----
    Discrete component H(X) is computed exactly. Continuous component H(Z|X)
    uses the specified estimator. Chain rule ensures mathematical correctness."""
    x = np.asarray(x)
    z = np.asarray(z)

    H_x = entropy_d(x)
    H_z_given_x = conditional_entropy_cd(z, x, k=k, estimator=estimator)
    H_xz = H_x + H_z_given_x
    return H_xz