Source code for imputation.sampler

import numpy as np
from scipy.stats import chi2

[docs]
def sym(x):
    """
        Ensures the input square matrix is symmetric by averaging it with its transpose.

        Parameters
        ----------
        x : np.ndarray
            A square numpy matrix.

        Returns
        -------
        np.ndarray
            A symmetric matrix computed as (x + x.T) / 2.

        """
    return (x + x.T) / 2


[docs]
def norm_draw(y, ry, x, rank_adjust=True, rng=None, **kwargs):
    """
        Bayesian linear regression draw of regression coefficients and residual variance,
        based on the least squares parameters from `estimice()`.

        This function replicates the `mice.impute.norm.draw()` algorithm from the R mice package,
        as described in Rubin (1987, p. 167).

        Parameters
        ----------
        y : np.ndarray
            Numeric vector of length n, containing the variable to be imputed.
        ry : np.ndarray of bool
            Boolean mask vector of length n, where True indicates observed values of `y` and False indicates missing values.
        x : np.ndarray
            Numeric design matrix of shape (n, p) with predictors for `y`. Must have no missing values.
        rank_adjust : bool, optional
            If True, replaces any NaN coefficients with zeros. This is relevant only when
            the least squares method is "qr" and the predictor matrix is rank-deficient.
            Default is True.
        rng : np.random.Generator, optional
            Random number generator for reproducibility. If None, a fresh generator is used.
        **kwargs : dict
            Additional keyword arguments passed to `estimice()`, e.g., `ls_meth` to specify the least squares method.

        Returns
        -------
        dict
            Dictionary containing:
            - 'coef': Least squares coefficient estimates (numpy array).
            - 'beta': Bayesian drawn regression coefficients (numpy array).
            - 'sigma': Drawn residual standard deviation (float).
            - 'estimation': Least squares method used (string).

        Notes
        -----
        The residual variance sigma is drawn from a scaled chi-square distribution, and
        the regression coefficients beta are drawn from a multivariate normal centered
        at the least squares estimates with variance scaled by sigma.

        References
        ----------
        Rubin, D. B. (1987). Multiple Imputation for Nonresponse in Surveys.
        Wiley, p. 167.

        Examples
        --------
        >>> import numpy as np
        >>> y = np.array([1.0, 2.0, np.nan, 4.0])
        >>> ry = ~np.isnan(y)
        >>> x = np.array([[1, 0], [1, 1], [1, 2], [1, 3]])
        >>> result = norm_draw(y, ry, x, ls_meth='qr')
        >>> print(result['beta'])
        """
    if rng is None:
        rng = np.random.default_rng()
    #Draw from estimice
    p = estimice(x[ry, :], y[ry], **kwargs)
    #sqrt(sum((p$r)^2) / rchisq(n = 1,df = p$df)) #one random variate with p$df normal noise
    #sqrt because we need sigma not sigma^2 for beta later
    sigma_star = np.sqrt(np.sum(p["r"] ** 2) / chi2.rvs(df = p["df"], size = 1, random_state=rng))
    # #cholesky needs matrix to be symmetrical, must be positive definite -> use sym() (A+A.T)/2
    # #np.linalg.cholesky returns lower triangular matrix
    chol = np.linalg.cholesky(sym(p["v"]))
    # #coef + lower triangular matrix from Cholesky Decomposition * random n draws from standard normal * sigma
    beta_star = p["c"] + (chol.T @ rng.normal(size=x.shape[1])) * sigma_star
    #return list
    parm = {
        "coef": p["c"],
        "beta": beta_star,
        "sigma": sigma_star,
        "estimation": p["ls_meth"]
    }
    #Replaces NaN with 0 if rank_adjust = True
    if np.any(np.isnan(parm["coef"])) and rank_adjust:
        parm["coef"] = np.nan_to_num(parm["coef"], nan=0.0)
        parm["beta"] = np.nan_to_num(parm["beta"], nan=0.0)
    return parm


[docs]
def estimice(x, y, ls_meth="qr", ridge=1e-5):
    """
        Computes least squares estimates, residuals, variance-covariance matrix,
        and degrees of freedom using different methods: ridge regression, QR decomposition, or Singular Value Decomposition.

        Parameters
        ----------
        x : np.ndarray
            Numeric design matrix with shape (n_samples, n_predictors). Must not contain missing values.
        y : np.ndarray
            Numeric vector of responses to be imputed, with possible missing values.
        ls_meth : str, optional
            Least squares method to use. Options are:
            - "qr": QR decomposition (default)
            - "ridge": Ridge regression
            - "svd": Singular Value Decomposition
        ridge : float, optional
            Ridge penalty size for ridge regression. Default is 1e-5, balancing stability and bias.

        Returns
        -------
        dict
            Dictionary containing:
            - 'c': Least squares coefficient estimates (numpy array).
            - 'r': Residuals (numpy array).
            - 'v': Variance-covariance matrix of coefficients (numpy array).
            - 'df': Degrees of freedom (int).
            - 'ls_meth': Method used (str).

        Examples
        --------
        >>> import numpy as np
        >>> x = np.array([[1, 2], [3, 4], [5, 6]])
        >>> y = np.array([7, np.nan, 9])
        >>> # Assuming you handle missing y externally, e.g. ry = ~np.isnan(y)
        >>> result = estimice(x[~np.isnan(y)], y[~np.isnan(y)], ls_meth="qr")
        >>> print(result['c'])
        [-6.   6.5]
        """

    #degrees of freedom = length of y - number of columns of x, mininmum df = 1
    #c = coefficients, f = fitted values, r = residuals
    df = max(len(y) - x.shape[1], 1)
    #QR Decomposition
    if ls_meth == "qr":
        #QR decomposition
        qr = np.linalg.qr(x)
        c = np.linalg.solve(qr.R, (qr.Q).T @ y)
        f = x @ c
        r = y - f
        rr = (qr.R).T @ qr.R
        v = np.linalg.solve(rr, np.eye(rr.shape[1]))

        return {
            "c": c.flatten(),  # transpose to match shape
            "r": r.flatten(),
            "v": v,
            "df": df,
            "ls_meth": ls_meth
        }
    #Ridge Regression
    elif ls_meth == "ridge":
        xx = x.T @ x
        pen = ridge * np.eye(xx.shape[0]) * xx
        v = np.linalg.solve(xx + pen, np.eye(xx.shape[1]))
        c = y.T @ x @ v
        r = y - x @ c.T
        return {
            "c": c.flatten(),
            "r": r.flatten(),
            "v": v,
            "df": df,
            "ls_meth": ls_meth
        }
    #Singular Value Decomposition
    elif ls_meth == "svd":
        svd = np.linalg.svd(x, full_matrices=False)
        c = svd.Vh @ (((svd.U).T @ y) / svd.S)
        f = x @ c
        r = f - y
        v = np.linalg.solve((svd.Vh).T * svd.S ** 2 @ svd.Vh, (np.eye((svd.S).shape[0])))
        return {
            "c": c.flatten(),
            "r": r.flatten(),
            "v": v,
            "df": df,
            "ls_meth": ls_meth
        }