Source code for imputation.sampler

import numpy as np
from scipy.stats import chi2
[docs] def sym(x): """ Ensures the input square matrix is symmetric by averaging it with its transpose. Parameters ---------- x : np.ndarray A square numpy matrix. Returns ------- np.ndarray A symmetric matrix computed as (x + x.T) / 2. """ return (x + x.T) / 2
[docs] def norm_draw(y, ry, x, rank_adjust=True, rng=None, **kwargs): """ Bayesian linear regression draw of regression coefficients and residual variance, based on the least squares parameters from `estimice()`. This function replicates the `mice.impute.norm.draw()` algorithm from the R mice package, as described in Rubin (1987, p. 167). Parameters ---------- y : np.ndarray Numeric vector of length n, containing the variable to be imputed. ry : np.ndarray of bool Boolean mask vector of length n, where True indicates observed values of `y` and False indicates missing values. x : np.ndarray Numeric design matrix of shape (n, p) with predictors for `y`. Must have no missing values. rank_adjust : bool, optional If True, replaces any NaN coefficients with zeros. This is relevant only when the least squares method is "qr" and the predictor matrix is rank-deficient. Default is True. rng : np.random.Generator, optional Random number generator for reproducibility. If None, a fresh generator is used. **kwargs : dict Additional keyword arguments passed to `estimice()`, e.g., `ls_meth` to specify the least squares method. Returns ------- dict Dictionary containing: - 'coef': Least squares coefficient estimates (numpy array). - 'beta': Bayesian drawn regression coefficients (numpy array). - 'sigma': Drawn residual standard deviation (float). - 'estimation': Least squares method used (string). Notes ----- The residual variance sigma is drawn from a scaled chi-square distribution, and the regression coefficients beta are drawn from a multivariate normal centered at the least squares estimates with variance scaled by sigma. References ---------- Rubin, D. B. (1987). Multiple Imputation for Nonresponse in Surveys. Wiley, p. 167. Examples -------- >>> import numpy as np >>> y = np.array([1.0, 2.0, np.nan, 4.0]) >>> ry = ~np.isnan(y) >>> x = np.array([[1, 0], [1, 1], [1, 2], [1, 3]]) >>> result = norm_draw(y, ry, x, ls_meth='qr') >>> print(result['beta']) """ if rng is None: rng = np.random.default_rng() #Draw from estimice p = estimice(x[ry, :], y[ry], **kwargs) #sqrt(sum((p$r)^2) / rchisq(n = 1,df = p$df)) #one random variate with p$df normal noise #sqrt because we need sigma not sigma^2 for beta later sigma_star = np.sqrt(np.sum(p["r"] ** 2) / chi2.rvs(df = p["df"], size = 1, random_state=rng)) # #cholesky needs matrix to be symmetrical, must be positive definite -> use sym() (A+A.T)/2 # #np.linalg.cholesky returns lower triangular matrix chol = np.linalg.cholesky(sym(p["v"])) # #coef + lower triangular matrix from Cholesky Decomposition * random n draws from standard normal * sigma beta_star = p["c"] + (chol.T @ rng.normal(size=x.shape[1])) * sigma_star #return list parm = { "coef": p["c"], "beta": beta_star, "sigma": sigma_star, "estimation": p["ls_meth"] } #Replaces NaN with 0 if rank_adjust = True if np.any(np.isnan(parm["coef"])) and rank_adjust: parm["coef"] = np.nan_to_num(parm["coef"], nan=0.0) parm["beta"] = np.nan_to_num(parm["beta"], nan=0.0) return parm
[docs] def estimice(x, y, ls_meth="qr", ridge=1e-5): """ Computes least squares estimates, residuals, variance-covariance matrix, and degrees of freedom using different methods: ridge regression, QR decomposition, or Singular Value Decomposition. Parameters ---------- x : np.ndarray Numeric design matrix with shape (n_samples, n_predictors). Must not contain missing values. y : np.ndarray Numeric vector of responses to be imputed, with possible missing values. ls_meth : str, optional Least squares method to use. Options are: - "qr": QR decomposition (default) - "ridge": Ridge regression - "svd": Singular Value Decomposition ridge : float, optional Ridge penalty size for ridge regression. Default is 1e-5, balancing stability and bias. Returns ------- dict Dictionary containing: - 'c': Least squares coefficient estimates (numpy array). - 'r': Residuals (numpy array). - 'v': Variance-covariance matrix of coefficients (numpy array). - 'df': Degrees of freedom (int). - 'ls_meth': Method used (str). Examples -------- >>> import numpy as np >>> x = np.array([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([7, np.nan, 9]) >>> # Assuming you handle missing y externally, e.g. ry = ~np.isnan(y) >>> result = estimice(x[~np.isnan(y)], y[~np.isnan(y)], ls_meth="qr") >>> print(result['c']) [-6. 6.5] """ #degrees of freedom = length of y - number of columns of x, mininmum df = 1 #c = coefficients, f = fitted values, r = residuals df = max(len(y) - x.shape[1], 1) #QR Decomposition if ls_meth == "qr": #QR decomposition qr = np.linalg.qr(x) c = np.linalg.solve(qr.R, (qr.Q).T @ y) f = x @ c r = y - f rr = (qr.R).T @ qr.R v = np.linalg.solve(rr, np.eye(rr.shape[1])) return { "c": c.flatten(), # transpose to match shape "r": r.flatten(), "v": v, "df": df, "ls_meth": ls_meth } #Ridge Regression elif ls_meth == "ridge": xx = x.T @ x pen = ridge * np.eye(xx.shape[0]) * xx v = np.linalg.solve(xx + pen, np.eye(xx.shape[1])) c = y.T @ x @ v r = y - x @ c.T return { "c": c.flatten(), "r": r.flatten(), "v": v, "df": df, "ls_meth": ls_meth } #Singular Value Decomposition elif ls_meth == "svd": svd = np.linalg.svd(x, full_matrices=False) c = svd.Vh @ (((svd.U).T @ y) / svd.S) f = x @ c r = f - y v = np.linalg.solve((svd.Vh).T * svd.S ** 2 @ svd.Vh, (np.eye((svd.S).shape[0]))) return { "c": c.flatten(), "r": r.flatten(), "v": v, "df": df, "ls_meth": ls_meth }