Source code for imputation.sample

import numpy as np
import pandas as pd
from typing import Union, Optional


[docs]
def sample(
    y: Union[pd.Series, np.ndarray],
    id_obs: np.ndarray,
    x: Union[pd.DataFrame, np.ndarray],
    id_mis: Optional[np.ndarray] = None,
    rng: Optional[np.random.Generator] = None,
    **kwargs
) -> np.ndarray:
    """
    Impute missing values by random sampling from observed values.
    
    This function is designed to be compatible with the MICE framework,
    following the same interface as PMM, midas, and CART imputation methods.
    
    Parameters
    ----------
    y : Union[pd.Series, np.ndarray]
        Target variable with missing values
    id_obs : np.ndarray
        Boolean mask of observed values in y (True for observed, False for missing)
    x : Union[pd.DataFrame, np.ndarray]
        Predictor variables (not used in this method, but kept for consistency)
    id_mis : np.ndarray, optional
        Boolean mask of missing values to impute. If None, uses ~id_obs
    rng : np.random.Generator, optional
        Random number generator for reproducibility. If None, a fresh generator is used.
    **kwargs : dict
        Additional arguments (not used in this method)
        
    Returns
    -------
    np.ndarray
        Imputed values for missing positions only (matching R implementation).
        
    Notes
    -----
    This is the simplest imputation method that:
    1. Takes all observed values in the target variable
    2. Randomly samples from them to fill in missing values
    3. No modeling is involved, just random sampling with replacement
    
    This method ignores the predictor variables (x) and only uses the observed
    values of the target variable for imputation.
    
    Edge cases handled (matching R implementation):
    - If no observed values: returns random normal values for numeric data,
      None values for categorical data
    - If only one observed value: duplicates it to allow sampling
    """
    # Convert boolean masks to numpy arrays, but preserve y's original type
    id_obs = np.asarray(id_obs, dtype=bool)
    
    # Set default id_mis if not provided
    if id_mis is None:
        id_mis = ~id_obs
    
    # Create random generator if not provided (matching R behavior)
    if rng is None:
        rng = np.random.default_rng()
    
    # Get observed values
    y_obs = y[id_obs]
    
    # Handle edge cases (matching R implementation)
    if len(y_obs) < 1:
        # If no observed values, handle based on data type
        n_mis = np.sum(id_mis)
        if hasattr(y, 'dtype') and y.dtype == 'object':
            # For categorical/string data, we can't generate meaningful values
            # Return None values that will need to be handled by the caller
            imputed_values = np.full(n_mis, None, dtype=object)
        else:
            # For numeric data, return random normal values using rng
            imputed_values = rng.normal(0, 1, n_mis)
    elif len(y_obs) == 1:
        # If only one observed value, duplicate it to allow sampling
        n_mis = np.sum(id_mis)
        imputed_values = np.full(n_mis, y_obs[0])
    else:
        # Normal case: sample from observed values using rng
        n_mis = np.sum(id_mis)
        imputed_values = rng.choice(y_obs, size=n_mis, replace=True)
    
    return imputed_values