Source code for plotting.diagnostics

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import logging
import os
from pathlib import Path

logger = logging.getLogger('imputation.plotting.diagnostics')

np.random.seed(42)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_columns', None)


def _ensure_directory(file_path):
    """
    Ensure the directory for a file path exists.
    Creates parent directories if they don't exist.
    
    Parameters
    ----------
    file_path : str
        Path to a file (not a directory)
    """
    directory = os.path.dirname(file_path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)
        logger.debug(f"Created directory: {directory}")

[docs] def stripplot(imputed_datasets, missing_pattern, columns=None, merge_imputations=False, observed_color='blue', imputed_color='red', save_path=None): """ Create stripplots for imputed data showing observed and imputed values. First plots observed data, then for each imputation shows both observed and imputed values in different colors. Parameters ---------- imputed_datasets : list of pandas.DataFrame List of DataFrames containing imputed values missing_pattern : pandas.DataFrame DataFrame indicating missing values (0 where missing, 1 where observed) columns : list of str, optional List of column names to plot. If None, plots all columns with missing values. merge_imputations : bool, default False If True, shows two columns: one with only observed values and another with observed and imputed values overlaid. If False, shows separate plots for each imputation. observed_color : str, default 'blue' Color for observed values imputed_color : str, default 'red' Color for imputed values save_path : str, optional If provided, save the plot to this path instead of displaying it """ # If no columns specified, use all columns with missing values if columns is None: columns = missing_pattern.columns[missing_pattern.eq(0).any()] # Filter columns to only those with missing values columns = [col for col in columns if col in missing_pattern.columns and missing_pattern[col].eq(0).any()] if not columns: print("No columns with missing values to plot") return # Create a figure with subplots n_cols = len(columns) fig, axes = plt.subplots(1, n_cols, figsize=(5*n_cols, 5)) if n_cols == 1: axes = [axes] # Plot each column for ax, col in zip(axes, columns): # Get observed values (where missing_pattern is 1) observed_mask = missing_pattern[col] == 1 observed = imputed_datasets[0].loc[observed_mask, col] # Create data for stripplot plot_data = [] if merge_imputations: # Add observed-only data plot_data.append(pd.DataFrame({ 'value': observed.values, 'type': ['Observed'] * len(observed), 'imputation': ['Observed Only'] * len(observed) })) # Add observed data again for the second column plot_data.append(pd.DataFrame({ 'value': observed.values, 'type': ['Observed'] * len(observed), 'imputation': ['Observed + Imputed'] * len(observed) })) # Combine all imputed values all_imp_values = [] for df in imputed_datasets: imp_values = df.loc[~observed_mask, col] all_imp_values.extend(imp_values.values) # Add all imputed values to the second column plot_data.append(pd.DataFrame({ 'value': all_imp_values, 'type': ['Imputed'] * len(all_imp_values), 'imputation': ['Observed + Imputed'] * len(all_imp_values) })) else: # Add observed data plot_data.append(pd.DataFrame({ 'value': observed.values, 'type': ['Observed'] * len(observed), 'imputation': ['Observed'] * len(observed) })) # Add each imputation for i, df in enumerate(imputed_datasets, 1): # Get imputed values for this imputation imp_values = df.loc[~observed_mask, col] # Add observed and imputed values imp_data = pd.DataFrame({ 'value': np.concatenate([observed.values, imp_values.values]), # Use same observed values 'type': ['Observed'] * len(observed) + ['Imputed'] * len(imp_values), 'imputation': [f'Imp {i}'] * (len(observed) + len(imp_values)) }) plot_data.append(imp_data) # Combine all data plot_data = pd.concat(plot_data, ignore_index=True) # Create stripplot sns.stripplot( data=plot_data, x='imputation', y='value', hue='type', ax=ax, jitter=True, palette={'Observed': observed_color, 'Imputed': imputed_color}, legend=False, alpha=0.6, linewidth=0.5 ) # Customize plot ax.set_title(f'{col}') ax.set_ylabel('Value') ax.set_xlabel('') # Rotate x-axis labels for better readability plt.setp(ax.get_xticklabels(), rotation=45, ha='right') plt.tight_layout() if save_path: _ensure_directory(save_path) plt.savefig(save_path, dpi=300, bbox_inches='tight') plt.close() else: plt.show()
[docs] def bwplot(imputed_datasets, missing_pattern, columns=None, merge_imputations=False, observed_color='blue', imputed_color='red', save_path=None): """ Create box-and-whisker plots for imputed data showing observed and imputed values. First plots observed data, then for each imputation shows only imputed values in different colors. Parameters ---------- imputed_datasets : list of pandas.DataFrame List of DataFrames containing imputed values missing_pattern : pandas.DataFrame DataFrame indicating missing values (0 where missing, 1 where observed) columns : list of str, optional List of column names to plot. If None, plots all columns with missing values. merge_imputations : bool, default False If True, combines all imputed values into a single boxplot. If False, shows separate boxplots for each imputation. observed_color : str, default 'blue' Color for observed values imputed_color : str, default 'red' Color for imputed values save_path : str, optional If provided, save the plot to this path instead of displaying it """ # If no columns specified, use all columns with missing values if columns is None: columns = missing_pattern.columns[missing_pattern.eq(0).any()] # Filter columns to only those with missing values columns = [col for col in columns if col in missing_pattern.columns and missing_pattern[col].eq(0).any()] if not columns: print("No columns with missing values to plot") return # Create a figure with subplots n_cols = len(columns) fig, axes = plt.subplots(1, n_cols, figsize=(5*n_cols, 5)) if n_cols == 1: axes = [axes] # Plot each column for ax, col in zip(axes, columns): # Get observed values (where missing_pattern is 1) observed_mask = missing_pattern[col] == 1 observed = imputed_datasets[0].loc[observed_mask, col] # Create data for boxplot plot_data = [] if merge_imputations: # Add observed data plot_data.append(pd.DataFrame({ 'value': observed.values, 'type': ['Observed'] * len(observed), 'imputation': ['Observed'] * len(observed) })) # Combine all imputed values all_imp_values = [] for df in imputed_datasets: imp_values = df.loc[~observed_mask, col] all_imp_values.extend(imp_values.values) # Add all imputed values together plot_data.append(pd.DataFrame({ 'value': all_imp_values, 'type': ['Imputed'] * len(all_imp_values), 'imputation': ['Imputed'] * len(all_imp_values) })) else: # Add observed data plot_data.append(pd.DataFrame({ 'value': observed.values, 'type': ['Observed'] * len(observed), 'imputation': ['Observed'] * len(observed) })) # Add each imputation for i, df in enumerate(imputed_datasets, 1): # Get only imputed values for this imputation imp_values = df.loc[~observed_mask, col] # Add only imputed values imp_data = pd.DataFrame({ 'value': imp_values.values, 'type': ['Imputed'] * len(imp_values), 'imputation': [f'Imp {i}'] * len(imp_values) }) plot_data.append(imp_data) # Combine all data plot_data = pd.concat(plot_data, ignore_index=True) # Create boxplot sns.boxplot( data=plot_data, x='imputation', y='value', hue='type', ax=ax, palette={'Observed': observed_color, 'Imputed': imputed_color}, width=0.8, showfliers=False, showcaps=True, showmeans=False, medianprops={'visible': False}, boxprops={'alpha': 0.6}, whiskerprops={'alpha': 0.6}, capprops={'alpha': 0.6} ) # Set transparency for all boxplot elements for patch in ax.artists: patch.set_alpha(0.6) # Make only whiskers dashed for line in ax.lines: # The whisker lines are the ones that extend beyond the box if len(line.get_xdata()) == 2: # Whisker lines have 2 points line.set_linestyle('--') line.set_alpha(0.6) # Add median points if merge_imputations: # Add observed data point observed_data = plot_data[plot_data['imputation'] == 'Observed'] if not observed_data.empty: median_val = observed_data['value'].median() ax.plot(0, median_val, 'o', color=observed_color, alpha=0.6, markersize=6, zorder=3) # Add imputed data point imp_data = plot_data[plot_data['imputation'] == 'Imputed'] if not imp_data.empty: median_val = imp_data['value'].median() ax.plot(1, median_val, 'o', color=imputed_color, alpha=0.6, markersize=6, zorder=3) else: # Add observed data point observed_data = plot_data[plot_data['imputation'] == 'Observed'] if not observed_data.empty: median_val = observed_data['value'].median() ax.plot(0, median_val, 'o', color=observed_color, alpha=0.6, markersize=6, zorder=3) # Then add imputed data points for i, df in enumerate(imputed_datasets, 1): imp_data = plot_data[(plot_data['imputation'] == f'Imp {i}') & (plot_data['type'] == 'Imputed')] if not imp_data.empty: median_val = imp_data['value'].median() ax.plot(i, median_val, 'o', color=imputed_color, alpha=0.6, markersize=6, zorder=3) # Customize plot ax.set_title(f'{col}') ax.set_ylabel('Value') ax.set_xlabel('') # Rotate x-axis labels for better readability plt.setp(ax.get_xticklabels(), rotation=45, ha='right') plt.tight_layout() if save_path: _ensure_directory(save_path) plt.savefig(save_path, dpi=300, bbox_inches='tight') plt.close() else: plt.show()
[docs] def densityplot(imputed_datasets, missing_pattern, columns=None, observed_color='blue', imputed_color='red', save_path=None): """ Create density plots (KDE) for observed and imputed data. Shows the distribution of observed data in blue and imputed data in red. Parameters ---------- imputed_datasets : list of pandas.DataFrame List of DataFrames containing imputed values missing_pattern : pandas.DataFrame DataFrame indicating missing values (0 where missing, 1 where observed) columns : list of str, optional List of column names to plot. If None, plots all columns with missing values. observed_color : str, default 'blue' Color for observed values imputed_color : str, default 'red' Color for imputed values save_path : str, optional If provided, save the plot to this path instead of displaying it """ # If no columns specified, use all columns with missing values if columns is None: columns = missing_pattern.columns[missing_pattern.eq(0).any()] # Filter columns to only those with missing values columns = [col for col in columns if col in missing_pattern.columns and missing_pattern[col].eq(0).any()] if not columns: print("No columns with missing values to plot") return # Create a figure with subplots n_cols = len(columns) fig, axes = plt.subplots(1, n_cols, figsize=(5*n_cols, 5)) if n_cols == 1: axes = [axes] # Plot each column for ax, col in zip(axes, columns): # Get observed values (where missing_pattern is 1) observed_mask = missing_pattern[col] == 1 observed = imputed_datasets[0].loc[observed_mask, col] # Plot observed data KDE sns.kdeplot(data=observed, ax=ax, color=observed_color, label='Observed', alpha=0.6, linewidth=2.5) # Plot imputed data KDE for each imputation for i, df in enumerate(imputed_datasets, 1): # Get only imputed values imp_values = df.loc[~observed_mask, col] # Only add label for the first imputation label = 'Imputed' if i == 1 else None sns.kdeplot(data=imp_values, ax=ax, color=imputed_color, label=label, alpha=0.6) # Customize plot ax.set_title(f'{col}') ax.set_xlabel('Value') ax.set_ylabel('Density') # Handle legend - only for the first plot if col == columns[0]: handles, labels = ax.get_legend_handles_labels() if handles: # If we have any handles ax.legend(handles, labels, title='') plt.tight_layout() if save_path: _ensure_directory(save_path) plt.savefig(save_path, dpi=300, bbox_inches='tight') plt.close() else: plt.show()
[docs] def densityplot_split(imputed_datasets, missing_pattern, column, observed_color='blue', imputed_color='red', save_path=None): """ Create separate density plots (KDE) for observed data and each imputed dataset. Shows the distribution of observed data in blue and imputed data in red, with each imputation in a separate subplot. Parameters ---------- imputed_datasets : list of pandas.DataFrame List of DataFrames containing imputed values missing_pattern : pandas.DataFrame DataFrame indicating missing values (0 where missing, 1 where observed) column : str Name of the column to plot observed_color : str, default 'blue' Color for observed values imputed_color : str, default 'red' Color for imputed values save_path : str, optional If provided, save the plot to this path instead of displaying it """ if column not in missing_pattern.columns: print(f"Column {column} not found in the data") return if not missing_pattern[column].eq(0).any(): print(f"No missing values in column {column}") return # Get observed values (where missing_pattern is 1) observed_mask = missing_pattern[column] == 1 observed = imputed_datasets[0].loc[observed_mask, column] # Calculate number of plots and determine grid layout n_plots = len(imputed_datasets) + 1 # +1 for observed data # Determine number of rows and columns if n_plots <= 3: n_rows, n_cols = 1, n_plots elif n_plots <= 6: n_rows, n_cols = 2, (n_plots + 1) // 2 elif n_plots <= 9: n_rows, n_cols = 3, (n_plots + 2) // 3 else: n_rows, n_cols = 4, (n_plots + 3) // 4 # Create a figure with subplots fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5*n_rows)) axes = axes.flatten() # Flatten the axes array for easier indexing # Plot observed data sns.kdeplot(data=observed, ax=axes[0], color=observed_color, alpha=0.6, linewidth=2.5) axes[0].set_title('Observed') axes[0].set_xlabel('Value') axes[0].set_ylabel('Density') # Plot each imputation for i, df in enumerate(imputed_datasets, 1): # Get only imputed values imp_values = df.loc[~observed_mask, column] # Check for zero variance if imp_values.nunique() <= 1: # If all values are the same, plot a vertical line value = imp_values.iloc[0] axes[i].axvline(x=value, color=imputed_color, alpha=0.6, linestyle='--') axes[i].set_title(f'Imp {i} (constant value: {value:.2f})') else: # If there's variance, plot the KDE sns.kdeplot(data=imp_values, ax=axes[i], color=imputed_color, alpha=0.6) axes[i].set_title(f'Imp {i}') axes[i].set_xlabel('Value') axes[i].set_ylabel('Density') # Hide any unused subplots for i in range(n_plots, len(axes)): axes[i].set_visible(False) # Add overall title fig.suptitle(f'Density plots for {column}', y=1.02) plt.tight_layout() if save_path: _ensure_directory(save_path) plt.savefig(save_path, dpi=300, bbox_inches='tight') plt.close() else: plt.show()
[docs] def xyplot(imputed_datasets, missing_pattern, x, y, merge_imputations=False, observed_color='blue', imputed_color='red', save_path=None): """ Create scatter plots of two columns, showing observed and imputed values. Missing data in y is shown in red, observed data in blue. Parameters ---------- imputed_datasets : list of pandas.DataFrame List of DataFrames containing imputed values missing_pattern : pandas.DataFrame DataFrame indicating missing values (0 where missing, 1 where observed) x : str Name of the column to plot on x-axis y : str Name of the column to plot on y-axis merge_imputations : bool, default False If True, shows all imputations on a single plot. If False, shows n+1 plots: first plot with only observed data, followed by one plot for each imputation. observed_color : str, default 'blue' Color for observed values imputed_color : str, default 'red' Color for imputed values save_path : str, optional If provided, save the plot to this path instead of displaying it """ # Check if columns exist and provide specific error messages missing_cols = [] if x not in missing_pattern.columns: missing_cols.append(f"x-axis column '{x}'") if y not in missing_pattern.columns: missing_cols.append(f"y-axis column '{y}'") if missing_cols: print(f"Error: The following columns are not found in the data: {', '.join(missing_cols)}") print(f"Available columns are: {', '.join(missing_pattern.columns)}") return # Get observed values (where missing_pattern is 1) observed_mask = missing_pattern[y] == 1 if merge_imputations: # Create a single plot fig, ax = plt.subplots(figsize=(8, 6)) # Plot observed data observed_x = imputed_datasets[0].loc[observed_mask, x] observed_y = imputed_datasets[0].loc[observed_mask, y] ax.scatter(observed_x, observed_y, color=observed_color, alpha=0.6, label=f'Observed') # Plot imputed data from all imputations for i, df in enumerate(imputed_datasets): imp_x = df.loc[~observed_mask, x] imp_y = df.loc[~observed_mask, y] # Only add label for the first imputation label = f'Imputed ({y})' if i == 0 else None ax.scatter(imp_x, imp_y, color=imputed_color, alpha=0.6, label=label) # Customize plot ax.set_xlabel(x) ax.set_ylabel(y) ax.legend() ax.set_title(f'Scatter plot of {x} vs {y} (all imputations)') else: # Calculate number of plots and determine grid layout n_plots = len(imputed_datasets) + 1 # +1 for observed data # Determine number of rows and columns if n_plots <= 3: n_rows, n_cols = 1, n_plots elif n_plots <= 6: n_rows, n_cols = 2, (n_plots + 1) // 2 elif n_plots <= 9: n_rows, n_cols = 3, (n_plots + 2) // 3 else: n_rows, n_cols = 4, (n_plots + 3) // 4 # Create a figure with subplots fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5*n_rows)) axes = axes.flatten() # Flatten the axes array for easier indexing # Plot observed data only in the first subplot observed_x = imputed_datasets[0].loc[observed_mask, x] observed_y = imputed_datasets[0].loc[observed_mask, y] axes[0].scatter(observed_x, observed_y, color=observed_color, alpha=0.6, label=f'Observed') axes[0].set_xlabel(x) axes[0].set_ylabel(y) axes[0].legend() axes[0].set_title('Observed') # Plot each imputation for i, df in enumerate(imputed_datasets, 1): # Plot observed data observed_x = df.loc[observed_mask, x] observed_y = df.loc[observed_mask, y] axes[i].scatter(observed_x, observed_y, color=observed_color, alpha=0.6, label=f'Observed ({y})') # Plot imputed data imp_x = df.loc[~observed_mask, x] imp_y = df.loc[~observed_mask, y] axes[i].scatter(imp_x, imp_y, color=imputed_color, alpha=0.6, label=f'Imputed ({y})') # Customize plot axes[i].set_xlabel(x) axes[i].set_ylabel(y) axes[i].legend() axes[i].set_title(f'Imp {i}') # Hide any unused subplots for i in range(n_plots, len(axes)): axes[i].set_visible(False) # Add overall title fig.suptitle(f'Scatter plots of {x} vs {y}', y=1.02) plt.tight_layout() if save_path: _ensure_directory(save_path) plt.savefig(save_path, dpi=300, bbox_inches='tight') plt.close() else: plt.show()
[docs] def plot_chain_stats(chain_mean, chain_var, columns=None, figsize=(10, 5), save_path=None): """ Plot per-iteration chain means and variances for the given columns. Parameters ---------- chain_mean : Dict[str, np.ndarray] Dictionary where each key is a column name and each value is a 2-D array of shape (n_iter, n_imputations) containing the means of the newly imputed values. chain_var : Dict[str, np.ndarray] Same structure as `chain_mean` but for the variance of the imputed values. columns : list of str, optional Columns to plot. If None, plots all keys present in `chain_mean`. figsize : tuple, default (10, 5) Base size of a single row (width, height). The final figure will be scaled according to the number of rows. save_path : str, optional If provided, save the plot to this path instead of displaying it. """ logger.info("Generating chain statistics plots.") # If columns are not specified, use all available from chain_mean if columns is None: columns = list(chain_mean.keys()) if not columns: logger.warning("No columns specified or found to plot.") return # Filter columns to only those that exist in both chain_mean and chain_var valid_columns = [] for col in columns: if col not in chain_mean or col not in chain_var: logger.warning(f"Statistics for column '{col}' not found in both chain_mean and chain_var. Skipping.") continue valid_columns.append(col) if not valid_columns: logger.warning("No valid columns found to plot.") return n_rows = len(valid_columns) fig, axes = plt.subplots(n_rows, 2, figsize=(figsize[0], figsize[1] * n_rows), squeeze=False) # Number of chains determined from first valid column's matrix n_chains = chain_mean[valid_columns[0]].shape[1] palette = sns.color_palette("husl", n_colors=n_chains) for row_idx, col in enumerate(valid_columns): mean_mat = chain_mean[col] var_mat = chain_var[col] n_iter = mean_mat.shape[0] x = np.arange(1, n_iter + 1) # Plot means ax_mean = axes[row_idx, 0] for chain_idx in range(mean_mat.shape[1]): ax_mean.plot(x, mean_mat[:, chain_idx], marker='o', alpha=0.7, label=f'Chain {chain_idx + 1}', color=palette[chain_idx]) ax_mean.set_title(f"{col} – Mean") ax_mean.set_xlabel("Iteration") ax_mean.set_ylabel("Mean") if row_idx == 0: ax_mean.legend(loc='best') # Plot variances ax_var = axes[row_idx, 1] for chain_idx in range(var_mat.shape[1]): ax_var.plot(x, var_mat[:, chain_idx], marker='o', alpha=0.7, label=f'Chain {chain_idx + 1}', color=palette[chain_idx]) ax_var.set_title(f"{col} – Variance") ax_var.set_xlabel("Iteration") ax_var.set_ylabel("Variance") plt.tight_layout() if save_path: _ensure_directory(save_path) plt.savefig(save_path, dpi=300, bbox_inches='tight') plt.close() else: plt.show()