Module likelihood.tools.impute
Classes
class SimpleImputer (n_features: None | int = None, use_scaler: bool = False)
-
Expand source code
class SimpleImputer: """Multiple imputation using simulation engine.""" def __init__(self, n_features: int | None = None, use_scaler: bool = False): """ Initialize the imputer. Parameters ---------- n_features: int | None Number of features to be used in the imputer. Default is None. use_scaler: bool Whether to use a scaler. Default is False. """ self.n_features = n_features self.sim = SimulationEngine(use_scaler=use_scaler) self.params = {} self.cols_transf = pd.Series([]) def fit(self, X: pd.DataFrame) -> None: """ Fit the imputer to the data. Parameters ---------- X: pd.DataFrame Dataframe to fit the imputer to. """ X_impute = X.copy() self.params = self._get_dict_params(X_impute) X_impute = self.sim._clean_data(X_impute) if X_impute.empty: raise ValueError( "The dataframe is empty after cleaning, it is not possible to train the imputer." ) self.n_features = self.n_features or X_impute.shape[1] - 1 self.sim.fit(X_impute, self.n_features) def transform( self, X: pd.DataFrame, boundary: bool = True, inplace: bool = True ) -> pd.DataFrame: """ Impute missing values in the data. Parameters ----------- X: pd.DataFrame Dataframe to impute missing values. boundary: bool Whether to use the boundaries of the data to impute missing values. Default is True. inplace: bool Whether to modify the columns of the original dataframe or return new ones. Default is True. """ X_impute = X.copy() self.cols_transf = X_impute.columns for column in X_impute.columns: if X_impute[column].isnull().sum() > 0: if not X_impute[column].dtype == "object": min_value = self.params[column]["min"] max_value = self.params[column]["max"] to_compare = self.params[column]["to_compare"] for row in X_impute.index: if pd.isnull(X_impute.loc[row, column]): value_impute = self._check_dtype_convert( self.sim.predict( self._set_zero(X_impute.loc[row, :], column), column, )[0], to_compare, ) if not X_impute[column].dtype == "object" and boundary: if value_impute < min_value: value_impute = min_value if value_impute > max_value: value_impute = max_value X_impute.loc[row, column] = value_impute else: self.cols_transf = self.cols_transf.drop(column) if not inplace: X_impute = X_impute[self.cols_transf].copy() X_impute = X_impute.rename( columns={column: column + "_imputed" for column in self.cols_transf} ) X_impute = X.join(X_impute, rsuffix="_imputed") order_cols = [] for column in X.columns: if column + "_imputed" in X_impute.columns: order_cols.append(column) order_cols.append(column + "_imputed") else: order_cols.append(column) X_impute = X_impute[order_cols] return X_impute def fit_transform( self, X: pd.DataFrame, boundary: bool = True, inplace: bool = True ) -> pd.DataFrame: """ Fit and transform the data. Parameters ----------- X: pd.DataFrame Dataframe to fit and transform. boundary: bool Whether to use the boundaries of the data to impute missing values. Default is True. inplace: bool Whether to modify the columns of the original dataframe or return new ones. Default is True. """ X_train = X.copy() self.fit(X_train) return self.transform(X, boundary, inplace) def _set_zero(self, X: pd.Series, column_exception) -> pd.DataFrame: """ Set missing values to zero, except for `column_exception`. Parameters ----------- X: pd.Series Series to set missing values to zero. """ X = X.copy() for column in X.index: if pd.isnull(X[column]) and column != column_exception: X[column] = 0 data = X.to_frame().T return data def _check_dtype_convert(self, value: Union[int, float], to_compare: Union[int, float]) -> None: """ Check if the value is an integer and convert it to float if it is. Parameters ----------- value: Union[int, float] Value to check and convert. to_compare: Union[int, float] Value to compare to. """ if isinstance(to_compare, int) and isinstance(value, float): value = int(round(value, 0)) if isinstance(to_compare, float) and isinstance(value, float): value = round(value, len(str(to_compare).split(".")[1])) return value def _get_dict_params(self, df: pd.DataFrame) -> dict: """ Get the parameters for the imputer. Parameters ----------- df: pd.DataFrame Dataframe to get the parameters from. """ params = {} for column in df.columns: if df[column].isnull().sum() > 0: if not df[column].dtype == "object": to_compare = df[column].dropna().sample().values[0] params[column] = { "min": df[column].min(), "to_compare": to_compare, "max": df[column].max(), } return params def eval(self, X: pd.DataFrame) -> None: """ Create a histogram of the imputed values. Parameters ----------- X: pd.DataFrame Dataframe to create the histogram from. """ if not isinstance(X, pd.DataFrame): raise ValueError("Input X must be a pandas DataFrame.") df = X.copy() imputed_cols = [col for col in df.columns if col.endswith("_imputed")] num_impute = len(imputed_cols) if num_impute == 0: print("No imputed columns found in the DataFrame.") return try: ncols, nrows = find_multiples(num_impute) except ValueError as e: print(f"Error finding multiples for {num_impute}: {e}") ncols = 1 nrows = num_impute _, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 5 * nrows)) axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes] for i, col in enumerate(imputed_cols): original_col = col.replace("_imputed", "") if original_col in df.columns: original_col_data = df[original_col].dropna() ax = axes[i] # Plot the original data sns.histplot( original_col_data, kde=True, color="blue", label=f"Original", bins=10, ax=ax, ) # Plot the imputed data sns.histplot( df[col], kde=True, color="red", label=f"Imputed", bins=10, ax=ax, ) ax.set_xlabel(original_col) ax.set_ylabel("Frequency" if i % ncols == 0 else "") ax.legend(loc="upper right") plt.suptitle("Histogram Comparison", fontsize=16, fontweight="bold") plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show() def save(self, filename: str = "./imputer") -> None: """ Save the state of the SimpleImputer to a file. Parameters ----------- filename: str Name of the file to save the imputer to. Default is "./imputer". """ filename = filename if filename.endswith(".pkl") else filename + ".pkl" with open(filename, "wb") as f: pickle.dump(self, f) @staticmethod def load(filename: str = "./imputer"): """ Load the state of a SimpleImputer from a file. Parameters ----------- filename: str Name of the file to load the imputer from. Default is "./imputer". """ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename with open(filename, "rb") as f: return pickle.load(f)
Multiple imputation using simulation engine.
Initialize the imputer.
Parameters
n_features
:int | None
- Number of features to be used in the imputer. Default is None.
use_scaler
:bool
- Whether to use a scaler. Default is False.
Static methods
def load(filename: str = './imputer')
-
Expand source code
@staticmethod def load(filename: str = "./imputer"): """ Load the state of a SimpleImputer from a file. Parameters ----------- filename: str Name of the file to load the imputer from. Default is "./imputer". """ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename with open(filename, "rb") as f: return pickle.load(f)
Load the state of a SimpleImputer from a file.
Parameters
filename
:str
- Name of the file to load the imputer from. Default is "./imputer".
Methods
def eval(self, X: pandas.core.frame.DataFrame) ‑> None
-
Expand source code
def eval(self, X: pd.DataFrame) -> None: """ Create a histogram of the imputed values. Parameters ----------- X: pd.DataFrame Dataframe to create the histogram from. """ if not isinstance(X, pd.DataFrame): raise ValueError("Input X must be a pandas DataFrame.") df = X.copy() imputed_cols = [col for col in df.columns if col.endswith("_imputed")] num_impute = len(imputed_cols) if num_impute == 0: print("No imputed columns found in the DataFrame.") return try: ncols, nrows = find_multiples(num_impute) except ValueError as e: print(f"Error finding multiples for {num_impute}: {e}") ncols = 1 nrows = num_impute _, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 5 * nrows)) axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes] for i, col in enumerate(imputed_cols): original_col = col.replace("_imputed", "") if original_col in df.columns: original_col_data = df[original_col].dropna() ax = axes[i] # Plot the original data sns.histplot( original_col_data, kde=True, color="blue", label=f"Original", bins=10, ax=ax, ) # Plot the imputed data sns.histplot( df[col], kde=True, color="red", label=f"Imputed", bins=10, ax=ax, ) ax.set_xlabel(original_col) ax.set_ylabel("Frequency" if i % ncols == 0 else "") ax.legend(loc="upper right") plt.suptitle("Histogram Comparison", fontsize=16, fontweight="bold") plt.tight_layout() plt.subplots_adjust(top=0.9) plt.show()
Create a histogram of the imputed values.
Parameters
X
:pd.DataFrame
- Dataframe to create the histogram from.
def fit(self, X: pandas.core.frame.DataFrame) ‑> None
-
Expand source code
def fit(self, X: pd.DataFrame) -> None: """ Fit the imputer to the data. Parameters ---------- X: pd.DataFrame Dataframe to fit the imputer to. """ X_impute = X.copy() self.params = self._get_dict_params(X_impute) X_impute = self.sim._clean_data(X_impute) if X_impute.empty: raise ValueError( "The dataframe is empty after cleaning, it is not possible to train the imputer." ) self.n_features = self.n_features or X_impute.shape[1] - 1 self.sim.fit(X_impute, self.n_features)
Fit the imputer to the data.
Parameters
X
:pd.DataFrame
- Dataframe to fit the imputer to.
def fit_transform(self, X: pandas.core.frame.DataFrame, boundary: bool = True, inplace: bool = True) ‑> pandas.core.frame.DataFrame
-
Expand source code
def fit_transform( self, X: pd.DataFrame, boundary: bool = True, inplace: bool = True ) -> pd.DataFrame: """ Fit and transform the data. Parameters ----------- X: pd.DataFrame Dataframe to fit and transform. boundary: bool Whether to use the boundaries of the data to impute missing values. Default is True. inplace: bool Whether to modify the columns of the original dataframe or return new ones. Default is True. """ X_train = X.copy() self.fit(X_train) return self.transform(X, boundary, inplace)
Fit and transform the data.
Parameters
X
:pd.DataFrame
- Dataframe to fit and transform.
boundary
:bool
- Whether to use the boundaries of the data to impute missing values. Default is True.
inplace
:bool
- Whether to modify the columns of the original dataframe or return new ones. Default is True.
def save(self, filename: str = './imputer') ‑> None
-
Expand source code
def save(self, filename: str = "./imputer") -> None: """ Save the state of the SimpleImputer to a file. Parameters ----------- filename: str Name of the file to save the imputer to. Default is "./imputer". """ filename = filename if filename.endswith(".pkl") else filename + ".pkl" with open(filename, "wb") as f: pickle.dump(self, f)
Save the state of the SimpleImputer to a file.
Parameters
filename
:str
- Name of the file to save the imputer to. Default is "./imputer".
def transform(self, X: pandas.core.frame.DataFrame, boundary: bool = True, inplace: bool = True) ‑> pandas.core.frame.DataFrame
-
Expand source code
def transform( self, X: pd.DataFrame, boundary: bool = True, inplace: bool = True ) -> pd.DataFrame: """ Impute missing values in the data. Parameters ----------- X: pd.DataFrame Dataframe to impute missing values. boundary: bool Whether to use the boundaries of the data to impute missing values. Default is True. inplace: bool Whether to modify the columns of the original dataframe or return new ones. Default is True. """ X_impute = X.copy() self.cols_transf = X_impute.columns for column in X_impute.columns: if X_impute[column].isnull().sum() > 0: if not X_impute[column].dtype == "object": min_value = self.params[column]["min"] max_value = self.params[column]["max"] to_compare = self.params[column]["to_compare"] for row in X_impute.index: if pd.isnull(X_impute.loc[row, column]): value_impute = self._check_dtype_convert( self.sim.predict( self._set_zero(X_impute.loc[row, :], column), column, )[0], to_compare, ) if not X_impute[column].dtype == "object" and boundary: if value_impute < min_value: value_impute = min_value if value_impute > max_value: value_impute = max_value X_impute.loc[row, column] = value_impute else: self.cols_transf = self.cols_transf.drop(column) if not inplace: X_impute = X_impute[self.cols_transf].copy() X_impute = X_impute.rename( columns={column: column + "_imputed" for column in self.cols_transf} ) X_impute = X.join(X_impute, rsuffix="_imputed") order_cols = [] for column in X.columns: if column + "_imputed" in X_impute.columns: order_cols.append(column) order_cols.append(column + "_imputed") else: order_cols.append(column) X_impute = X_impute[order_cols] return X_impute
Impute missing values in the data.
Parameters
X
:pd.DataFrame
- Dataframe to impute missing values.
boundary
:bool
- Whether to use the boundaries of the data to impute missing values. Default is True.
inplace
:bool
- Whether to modify the columns of the original dataframe or return new ones. Default is True.