Module likelihood.tools.cat_embed
Classes
class CategoricalEmbedder (embedding_dim=32)
-
Expand source code
class CategoricalEmbedder: def __init__(self, embedding_dim=32): self.embedding_dim = embedding_dim self.label_encoders = {} self.embeddings = {} def fit(self, df: DataFrame, categorical_cols: List): """ Fit the embeddings on the given data. Parameters ---------- df : `DataFrame` Pandas DataFrame containing the tabular data. categorical_cols : `List` List of column names representing categorical features. Returns ------- `None` """ df_processed = df.copy() for col in categorical_cols: if col not in df_processed.columns: raise ValueError(f"Column {col} not found in DataFrame") for col in categorical_cols: mode_val = df_processed[col].mode() if not mode_val.empty: df_processed[col] = df_processed[col].fillna(mode_val[0]) for col in categorical_cols: le = LabelEncoder() df_processed[col] = le.fit_transform(df_processed[col]) self.label_encoders[col] = le vocab_size = len(le.classes_) embedding_matrix = np.random.rand(vocab_size, self.embedding_dim) self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32) def transform(self, df: DataFrame, categorical_cols: List[str]): """ Transform the data using the fitted embeddings. Parameters ---------- df : `DataFrame` Pandas DataFrame containing the tabular data. categorical_cols : `List[str]` List of column names representing categorical features. Returns ------- Transformed Pandas DataFrame with original columns except `categorical_cols` replaced by their embedding representations. """ df_processed = df.copy() for col in categorical_cols: if col not in self.label_encoders: raise ValueError( f"Column {col} has not been fitted. Please call fit() on this column first." ) mode_val = df_processed[col].mode() if not mode_val.empty: df_processed[col] = df_processed[col].fillna(mode_val[0]) le = self.label_encoders[col] df_processed[col] = le.transform(df_processed[col]) for col in categorical_cols: indices_tensor = tf.constant(df_processed[col], dtype=tf.int32) embedding_layer = tf.nn.embedding_lookup( params=self.embeddings[col], ids=indices_tensor ) if len(embedding_layer.shape) == 1: embedding_layer = tf.expand_dims(embedding_layer, axis=0) for i in range(self.embedding_dim): df_processed[f"{col}_embed_{i}"] = embedding_layer[:, i] df_processed.drop(columns=[col], inplace=True) return df_processed def inverse_transform(self, df: pd.DataFrame, categorical_cols: List[str]): """ Inverse transform the data using the fitted embeddings. Parameters ---------- df : `DataFrame` Pandas DataFrame containing the tabular data with embedded representations. categorical_cols : `List[str]` List of column names representing categorical features. Returns ------- Transformed Pandas DataFrame with original columns replaced by their categorical labels. """ df_processed = df.copy() for col in categorical_cols: if col not in self.label_encoders: raise ValueError( f"Column {col} has not been fitted. Please call fit() on this column first." ) embedding_matrix = self.embeddings[col].numpy() label_encoder = self.label_encoders[col] embedded_columns = [f"{col}_embed_{i}" for i in range(self.embedding_dim)] embeddings = df_processed[embedded_columns].values distances = np.linalg.norm(embedding_matrix - embeddings[:, np.newaxis], axis=2) original_indices = np.argmin(distances, axis=1) original_labels = label_encoder.inverse_transform(original_indices) df_processed[col] = original_labels df_processed.drop(columns=embedded_columns, inplace=True) return df_processed def save_embeddings(self, path: str): """ Save the embeddings to a directory. Parameters ---------- path : `str` Path to the directory where embeddings will be saved. """ os.makedirs(path, exist_ok=True) for col, embedding in self.embeddings.items(): np.save(os.path.join(path, f"{col}_embedding.npy"), embedding.numpy()) def load_embeddings(self, path: str): """ Load the embeddings from a directory. Parameters ---------- path : `str` Path to the directory where embeddings are saved. """ for col in self.label_encoders.keys(): embedding_path = os.path.join(path, f"{col}_embedding.npy") if not os.path.exists(embedding_path): raise FileNotFoundError(f"Embedding file {embedding_path} not found.") embedding_matrix = np.load(embedding_path) self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)
Methods
def fit(self, df: pandas.core.frame.DataFrame, categorical_cols: List)
-
Expand source code
def fit(self, df: DataFrame, categorical_cols: List): """ Fit the embeddings on the given data. Parameters ---------- df : `DataFrame` Pandas DataFrame containing the tabular data. categorical_cols : `List` List of column names representing categorical features. Returns ------- `None` """ df_processed = df.copy() for col in categorical_cols: if col not in df_processed.columns: raise ValueError(f"Column {col} not found in DataFrame") for col in categorical_cols: mode_val = df_processed[col].mode() if not mode_val.empty: df_processed[col] = df_processed[col].fillna(mode_val[0]) for col in categorical_cols: le = LabelEncoder() df_processed[col] = le.fit_transform(df_processed[col]) self.label_encoders[col] = le vocab_size = len(le.classes_) embedding_matrix = np.random.rand(vocab_size, self.embedding_dim) self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)
Fit the embeddings on the given data.
Parameters
df
:DataFrame
- Pandas DataFrame containing the tabular data.
categorical_cols
:List
- List of column names representing categorical features.
Returns
None
def inverse_transform(self, df: pandas.core.frame.DataFrame, categorical_cols: List[str])
-
Expand source code
def inverse_transform(self, df: pd.DataFrame, categorical_cols: List[str]): """ Inverse transform the data using the fitted embeddings. Parameters ---------- df : `DataFrame` Pandas DataFrame containing the tabular data with embedded representations. categorical_cols : `List[str]` List of column names representing categorical features. Returns ------- Transformed Pandas DataFrame with original columns replaced by their categorical labels. """ df_processed = df.copy() for col in categorical_cols: if col not in self.label_encoders: raise ValueError( f"Column {col} has not been fitted. Please call fit() on this column first." ) embedding_matrix = self.embeddings[col].numpy() label_encoder = self.label_encoders[col] embedded_columns = [f"{col}_embed_{i}" for i in range(self.embedding_dim)] embeddings = df_processed[embedded_columns].values distances = np.linalg.norm(embedding_matrix - embeddings[:, np.newaxis], axis=2) original_indices = np.argmin(distances, axis=1) original_labels = label_encoder.inverse_transform(original_indices) df_processed[col] = original_labels df_processed.drop(columns=embedded_columns, inplace=True) return df_processed
Inverse transform the data using the fitted embeddings.
Parameters
df
:DataFrame
- Pandas DataFrame containing the tabular data with embedded representations.
categorical_cols
:List[str]
- List of column names representing categorical features.
Returns
Transformed Pandas DataFrame with original columns replaced by their categorical labels.
def load_embeddings(self, path: str)
-
Expand source code
def load_embeddings(self, path: str): """ Load the embeddings from a directory. Parameters ---------- path : `str` Path to the directory where embeddings are saved. """ for col in self.label_encoders.keys(): embedding_path = os.path.join(path, f"{col}_embedding.npy") if not os.path.exists(embedding_path): raise FileNotFoundError(f"Embedding file {embedding_path} not found.") embedding_matrix = np.load(embedding_path) self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)
Load the embeddings from a directory.
Parameters
path
:str
- Path to the directory where embeddings are saved.
def save_embeddings(self, path: str)
-
Expand source code
def save_embeddings(self, path: str): """ Save the embeddings to a directory. Parameters ---------- path : `str` Path to the directory where embeddings will be saved. """ os.makedirs(path, exist_ok=True) for col, embedding in self.embeddings.items(): np.save(os.path.join(path, f"{col}_embedding.npy"), embedding.numpy())
Save the embeddings to a directory.
Parameters
path
:str
- Path to the directory where embeddings will be saved.
def transform(self, df: pandas.core.frame.DataFrame, categorical_cols: List[str])
-
Expand source code
def transform(self, df: DataFrame, categorical_cols: List[str]): """ Transform the data using the fitted embeddings. Parameters ---------- df : `DataFrame` Pandas DataFrame containing the tabular data. categorical_cols : `List[str]` List of column names representing categorical features. Returns ------- Transformed Pandas DataFrame with original columns except `categorical_cols` replaced by their embedding representations. """ df_processed = df.copy() for col in categorical_cols: if col not in self.label_encoders: raise ValueError( f"Column {col} has not been fitted. Please call fit() on this column first." ) mode_val = df_processed[col].mode() if not mode_val.empty: df_processed[col] = df_processed[col].fillna(mode_val[0]) le = self.label_encoders[col] df_processed[col] = le.transform(df_processed[col]) for col in categorical_cols: indices_tensor = tf.constant(df_processed[col], dtype=tf.int32) embedding_layer = tf.nn.embedding_lookup( params=self.embeddings[col], ids=indices_tensor ) if len(embedding_layer.shape) == 1: embedding_layer = tf.expand_dims(embedding_layer, axis=0) for i in range(self.embedding_dim): df_processed[f"{col}_embed_{i}"] = embedding_layer[:, i] df_processed.drop(columns=[col], inplace=True) return df_processed
Transform the data using the fitted embeddings.
Parameters
df
:DataFrame
- Pandas DataFrame containing the tabular data.
categorical_cols
:List[str]
- List of column names representing categorical features.
Returns
Transformed Pandas DataFrame with original columns except
categorical_cols
replaced by their embedding representations.