Module likelihood.tools.cat_embed

Classes

class CategoricalEmbedder (embedding_dim=32)
Expand source code
class CategoricalEmbedder:
    def __init__(self, embedding_dim=32):
        self.embedding_dim = embedding_dim
        self.label_encoders = {}
        self.embeddings = {}

    def fit(self, df: DataFrame, categorical_cols: List):
        """
        Fit the embeddings on the given data.

        Parameters
        ----------
        df : `DataFrame`
            Pandas DataFrame containing the tabular data.
        categorical_cols : `List`
            List of column names representing categorical features.

        Returns
        -------
        `None`
        """
        df_processed = df.copy()
        for col in categorical_cols:
            if col not in df_processed.columns:
                raise ValueError(f"Column {col} not found in DataFrame")

        for col in categorical_cols:
            mode_val = df_processed[col].mode()
            if not mode_val.empty:
                df_processed[col] = df_processed[col].fillna(mode_val[0])

        for col in categorical_cols:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col])
            self.label_encoders[col] = le

            vocab_size = len(le.classes_)
            embedding_matrix = np.random.rand(vocab_size, self.embedding_dim)
            self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)

    def transform(self, df: DataFrame, categorical_cols: List[str]):
        """
        Transform the data using the fitted embeddings.

        Parameters
        ----------
        df : `DataFrame`
            Pandas DataFrame containing the tabular data.
        categorical_cols : `List[str]`
            List of column names representing categorical features.

        Returns
        -------
        Transformed Pandas DataFrame with original columns except `categorical_cols` replaced by their embedding representations.
        """

        df_processed = df.copy()

        for col in categorical_cols:
            if col not in self.label_encoders:
                raise ValueError(
                    f"Column {col} has not been fitted. Please call fit() on this column first."
                )
            mode_val = df_processed[col].mode()
            if not mode_val.empty:
                df_processed[col] = df_processed[col].fillna(mode_val[0])
            le = self.label_encoders[col]
            df_processed[col] = le.transform(df_processed[col])

        for col in categorical_cols:
            indices_tensor = tf.constant(df_processed[col], dtype=tf.int32)
            embedding_layer = tf.nn.embedding_lookup(
                params=self.embeddings[col], ids=indices_tensor
            )
            if len(embedding_layer.shape) == 1:
                embedding_layer = tf.expand_dims(embedding_layer, axis=0)

            for i in range(self.embedding_dim):
                df_processed[f"{col}_embed_{i}"] = embedding_layer[:, i]
            df_processed.drop(columns=[col], inplace=True)

        return df_processed

    def inverse_transform(self, df: pd.DataFrame, categorical_cols: List[str]):
        """
        Inverse transform the data using the fitted embeddings.

        Parameters
        ----------
        df : `DataFrame`
            Pandas DataFrame containing the tabular data with embedded representations.
        categorical_cols : `List[str]`
            List of column names representing categorical features.

        Returns
        -------
        Transformed Pandas DataFrame with original columns replaced by their categorical labels.
        """

        df_processed = df.copy()

        for col in categorical_cols:
            if col not in self.label_encoders:
                raise ValueError(
                    f"Column {col} has not been fitted. Please call fit() on this column first."
                )

            embedding_matrix = self.embeddings[col].numpy()
            label_encoder = self.label_encoders[col]

            embedded_columns = [f"{col}_embed_{i}" for i in range(self.embedding_dim)]
            embeddings = df_processed[embedded_columns].values

            distances = np.linalg.norm(embedding_matrix - embeddings[:, np.newaxis], axis=2)
            original_indices = np.argmin(distances, axis=1)
            original_labels = label_encoder.inverse_transform(original_indices)

            df_processed[col] = original_labels
            df_processed.drop(columns=embedded_columns, inplace=True)

        return df_processed

    def save_embeddings(self, path: str):
        """
        Save the embeddings to a directory.

        Parameters
        ----------
        path : `str`
            Path to the directory where embeddings will be saved.
        """

        os.makedirs(path, exist_ok=True)
        for col, embedding in self.embeddings.items():
            np.save(os.path.join(path, f"{col}_embedding.npy"), embedding.numpy())

    def load_embeddings(self, path: str):
        """
        Load the embeddings from a directory.

        Parameters
        ----------
        path : `str`
            Path to the directory where embeddings are saved.
        """

        for col in self.label_encoders.keys():
            embedding_path = os.path.join(path, f"{col}_embedding.npy")
            if not os.path.exists(embedding_path):
                raise FileNotFoundError(f"Embedding file {embedding_path} not found.")
            embedding_matrix = np.load(embedding_path)
            self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)

Methods

def fit(self, df: pandas.core.frame.DataFrame, categorical_cols: List)
Expand source code
def fit(self, df: DataFrame, categorical_cols: List):
    """
    Fit the embeddings on the given data.

    Parameters
    ----------
    df : `DataFrame`
        Pandas DataFrame containing the tabular data.
    categorical_cols : `List`
        List of column names representing categorical features.

    Returns
    -------
    `None`
    """
    df_processed = df.copy()
    for col in categorical_cols:
        if col not in df_processed.columns:
            raise ValueError(f"Column {col} not found in DataFrame")

    for col in categorical_cols:
        mode_val = df_processed[col].mode()
        if not mode_val.empty:
            df_processed[col] = df_processed[col].fillna(mode_val[0])

    for col in categorical_cols:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col])
        self.label_encoders[col] = le

        vocab_size = len(le.classes_)
        embedding_matrix = np.random.rand(vocab_size, self.embedding_dim)
        self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)

Fit the embeddings on the given data.

Parameters

df : DataFrame
Pandas DataFrame containing the tabular data.
categorical_cols : List
List of column names representing categorical features.

Returns

None

def inverse_transform(self, df: pandas.core.frame.DataFrame, categorical_cols: List[str])
Expand source code
def inverse_transform(self, df: pd.DataFrame, categorical_cols: List[str]):
    """
    Inverse transform the data using the fitted embeddings.

    Parameters
    ----------
    df : `DataFrame`
        Pandas DataFrame containing the tabular data with embedded representations.
    categorical_cols : `List[str]`
        List of column names representing categorical features.

    Returns
    -------
    Transformed Pandas DataFrame with original columns replaced by their categorical labels.
    """

    df_processed = df.copy()

    for col in categorical_cols:
        if col not in self.label_encoders:
            raise ValueError(
                f"Column {col} has not been fitted. Please call fit() on this column first."
            )

        embedding_matrix = self.embeddings[col].numpy()
        label_encoder = self.label_encoders[col]

        embedded_columns = [f"{col}_embed_{i}" for i in range(self.embedding_dim)]
        embeddings = df_processed[embedded_columns].values

        distances = np.linalg.norm(embedding_matrix - embeddings[:, np.newaxis], axis=2)
        original_indices = np.argmin(distances, axis=1)
        original_labels = label_encoder.inverse_transform(original_indices)

        df_processed[col] = original_labels
        df_processed.drop(columns=embedded_columns, inplace=True)

    return df_processed

Inverse transform the data using the fitted embeddings.

Parameters

df : DataFrame
Pandas DataFrame containing the tabular data with embedded representations.
categorical_cols : List[str]
List of column names representing categorical features.

Returns

Transformed Pandas DataFrame with original columns replaced by their categorical labels.

def load_embeddings(self, path: str)
Expand source code
def load_embeddings(self, path: str):
    """
    Load the embeddings from a directory.

    Parameters
    ----------
    path : `str`
        Path to the directory where embeddings are saved.
    """

    for col in self.label_encoders.keys():
        embedding_path = os.path.join(path, f"{col}_embedding.npy")
        if not os.path.exists(embedding_path):
            raise FileNotFoundError(f"Embedding file {embedding_path} not found.")
        embedding_matrix = np.load(embedding_path)
        self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)

Load the embeddings from a directory.

Parameters

path : str
Path to the directory where embeddings are saved.
def save_embeddings(self, path: str)
Expand source code
def save_embeddings(self, path: str):
    """
    Save the embeddings to a directory.

    Parameters
    ----------
    path : `str`
        Path to the directory where embeddings will be saved.
    """

    os.makedirs(path, exist_ok=True)
    for col, embedding in self.embeddings.items():
        np.save(os.path.join(path, f"{col}_embedding.npy"), embedding.numpy())

Save the embeddings to a directory.

Parameters

path : str
Path to the directory where embeddings will be saved.
def transform(self, df: pandas.core.frame.DataFrame, categorical_cols: List[str])
Expand source code
def transform(self, df: DataFrame, categorical_cols: List[str]):
    """
    Transform the data using the fitted embeddings.

    Parameters
    ----------
    df : `DataFrame`
        Pandas DataFrame containing the tabular data.
    categorical_cols : `List[str]`
        List of column names representing categorical features.

    Returns
    -------
    Transformed Pandas DataFrame with original columns except `categorical_cols` replaced by their embedding representations.
    """

    df_processed = df.copy()

    for col in categorical_cols:
        if col not in self.label_encoders:
            raise ValueError(
                f"Column {col} has not been fitted. Please call fit() on this column first."
            )
        mode_val = df_processed[col].mode()
        if not mode_val.empty:
            df_processed[col] = df_processed[col].fillna(mode_val[0])
        le = self.label_encoders[col]
        df_processed[col] = le.transform(df_processed[col])

    for col in categorical_cols:
        indices_tensor = tf.constant(df_processed[col], dtype=tf.int32)
        embedding_layer = tf.nn.embedding_lookup(
            params=self.embeddings[col], ids=indices_tensor
        )
        if len(embedding_layer.shape) == 1:
            embedding_layer = tf.expand_dims(embedding_layer, axis=0)

        for i in range(self.embedding_dim):
            df_processed[f"{col}_embed_{i}"] = embedding_layer[:, i]
        df_processed.drop(columns=[col], inplace=True)

    return df_processed

Transform the data using the fitted embeddings.

Parameters

df : DataFrame
Pandas DataFrame containing the tabular data.
categorical_cols : List[str]
List of column names representing categorical features.

Returns

Transformed Pandas DataFrame with original columns except categorical_cols replaced by their embedding representations.