Module `pydbsmgr.lightest`

Functions

def process_dates(x: str, format_type: str, auxiliary_type: str = None, errors: str = 'ignore') ‑> str: Auxiliary function in date type string processing.

Classes

class LightCleaner (df_: pandas.core.frame.DataFrame)

Performs a light cleaning on the table.

Expand source code

class LightCleaner:
    """Performs a light cleaning on the table."""

    __slots__ = ["df", "dict_dtypes"]

    def __init__(self, df_: pd.DataFrame):
        self.df = df_.copy()
        self.dict_dtypes = {"float": "float64", "int": "int64", "str": "object"}

    def clean_frame(
        self,
        sample_frac: float = 0.1,
        fast_execution: bool = True,
        two_date_formats: bool = True,
        **kwargs,
    ) -> pd.DataFrame:
        """DataFrame cleaning main function

        Parameters
        ----------
        sample_frac : `float`
            The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%.
        fast_execution : `bool`
            If `False` use `applymap` pandas for extra text cleanup. Default is `True`.

        Keyword Arguments:
        ----------
        no_emoji : `bool`
            By default it is set to `False`. If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`.
        title_mode : `bool`
            By default it is set to `True`. If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`. By default, converts everything to `title`.
        """
        table = self.df.copy()
        cols = table.columns
        if sample_frac != 1.0:
            table_sample = table.sample(frac=sample_frac, replace=False)
        else:
            table_sample = table.copy()
        errors = kwargs.get("errors", "ignore")

        for column_index, datatype in enumerate(table.dtypes):
            if datatype == "object":
                datetype_column = (
                    (table_sample[cols[column_index]].apply(check_if_contains_dates))
                    .isin([True])
                    .any()
                )
                if datetype_column:
                    main_type, auxiliary_type = most_repeated_item(
                        list(
                            filter(
                                lambda item: item is not None,
                                table_sample[cols[column_index]].apply(get_date_format),
                            )
                        ),
                        two_date_formats,
                    )

                    format_type = auxiliary_type or main_type

                    partial_dates = partial(
                        process_dates,
                        format_type=format_type,
                        auxiliary_type=None,
                        errors=errors,
                    )
                    vpartial_dates = np.vectorize(partial_dates)

                    table[cols[column_index]] = pd.to_datetime(
                        vpartial_dates(table[cols[column_index]]),
                        format="%Y-%m-%d",
                        errors="coerce",
                    ).normalize()
                else:
                    try:
                        table[cols[column_index]] = (
                            table[cols[column_index]]
                            .replace(np.nan, "")
                            .astype(str)
                            .str.normalize("NFKD")
                            .str.encode("ascii", errors="ignore")
                            .str.decode("ascii")
                            .str.title()
                        )
                    except AttributeError as e:
                        msg = f"It was not possible to perform the cleaning, the column {cols[column_index]} is duplicated. Error: {e}"
                        logging.warning(msg)
                        sys.exit("Perform correction manually")

                    if not fast_execution:
                        no_emoji = kwargs.get("no_emoji", False)
                        title_mode = kwargs.get("title_mode", True)
                        partial_clean = partial(clean, no_emoji=no_emoji, title_mode=title_mode)
                        vpartial_clean = np.vectorize(partial_clean)
                        table[cols[column_index]] = vpartial_clean(table[cols[column_index]])

        table = self._remove_duplicate_columns(table)
        self.df = table.copy()
        return self.df

    def _correct_type(self, value, datatype):
        """General type correction function."""
        val_type = type(value).__name__
        if self.dict_dtypes[val_type] != datatype:
            try:
                return {"float": float, "int": int, "str": str}[datatype](value)
            except ValueError:
                return np.nan if datatype in ["float", "int"] else ""
        return value

    def _remove_duplicate_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """Remove duplicate columns based on column name."""
        seen = set()
        unique_cols = [col for col in df.columns if not (col in seen or seen.add(col))]
        return df[unique_cols]

Instance variables

var df
var dict_dtypes

Methods

def clean_frame(self, sample_frac: float = 0.1, fast_execution: bool = True, two_date_formats: bool = True, **kwargs) ‑> pandas.core.frame.DataFrame

DataFrame cleaning main function

Parameters

sample_frac : float: The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%.
fast_execution : bool: If False use applymap pandas for extra text cleanup. Default is True.

Keyword Arguments:

no_emoji : bool By default it is set to False. If True, removes all emojis from text data. Works only when fast_execution = False. title_mode : bool By default it is set to True. If False, converts the text to lowercase. Works only when fast_execution = False. By default, converts everything to title.