Module pydbsmgr.lightest
Functions
def process_dates(x: str, format_type: str, auxiliary_type: str = None, errors: str = 'ignore') ‑> str
-
Auxiliary function in date type string processing.
Classes
class LightCleaner (df_: pandas.core.frame.DataFrame)
-
Performs a light cleaning on the table.
Expand source code
class LightCleaner: """Performs a light cleaning on the table.""" __slots__ = ["df", "dict_dtypes"] def __init__(self, df_: pd.DataFrame): self.df = df_.copy() self.dict_dtypes = {"float": "float64", "int": "int64", "str": "object"} def clean_frame( self, sample_frac: float = 0.1, fast_execution: bool = True, two_date_formats: bool = True, **kwargs, ) -> pd.DataFrame: """DataFrame cleaning main function Parameters ---------- sample_frac : `float` The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%. fast_execution : `bool` If `False` use `applymap` pandas for extra text cleanup. Default is `True`. Keyword Arguments: ---------- no_emoji : `bool` By default it is set to `False`. If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`. title_mode : `bool` By default it is set to `True`. If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`. By default, converts everything to `title`. """ table = self.df.copy() cols = table.columns if sample_frac != 1.0: table_sample = table.sample(frac=sample_frac, replace=False) else: table_sample = table.copy() errors = kwargs.get("errors", "ignore") for column_index, datatype in enumerate(table.dtypes): if datatype == "object": datetype_column = ( (table_sample[cols[column_index]].apply(check_if_contains_dates)) .isin([True]) .any() ) if datetype_column: main_type, auxiliary_type = most_repeated_item( list( filter( lambda item: item is not None, table_sample[cols[column_index]].apply(get_date_format), ) ), two_date_formats, ) format_type = auxiliary_type or main_type partial_dates = partial( process_dates, format_type=format_type, auxiliary_type=None, errors=errors, ) vpartial_dates = np.vectorize(partial_dates) table[cols[column_index]] = pd.to_datetime( vpartial_dates(table[cols[column_index]]), format="%Y-%m-%d", errors="coerce", ).normalize() else: try: table[cols[column_index]] = ( table[cols[column_index]] .replace(np.nan, "") .astype(str) .str.normalize("NFKD") .str.encode("ascii", errors="ignore") .str.decode("ascii") .str.title() ) except AttributeError as e: msg = f"It was not possible to perform the cleaning, the column {cols[column_index]} is duplicated. Error: {e}" logging.warning(msg) sys.exit("Perform correction manually") if not fast_execution: no_emoji = kwargs.get("no_emoji", False) title_mode = kwargs.get("title_mode", True) partial_clean = partial(clean, no_emoji=no_emoji, title_mode=title_mode) vpartial_clean = np.vectorize(partial_clean) table[cols[column_index]] = vpartial_clean(table[cols[column_index]]) table = self._remove_duplicate_columns(table) self.df = table.copy() return self.df def _correct_type(self, value, datatype): """General type correction function.""" val_type = type(value).__name__ if self.dict_dtypes[val_type] != datatype: try: return {"float": float, "int": int, "str": str}[datatype](value) except ValueError: return np.nan if datatype in ["float", "int"] else "" return value def _remove_duplicate_columns(self, df: pd.DataFrame) -> pd.DataFrame: """Remove duplicate columns based on column name.""" seen = set() unique_cols = [col for col in df.columns if not (col in seen or seen.add(col))] return df[unique_cols]
Instance variables
var df
var dict_dtypes
Methods
def clean_frame(self,
sample_frac: float = 0.1,
fast_execution: bool = True,
two_date_formats: bool = True,
**kwargs) ‑> pandas.core.frame.DataFrame-
DataFrame cleaning main function
Parameters
sample_frac
:float
- The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%.
fast_execution
:bool
- If
False
useapplymap
pandas for extra text cleanup. Default isTrue
.
Keyword Arguments:
no_emoji :
bool
By default it is set toFalse
. IfTrue
, removes all emojis from text data. Works only whenfast_execution
=False
. title_mode :bool
By default it is set toTrue
. IfFalse
, converts the text to lowercase. Works only whenfast_execution
=False
. By default, converts everything totitle
.