Module pydbsmgr.main

Functions

def check_dtypes(dataframe: pandas.core.frame.DataFrame, datatypes: pandas.core.series.Series) ‑> pandas.core.frame.DataFrame
Expand source code
def check_dtypes(dataframe: DataFrame, datatypes: Series) -> DataFrame:
    """
    Checks and updates the data types of columns in a `DataFrame`.

    Parameters
    ----------
    dataframe : `DataFrame`
        The `DataFrame` to check and update the data types.
    datatypes : `Series`
        The `Series` containing the desired data types for each column in the `DataFrame`.

    Returns
    -------
    dataframe : `DataFrame`
        The `DataFrame` with updated data types.
    """
    cols = dataframe.columns

    for column_index, datatype in enumerate(datatypes):
        if datatype == "object" or datatype == "datetime64[ns]":
            dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(
                clean_and_convert_to
            )
            dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(correct_nan)
            try:
                dataframe[cols[column_index]] = dataframe[cols[column_index]].map(str.strip)
            except:
                try:
                    dataframe[cols[column_index]] = dataframe[cols[column_index]].astype(
                        "datetime64[ns]"
                    )
                except:
                    warning_type = "UserWarning"
                    msg = (
                        "It was not possible to convert the column {%s} to datetime64[ns] type"
                        % cols[column_index]
                    )
                    print(f"{warning_type}: {msg}")
    return dataframe

Checks and updates the data types of columns in a DataFrame.

Parameters

dataframe : DataFrame
The DataFrame to check and update the data types.
datatypes : Series
The Series containing the desired data types for each column in the DataFrame.

Returns

dataframe : DataFrame
The DataFrame with updated data types.
def check_if_contains_dates(input_string: str) ‑> bool
Expand source code
def check_if_contains_dates(input_string: str) -> bool:
    """Check if a string contains date."""
    if input_string == "":
        return False
    else:
        if re.search(r"\d{4}(-|/)\d{1,2}(-|/)\d{1,2}", str(input_string)):
            return True
        else:
            if re.search(r"\d{1,2}(-|/)\d{1,2}(-|/)\d{4}", str(input_string)):
                return True
            else:
                return False

Check if a string contains date.

def check_if_isemail(check_email: str) ‑> Tuple[str, bool]
Expand source code
def check_if_isemail(check_email: str) -> Tuple[str, bool]:
    """
    Checks if a string is an email address and returns the cleaned string and a flag indicating if the string is an email.

    Parameters
    ----------
    check_email : `str`
        The input string to be checked for an email address.

    Returns
    -------
    check_email, found_email : `str`, `bool`
        A tuple containing the cleaned string and a boolean flag indicating if an email address was found.
    """
    found_email = False
    if str(check_email).find("@") != -1:
        check_email = str(clean(check_email))
        found_email = True
        print(f"An e-mail has been detected.")

    return check_email, found_email

Checks if a string is an email address and returns the cleaned string and a flag indicating if the string is an email.

Parameters

check_email : str
The input string to be checked for an email address.

Returns

check_email, found_email : str,bool
A tuple containing the cleaned string and a boolean flag indicating if an email address was found.
def clean(dirty_string: str,
pattern: str = '[a-zA-Zñáéíóú_@.0-9]+\\b',
no_emoji: bool = False,
title_mode: bool = False) ‑> str
Expand source code
def clean(
    dirty_string: str,
    pattern: str = r"[a-zA-Zñáéíóú_@.0-9]+\b",
    no_emoji: bool = False,
    title_mode: bool = False,
) -> str:
    """
    Receive a string and clean it of special characters

    Parameters
    ----------
    dirty_string : `str`
        string of characters
    pattern : `str`
        regular expression string

    Returns
    -------
    result : `str`
        clean character string
    """
    if no_emoji:
        emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"
            "\U0001F300-\U0001F5FF"
            "\U0001F680-\U0001F6FF"
            "\U0001F1E0-\U0001F1FF"
            "]+",
            flags=re.UNICODE,
        )
        dirty_string = emoji_pattern.sub(r"", dirty_string)
    dirty_string = dirty_string.lower()
    words = dirty_string.split()
    processed_words = ["".join(re.findall(pattern, word)) for word in words]
    result = " ".join(processed_words)
    # Remove any extra spaces that were introduced by
    result = result.strip()
    if title_mode:
        return result.title()
    else:
        return result

Receive a string and clean it of special characters

Parameters

dirty_string : str
string of characters
pattern : str
regular expression string

Returns

result : str
clean character string
def clean_and_convert_to(x: str) ‑> str
Expand source code
def clean_and_convert_to(x: str) -> str:
    """
    Performs cleaning and some conversions on a `str`.

    Parameters
    ----------
    x : `str`
        The input string to be cleaned and converted.

    Returns
    -------
    x : `str`
        The cleaned and converted string.
    """

    # Consider cases where a number is passed as a `str`
    if is_number_regex(str(x)):
        if str(x).find(".") != -1:
            try:
                return float(x)
            except:
                # Could not convert to float, converted to `np.nan`.
                return np.nan
        else:
            try:
                return int(x)
            except:
                # Could not convert to `int`, converted to `np.nan`.
                return np.nan
    else:
        # Consider cases in which a `float` number is passed as a `str` and is erroneous
        if str(x).find(".") != -1:
            try:
                return float(x)
            except:
                # Could not convert {x} to float, converting to `str`...
                x = str(x)
                # Successfully converted {x} to `str`.
        # Cases in which we have an identifier with numbers and letters
        else:
            result = re.findall(r"^[A-Za-z0-9]+$", str(x))
            try:
                return result[0]
            # Case in which none of the above applies
            except:
                x = str(x)

    x = remove_char(x)
    try:
        x, find_ = check_if_isemail(x)
        if (x.find("/") != -1 or x.find("-")) != -1 and not (x.find("//") or x.find("\\")) != -1:
            x_ = x.replace("/", "")
            x_ = x_.replace("-", "")

            if len(x_) == 8:
                x = convert_date(x_)
            else:
                if str(x_).find(":") != -1:
                    x = convert_date(x_[:8])
                else:
                    # No date found.
                    x = clean(x)
                    x = x.title()
        else:
            if not find_:
                if str(x).find(".") != -1:
                    x_ = x.replace(".", "")
                    if len(x) == 8:
                        x = convert_date(x_)
                    else:
                        if x.find("//") == -1:
                            x_ = x.replace(".", " ")
                            x_ = " ".join(x_.split())
                            x_ = clean(x_)
                            x = x_.title()
                else:
                    x = clean(x)
                    x = " ".join(x.split())
                    x = x.title()
    except:
        print(f"No transformation has been performed, the character will be returned as it came.")
    return x

Performs cleaning and some conversions on a str.

Parameters

x : str
The input string to be cleaned and converted.

Returns

x : str
The cleaned and converted string.
def clean_transform(col_index: pandas.core.indexes.base.Index,
mode: bool = True,
remove_spaces: bool = True,
remove_numeric: bool = True) ‑> List[str]
Expand source code
def clean_transform(
    col_index: Index,
    mode: bool = True,
    remove_spaces: bool = True,
    remove_numeric: bool = True,
) -> List[str]:
    """
    Transforms a column index by cleaning the column names and if needed makes them capital.

    Parameters
    ----------
    col_index : `Index`
        The column index to be transformed.
    mode : bool
        Indicates if names will be capitalized. By default it is set to `True`.

    Returns
    -------
    col_name_list : `str`
        The transformed column names as a `list` of strings.
    """
    return [
        clean_transform_helper(
            col, mode=mode, remove_spaces=remove_spaces, remove_numeric=remove_numeric
        )
        for col in col_index
    ]

Transforms a column index by cleaning the column names and if needed makes them capital.

Parameters

col_index : Index
The column index to be transformed.
mode : bool
Indicates if names will be capitalized. By default it is set to True.

Returns

col_name_list : str
The transformed column names as a list of strings.
def clean_transform_helper(col: str,
mode: bool = True,
remove_numeric: bool = True,
remove_spaces: bool = True) ‑> str
Expand source code
def clean_transform_helper(
    col: str, mode: bool = True, remove_numeric: bool = True, remove_spaces: bool = True
) -> str:
    """
    Transforms a column name by cleaning the column name and if needed makes it capital.

    Parameters
    ----------
    col : `str`
        The column name to be transformed.
    mode : `bool`
        Indicates if names will be capitalized. By default it is set to `True`.
    remove_numeric : `bool`
        Indicates if numeric characters will be removed. By default it is set to `True`.
    remove_spaces : `bool`
        Indicates if spaces will be removed. By default it is set to `True`.
    Returns
    -------
    col_name : `str`
        The transformed column name.
    """
    col_name = remove_char(str(clean(col)))
    if mode:
        col_name = col_name.title()
    if remove_numeric:
        col_name = remove_numeric_char(col_name).strip()
    if remove_spaces:
        col_name = col_name.replace(" ", "_").replace("-", "_").replace("/", "_")

    return col_name

Transforms a column name by cleaning the column name and if needed makes it capital.

Parameters

col : str
The column name to be transformed.
mode : bool
Indicates if names will be capitalized. By default it is set to True.
remove_numeric : bool
Indicates if numeric characters will be removed. By default it is set to True.
remove_spaces : bool
Indicates if spaces will be removed. By default it is set to True.

Returns

col_name : str
The transformed column name.
def clearConsole()
Expand source code
def clearConsole():
    command = "clear"
    if os.name in ("nt", "dos"):
        command = "cls"
    os.system(command)
def convert_date(date_string: str) ‑> str
Expand source code
def convert_date(date_string: str) -> str:
    """
    Converts a `str` of a date to a proper `datetime64[ns]` format.

    Parameters
    ----------
    date_string : `str`
        The input string representing a date.

    Returns
    -------
    proper_date : `str`
        The date string in the proper format `YYYY-MM-DD`.
    """
    try:
        proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="raise"))[:10]
    except:
        try:
            proper_date = str(pd.to_datetime(date_string, format="%d%m%Y", errors="raise"))[:10]
        except:
            proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="ignore"))[:10]
    return proper_date

Converts a str of a date to a proper datetime64[ns] format.

Parameters

date_string : str
The input string representing a date.

Returns

proper_date : str
The date string in the proper format YYYY-MM-DD.
def correct_nan(check_missing: str) ‑> str
Expand source code
def correct_nan(check_missing: str) -> str:
    """
    Corrects the format of missing values in a `str` to the correct empty `str`.

    Parameters
    ----------
    check_missing : `str`
        The string to be checked for incorrect missing value format.

    Returns
    -------
    check_missing : `str`
        The corrected string format or empty `str`.
    """
    if str(check_missing).lower() == "nan":
        return ""
    return check_missing

Corrects the format of missing values in a str to the correct empty str.

Parameters

check_missing : str
The string to be checked for incorrect missing value format.

Returns

check_missing : str
The corrected string format or empty str.
def drop_empty_columns(df_: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame
Expand source code
def drop_empty_columns(df_: DataFrame) -> DataFrame:
    """
    Function that removes empty columns
    """
    cols_to_keep = []
    for col in df_.columns:
        if not (pd.isnull(df_[col]).sum() == len(df_[col])):
            cols_to_keep.append(col)
    return df_[cols_to_keep].copy()

Function that removes empty columns

def get_date_format(input_string: str) ‑> str
Expand source code
def get_date_format(input_string: str) -> str:
    """Infer the date format from a given string."""
    regex_formats = [
        r"\d{4}(-|/)[0-1]+[0-9](-|/)[0-3]+[0-9]",
        r"\d{4}(-|/)[0-3]+[0-9](-|/)[0-1]+[0-9]",
        r"[0-3]+[0-9](-|/)[0-1]+[0-9](-|/)\d{4}",
        r"[0-1]+[0-9](-|/)[0-3]+[0-9](-|/)\d{4}",
        r"([1-9]|[12][0-9]|3[01])(-|/)([1-9]|1[0-2])(-|/)\d{4}",
        r"([1-9]|1[0-2])(-|/)([1-9]|[12][0-9]|3[01])(-|/)\d{4}",
    ]
    formats = ["%Y%m%d", "%Y%d%m", "%d%m%Y", "%m%d%Y", "dayfirst", "monthfirst"]
    for format, regex in enumerate(regex_formats):
        if re.search(regex, str(input_string)):
            if formats[format] == formats[2] and int((input_string[3:5]).replace("0", "")) > 12:
                return formats[3]
            return formats[format]

    return ""

Infer the date format from a given string.

def intersection_cols(dfs_: List[pandas.core.frame.DataFrame]) ‑> pandas.core.frame.DataFrame
Expand source code
def intersection_cols(dfs_: List[DataFrame]) -> DataFrame:
    """
    Function that resolves columns issues of a `list` of dataframes

    Parameters
    ----------
    dfs_ : List[DataFrame]
        The `list` of dataframes with columns to be resolves.

    Returns
    -------
    dfs_ : List[DataFrame]
        The `list` of dataframes with the corrections in their columns (intersection).
    """
    min_cols = []
    index_dfs = []
    for i, df in enumerate(dfs_):
        min_cols.append(len(df.columns))
        index_dfs.append(i)
    df_dict = dict(zip(min_cols, index_dfs))

    min_col = min(min_cols)
    index_min = df_dict[min_col]
    cols_ = set(dfs_[index_min].columns)
    for i, df in enumerate(dfs_):
        dfs_[i] = (dfs_[i][list(cols_)]).copy()

    return dfs_

Function that resolves columns issues of a list of dataframes

Parameters

dfs_ : List[DataFrame]
The list of dataframes with columns to be resolves.

Returns

dfs_ : List[DataFrame]
The list of dataframes with the corrections in their columns (intersection).
def is_number_regex(s: str) ‑> bool
Expand source code
def is_number_regex(s: str) -> bool:
    """Returns `True` if string is a number."""
    if re.match(r"^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

Returns True if string is a number.

def remove_char(input_string: str) ‑> str
Expand source code
def remove_char(input_string: str) -> str:
    """
    Removes special characters from a string.

    Parameters
    ----------
    input_string : `str`
        The input string from which characters will be removed.

    Returns
    -------
    input_string : `str`
        The string with specified characters removed.
    """
    list_of_char = ["#", "$", "*", "?", "!", "(", ")", "&", "%"]
    for char in list_of_char:
        try:
            input_string = input_string.replace(char, "")
        except:
            pass
    input_string = correct_nan(input_string)
    return input_string

Removes special characters from a string.

Parameters

input_string : str
The input string from which characters will be removed.

Returns

input_string : str
The string with specified characters removed.
def remove_numeric_char(input_string: str) ‑> str
Expand source code
def remove_numeric_char(input_string: str) -> str:
    """Remove all numeric characters from a string.

    Parameters
    ----------
    input_string : `str`
        character string to be cleaned of numeric characters

    Returns
    -------
    `str`
        clean character string
    """
    return re.sub(r"\d", "", input_string)

Remove all numeric characters from a string.

Parameters

input_string : str
character string to be cleaned of numeric characters

Returns

str clean character string