Module pydbsmgr.main
Functions
def check_dtypes(dataframe: pandas.core.frame.DataFrame, datatypes: pandas.core.series.Series) ‑> pandas.core.frame.DataFrame
-
Expand source code
def check_dtypes(dataframe: DataFrame, datatypes: Series) -> DataFrame: """ Checks and updates the data types of columns in a `DataFrame`. Parameters ---------- dataframe : `DataFrame` The `DataFrame` to check and update the data types. datatypes : `Series` The `Series` containing the desired data types for each column in the `DataFrame`. Returns ------- dataframe : `DataFrame` The `DataFrame` with updated data types. """ cols = dataframe.columns for column_index, datatype in enumerate(datatypes): if datatype == "object" or datatype == "datetime64[ns]": dataframe[cols[column_index]] = dataframe[cols[column_index]].apply( clean_and_convert_to ) dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(correct_nan) try: dataframe[cols[column_index]] = dataframe[cols[column_index]].map(str.strip) except: try: dataframe[cols[column_index]] = dataframe[cols[column_index]].astype( "datetime64[ns]" ) except: warning_type = "UserWarning" msg = ( "It was not possible to convert the column {%s} to datetime64[ns] type" % cols[column_index] ) print(f"{warning_type}: {msg}") return dataframe
Checks and updates the data types of columns in a
DataFrame
.Parameters
dataframe
:DataFrame
- The
DataFrame
to check and update the data types. datatypes
:Series
- The
Series
containing the desired data types for each column in theDataFrame
.
Returns
dataframe
:DataFrame
- The
DataFrame
with updated data types.
def check_if_contains_dates(input_string: str) ‑> bool
-
Expand source code
def check_if_contains_dates(input_string: str) -> bool: """Check if a string contains date.""" if input_string == "": return False else: if re.search(r"\d{4}(-|/)\d{1,2}(-|/)\d{1,2}", str(input_string)): return True else: if re.search(r"\d{1,2}(-|/)\d{1,2}(-|/)\d{4}", str(input_string)): return True else: return False
Check if a string contains date.
def check_if_isemail(check_email: str) ‑> Tuple[str, bool]
-
Expand source code
def check_if_isemail(check_email: str) -> Tuple[str, bool]: """ Checks if a string is an email address and returns the cleaned string and a flag indicating if the string is an email. Parameters ---------- check_email : `str` The input string to be checked for an email address. Returns ------- check_email, found_email : `str`, `bool` A tuple containing the cleaned string and a boolean flag indicating if an email address was found. """ found_email = False if str(check_email).find("@") != -1: check_email = str(clean(check_email)) found_email = True print(f"An e-mail has been detected.") return check_email, found_email
Checks if a string is an email address and returns the cleaned string and a flag indicating if the string is an email.
Parameters
check_email
:str
- The input string to be checked for an email address.
Returns
check_email
,found_email
:str
,
bool- A tuple containing the cleaned string and a boolean flag indicating if an email address was found.
def clean(dirty_string: str,
pattern: str = '[a-zA-Zñáéíóú_@.0-9]+\\b',
no_emoji: bool = False,
title_mode: bool = False) ‑> str-
Expand source code
def clean( dirty_string: str, pattern: str = r"[a-zA-Zñáéíóú_@.0-9]+\b", no_emoji: bool = False, title_mode: bool = False, ) -> str: """ Receive a string and clean it of special characters Parameters ---------- dirty_string : `str` string of characters pattern : `str` regular expression string Returns ------- result : `str` clean character string """ if no_emoji: emoji_pattern = re.compile( "[" "\U0001F600-\U0001F64F" "\U0001F300-\U0001F5FF" "\U0001F680-\U0001F6FF" "\U0001F1E0-\U0001F1FF" "]+", flags=re.UNICODE, ) dirty_string = emoji_pattern.sub(r"", dirty_string) dirty_string = dirty_string.lower() words = dirty_string.split() processed_words = ["".join(re.findall(pattern, word)) for word in words] result = " ".join(processed_words) # Remove any extra spaces that were introduced by result = result.strip() if title_mode: return result.title() else: return result
Receive a string and clean it of special characters
Parameters
dirty_string
:str
- string of characters
pattern
:str
- regular expression string
Returns
result
:str
- clean character string
def clean_and_convert_to(x: str) ‑> str
-
Expand source code
def clean_and_convert_to(x: str) -> str: """ Performs cleaning and some conversions on a `str`. Parameters ---------- x : `str` The input string to be cleaned and converted. Returns ------- x : `str` The cleaned and converted string. """ # Consider cases where a number is passed as a `str` if is_number_regex(str(x)): if str(x).find(".") != -1: try: return float(x) except: # Could not convert to float, converted to `np.nan`. return np.nan else: try: return int(x) except: # Could not convert to `int`, converted to `np.nan`. return np.nan else: # Consider cases in which a `float` number is passed as a `str` and is erroneous if str(x).find(".") != -1: try: return float(x) except: # Could not convert {x} to float, converting to `str`... x = str(x) # Successfully converted {x} to `str`. # Cases in which we have an identifier with numbers and letters else: result = re.findall(r"^[A-Za-z0-9]+$", str(x)) try: return result[0] # Case in which none of the above applies except: x = str(x) x = remove_char(x) try: x, find_ = check_if_isemail(x) if (x.find("/") != -1 or x.find("-")) != -1 and not (x.find("//") or x.find("\\")) != -1: x_ = x.replace("/", "") x_ = x_.replace("-", "") if len(x_) == 8: x = convert_date(x_) else: if str(x_).find(":") != -1: x = convert_date(x_[:8]) else: # No date found. x = clean(x) x = x.title() else: if not find_: if str(x).find(".") != -1: x_ = x.replace(".", "") if len(x) == 8: x = convert_date(x_) else: if x.find("//") == -1: x_ = x.replace(".", " ") x_ = " ".join(x_.split()) x_ = clean(x_) x = x_.title() else: x = clean(x) x = " ".join(x.split()) x = x.title() except: print(f"No transformation has been performed, the character will be returned as it came.") return x
Performs cleaning and some conversions on a
str
.Parameters
x
:str
- The input string to be cleaned and converted.
Returns
x
:str
- The cleaned and converted string.
def clean_transform(col_index: pandas.core.indexes.base.Index,
mode: bool = True,
remove_spaces: bool = True,
remove_numeric: bool = True) ‑> List[str]-
Expand source code
def clean_transform( col_index: Index, mode: bool = True, remove_spaces: bool = True, remove_numeric: bool = True, ) -> List[str]: """ Transforms a column index by cleaning the column names and if needed makes them capital. Parameters ---------- col_index : `Index` The column index to be transformed. mode : bool Indicates if names will be capitalized. By default it is set to `True`. Returns ------- col_name_list : `str` The transformed column names as a `list` of strings. """ return [ clean_transform_helper( col, mode=mode, remove_spaces=remove_spaces, remove_numeric=remove_numeric ) for col in col_index ]
Transforms a column index by cleaning the column names and if needed makes them capital.
Parameters
col_index
:Index
- The column index to be transformed.
mode
:bool
- Indicates if names will be capitalized. By default it is set to
True
.
Returns
col_name_list
:str
- The transformed column names as a
list
of strings.
def clean_transform_helper(col: str,
mode: bool = True,
remove_numeric: bool = True,
remove_spaces: bool = True) ‑> str-
Expand source code
def clean_transform_helper( col: str, mode: bool = True, remove_numeric: bool = True, remove_spaces: bool = True ) -> str: """ Transforms a column name by cleaning the column name and if needed makes it capital. Parameters ---------- col : `str` The column name to be transformed. mode : `bool` Indicates if names will be capitalized. By default it is set to `True`. remove_numeric : `bool` Indicates if numeric characters will be removed. By default it is set to `True`. remove_spaces : `bool` Indicates if spaces will be removed. By default it is set to `True`. Returns ------- col_name : `str` The transformed column name. """ col_name = remove_char(str(clean(col))) if mode: col_name = col_name.title() if remove_numeric: col_name = remove_numeric_char(col_name).strip() if remove_spaces: col_name = col_name.replace(" ", "_").replace("-", "_").replace("/", "_") return col_name
Transforms a column name by cleaning the column name and if needed makes it capital.
Parameters
col
:str
- The column name to be transformed.
mode
:bool
- Indicates if names will be capitalized. By default it is set to
True
. remove_numeric
:bool
- Indicates if numeric characters will be removed. By default it is set to
True
. remove_spaces
:bool
- Indicates if spaces will be removed. By default it is set to
True
.
Returns
col_name
:str
- The transformed column name.
def clearConsole()
-
Expand source code
def clearConsole(): command = "clear" if os.name in ("nt", "dos"): command = "cls" os.system(command)
def convert_date(date_string: str) ‑> str
-
Expand source code
def convert_date(date_string: str) -> str: """ Converts a `str` of a date to a proper `datetime64[ns]` format. Parameters ---------- date_string : `str` The input string representing a date. Returns ------- proper_date : `str` The date string in the proper format `YYYY-MM-DD`. """ try: proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="raise"))[:10] except: try: proper_date = str(pd.to_datetime(date_string, format="%d%m%Y", errors="raise"))[:10] except: proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="ignore"))[:10] return proper_date
Converts a
str
of a date to a properdatetime64[ns]
format.Parameters
date_string
:str
- The input string representing a date.
Returns
proper_date
:str
- The date string in the proper format
YYYY-MM-DD
.
def correct_nan(check_missing: str) ‑> str
-
Expand source code
def correct_nan(check_missing: str) -> str: """ Corrects the format of missing values in a `str` to the correct empty `str`. Parameters ---------- check_missing : `str` The string to be checked for incorrect missing value format. Returns ------- check_missing : `str` The corrected string format or empty `str`. """ if str(check_missing).lower() == "nan": return "" return check_missing
Corrects the format of missing values in a
str
to the correct emptystr
.Parameters
check_missing
:str
- The string to be checked for incorrect missing value format.
Returns
check_missing
:str
- The corrected string format or empty
str
.
def drop_empty_columns(df_: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame
-
Expand source code
def drop_empty_columns(df_: DataFrame) -> DataFrame: """ Function that removes empty columns """ cols_to_keep = [] for col in df_.columns: if not (pd.isnull(df_[col]).sum() == len(df_[col])): cols_to_keep.append(col) return df_[cols_to_keep].copy()
Function that removes empty columns
def get_date_format(input_string: str) ‑> str
-
Expand source code
def get_date_format(input_string: str) -> str: """Infer the date format from a given string.""" regex_formats = [ r"\d{4}(-|/)[0-1]+[0-9](-|/)[0-3]+[0-9]", r"\d{4}(-|/)[0-3]+[0-9](-|/)[0-1]+[0-9]", r"[0-3]+[0-9](-|/)[0-1]+[0-9](-|/)\d{4}", r"[0-1]+[0-9](-|/)[0-3]+[0-9](-|/)\d{4}", r"([1-9]|[12][0-9]|3[01])(-|/)([1-9]|1[0-2])(-|/)\d{4}", r"([1-9]|1[0-2])(-|/)([1-9]|[12][0-9]|3[01])(-|/)\d{4}", ] formats = ["%Y%m%d", "%Y%d%m", "%d%m%Y", "%m%d%Y", "dayfirst", "monthfirst"] for format, regex in enumerate(regex_formats): if re.search(regex, str(input_string)): if formats[format] == formats[2] and int((input_string[3:5]).replace("0", "")) > 12: return formats[3] return formats[format] return ""
Infer the date format from a given string.
def intersection_cols(dfs_: List[pandas.core.frame.DataFrame]) ‑> pandas.core.frame.DataFrame
-
Expand source code
def intersection_cols(dfs_: List[DataFrame]) -> DataFrame: """ Function that resolves columns issues of a `list` of dataframes Parameters ---------- dfs_ : List[DataFrame] The `list` of dataframes with columns to be resolves. Returns ------- dfs_ : List[DataFrame] The `list` of dataframes with the corrections in their columns (intersection). """ min_cols = [] index_dfs = [] for i, df in enumerate(dfs_): min_cols.append(len(df.columns)) index_dfs.append(i) df_dict = dict(zip(min_cols, index_dfs)) min_col = min(min_cols) index_min = df_dict[min_col] cols_ = set(dfs_[index_min].columns) for i, df in enumerate(dfs_): dfs_[i] = (dfs_[i][list(cols_)]).copy() return dfs_
Function that resolves columns issues of a
list
of dataframesParameters
dfs_
:List[DataFrame]
- The
list
of dataframes with columns to be resolves.
Returns
dfs_
:List[DataFrame]
- The
list
of dataframes with the corrections in their columns (intersection).
def is_number_regex(s: str) ‑> bool
-
Expand source code
def is_number_regex(s: str) -> bool: """Returns `True` if string is a number.""" if re.match(r"^\d+?\.\d+?$", s) is None: return s.isdigit() return True
Returns
True
if string is a number. def remove_char(input_string: str) ‑> str
-
Expand source code
def remove_char(input_string: str) -> str: """ Removes special characters from a string. Parameters ---------- input_string : `str` The input string from which characters will be removed. Returns ------- input_string : `str` The string with specified characters removed. """ list_of_char = ["#", "$", "*", "?", "!", "(", ")", "&", "%"] for char in list_of_char: try: input_string = input_string.replace(char, "") except: pass input_string = correct_nan(input_string) return input_string
Removes special characters from a string.
Parameters
input_string
:str
- The input string from which characters will be removed.
Returns
input_string
:str
- The string with specified characters removed.
def remove_numeric_char(input_string: str) ‑> str
-
Expand source code
def remove_numeric_char(input_string: str) -> str: """Remove all numeric characters from a string. Parameters ---------- input_string : `str` character string to be cleaned of numeric characters Returns ------- `str` clean character string """ return re.sub(r"\d", "", input_string)
Remove all numeric characters from a string.
Parameters
input_string
:str
- character string to be cleaned of numeric characters
Returns
str
clean character string