Module likelihood.models.simulation
Functions
def categories_by_quartile(df: pandas.core.frame.DataFrame, column: str) ‑> Tuple[str, str]
-
Expand source code
def categories_by_quartile(df: DataFrame, column: str) -> Tuple[str, str]: freq = df[column].value_counts() q1 = freq.quantile(0.25) q3 = freq.quantile(0.75) least_frequent = freq[freq <= q1] most_frequent = freq[freq >= q3] least_frequent_category = least_frequent.idxmin() if not least_frequent.empty else None most_frequent_category = most_frequent.idxmax() if not most_frequent.empty else None return least_frequent_category, most_frequent_category
Classes
class SimulationEngine (use_scaler: bool = False, **kwargs)
-
Expand source code
class SimulationEngine(FeatureSelection): """ This class implements a predictive model that utilizes multiple linear regression for numerical target variables and multiple logistic regression for categorical target variables. The class provides methods for training the model on a given dataset, making predictions, and evaluating the model's performance. Key features: - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method. - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics. - Designed to be flexible and user-friendly, allowing for easy integration with various datasets. Usage: - Instantiate the class with the training data and target variable. - Call the fit method to train the model. - Use the predict method to generate predictions on new data. - Evaluate the model using built-in metrics for accuracy and error. This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques for both numerical and categorical outcomes efficiently. """ def __init__(self, use_scaler: bool = False, **kwargs): self.df = pd.DataFrame() self.n_importances = None self.use_scaler = use_scaler self.proba_dict = {} super().__init__(**kwargs) def predict(self, df: DataFrame, column: str) -> np.ndarray | list: # Let us assign the dictionary entries corresponding to the column w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column] df = df[names_cols].copy() # Change the scale of the DataFrame dataset = self.df.copy() dataset.drop(columns=column, inplace=True) numeric_df = dataset.select_dtypes(include="number") if self.use_scaler: scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None) _ = scaler.rescale() dataset_ = df.copy() numeric_df = dataset_.select_dtypes(include="number") numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy()) numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns) for col in numeric_df.columns: df[col] = numeric_df[col].values # Encoding the DataFrame for num, colname in enumerate(dfe._encode_columns): if df[colname].dtype == "object": encode_dict = dfe.encoding_list[num] df[colname] = df[colname].apply( dfe._code_transformation_to, dictionary_list=encode_dict ) # Prediction y = df.to_numpy() @ w # Categorical column if quick_encoder != None: one_hot = OneHotEncoder() y = one_hot.decode(y) encoding_dic = quick_encoder.decoding_list[0] y = [encoding_dic[item] for item in y] # Numeric column else: if self.use_scaler: # scale output y += 1 y /= 2 y = y * (self.df[column].max() - self.df[column].min()) return y[:] def _encode(self, df: DataFrame) -> Dict[str, float]: df = df.copy() column = df.columns[0] frec = df[column].value_counts() / len(df) df.loc[:, "frec"] = df[column].map(frec) df.sort_values("frec", inplace=True) keys = df[column].to_list() values = df["frec"].to_list() return dict(zip(keys, values)) def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None: self.df = df self.n_importances = n_importances # We run the feature selection algorithm self.get_digraph(self.df, self.n_importances, self.use_scaler) proba_dict_keys = list(self.w_dict.keys()) self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))])) for key in proba_dict_keys: x = ( self.df[key].values, None if self.df[key].dtype != "object" else self._encode(self.df[[key]]), ) poly = kwargs.get("poly", 9) plot = kwargs.get("plot", False) if not x[1]: media = self.df[key].mean() standard_deviation = self.df[key].std() lower_limit = media - 1.5 * standard_deviation upper_limit = media + 1.5 * standard_deviation if plot: print(f"Cumulative Distribution Function ({key})") f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot) else: f, ox = None, None least_frequent_category, most_frequent_category = categories_by_quartile( self.df[[key]], key ) lower_limit = x[1].get(least_frequent_category, 0) upper_limit = x[1].get(most_frequent_category, 0) self.proba_dict[key] = ( f if f else None, x[1], (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None), f(lower_limit) if f else lower_limit, f(upper_limit) if f else upper_limit, ) def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]: value = ( value if isinstance(value, list) else value.flatten().tolist() if isinstance(value, np.ndarray) else [value] ) return [ ( self.proba_dict[colname][0](val) - self.proba_dict[colname][0](val - self.proba_dict[colname][2]) if (isinstance(val, float) or isinstance(val, int)) else self.proba_dict[colname][1].get(val, 0) ) for val in value ] def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]: return [ ( "inlier" if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4]) else "outlier" ) for val in self.get_proba(value, colname) ] def _clean_data(self, df: DataFrame) -> DataFrame: df.replace([np.inf, -np.inf], np.nan, inplace=True) df.replace(" ", np.nan, inplace=True) df = check_nan_inf(df) df = df.reset_index() df = df.drop(columns=["index"]) return df def save(self, filename: str = "./simulation_model") -> None: """ Save the state of the SimulationEngine to a file. Parameters: filename (str): The name of the file where the object will be saved. """ filename = filename if filename.endswith(".pkl") else filename + ".pkl" with open(filename, "wb") as f: pickle.dump(self, f) @staticmethod def load(filename: str = "./simulation_model"): """ Load the state of a SimulationEngine from a file. Parameters: filename (str): The name of the file containing the saved object. Returns: SimulationEngine: A new instance of SimulationEngine with the loaded state. """ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename with open(filename, "rb") as f: return pickle.load(f)
This class implements a predictive model that utilizes multiple linear regression for numerical target variables and multiple logistic regression for categorical target variables.
The class provides methods for training the model on a given dataset, making predictions, and evaluating the model's performance.
Key features: - Supports both numerical and categorical target variables, automatically selecting the appropriate regression method. - Includes methods for data preprocessing, model fitting, prediction, and evaluation metrics. - Designed to be flexible and user-friendly, allowing for easy integration with various datasets.
Usage: - Instantiate the class with the training data and target variable. - Call the fit method to train the model. - Use the predict method to generate predictions on new data. - Evaluate the model using built-in metrics for accuracy and error.
This class is suitable for applications in data analysis and machine learning, enabling users to leverage regression techniques for both numerical and categorical outcomes efficiently.
The initializer of the class. The initial parameter is a list of strings with variables to discard.
Ancestors
Static methods
def load(filename: str = './simulation_model')
-
Expand source code
@staticmethod def load(filename: str = "./simulation_model"): """ Load the state of a SimulationEngine from a file. Parameters: filename (str): The name of the file containing the saved object. Returns: SimulationEngine: A new instance of SimulationEngine with the loaded state. """ filename = filename + ".pkl" if not filename.endswith(".pkl") else filename with open(filename, "rb") as f: return pickle.load(f)
Load the state of a SimulationEngine from a file.
Parameters
filename (str): The name of the file containing the saved object.
Returns
SimulationEngine
- A new instance of SimulationEngine with the loaded state.
Methods
def fit(self, df: pandas.core.frame.DataFrame, n_importances: int, **kwargs) ‑> None
-
Expand source code
def fit(self, df: DataFrame, n_importances: int, **kwargs) -> None: self.df = df self.n_importances = n_importances # We run the feature selection algorithm self.get_digraph(self.df, self.n_importances, self.use_scaler) proba_dict_keys = list(self.w_dict.keys()) self.proba_dict = dict(zip(proba_dict_keys, [i for i in range(len(proba_dict_keys))])) for key in proba_dict_keys: x = ( self.df[key].values, None if self.df[key].dtype != "object" else self._encode(self.df[[key]]), ) poly = kwargs.get("poly", 9) plot = kwargs.get("plot", False) if not x[1]: media = self.df[key].mean() standard_deviation = self.df[key].std() lower_limit = media - 1.5 * standard_deviation upper_limit = media + 1.5 * standard_deviation if plot: print(f"Cumulative Distribution Function ({key})") f, cdf_, ox = cdf(x[0].flatten(), poly=poly, plot=plot) else: f, ox = None, None least_frequent_category, most_frequent_category = categories_by_quartile( self.df[[key]], key ) lower_limit = x[1].get(least_frequent_category, 0) upper_limit = x[1].get(most_frequent_category, 0) self.proba_dict[key] = ( f if f else None, x[1], (np.mean(np.abs(np.diff(ox))) / 2.0 if isinstance(ox, np.ndarray) else None), f(lower_limit) if f else lower_limit, f(upper_limit) if f else upper_limit, )
def get_proba(self, value: float | int | str | list, colname: str) ‑> List[float]
-
Expand source code
def get_proba(self, value: Union[Union[float, int], str] | list, colname: str) -> List[float]: value = ( value if isinstance(value, list) else value.flatten().tolist() if isinstance(value, np.ndarray) else [value] ) return [ ( self.proba_dict[colname][0](val) - self.proba_dict[colname][0](val - self.proba_dict[colname][2]) if (isinstance(val, float) or isinstance(val, int)) else self.proba_dict[colname][1].get(val, 0) ) for val in value ]
def pred_outliers(self, value: float | int | str | list, colname: str) ‑> List[str]
-
Expand source code
def pred_outliers(self, value: Union[Union[float, int], str] | list, colname: str) -> List[str]: return [ ( "inlier" if (self.proba_dict[colname][3] < val < self.proba_dict[colname][4]) else "outlier" ) for val in self.get_proba(value, colname) ]
def predict(self, df: pandas.core.frame.DataFrame, column: str) ‑> numpy.ndarray | list
-
Expand source code
def predict(self, df: DataFrame, column: str) -> np.ndarray | list: # Let us assign the dictionary entries corresponding to the column w, quick_encoder, names_cols, dfe, numeric_dict = self.w_dict[column] df = df[names_cols].copy() # Change the scale of the DataFrame dataset = self.df.copy() dataset.drop(columns=column, inplace=True) numeric_df = dataset.select_dtypes(include="number") if self.use_scaler: scaler = DataScaler(numeric_df.copy().to_numpy().T, n=None) _ = scaler.rescale() dataset_ = df.copy() numeric_df = dataset_.select_dtypes(include="number") numeric_scaled = scaler.rescale(dataset_=numeric_df.to_numpy()) numeric_df = pd.DataFrame(numeric_scaled.T, columns=numeric_df.columns) for col in numeric_df.columns: df[col] = numeric_df[col].values # Encoding the DataFrame for num, colname in enumerate(dfe._encode_columns): if df[colname].dtype == "object": encode_dict = dfe.encoding_list[num] df[colname] = df[colname].apply( dfe._code_transformation_to, dictionary_list=encode_dict ) # Prediction y = df.to_numpy() @ w # Categorical column if quick_encoder != None: one_hot = OneHotEncoder() y = one_hot.decode(y) encoding_dic = quick_encoder.decoding_list[0] y = [encoding_dic[item] for item in y] # Numeric column else: if self.use_scaler: # scale output y += 1 y /= 2 y = y * (self.df[column].max() - self.df[column].min()) return y[:]
def save(self, filename: str = './simulation_model') ‑> None
-
Expand source code
def save(self, filename: str = "./simulation_model") -> None: """ Save the state of the SimulationEngine to a file. Parameters: filename (str): The name of the file where the object will be saved. """ filename = filename if filename.endswith(".pkl") else filename + ".pkl" with open(filename, "wb") as f: pickle.dump(self, f)
Save the state of the SimulationEngine to a file.
Parameters
filename (str): The name of the file where the object will be saved.
Inherited members