# sharkpy/learning.py
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import optuna
import optuna.logging
import lightgbm as lgb
import catboost as cb
from typing import Union, Optional, Any
import random
# Prediction intros for the learn function
[docs]
PREDICTION_INTROS = [
"π¦ Diving into {project_name}! Time to make some waves! π",
"π¦ Sharpening teeth on {project_name}! Ready to take a bite out of prediction! π¦",
"π¦ Commanding the seas of {project_name}! Prepare for precision strikes! π―",
"π¦ Swimming through {project_name}... hunting for patterns! π",
"π¦ Unleashing the predator on {project_name}! No pattern is safe! β‘",
"π¦ Lurking in the data streams of {project_name}... ready to strike! π",
"π¦ Circling {project_name} like the apex predictor I am! π",
"π¦ Scenting blood in the data waters of {project_name}! π©Έ",
"π¦ Preparing to feast on the patterns in {project_name}! π½οΈ",
"π¦ Navigating the deep data trenches of {project_name}! πΊοΈ"
]
[docs]
def learn(
self,
data: Union[str, pd.DataFrame],
project_name: str = "your data",
target: Optional[str] = None,
problem_type: Optional[str] = None,
model: Optional[Any] = None,
model_choice: Optional[str] = None,
detailed_stats: bool = False,
n_trials: int = 30,
verbose: bool = False
) -> 'Shark':
"""
Train a machine learning model using the provided data and parameters.
Parameters
----------
self : Shark
The Shark instance.
data : str or pandas.DataFrame
The dataset to use for training. Can be a file path (CSV) or a DataFrame.
project_name : str, optional
Name of the project for tracking and reporting.
target : str, optional
Name of the column to predict. If None, uses the last column.
problem_type : str, optional
Type of problem: "regression" or "classification". If None, tries to infer automatically.
model : sklearn.base.BaseEstimator, optional
A custom scikit-learn compatible model instance to use. If provided, overrides model_choice.
model_choice : str, optional
String identifier for built-in model selection. Options:
- "random_forest": RandomForestRegressor or RandomForestClassifier
- "svm": SVR or SVC
- "ridge": Ridge Regression (L2 regularization)
- "lasso": Lasso Regression (L1 regularization)
- "knn": K-Nearest Neighbors
- "xgboost": XGBoost with Optuna optimization
- "lightgbm": LightGBM with Optuna optimization
- "catboost": CatBoost with Optuna optimization
- None: LinearRegression or LogisticRegression (default)
detailed_stats : bool, optional
If True, uses statsmodels for detailed statistical analysis
n_trials : int, optional
Number of optimization trials for boosting models (default: 30)
verbose : bool, optional
If True, enables verbose logging for Optuna optimization (default: False)
Notes
-----
- Encodes categorical features and target automatically for classification.
- Performs K-Fold cross-validation and prints mean and std of scores.
- Fits the selected model on the entire dataset after cross-validation.
- Sets self.model, self.problem_type, self.features, self.target, and self.encoders.
- Warning: Avoid loading untrusted CSV files, as they may contain malicious data.
"""
print(random.choice(PREDICTION_INTROS).format(project_name=project_name))
# Set Optuna verbosity
optuna.logging.set_verbosity(optuna.logging.INFO if verbose else optuna.logging.WARNING)
# Load data if string path provided
if isinstance(data, str):
try:
data = pd.read_csv(data)
except Exception as e:
raise ValueError(f"π¦ Could not read data from {data}: {str(e)}")
if not isinstance(data, pd.DataFrame):
raise ValueError("π¦ Data must be a pandas DataFrame or a valid CSV file path")
# Set project name
self.project_name = project_name
# Select target and features
if target is None:
print("π¦ No target specified, using last column as target")
target = data.columns[-1]
if target not in data.columns:
raise ValueError(f"π¦ Target column '{target}' not found in data")
self.features = data.drop(columns=[target])
self.target = data[target]
# Infer problem type if not specified
if problem_type is None:
if pd.api.types.is_numeric_dtype(self.target):
self.problem_type = "regression"
print(f"π¦ Looks like a regression problem (numeric target: {target})")
else:
self.problem_type = "classification"
print(f"π¦ Looks like a classification problem (non-numeric target: {target})")
else:
self.problem_type = problem_type.lower()
if self.problem_type not in ["regression", "classification"]:
raise ValueError("π¦ Problem type must be 'regression' or 'classification'")
# Encode target for classification if necessary
if self.problem_type == "classification" and not pd.api.types.is_numeric_dtype(self.target):
print(f"π¦ Encoding categorical target '{target}' to numeric labels")
self.target_encoder = LabelEncoder()
self.target = pd.Series(self.target_encoder.fit_transform(self.target), index=self.target.index, name=self.target.name)
# Encode categorical features
self.feature_names = self.features.columns
for col in self.features.columns:
if self.features[col].dtype == "object":
print(f"π¦ Encoding categorical feature '{col}'")
self.features[col] = self.features[col].astype("category")
self.encoders[col] = dict(enumerate(self.features[col].cat.categories))
self.features[col] = self.features[col].cat.codes
# Model selection
if model is not None:
print(f"π¦ Using custom model: {type(model).__name__}")
self.model = model.fit(self.features, self.target)
else:
if model_choice is None:
model_choice = "linear_regression" if self.problem_type == "regression" else "logistic_regression"
print(f"π¦ No model specified, defaulting to {model_choice}")
model_choice = model_choice.lower()
if model_choice == "random_forest":
self.model = RandomForestRegressor(random_state=42).fit(self.features, self.target) if self.problem_type == "regression" else RandomForestClassifier(random_state=42).fit(self.features, self.target)
elif model_choice == "svm":
self.model = SVR().fit(self.features, self.target) if self.problem_type == "regression" else SVC().fit(self.features, self.target)
elif model_choice == "ridge":
self.model = Ridge(alpha=1.0, random_state=42).fit(self.features, self.target) if self.problem_type == "regression" else LogisticRegression(penalty='l2').fit(self.features, self.target)
elif model_choice == "lasso":
self.model = Lasso(alpha=1.0, random_state=42).fit(self.features, self.target) if self.problem_type == "regression" else LogisticRegression(penalty='l1', solver='liblinear').fit(self.features, self.target)
elif model_choice == "knn":
self.model = KNeighborsRegressor(n_neighbors=5).fit(self.features, self.target) if self.problem_type == "regression" else KNeighborsClassifier(n_neighbors=5).fit(self.features, self.target)
elif model_choice == "xgboost":
print("π¦ Optimizing XGBoost with Optuna...")
self.model = _create_optimized_xgboost(self.features, self.target, self.problem_type, n_trials)
elif model_choice == "lightgbm":
print("π¦ Optimizing LightGBM with Optuna...")
self.model = _create_optimized_lightgbm(self.features, self.target, self.problem_type, n_trials)
elif model_choice == "catboost":
print("π¦ Optimizing CatBoost with Optuna...")
self.model = _create_optimized_catboost(self.features, self.target, self.problem_type, n_trials)
else:
self.model = LinearRegression().fit(self.features, self.target) if self.problem_type == "regression" else LogisticRegression().fit(self.features, self.target)
# Detailed statistical analysis for regression
if detailed_stats and self.problem_type == "regression":
try:
X_with_const = sm.add_constant(self.features)
self.stats_model = sm.OLS(self.target, X_with_const).fit()
self.statistical_summary = self.stats_model.summary().as_text()
self.p_values = self.stats_model.pvalues
self.conf_intervals = self.stats_model.conf_int()
except Exception as e:
print(f"β οΈ Could not compute statistical details: {str(e)}")
print(f"π¦ Model training complete! Ready to make predictions.")
return self
[docs]
def _create_optimized_xgboost(X: pd.DataFrame, y: pd.Series, problem_type: str = "regression", n_trials: int = 30) -> Any:
"""
Create and optimize an XGBoost model using Optuna.
Parameters
----------
X : pd.DataFrame
Features DataFrame
y : pd.Series
Target series
problem_type : str
Type of problem: "regression" or "classification"
n_trials : int
Number of optimization trials (default: 30)
Returns
-------
model : Any
Trained XGBoost model with optimized parameters
"""
def objective(trial):
params = {
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'gamma': trial.suggest_float('gamma', 0, 5),
'random_state': 42
}
if problem_type == "regression":
model = xgb.XGBRegressor(**params)
else:
model = xgb.XGBClassifier(**params)
return model.fit(X, y).score(X, y)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)
if problem_type == "regression":
final_model = xgb.XGBRegressor(**study.best_params)
else:
final_model = xgb.XGBClassifier(**study.best_params)
print(f"π¦ Best XGBoost parameters found: {study.best_params}")
return final_model.fit(X, y)
[docs]
def _create_optimized_lightgbm(X: pd.DataFrame, y: pd.Series, problem_type: str = "regression", n_trials: int = 30) -> Any:
"""
Create and optimize a LightGBM model using Optuna.
Parameters
----------
X : pd.DataFrame
Features DataFrame
y : pd.Series
Target series
problem_type : str
Type of problem: "regression" or "classification"
n_trials : int
Number of optimization trials (default: 30)
Returns
-------
model : Any
Trained LightGBM model with optimized parameters
"""
def objective(trial):
params = {
'num_leaves': trial.suggest_int('num_leaves', 20, 100),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
'random_state': 42
}
if problem_type == "regression":
model = lgb.LGBMRegressor(**params)
else:
model = lgb.LGBMClassifier(**params)
return model.fit(X, y).score(X, y)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)
if problem_type == "regression":
final_model = lgb.LGBMRegressor(**study.best_params)
else:
final_model = lgb.LGBMClassifier(**study.best_params)
print(f"π¦ Best LightGBM parameters found: {study.best_params}")
return final_model.fit(X, y)
[docs]
def _create_optimized_catboost(X: pd.DataFrame, y: pd.Series, problem_type: str = "regression", n_trials: int = 30) -> Union[xgb.XGBRegressor, xgb.XGBClassifier]:
"""
Create and optimize a CatBoost model using Optuna.
Parameters
----------
X : pd.DataFrame
Features DataFrame
y : pd.Series
Target series
problem_type : str
Type of problem: "regression" or "classification"
n_trials : int
Number of optimization trials (default: 30)
Returns
-------
model : Any
Trained CatBoost model with optimized parameters
"""
def objective(trial):
params = {
'depth': trial.suggest_int('depth', 4, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'iterations': trial.suggest_int('iterations', 50, 300),
'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
'random_strength': trial.suggest_float('random_strength', 0, 1),
'random_seed': 42,
'verbose': False
}
if problem_type == "regression":
model = cb.CatBoostRegressor(**params)
else:
model = cb.CatBoostClassifier(**params)
return model.fit(X, y).score(X, y)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)
if problem_type == "regression":
final_model = cb.CatBoostRegressor(**study.best_params)
else:
final_model = cb.CatBoostClassifier(**study.best_params)
print(f"π¦ Best CatBoost parameters found: {study.best_params}")
return final_model.fit(X, y)