Source code for sharkpy.core

# sharkpy/core.py

from typing import List, Union, Optional, Dict
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from .learning import learn
from .predicting import predict, predict_baseline
from .reporting import report
from .plotting import plot_model
from .saving import save_model, load_model
from .explaining import explain_model
from .battle import battle, MODEL_DETAILS

try:
    from .shapash_integration import explain_with_shapash
except ImportError:
[docs] explain_with_shapash = None
[docs] class Shark: """ A machine learning model manager that simplifies training, prediction, and analysis. Attributes ---------- model : object or None The trained machine learning model (e.g., LogisticRegression, RandomForestClassifier). problem_type : str or None Type of ML problem ('classification' or 'regression'). features : pd.DataFrame or None Input features used for training. target : pd.Series or np.ndarray or None Target variable (encoded for classification, original for regression). target_name : str or None Name of the target column in the input data. data : pd.DataFrame or None Original input DataFrame, including features and target. project_name : str or None Name of the current project for tracking and reporting. feature_names : list of str or None Names of feature columns. encoders : dict Dictionary storing feature encoders (e.g., for categorical features). label_encoder : LabelEncoder or None Encoder for categorical target variable (for classification). stats_model : object or None Statistical model for detailed analysis (optional). statistical_summary : str or None Summary of statistical analysis (optional). p_values : pd.Series or None P-values from statistical analysis (optional). conf_intervals : pd.DataFrame or None Confidence intervals from statistical analysis (optional). Examples -------- >>> from sharkpy import Shark >>> import pandas as pd >>> shark = Shark() >>> data = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv', header=None) >>> data.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] >>> shark.learn(data=data, target='species', model_choice='logistic_regression') >>> predictions = shark.predict(data) >>> shark.explain(export_path='explanation.pdf', format='pdf', depth='simple') >>> cv_results, train_metrics = shark.report(cv_folds=5) """ def __init__(self): """Initialize Shark with empty attributes.""" # Core attributes
[docs] self.model = None
[docs] self.features = None
[docs] self.target = None
[docs] self.problem_type = None
[docs] self.target_name = None
[docs] self.data = None
[docs] self.label_encoder = None
# Metadata
[docs] self.project_name = None
[docs] self.feature_names = None
[docs] self.encoders = {}
# Statistical analysis
[docs] self.stats_model = None
[docs] self.statistical_summary = None
[docs] self.p_values = None
[docs] self.conf_intervals = None
[docs] def learn(self, data: Union[str, pd.DataFrame], project_name: str = "your data", target: Optional[str] = None, problem_type: Optional[str] = None, model: Optional[object] = None, model_choice: Optional[str] = None, detailed_stats: bool = False, n_trials: int = 30, verbose: bool = False) -> 'Shark': """ Train a machine learning model on the provided data. Parameters ---------- data : str or pd.DataFrame Dataset for training. Can be a file path (CSV) or a pandas DataFrame. project_name : str, optional Name of the project for tracking and reporting (default: "your data"). target : str, optional Name of the target column to predict (default: None). problem_type : str, optional Type of problem: 'regression', 'classification', or None for auto-detection (default: None). model : object, optional Custom model instance to use (default: None). model_choice : str, optional Built-in model to use (e.g., 'logistic_regression', 'random_forest', 'xgboost') (default: None). detailed_stats : bool, optional Whether to compute detailed statistical analysis (e.g., p-values, confidence intervals) (default: False). n_trials : int, optional Number of optimization trials for boosting models (e.g., XGBoost) (default: 30). verbose : bool, optional Whether to print detailed output during training (default: False). Returns ------- Shark The current Shark instance with trained model and updated attributes. Notes ----- - Automatically encodes categorical features and target (for classification). - Stores the original DataFrame in `self.data` and target name in `self.target_name`. - For classification, stores the `LabelEncoder` in `self.label_encoder` to preserve category names. - Performs K-Fold cross-validation and prints mean and standard deviation of scores. - Fits the selected model on the entire dataset after cross-validation. - Warning: Avoid loading untrusted CSV files, as they may contain malicious data. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'a']}) >>> shark.learn(data, target='y', model_choice='logistic_regression') 🦈 Looks like a classification problem (non-numeric target: y) 🦈 Encoding categorical target 'y' to numeric labels ... >>> shark.target_name 'y' >>> shark.label_encoder.classes_ array(['a', 'b'], dtype=object) """ # Store metadata self.project_name = project_name self.target_name = target self.data = data if isinstance(data, pd.DataFrame) else pd.read_csv(data) # Process feature names and target if isinstance(self.data, pd.DataFrame): self.feature_names = [col for col in self.data.columns if col != target] self.features = self.data.drop(columns=[target]) self.target = self.data[target] else: raise ValueError("🦈 Data must be a pandas DataFrame!") # Encode categorical target for classification if problem_type == 'classification' or (problem_type is None and not np.issubdtype(self.target.dtype, np.number)): print(f"🦈 Looks like a classification problem (non-numeric target: {target})") self.label_encoder = LabelEncoder() self.target = self.label_encoder.fit_transform(self.target) print(f"🦈 Encoding categorical target '{target}' to numeric labels") return learn(self, self.data, project_name, target, problem_type, model, model_choice, detailed_stats, n_trials, verbose)
[docs] def predict(self, X: Optional[Union[Dict, pd.DataFrame, List[Dict], np.ndarray]] = None) -> Union[float, str, np.ndarray]: """ Make predictions using the trained model. Parameters ---------- X : dict, pd.DataFrame, list of dict, np.ndarray, or None, optional Input samples to predict. If None, predicts on training data. Options: - dict: Single prediction (e.g., {'feature1': value1, 'feature2': value2}). - list of dict: Multiple scenarios (e.g., [{'feature1': value1}, {'feature1': value2}]). - pd.DataFrame: Multiple samples with feature columns. - np.ndarray: Raw feature values (must match training feature count). Returns ------- float, str, or np.ndarray Predicted values. For classification, returns original category names if `label_encoder` is available. Raises ------ ValueError If no model is trained or input data is invalid. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x1': [1, 2], 'x2': [3, 4], 'y': ['cat', 'dog']}) >>> shark.learn(data, target='y') >>> shark.predict({'x1': 1, 'x2': 3}) 'cat' >>> shark.predict(data[['x1', 'x2']]) array(['cat', 'dog'], dtype=object) """ return predict(self, X)
[docs] def predict_baseline(self) -> Union[float, str]: """ Make a baseline prediction using the minimum values of the training features. Returns ------- float or str Baseline prediction for regression (mean) or classification (most frequent class). Raises ------ ValueError If no model is trained. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [10, 20, 30]}) >>> shark.learn(data, target='y') >>> shark.predict_baseline() 20.0 """ return predict_baseline(self)
[docs] def plot(self, kind: str = "prediction", show: bool = True, save_path: Optional[str] = None, colors: Optional[Dict[str, str]] = None): """ Visualize model behavior based on the specified plot type. Parameters ---------- kind : str, optional Type of plot: 'prediction', 'residuals', 'confusion_matrix', 'roc', 'pr_curve', 'proba_hist', or 'feature_importance' (default: 'prediction'). show : bool, optional Whether to display the plot (default: True). save_path : str, optional Path to save the plot (default: None). colors : dict, optional Custom color specifications for the plot. If None, uses default SharkPy colors. Available keys: 'primary', 'secondary', 'accent', 'background', 'grid', 'text', 'bars' Returns ------- None Raises ------ ValueError If no model is trained or the plot type is invalid. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 1, 0]}) >>> shark.learn(data, target='y') >>> shark.plot(kind='confusion_matrix') >>> # Custom colors example >>> custom_colors = { >>> 'primary': '#FF6B6B', # Coral red >>> 'secondary': '#4ECDC4', # Turquoise >>> 'background': '#F7FFF7' # Light green >>> } >>> shark.plot(kind='feature_importance', colors=custom_colors) """ if not hasattr(self, 'model') or self.model is None: raise ValueError("No model trained yet. Call learn() first.") if not hasattr(self, 'features') or self.features is None: raise ValueError("No feature data available.") if not hasattr(self, 'target') or self.target is None: raise ValueError("No target data available.") return plot_model( model=self.model, X=self.features, y=self.target, kind=kind, show=show, save_path=save_path, feature_names=self.feature_names if hasattr(self, 'feature_names') else None, colors=colors )
[docs] def report(self, cv_folds: int = 5, export_path: Optional[str] = None, format: str = 'txt') -> tuple: """ Generate a comprehensive performance report with cross-validation metrics. Parameters ---------- cv_folds : int, optional Number of cross-validation folds (default: 5). export_path : str, optional Path to export the report (txt, docx, or pdf) (default: None). format : str, optional Export format: 'txt', 'docx', or 'pdf' (default: 'txt'). Returns ------- tuple (cv_results, train_metrics), where cv_results is a dict of cross-validation metrics and train_metrics is a dict of training metrics. Raises ------ ValueError If no model is trained or the format is invalid. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 1, 0]}) >>> shark.learn(data, target='y') >>> cv_results, train_metrics = shark.report(cv_folds=5) >>> print(cv_results['test_accuracy'].mean()) """ return report(self, cv_folds, export_path, format)
[docs] def explain(self, cv_results=None, train_metrics=None, export_path: Optional[str] = None, format: str = 'txt', depth: str = 'deep', verbose: int = 1) -> Optional[pd.DataFrame]: """ Explain the model's behavior and performance with customizable depth and export options. Parameters ---------- cv_results : dict, optional Cross-validation results from report(), containing metrics like test_r2 or test_accuracy. train_metrics : dict, optional Training metrics from report(), containing metrics like r2 or accuracy. export_path : str, optional Path to export the explanation (txt, docx, or pdf) (default: None). format : str, optional Export format: 'txt', 'docx', or 'pdf' (default: 'txt'). depth : str, optional Explanation depth: 'simple' (beginner overview), 'mechanics' (technical details), 'interpretation' (performance analysis), 'actionable' (recommendations), 'deep' (all levels, default), or 'shapash' (interactive SHAP dashboard). Returns ------- pd.DataFrame or None Feature importance DataFrame if available, else None. Notes ----- - Requires a trained model (call `learn` first). - For classification, uses `label_encoder` to display original category names (e.g., 'Iris-setosa' instead of 0). - If `export_path` is provided, saves the explanation in the specified format. - 'shapash' depth requires the `shapash` package to be installed. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x1': [1, 2], 'x2': [3, 4], 'y': ['cat', 'dog']}) >>> shark.learn(data, target='y') >>> shark.explain(depth='simple', export_path='explanation.txt') 🦈 Sharky is diving into the LogisticRegression model explanation... ... >>> # explanation.txt contains: "This model predicts one of 2 categories (cat, dog)..." """ if self.model is None: print("🦈 Oops! Sharky can't explain a model that hasn't been trained yet! Call .learn() first.") return None print(f"🦈 Sharky is diving into the {type(self.model).__name__} model explanation...") # Call explain_model with all parameters, including target_name, data, and label_encoder feature_df = explain_model( model=self.model, features=self.features, target=self.target, target_name=self.target_name, data=self.data, label_encoder=self.label_encoder, cv_results=cv_results, train_metrics=train_metrics, export_path=export_path, format=format, depth=depth ) if feature_df is not None: print("\n🦈 Sharky found some key features driving the model! Check the output above.") else: print("\n🦈 Sharky couldn't extract feature importance for this model type.") return feature_df
[docs] def save_model(self, name: str = "shark_model", directory: str = "models") -> str: """ Save the trained model to a .joblib file. Parameters ---------- name : str, optional Filename without extension (default: "shark_model"). directory : str, optional Folder where the model will be saved (default: "models"). Returns ------- str Path to the saved model file. Raises ------ ValueError If no model is trained. OSError If directory creation or file writing fails. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [10, 20, 30]}) >>> shark.learn(data, target='y') >>> shark.save_model(name='my_model') 'models/my_model.joblib' """ return save_model(self, self.model, name, directory)
[docs] def load_model(self, model_path: str) -> object: """ Load a saved SharkPy model from a .joblib file. Parameters ---------- model_path : str Path to the saved .joblib model file. Returns ------- object The loaded model object. Raises ------ FileNotFoundError If the model file does not exist. ValueError If the file is not a valid model. Examples -------- >>> shark = Shark() >>> shark.load_model('models/my_model.joblib') <sklearn.linear_model.LinearRegression object at ...> """ return load_model(self, model_path)
[docs] def battle(self, data: pd.DataFrame, target: str, models: List[str] = ['linear_regression', 'random_forest', 'xgboost'], metric: str = 'r2', n_trials: int = 30, early_stopping: bool = False, min_score: float = 0.5, verbose: int = 0) -> Dict: """ Compare multiple models and select the best performer. Parameters ---------- data : pd.DataFrame Input data for training. target : str Name of the target column. models : list of str, optional List of model names to compare (e.g., ['linear_regression', 'random_forest']) (default: ['linear_regression', 'random_forest', 'xgboost']). metric : str, optional Metric to compare models (e.g., 'r2', 'accuracy') (default: 'r2'). n_trials : int, optional Number of optimization trials for boosting models (default: 30). early_stopping : bool, optional If True, stops training if any model exceeds `min_score`. Not recommended as it may miss better models later (default: False). min_score : float, optional Minimum score to trigger early stopping (default: 0.5). verbose : int, optional Verbosity level for model training (default: 0) Returns ------- dict Dictionary containing champion model name, model object, score, all results, details, and comparison plot. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [10, 20, 30]}) >>> result = shark.battle(data, target='y', models=['linear_regression', 'random_forest']) >>> print(result['champion']) 'linear_regression' """ return battle(self, data, target, models, metric, n_trials, early_stopping, min_score, verbose)
[docs] def explain_with_shapash(self, title_story: Optional[str] = None, display: bool = True): """ Create an interactive Shapash dashboard for model interpretation. Parameters ---------- title_story : str, optional Title for the Shapash dashboard (default: None). display : bool, optional Whether to display the dashboard (default: True). Returns ------- None Raises ------ ImportError If the `shapash` package is not installed. ValueError If no model is trained. Examples -------- >>> shark = Shark() >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 1, 0]}) >>> shark.learn(data, target='y') >>> shark.explain_with_shapash(title_story='My Model Analysis') """ return explain_with_shapash(self, title_story, display)
[docs] def available_models(self) -> Dict: """ List all available models with their details and print a comparison table. Returns ------- dict Dictionary of available models and their details. Examples -------- >>> shark = Shark() >>> models = shark.available_models() 🦈 Available Models in SharkPy 🦈 ... >>> print(models.keys()) dict_keys(['linear_regression', 'random_forest', 'xgboost', ...]) """ print("\n🦈 Available Models in SharkPy 🦈") df = pd.DataFrame.from_dict(MODEL_DETAILS, orient='index') print(df.to_string()) return MODEL_DETAILS