Source code for sharkpy.core

# sharkpy/core.py

from typing import List, Union, Optional, Dict
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from .learning import learn
from .predicting import predict, predict_baseline
from .reporting import report
from .plotting import plot_model
from .saving import save_model, load_model
from .explaining import explain_model
from .battle import battle, MODEL_DETAILS

try:
    from .shapash_integration import explain_with_shapash
except ImportError:

[docs]
    explain_with_shapash = None



[docs]
class Shark:
    """
    A machine learning model manager that simplifies training, prediction, and analysis.

    Attributes
    ----------
    model : object or None
        The trained machine learning model (e.g., LogisticRegression, RandomForestClassifier).
    problem_type : str or None
        Type of ML problem ('classification' or 'regression').
    features : pd.DataFrame or None
        Input features used for training.
    target : pd.Series or np.ndarray or None
        Target variable (encoded for classification, original for regression).
    target_name : str or None
        Name of the target column in the input data.
    data : pd.DataFrame or None
        Original input DataFrame, including features and target.
    project_name : str or None
        Name of the current project for tracking and reporting.
    feature_names : list of str or None
        Names of feature columns.
    encoders : dict
        Dictionary storing feature encoders (e.g., for categorical features).
    label_encoder : LabelEncoder or None
        Encoder for categorical target variable (for classification).
    stats_model : object or None
        Statistical model for detailed analysis (optional).
    statistical_summary : str or None
        Summary of statistical analysis (optional).
    p_values : pd.Series or None
        P-values from statistical analysis (optional).
    conf_intervals : pd.DataFrame or None
        Confidence intervals from statistical analysis (optional).

    Examples
    --------
    >>> from sharkpy import Shark
    >>> import pandas as pd
    >>> shark = Shark()
    >>> data = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv', header=None)
    >>> data.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
    >>> shark.learn(data=data, target='species', model_choice='logistic_regression')
    >>> predictions = shark.predict(data)
    >>> shark.explain(export_path='explanation.pdf', format='pdf', depth='simple')
    >>> cv_results, train_metrics = shark.report(cv_folds=5)
    """

    def __init__(self):
        """Initialize Shark with empty attributes."""
        # Core attributes

[docs]
        self.model = None


[docs]
        self.features = None


[docs]
        self.target = None


[docs]
        self.problem_type = None


[docs]
        self.target_name = None


[docs]
        self.data = None


[docs]
        self.label_encoder = None


        # Metadata

[docs]
        self.project_name = None


[docs]
        self.feature_names = None


[docs]
        self.encoders = {}


        # Statistical analysis

[docs]
        self.stats_model = None


[docs]
        self.statistical_summary = None


[docs]
        self.p_values = None


[docs]
        self.conf_intervals = None



[docs]
    def learn(self, data: Union[str, pd.DataFrame], project_name: str = "your data", target: Optional[str] = None, 
              problem_type: Optional[str] = None, model: Optional[object] = None, model_choice: Optional[str] = None, 
              detailed_stats: bool = False, n_trials: int = 30, verbose: bool = False) -> 'Shark':
        """
        Train a machine learning model on the provided data.

        Parameters
        ----------
        data : str or pd.DataFrame
            Dataset for training. Can be a file path (CSV) or a pandas DataFrame.
        project_name : str, optional
            Name of the project for tracking and reporting (default: "your data").
        target : str, optional
            Name of the target column to predict (default: None).
        problem_type : str, optional
            Type of problem: 'regression', 'classification', or None for auto-detection (default: None).
        model : object, optional
            Custom model instance to use (default: None).
        model_choice : str, optional
            Built-in model to use (e.g., 'logistic_regression', 'random_forest', 'xgboost') (default: None).
        detailed_stats : bool, optional
            Whether to compute detailed statistical analysis (e.g., p-values, confidence intervals) (default: False).
        n_trials : int, optional
            Number of optimization trials for boosting models (e.g., XGBoost) (default: 30).
        verbose : bool, optional
            Whether to print detailed output during training (default: False).

        Returns
        -------
        Shark
            The current Shark instance with trained model and updated attributes.

        Notes
        -----
        - Automatically encodes categorical features and target (for classification).
        - Stores the original DataFrame in `self.data` and target name in `self.target_name`.
        - For classification, stores the `LabelEncoder` in `self.label_encoder` to preserve category names.
        - Performs K-Fold cross-validation and prints mean and standard deviation of scores.
        - Fits the selected model on the entire dataset after cross-validation.
        - Warning: Avoid loading untrusted CSV files, as they may contain malicious data.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'a']})
        >>> shark.learn(data, target='y', model_choice='logistic_regression')
        🦈 Looks like a classification problem (non-numeric target: y)
        🦈 Encoding categorical target 'y' to numeric labels
        ...
        >>> shark.target_name
        'y'
        >>> shark.label_encoder.classes_
        array(['a', 'b'], dtype=object)
        """
        # Store metadata
        self.project_name = project_name
        self.target_name = target
        self.data = data if isinstance(data, pd.DataFrame) else pd.read_csv(data)

        # Process feature names and target
        if isinstance(self.data, pd.DataFrame):
            self.feature_names = [col for col in self.data.columns if col != target]
            self.features = self.data.drop(columns=[target])
            self.target = self.data[target]
        else:
            raise ValueError("🦈 Data must be a pandas DataFrame!")

        # Encode categorical target for classification
        if problem_type == 'classification' or (problem_type is None and not np.issubdtype(self.target.dtype, np.number)):
            print(f"🦈 Looks like a classification problem (non-numeric target: {target})")
            self.label_encoder = LabelEncoder()
            self.target = self.label_encoder.fit_transform(self.target)
            print(f"🦈 Encoding categorical target '{target}' to numeric labels")

        return learn(self, self.data, project_name, target, problem_type, 
                     model, model_choice, detailed_stats, n_trials, verbose)



[docs]
    def predict(self, X: Optional[Union[Dict, pd.DataFrame, List[Dict], np.ndarray]] = None) -> Union[float, str, np.ndarray]:
        """
        Make predictions using the trained model.

        Parameters
        ----------
        X : dict, pd.DataFrame, list of dict, np.ndarray, or None, optional
            Input samples to predict. If None, predicts on training data. Options:
            - dict: Single prediction (e.g., {'feature1': value1, 'feature2': value2}).
            - list of dict: Multiple scenarios (e.g., [{'feature1': value1}, {'feature1': value2}]).
            - pd.DataFrame: Multiple samples with feature columns.
            - np.ndarray: Raw feature values (must match training feature count).

        Returns
        -------
        float, str, or np.ndarray
            Predicted values. For classification, returns original category names if `label_encoder` is available.

        Raises
        ------
        ValueError
            If no model is trained or input data is invalid.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x1': [1, 2], 'x2': [3, 4], 'y': ['cat', 'dog']})
        >>> shark.learn(data, target='y')
        >>> shark.predict({'x1': 1, 'x2': 3})
        'cat'
        >>> shark.predict(data[['x1', 'x2']])
        array(['cat', 'dog'], dtype=object)
        """
        return predict(self, X)



[docs]
    def predict_baseline(self) -> Union[float, str]:
        """
        Make a baseline prediction using the minimum values of the training features.

        Returns
        -------
        float or str
            Baseline prediction for regression (mean) or classification (most frequent class).

        Raises
        ------
        ValueError
            If no model is trained.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [10, 20, 30]})
        >>> shark.learn(data, target='y')
        >>> shark.predict_baseline()
        20.0
        """
        return predict_baseline(self)



[docs]
    def plot(self, kind: str = "prediction", show: bool = True, 
            save_path: Optional[str] = None, colors: Optional[Dict[str, str]] = None):
        """
        Visualize model behavior based on the specified plot type.

        Parameters
        ----------
        kind : str, optional
            Type of plot: 'prediction', 'residuals', 'confusion_matrix', 'roc', 
            'pr_curve', 'proba_hist', or 'feature_importance' (default: 'prediction').
        show : bool, optional
            Whether to display the plot (default: True).
        save_path : str, optional
            Path to save the plot (default: None).
        colors : dict, optional
            Custom color specifications for the plot. If None, uses default SharkPy colors.
            Available keys: 'primary', 'secondary', 'accent', 'background', 'grid', 'text', 'bars'

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If no model is trained or the plot type is invalid.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 1, 0]})
        >>> shark.learn(data, target='y')
        >>> shark.plot(kind='confusion_matrix')
        
        >>> # Custom colors example
        >>> custom_colors = {
        >>>     'primary': '#FF6B6B',    # Coral red
        >>>     'secondary': '#4ECDC4',  # Turquoise
        >>>     'background': '#F7FFF7'  # Light green
        >>> }
        >>> shark.plot(kind='feature_importance', colors=custom_colors)
        """
        if not hasattr(self, 'model') or self.model is None:
            raise ValueError("No model trained yet. Call learn() first.")
        
        if not hasattr(self, 'features') or self.features is None:
            raise ValueError("No feature data available.")
        
        if not hasattr(self, 'target') or self.target is None:
            raise ValueError("No target data available.")
        
        return plot_model(
            model=self.model, 
            X=self.features, 
            y=self.target, 
            kind=kind, 
            show=show, 
            save_path=save_path,
            feature_names=self.feature_names if hasattr(self, 'feature_names') else None,
            colors=colors
        )



[docs]
    def report(self, cv_folds: int = 5, export_path: Optional[str] = None, format: str = 'txt') -> tuple:
        """
        Generate a comprehensive performance report with cross-validation metrics.

        Parameters
        ----------
        cv_folds : int, optional
            Number of cross-validation folds (default: 5).
        export_path : str, optional
            Path to export the report (txt, docx, or pdf) (default: None).
        format : str, optional
            Export format: 'txt', 'docx', or 'pdf' (default: 'txt').

        Returns
        -------
        tuple
            (cv_results, train_metrics), where cv_results is a dict of cross-validation metrics and train_metrics is a dict of training metrics.

        Raises
        ------
        ValueError
            If no model is trained or the format is invalid.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 1, 0]})
        >>> shark.learn(data, target='y')
        >>> cv_results, train_metrics = shark.report(cv_folds=5)
        >>> print(cv_results['test_accuracy'].mean())
        """
        return report(self, cv_folds, export_path, format)



[docs]
    def explain(self, cv_results=None, train_metrics=None, export_path: Optional[str] = None, format: str = 'txt', depth: str = 'deep', verbose: int = 1) -> Optional[pd.DataFrame]:
        """
        Explain the model's behavior and performance with customizable depth and export options.

        Parameters
        ----------
        cv_results : dict, optional
            Cross-validation results from report(), containing metrics like test_r2 or test_accuracy.
        train_metrics : dict, optional
            Training metrics from report(), containing metrics like r2 or accuracy.
        export_path : str, optional
            Path to export the explanation (txt, docx, or pdf) (default: None).
        format : str, optional
            Export format: 'txt', 'docx', or 'pdf' (default: 'txt').
        depth : str, optional
            Explanation depth: 'simple' (beginner overview), 'mechanics' (technical details),
            'interpretation' (performance analysis), 'actionable' (recommendations),
            'deep' (all levels, default), or 'shapash' (interactive SHAP dashboard).

        Returns
        -------
        pd.DataFrame or None
            Feature importance DataFrame if available, else None.

        Notes
        -----
        - Requires a trained model (call `learn` first).
        - For classification, uses `label_encoder` to display original category names (e.g., 'Iris-setosa' instead of 0).
        - If `export_path` is provided, saves the explanation in the specified format.
        - 'shapash' depth requires the `shapash` package to be installed.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x1': [1, 2], 'x2': [3, 4], 'y': ['cat', 'dog']})
        >>> shark.learn(data, target='y')
        >>> shark.explain(depth='simple', export_path='explanation.txt')
        🦈 Sharky is diving into the LogisticRegression model explanation...
        ...
        >>> # explanation.txt contains: "This model predicts one of 2 categories (cat, dog)..."
        """
        if self.model is None:
            print("🦈 Oops! Sharky can't explain a model that hasn't been trained yet! Call .learn() first.")
            return None
        
        print(f"🦈 Sharky is diving into the {type(self.model).__name__} model explanation...")
        
        # Call explain_model with all parameters, including target_name, data, and label_encoder
        feature_df = explain_model(
            model=self.model,
            features=self.features,
            target=self.target,
            target_name=self.target_name,
            data=self.data,
            label_encoder=self.label_encoder,
            cv_results=cv_results,
            train_metrics=train_metrics,
            export_path=export_path,
            format=format,
            depth=depth
        )
        
        if feature_df is not None:
            print("\n🦈 Sharky found some key features driving the model! Check the output above.")
        else:
            print("\n🦈 Sharky couldn't extract feature importance for this model type.")
        
        return feature_df



[docs]
    def save_model(self, name: str = "shark_model", directory: str = "models") -> str:
        """
        Save the trained model to a .joblib file.

        Parameters
        ----------
        name : str, optional
            Filename without extension (default: "shark_model").
        directory : str, optional
            Folder where the model will be saved (default: "models").

        Returns
        -------
        str
            Path to the saved model file.

        Raises
        ------
        ValueError
            If no model is trained.
        OSError
            If directory creation or file writing fails.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [10, 20, 30]})
        >>> shark.learn(data, target='y')
        >>> shark.save_model(name='my_model')
        'models/my_model.joblib'
        """
        return save_model(self, self.model, name, directory)



[docs]
    def load_model(self, model_path: str) -> object:
        """
        Load a saved SharkPy model from a .joblib file.

        Parameters
        ----------
        model_path : str
            Path to the saved .joblib model file.

        Returns
        -------
        object
            The loaded model object.

        Raises
        ------
        FileNotFoundError
            If the model file does not exist.
        ValueError
            If the file is not a valid model.

        Examples
        --------
        >>> shark = Shark()
        >>> shark.load_model('models/my_model.joblib')
        <sklearn.linear_model.LinearRegression object at ...>
        """
        return load_model(self, model_path)



[docs]
    def battle(self, data: pd.DataFrame, target: str, models: List[str] = ['linear_regression', 'random_forest', 'xgboost'], 
            metric: str = 'r2', n_trials: int = 30, early_stopping: bool = False, min_score: float = 0.5, verbose: int = 0) -> Dict:
        """
        Compare multiple models and select the best performer.

        Parameters
        ----------
        data : pd.DataFrame
            Input data for training.
        target : str
            Name of the target column.
        models : list of str, optional
            List of model names to compare (e.g., ['linear_regression', 'random_forest']) (default: ['linear_regression', 'random_forest', 'xgboost']).
        metric : str, optional
            Metric to compare models (e.g., 'r2', 'accuracy') (default: 'r2').
        n_trials : int, optional
            Number of optimization trials for boosting models (default: 30).
        early_stopping : bool, optional
            If True, stops training if any model exceeds `min_score`. Not recommended as it may miss better models later (default: False).
        min_score : float, optional
            Minimum score to trigger early stopping (default: 0.5).
        verbose : int, optional
            Verbosity level for model training (default: 0)

        Returns
        -------
        dict
            Dictionary containing champion model name, model object, score, all results, details, and comparison plot.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [10, 20, 30]})
        >>> result = shark.battle(data, target='y', models=['linear_regression', 'random_forest'])
        >>> print(result['champion'])
        'linear_regression'
        """
        return battle(self, data, target, models, metric, n_trials, early_stopping, min_score, verbose)



[docs]
    def explain_with_shapash(self, title_story: Optional[str] = None, display: bool = True):
        """
        Create an interactive Shapash dashboard for model interpretation.

        Parameters
        ----------
        title_story : str, optional
            Title for the Shapash dashboard (default: None).
        display : bool, optional
            Whether to display the dashboard (default: True).

        Returns
        -------
        None

        Raises
        ------
        ImportError
            If the `shapash` package is not installed.
        ValueError
            If no model is trained.

        Examples
        --------
        >>> shark = Shark()
        >>> data = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 1, 0]})
        >>> shark.learn(data, target='y')
        >>> shark.explain_with_shapash(title_story='My Model Analysis')
        """
        return explain_with_shapash(self, title_story, display)



[docs]
    def available_models(self) -> Dict:
        """
        List all available models with their details and print a comparison table.

        Returns
        -------
        dict
            Dictionary of available models and their details.

        Examples
        --------
        >>> shark = Shark()
        >>> models = shark.available_models()
        🦈 Available Models in SharkPy 🦈
        ...
        >>> print(models.keys())
        dict_keys(['linear_regression', 'random_forest', 'xgboost', ...])
        """
        print("\n🦈 Available Models in SharkPy 🦈")
        df = pd.DataFrame.from_dict(MODEL_DETAILS, orient='index')
        print(df.to_string())
        return MODEL_DETAILS