Source code for sharkpy.predicting

#sharkpy/predict.py

import pandas as pd
import numpy as np


[docs]
def predict(self, X=None):
    """
    Make predictions using the trained model.
    
    Parameters:
        X: dict, DataFrame, or array-like, optional
           Input samples to predict. If None, predicts on training data.
           Can be:
           - dict: Single prediction scenario {'feature1': value1, 'feature2': value2}
           - list of dicts: Multiple scenarios [{'feature1': value1}, {'feature1': value2}]
           - DataFrame: Multiple samples with feature columns
           - array-like: Raw feature values
        
    Returns:
        float, str, or array: Predicted values
    """
    print("🦈 Sharky is analyzing the data!")

    # Validate model exists
    if not hasattr(self, 'model'):
        raise AttributeError("🦈 Oops! Sharky needs training first. Call learn() before predict().")

    # If no X provided, use training features
    if X is None:
        print("🦈 No data provided! Using training data for prediction...")
        X = self.features
    else:
        # Process input
        X = _validate_and_process_input(self, X)
    
    # Make prediction
    try:
        predictions = self.model.predict(X)
        
        # Decode categorical predictions if encoder exists
        if hasattr(self, 'target_encoder'):
            predictions = self.target_encoder.inverse_transform(predictions)
        
        # Format output based on number of predictions
        if len(predictions) == 1:
            print(f"🦈 Prediction: {predictions[0]}")
            return predictions[0]
        else:
            print(f"🦈 Made {len(predictions)} predictions!")
            return predictions
            
    except Exception as e:
        print(f"🦈 Uh-oh! Sharky encountered an error: {str(e)}")
        raise



[docs]
def predict_baseline(self):
    """
    Make a baseline prediction with all features at minimum values.
    
    Returns:
        float or str: Baseline prediction
    """
    print("🦈 Making baseline prediction (all features at minimum values)...")
    
    # Create baseline sample with minimum values
    baseline_data = {}
    for col in self.features.columns:
        if col in self.encoders:  # Only use encoder for categorical features
            baseline_data[col] = list(self.encoders[col].values())[0]
        else:
            # For numerical features, use 0
            baseline_data[col] = 0 
    
    return predict(self, baseline_data)



[docs]
def _validate_and_process_input(self, X):
    """Helper method to validate and process input data"""
    # Convert input to DataFrame
    if isinstance(X, dict):
        X = pd.DataFrame([X])
    elif isinstance(X, np.ndarray):
        if not hasattr(self, 'feature_names'):
            raise ValueError("🦈 Sharky doesn't know the feature names for numpy arrays!")
        X = pd.DataFrame(X, columns=self.feature_names)
    elif not isinstance(X, pd.DataFrame):
        raise ValueError("🦈 Input must be a dictionary, DataFrame, or numpy array!")

    # Validate features
    missing_features = set(self.feature_names) - set(X.columns)
    if missing_features:
        raise ValueError(f"🦈 Missing features: {missing_features}")

    # Process categorical features
    X_processed = X.copy()
    for col in X_processed.columns:
        if col in self.encoders:
            cat_to_code = {v: k for k, v in self.encoders[col].items()}
            codes = set(cat_to_code.values())
            mapped = X_processed[col].map(cat_to_code)
            # If values are already numeric codes, keep them
            already_codes_mask = X_processed[col].isin(codes)
            combined = mapped.where(~mapped.isnull(), X_processed[col])
            # Now flag only truly unseen values (neither label nor known code)
            unseen_mask = mapped.isnull() & ~already_codes_mask
            if unseen_mask.any():
                unseen_values = X[col][unseen_mask].unique()
                raise ValueError(f"🦈 Unseen categories in '{col}': {unseen_values}")
            # Ensure integer codes
            X_processed[col] = combined.astype(int)

    return X_processed