Source code for sharkpy.reporting

import os
import tempfile
import atexit
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import (
    make_scorer, r2_score, mean_absolute_error, mean_squared_error, 
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score
)
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx2pdf import convert
from typing import Optional, Tuple, List, Dict, Any

# Set style for better looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

[docs] def _create_temp_plot(shark: Any, kind: str, width: int = 8, height: int = 6) -> Optional[str]: """Create a temporary plot and return its path""" try: # Create temporary file fd, temp_path = tempfile.mkstemp(suffix='.png') os.close(fd) # Store original backend original_backend = matplotlib.get_backend() try: # Set Agg backend for plot generation matplotlib.use('Agg') # Generate plot from .plotting import plot_model plot_model(shark.model, shark.features, shark.target, kind=kind, show=False, save_path=temp_path) finally: # Restore original backend matplotlib.use(original_backend) return temp_path except Exception as e: print(f"⚠️ Could not create {kind} plot: {str(e)}") return None
[docs] def _get_feature_importance_section(shark: Any) -> Tuple[List[str], Optional[pd.DataFrame]]: """Generate feature importance section for the report and a DataFrame for DOCX table""" lines = ["\n📊 Feature Importance:"] feature_data = [] if hasattr(shark.model, 'feature_importances_'): importances = shark.model.feature_importances_ indices = np.argsort(importances)[::-1] for idx in indices: feature = shark.feature_names[idx] importance = importances[idx] lines.append(f" - {feature}: {importance:.3f}") feature_data.append({'Feature': feature, 'Importance': importance}) elif hasattr(shark.model, 'coef_'): coefs = shark.model.coef_ if len(coefs.shape) > 1: # Multi-class logistic regression coefs = np.abs(coefs).mean(axis=0) indices = np.argsort(np.abs(coefs))[::-1] for idx in indices: feature = shark.feature_names[idx] coef = coefs[idx] lines.append(f" - {feature}: {coef:.3f} (abs. coefficient)") feature_data.append({'Feature': feature, 'Coefficient': coef}) else: lines.append(" - Feature importance not available for this model.") feature_data = None feature_df = pd.DataFrame(feature_data) if feature_data else None return lines, feature_df
[docs] def _get_statistical_details_section(shark: Any) -> List[str]: """Generate statistical details section for the report""" lines = ["\n📈 Statistical Details:"] lines.append(str(shark.statistical_summary)) return lines
[docs] def _add_table_to_doc(doc: Document, df: pd.DataFrame, title: str): """Helper function to add a pandas DataFrame as a table to DOCX""" doc.add_heading(title, level=1) table = doc.add_table(rows=1, cols=len(df.columns)) table.alignment = WD_TABLE_ALIGNMENT.CENTER table.style = 'Table Grid' # Add header hdr_cells = table.rows[0].cells for i, column in enumerate(df.columns): hdr_cells[i].text = column hdr_cells[i].paragraphs[0].runs[0].bold = True hdr_cells[i].paragraphs[0].runs[0].font.size = Pt(12) # Add data rows for _, row in df.iterrows(): row_cells = table.add_row().cells for i, value in enumerate(row): row_cells[i].text = f"{value:.3f}" if isinstance(value, float) else str(value) row_cells[i].paragraphs[0].runs[0].font.size = Pt(12)
[docs] def _export_docx_report(path: str, shark: Any, cv_metrics_df: pd.DataFrame, train_metrics_df: pd.DataFrame, problem_type: str): """Export report as Word document with enhanced formatting, tables, and plots.""" doc = Document() # Add title title = doc.add_heading('🦈 SharkPy Model Report', level=0) title.alignment = WD_ALIGN_PARAGRAPH.CENTER title.runs[0].font.size = Pt(16) # Add metadata doc.add_heading('Model Summary', level=1) metadata = [ f"Project: {shark.project_name}", f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", f"Model Type: {type(shark.model).__name__}", f"Problem Type: {shark.problem_type.capitalize()}", f"Target Variable: {shark.target.name}", f"Features: {', '.join(shark.feature_names)}", f"Training Set Shape: {shark.features.shape}" ] for line in metadata: p = doc.add_paragraph(line) p.runs[0].font.size = Pt(12) # Add cross-validation metrics table _add_table_to_doc(doc, cv_metrics_df, 'Cross-Validation Metrics') # Add training metrics table _add_table_to_doc(doc, train_metrics_df, 'Training Set Metrics') # Add feature importance table _, feature_df = _get_feature_importance_section(shark) if feature_df is not None: _add_table_to_doc(doc, feature_df, 'Feature Importance') # Add statistical details (for regression) if hasattr(shark, 'stats_model') and shark.p_values is not None: doc.add_heading('Statistical Details', level=1) p = doc.add_paragraph(str(shark.statistical_summary)) p.runs[0].font.size = Pt(10) # Add visualizations doc.add_heading('Visualizations', level=1) plots = [('Feature Importance', 'feature_importance')] if problem_type == "regression": plots += [ ('Prediction Plot', 'prediction'), ('Residuals Plot', 'residuals') ] else: # classification plots += [('Confusion Matrix', 'confusion_matrix')] if len(np.unique(shark.target)) == 2: # Binary classification plots += [ ('ROC Curve', 'roc'), ('Precision-Recall Curve', 'pr_curve'), ('Probability Histogram', 'proba_hist') ] for title, kind in plots: temp_path = _create_temp_plot(shark, kind) if temp_path and os.path.exists(temp_path): doc.add_heading(title, level=2) doc.add_picture(temp_path, width=Inches(5.5)) p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER os.remove(temp_path) # Save document doc.save(path)
[docs] def _export_txt_report(path: str, lines: List[str]): """Export report as text file""" with open(path, 'w') as f: for line in lines: f.write(line + '\n')
[docs] def _convert_docx_to_pdf(docx_path: str, pdf_path: str) -> None: """Convert DOCX to PDF using available tools""" try: # Try docx2pdf first (requires MS Word) convert(docx_path, pdf_path) except Exception as e: try: # Fallback to LibreOffice if available import subprocess subprocess.run(['libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', os.path.dirname(pdf_path), docx_path], check=True, capture_output=True) except Exception as e2: raise Exception(f"Could not convert to PDF. Please install Microsoft Word or LibreOffice.\nError: {str(e2)}")
[docs] def report(self, cv_folds: int = 5, export_path: Optional[str] = None, format: str = 'txt') -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: """ Generate comprehensive model performance report including cross-validation. Parameters ---------- self : Any The Shark instance cv_folds : int, optional Number of folds for K-Fold cross-validation (default: 5) export_path : str, optional Path to export the report. If None, report is only printed. If a directory is provided, a timestamped file will be created. format : str, optional Export format: 'txt', 'pdf', or 'docx' (default: 'txt') Returns ------- tuple - cv_results : dict Dictionary containing cross-validation results - train_metrics : dict Dictionary containing training set metrics Notes ----- - For PDF export, ensure Microsoft Word or LibreOffice is installed for docx2pdf conversion. - Visualizations include feature importance, predictions/residuals (regression), or confusion matrix/ROC/PR curves (classification). """ if not hasattr(self, 'model'): raise ValueError("🦈 No model has been trained yet. Call learn() first.") # Store original backend original_backend = matplotlib.get_backend() try: # Initialize report lines report_lines = [] report_lines.append("🦈 SharkPy Model Report 🦈") report_lines.append(f"Project: {self.project_name}") report_lines.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") report_lines.append(f"Model Type: {type(self.model).__name__}") report_lines.append(f"Problem Type: {self.problem_type.capitalize()}") report_lines.append(f"Target Variable: {self.target.name}") report_lines.append(f"Features: {', '.join(self.feature_names)}") report_lines.append(f"Training Set Shape: {self.features.shape}") # Define scoring metrics if self.problem_type == "regression": scoring = { 'r2': 'r2', 'mae': make_scorer(mean_absolute_error), 'mse': make_scorer(mean_squared_error), 'rmse': make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred))), } else: scoring = { 'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score, average='weighted', zero_division=0), 'recall': make_scorer(recall_score, average='weighted', zero_division=0), 'f1': make_scorer(f1_score, average='weighted', zero_division=0), } if hasattr(self.model, 'predict_proba'): scoring['roc_auc'] = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr') # Cross-validation kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42) cv_results = cross_validate( self.model, self.features, self.target, cv=kf, scoring=scoring, n_jobs=-1 ) # Format CV results for text and table report_lines.append("\n📊 Cross-Validation Metrics:") cv_metrics_list = [] if self.problem_type == "regression": metrics = ['r2', 'mae', 'mse', 'rmse'] for metric in metrics: mean = cv_results[f'test_{metric}'].mean() std = cv_results[f'test_{metric}'].std() report_lines.append(f" - {metric.upper()}: {mean:.3f} ± {std:.3f}") cv_metrics_list.append({'Metric': metric.upper(), 'Mean': mean, 'Std': std}) else: metrics = ['accuracy', 'precision', 'recall', 'f1'] for metric in metrics: mean = cv_results[f'test_{metric}'].mean() std = cv_results[f'test_{metric}'].std() report_lines.append(f" - {metric.upper()}: {mean:.3f} ± {std:.3f}") cv_metrics_list.append({'Metric': metric.upper(), 'Mean': mean, 'Std': std}) if 'test_roc_auc' in cv_results: mean = cv_results['test_roc_auc'].mean() std = cv_results['test_roc_auc'].std() report_lines.append(f" - ROC AUC: {mean:.3f} ± {std:.3f}") cv_metrics_list.append({'Metric': 'ROC AUC', 'Mean': mean, 'Std': std}) cv_metrics_df = pd.DataFrame(cv_metrics_list) # Training metrics y_pred = self.model.predict(self.features) if hasattr(self, 'target_encoder'): y_pred = self.target_encoder.inverse_transform(y_pred) target = self.target_encoder.inverse_transform(self.target) else: target = self.target report_lines.append("\n📊 Training Set Metrics:") train_metrics_list = [] if self.problem_type == "regression": train_metrics = { 'R2': r2_score(target, y_pred), 'MAE': mean_absolute_error(target, y_pred), 'MSE': mean_squared_error(target, y_pred), 'RMSE': np.sqrt(mean_squared_error(target, y_pred)), } for metric, value in train_metrics.items(): report_lines.append(f" - {metric}: {value:.3f}") train_metrics_list.append({'Metric': metric, 'Value': value}) else: train_metrics = { 'Accuracy': accuracy_score(target, y_pred), 'Precision': precision_score(target, y_pred, average='weighted', zero_division=0), 'Recall': recall_score(target, y_pred, average='weighted', zero_division=0), 'F1': f1_score(target, y_pred, average='weighted', zero_division=0) } if hasattr(self.model, 'predict_proba'): try: train_metrics['ROC AUC'] = roc_auc_score( target, self.model.predict_proba(self.features), multi_class='ovr' ) except: pass for metric, value in train_metrics.items(): report_lines.append(f" - {metric}: {value:.3f}") train_metrics_list.append({'Metric': metric, 'Value': value}) train_metrics_df = pd.DataFrame(train_metrics_list) # Feature importance section report_lines, feature_df = _get_feature_importance_section(self) # Statistical details section if hasattr(self, 'stats_model') and self.p_values is not None: report_lines.extend(_get_statistical_details_section(self)) # Export report if export_path: try: export_dir = os.path.dirname(os.path.abspath(export_path)) if os.path.dirname(export_path) else '.' os.makedirs(export_dir, exist_ok=True) if format.lower() == 'txt': _export_txt_report(export_path, report_lines) print(f"\n🦈 Text report exported to: {export_path}") elif format.lower() in ['pdf', 'docx']: docx_path = export_path if format.lower() == 'docx' else export_path.replace('.pdf', '.docx') _export_docx_report(docx_path, self, cv_metrics_df, train_metrics_df, self.problem_type) if format.lower() == 'pdf': print("⚠️ Note: PDF export requires Microsoft Word or LibreOffice installed.") _convert_docx_to_pdf(docx_path, export_path) os.remove(docx_path) print(f"\n🦈 PDF report exported to: {export_path}") else: print(f"\n🦈 Word document exported to: {export_path}") else: raise ValueError(f"🦈 Unsupported format: {format}") except Exception as e: print(f"\n⚠️ Failed to export report: {str(e)}") raise # Print report to console for line in report_lines: print(line) return cv_results, train_metrics finally: # Restore original backend matplotlib.use(original_backend)