import os
import tempfile
import atexit
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import (
make_scorer, r2_score, mean_absolute_error, mean_squared_error,
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score
)
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx2pdf import convert
from typing import Optional, Tuple, List, Dict, Any
# Set style for better looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12
[docs]
def _create_temp_plot(shark: Any, kind: str, width: int = 8, height: int = 6) -> Optional[str]:
"""Create a temporary plot and return its path"""
try:
# Create temporary file
fd, temp_path = tempfile.mkstemp(suffix='.png')
os.close(fd)
# Store original backend
original_backend = matplotlib.get_backend()
try:
# Set Agg backend for plot generation
matplotlib.use('Agg')
# Generate plot
from .plotting import plot_model
plot_model(shark.model, shark.features, shark.target,
kind=kind, show=False, save_path=temp_path)
finally:
# Restore original backend
matplotlib.use(original_backend)
return temp_path
except Exception as e:
print(f"⚠️ Could not create {kind} plot: {str(e)}")
return None
[docs]
def _get_feature_importance_section(shark: Any) -> Tuple[List[str], Optional[pd.DataFrame]]:
"""Generate feature importance section for the report and a DataFrame for DOCX table"""
lines = ["\n📊 Feature Importance:"]
feature_data = []
if hasattr(shark.model, 'feature_importances_'):
importances = shark.model.feature_importances_
indices = np.argsort(importances)[::-1]
for idx in indices:
feature = shark.feature_names[idx]
importance = importances[idx]
lines.append(f" - {feature}: {importance:.3f}")
feature_data.append({'Feature': feature, 'Importance': importance})
elif hasattr(shark.model, 'coef_'):
coefs = shark.model.coef_
if len(coefs.shape) > 1: # Multi-class logistic regression
coefs = np.abs(coefs).mean(axis=0)
indices = np.argsort(np.abs(coefs))[::-1]
for idx in indices:
feature = shark.feature_names[idx]
coef = coefs[idx]
lines.append(f" - {feature}: {coef:.3f} (abs. coefficient)")
feature_data.append({'Feature': feature, 'Coefficient': coef})
else:
lines.append(" - Feature importance not available for this model.")
feature_data = None
feature_df = pd.DataFrame(feature_data) if feature_data else None
return lines, feature_df
[docs]
def _get_statistical_details_section(shark: Any) -> List[str]:
"""Generate statistical details section for the report"""
lines = ["\n📈 Statistical Details:"]
lines.append(str(shark.statistical_summary))
return lines
[docs]
def _add_table_to_doc(doc: Document, df: pd.DataFrame, title: str):
"""Helper function to add a pandas DataFrame as a table to DOCX"""
doc.add_heading(title, level=1)
table = doc.add_table(rows=1, cols=len(df.columns))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
table.style = 'Table Grid'
# Add header
hdr_cells = table.rows[0].cells
for i, column in enumerate(df.columns):
hdr_cells[i].text = column
hdr_cells[i].paragraphs[0].runs[0].bold = True
hdr_cells[i].paragraphs[0].runs[0].font.size = Pt(12)
# Add data rows
for _, row in df.iterrows():
row_cells = table.add_row().cells
for i, value in enumerate(row):
row_cells[i].text = f"{value:.3f}" if isinstance(value, float) else str(value)
row_cells[i].paragraphs[0].runs[0].font.size = Pt(12)
[docs]
def _export_docx_report(path: str, shark: Any, cv_metrics_df: pd.DataFrame, train_metrics_df: pd.DataFrame, problem_type: str):
"""Export report as Word document with enhanced formatting, tables, and plots."""
doc = Document()
# Add title
title = doc.add_heading('🦈 SharkPy Model Report', level=0)
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
title.runs[0].font.size = Pt(16)
# Add metadata
doc.add_heading('Model Summary', level=1)
metadata = [
f"Project: {shark.project_name}",
f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
f"Model Type: {type(shark.model).__name__}",
f"Problem Type: {shark.problem_type.capitalize()}",
f"Target Variable: {shark.target.name}",
f"Features: {', '.join(shark.feature_names)}",
f"Training Set Shape: {shark.features.shape}"
]
for line in metadata:
p = doc.add_paragraph(line)
p.runs[0].font.size = Pt(12)
# Add cross-validation metrics table
_add_table_to_doc(doc, cv_metrics_df, 'Cross-Validation Metrics')
# Add training metrics table
_add_table_to_doc(doc, train_metrics_df, 'Training Set Metrics')
# Add feature importance table
_, feature_df = _get_feature_importance_section(shark)
if feature_df is not None:
_add_table_to_doc(doc, feature_df, 'Feature Importance')
# Add statistical details (for regression)
if hasattr(shark, 'stats_model') and shark.p_values is not None:
doc.add_heading('Statistical Details', level=1)
p = doc.add_paragraph(str(shark.statistical_summary))
p.runs[0].font.size = Pt(10)
# Add visualizations
doc.add_heading('Visualizations', level=1)
plots = [('Feature Importance', 'feature_importance')]
if problem_type == "regression":
plots += [
('Prediction Plot', 'prediction'),
('Residuals Plot', 'residuals')
]
else: # classification
plots += [('Confusion Matrix', 'confusion_matrix')]
if len(np.unique(shark.target)) == 2: # Binary classification
plots += [
('ROC Curve', 'roc'),
('Precision-Recall Curve', 'pr_curve'),
('Probability Histogram', 'proba_hist')
]
for title, kind in plots:
temp_path = _create_temp_plot(shark, kind)
if temp_path and os.path.exists(temp_path):
doc.add_heading(title, level=2)
doc.add_picture(temp_path, width=Inches(5.5))
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
os.remove(temp_path)
# Save document
doc.save(path)
[docs]
def _export_txt_report(path: str, lines: List[str]):
"""Export report as text file"""
with open(path, 'w') as f:
for line in lines:
f.write(line + '\n')
[docs]
def _convert_docx_to_pdf(docx_path: str, pdf_path: str) -> None:
"""Convert DOCX to PDF using available tools"""
try:
# Try docx2pdf first (requires MS Word)
convert(docx_path, pdf_path)
except Exception as e:
try:
# Fallback to LibreOffice if available
import subprocess
subprocess.run(['libreoffice', '--headless', '--convert-to', 'pdf',
'--outdir', os.path.dirname(pdf_path), docx_path],
check=True, capture_output=True)
except Exception as e2:
raise Exception(f"Could not convert to PDF. Please install Microsoft Word or LibreOffice.\nError: {str(e2)}")
[docs]
def report(self, cv_folds: int = 5, export_path: Optional[str] = None, format: str = 'txt') -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
"""
Generate comprehensive model performance report including cross-validation.
Parameters
----------
self : Any
The Shark instance
cv_folds : int, optional
Number of folds for K-Fold cross-validation (default: 5)
export_path : str, optional
Path to export the report. If None, report is only printed.
If a directory is provided, a timestamped file will be created.
format : str, optional
Export format: 'txt', 'pdf', or 'docx' (default: 'txt')
Returns
-------
tuple
- cv_results : dict
Dictionary containing cross-validation results
- train_metrics : dict
Dictionary containing training set metrics
Notes
-----
- For PDF export, ensure Microsoft Word or LibreOffice is installed for docx2pdf conversion.
- Visualizations include feature importance, predictions/residuals (regression), or confusion matrix/ROC/PR curves (classification).
"""
if not hasattr(self, 'model'):
raise ValueError("🦈 No model has been trained yet. Call learn() first.")
# Store original backend
original_backend = matplotlib.get_backend()
try:
# Initialize report lines
report_lines = []
report_lines.append("🦈 SharkPy Model Report 🦈")
report_lines.append(f"Project: {self.project_name}")
report_lines.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report_lines.append(f"Model Type: {type(self.model).__name__}")
report_lines.append(f"Problem Type: {self.problem_type.capitalize()}")
report_lines.append(f"Target Variable: {self.target.name}")
report_lines.append(f"Features: {', '.join(self.feature_names)}")
report_lines.append(f"Training Set Shape: {self.features.shape}")
# Define scoring metrics
if self.problem_type == "regression":
scoring = {
'r2': 'r2',
'mae': make_scorer(mean_absolute_error),
'mse': make_scorer(mean_squared_error),
'rmse': make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred))),
}
else:
scoring = {
'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score, average='weighted', zero_division=0),
'recall': make_scorer(recall_score, average='weighted', zero_division=0),
'f1': make_scorer(f1_score, average='weighted', zero_division=0),
}
if hasattr(self.model, 'predict_proba'):
scoring['roc_auc'] = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr')
# Cross-validation
kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
cv_results = cross_validate(
self.model,
self.features,
self.target,
cv=kf,
scoring=scoring,
n_jobs=-1
)
# Format CV results for text and table
report_lines.append("\n📊 Cross-Validation Metrics:")
cv_metrics_list = []
if self.problem_type == "regression":
metrics = ['r2', 'mae', 'mse', 'rmse']
for metric in metrics:
mean = cv_results[f'test_{metric}'].mean()
std = cv_results[f'test_{metric}'].std()
report_lines.append(f" - {metric.upper()}: {mean:.3f} ± {std:.3f}")
cv_metrics_list.append({'Metric': metric.upper(), 'Mean': mean, 'Std': std})
else:
metrics = ['accuracy', 'precision', 'recall', 'f1']
for metric in metrics:
mean = cv_results[f'test_{metric}'].mean()
std = cv_results[f'test_{metric}'].std()
report_lines.append(f" - {metric.upper()}: {mean:.3f} ± {std:.3f}")
cv_metrics_list.append({'Metric': metric.upper(), 'Mean': mean, 'Std': std})
if 'test_roc_auc' in cv_results:
mean = cv_results['test_roc_auc'].mean()
std = cv_results['test_roc_auc'].std()
report_lines.append(f" - ROC AUC: {mean:.3f} ± {std:.3f}")
cv_metrics_list.append({'Metric': 'ROC AUC', 'Mean': mean, 'Std': std})
cv_metrics_df = pd.DataFrame(cv_metrics_list)
# Training metrics
y_pred = self.model.predict(self.features)
if hasattr(self, 'target_encoder'):
y_pred = self.target_encoder.inverse_transform(y_pred)
target = self.target_encoder.inverse_transform(self.target)
else:
target = self.target
report_lines.append("\n📊 Training Set Metrics:")
train_metrics_list = []
if self.problem_type == "regression":
train_metrics = {
'R2': r2_score(target, y_pred),
'MAE': mean_absolute_error(target, y_pred),
'MSE': mean_squared_error(target, y_pred),
'RMSE': np.sqrt(mean_squared_error(target, y_pred)),
}
for metric, value in train_metrics.items():
report_lines.append(f" - {metric}: {value:.3f}")
train_metrics_list.append({'Metric': metric, 'Value': value})
else:
train_metrics = {
'Accuracy': accuracy_score(target, y_pred),
'Precision': precision_score(target, y_pred, average='weighted', zero_division=0),
'Recall': recall_score(target, y_pred, average='weighted', zero_division=0),
'F1': f1_score(target, y_pred, average='weighted', zero_division=0)
}
if hasattr(self.model, 'predict_proba'):
try:
train_metrics['ROC AUC'] = roc_auc_score(
target,
self.model.predict_proba(self.features),
multi_class='ovr'
)
except:
pass
for metric, value in train_metrics.items():
report_lines.append(f" - {metric}: {value:.3f}")
train_metrics_list.append({'Metric': metric, 'Value': value})
train_metrics_df = pd.DataFrame(train_metrics_list)
# Feature importance section
report_lines, feature_df = _get_feature_importance_section(self)
# Statistical details section
if hasattr(self, 'stats_model') and self.p_values is not None:
report_lines.extend(_get_statistical_details_section(self))
# Export report
if export_path:
try:
export_dir = os.path.dirname(os.path.abspath(export_path)) if os.path.dirname(export_path) else '.'
os.makedirs(export_dir, exist_ok=True)
if format.lower() == 'txt':
_export_txt_report(export_path, report_lines)
print(f"\n🦈 Text report exported to: {export_path}")
elif format.lower() in ['pdf', 'docx']:
docx_path = export_path if format.lower() == 'docx' else export_path.replace('.pdf', '.docx')
_export_docx_report(docx_path, self, cv_metrics_df, train_metrics_df, self.problem_type)
if format.lower() == 'pdf':
print("⚠️ Note: PDF export requires Microsoft Word or LibreOffice installed.")
_convert_docx_to_pdf(docx_path, export_path)
os.remove(docx_path)
print(f"\n🦈 PDF report exported to: {export_path}")
else:
print(f"\n🦈 Word document exported to: {export_path}")
else:
raise ValueError(f"🦈 Unsupported format: {format}")
except Exception as e:
print(f"\n⚠️ Failed to export report: {str(e)}")
raise
# Print report to console
for line in report_lines:
print(line)
return cv_results, train_metrics
finally:
# Restore original backend
matplotlib.use(original_backend)