import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import permutation_test_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import base64
from IPython.display import HTML, display
from rich.console import Console
from rich.table import Table
from rich.text import Text
from tabulate import tabulate

# Load CSV
data = pd.read_csv("NBA Draft (2000-2020).csv")

#Replace "" spaces with NaN
data=data.replace("", np.nan)

# Identify numeric columns
numeric_columns = data.select_dtypes(include=[np.number]).columns

# Fill missing values with column median
for col in numeric_columns:
    data[col] = data[col].fillna(data[col].median())

# Define features and target
X = data.drop(columns=["drafted", "yearCombine"])
y = data["drafted"]

# Convert categorical variables to numerical
X = pd.get_dummies(X)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=49)

# Train model on training set, Random Forest (100 trees, shuffled)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

# Predict with AI the target of test set given its features
y_pred = rf.predict(X_test)

#calcuate accuracy and F1 Metrics
acc=accuracy_score(y_test, y_pred)
f1=accuracy_score(y_test, y_pred)

# Rich table display
console = Console()
table = Table(title="Overall Metrics")
table.add_column("Metric", style="black", no_wrap=True)
table.add_column("Value", justify="right")
table.add_row("Accuracy", f"{acc:.3f}")
table.add_row("F1 Score", f"{f1:.3f}")
console.print(table)

  Overall Metrics   
┏━━━━━━━━━━┳━━━━━━━┓
┃ Metric   ┃ Value ┃
┡━━━━━━━━━━╇━━━━━━━┩
│ Accuracy │ 0.608 │
│ F1 Score │ 0.608 │
└──────────┴───────┘

#Print Confusion matrix as a rich table
console = Console()
cm = confusion_matrix(y_test, y_pred)
table = Table(title="Confusion Matrix")
table.add_column(" ", justify="right")
table.add_column("Predicted 0", justify="center")
table.add_column("Predicted 1", justify="center")
table.add_row("Actual 0", str(cm[0, 0]), str(cm[0, 1]))
table.add_row("Actual 1", str(cm[1, 0]), str(cm[1, 1]))

console.print(table)

            Confusion Matrix            
┏━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓
┃          ┃ Predicted 0 ┃ Predicted 1 ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ Actual 0 │     57      │     57      │
│ Actual 1 │     41      │     95      │
└──────────┴─────────────┴─────────────┘

# Extract feature importances
rf_importances = rf.feature_importances_

# Create a DataFrame sorted by importance
feature_importance_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf_importances
}).sort_values("Importance", ascending=False).reset_index(drop=True)
console = Console()

# Limit to top N features
top_n = 10
df = feature_importance_df.head(top_n)
max_importance = df["Importance"].max()
bar_width = 30

# Create a horizontal bar
def importance_bar(value, max_value, width=30):
    filled = int((value / max_value) * width)
    return Text("█" * filled + " " * (width - filled), style="blue")

# Create a Rich table
table = Table(title="Random Forest Feature Importance (Top 10)")
table.add_column("Feature", style="black", no_wrap=True)
table.add_column("Importance", justify="right")
table.add_column("")

# Add each feature as a row with a horizontal bar
for _, row in df.iterrows():
    table.add_row(
        row["Feature"],
        f"{row['Importance']:.4f}",
        importance_bar(row["Importance"], max_importance, bar_width))

# Print the table
console.print(table)

                  Random Forest Feature Importance (Top 10)                  
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Feature                     ┃ Importance ┃                                ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ timeLaneAgility             │     0.1130 │ ██████████████████████████████ │
│ body_fat_pct                │     0.1062 │ ████████████████████████████   │
│ weight                      │     0.0968 │ █████████████████████████      │
│ max_vertical                │     0.0950 │ █████████████████████████      │
│ wingspan                    │     0.0899 │ ███████████████████████        │
│ height                      │     0.0871 │ ███████████████████████        │
│ timeThreeQuarterCourtSprint │     0.0860 │ ██████████████████████         │
│ reach_standing              │     0.0833 │ ██████████████████████         │
│ standing_vertical           │     0.0830 │ ██████████████████████         │
│ bench_reps                  │     0.0753 │ ███████████████████            │
└─────────────────────────────┴────────────┴────────────────────────────────┘

score, perm_scores, p_value = permutation_test_score(
    rf,           
    X,            
    y,            
    scoring="f1", 
    cv=5,        
    n_permutations=500, 
    n_jobs=-1
)

# Print the results
print("=== Permutation Test Results ===")
print(f"Permutation test p-value: {p_value:.4f}")

=== Permutation Test Results ===
Permutation test p-value: 0.5968

	n_estimators	100
	criterion	'gini'
	max_depth	None
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	42
	verbose	0
	warm_start	False
	class_weight	None
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

AI Analysis of NBA Combine Draft Data (2000-2020)¶

Analysis Task¶

Data Source and Preparation¶

Analysis¶

Import Packages¶

Import and Clean Files¶

Set Up and Train AI (Random Forest Classification)¶

Test AI Model according to reserved test data¶

AI Model Evaluation Results¶

Feature Importance¶

Data Predictability¶

Conclusion¶

Key Insights:¶

Final Remarks:¶