|
|
--- |
|
|
license: bigscience-openrail-m |
|
|
datasets: |
|
|
- Abdelrahma12/heart_failure_clinical_records_dataset.csv |
|
|
language: |
|
|
- ar |
|
|
metrics: |
|
|
- accuracy |
|
|
base_model: |
|
|
- HuggingFaceTB/SmolLM3-3B |
|
|
new_version: HuggingFaceTB/SmolLM3-3B |
|
|
library_name: adapter-transformers |
|
|
tags: |
|
|
- medical |
|
|
--- |
|
|
###Importing Liberaries |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from sklearn.model_selection import GridSearchCV |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.neural_network import MLPClassifier |
|
|
from sklearn.neighbors import KNeighborsClassifier |
|
|
from xgboost import XGBClassifier |
|
|
from sklearn.svm import SVC |
|
|
from sklearn.metrics import accuracy_score, classification_report |
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
from google.colab import drive |
|
|
|
|
|
|
|
|
### Data Load |
|
|
|
|
|
data = pd.read_csv(r'/content/heart_failure_clinical_records_dataset - Copy.csv') |
|
|
|
|
|
|
|
|
### Data Exploratory |
|
|
|
|
|
|
|
|
### Data Exploratory |
|
|
|
|
|
data |
|
|
|
|
|
data.head() |
|
|
|
|
|
data.info() |
|
|
|
|
|
data.duplicated().sum() |
|
|
|
|
|
labels = ["40-45", "46-50", "51-55", "56-60", "61-65", "66-70", "71-75", "76-80", "81-95"] |
|
|
data['age_group'] = pd.cut(data['age'], bins=[40, 45, 50, 55, 60, 65, 70, 75, 80, 95], labels=labels) |
|
|
|
|
|
data.isnull().sum() |
|
|
|
|
|
### Data Visualization |
|
|
|
|
|
plt.figure(figsize=(10,6)) |
|
|
sns.countplot(data=data, x='age_group', hue='DEATH_EVENT', palette=["lightblue", "red"]) |
|
|
plt.title("Death Count by Age Group") |
|
|
plt.xlabel("Age Group") |
|
|
plt.ylabel("Patient Count") |
|
|
plt.legend(["Survived", "Died"]) |
|
|
plt.show() |
|
|
|
|
|
corr_matrix = data.drop(columns=['age_group']).corr() |
|
|
plt.figure(figsize=(12, 10)) |
|
|
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f") |
|
|
plt.title('Correlation Matrix of Heart Failure Clinical Records') |
|
|
plt.show() |
|
|
|
|
|
death_counts = data['DEATH_EVENT'].value_counts() |
|
|
plt.figure(figsize=(6, 6)) |
|
|
plt.pie(death_counts, labels=['Not Died', 'Died'], autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral']) |
|
|
plt.title('Distribution of DEATH_EVENT') |
|
|
plt.show() |
|
|
|
|
|
# Select a subset of numerical features that showed some correlation with DEATH_EVENT |
|
|
selected_features = ['time', 'serum_creatinine', 'ejection_fraction', 'age', 'serum_sodium', 'DEATH_EVENT'] |
|
|
|
|
|
sns.pairplot(data[selected_features], hue='DEATH_EVENT', diag_kind='kde') |
|
|
plt.suptitle('Pairplot of Selected Numerical Features by DEATH_EVENT', y=1.02) |
|
|
plt.show() |
|
|
|
|
|
# Data Preprocessing |
|
|
|
|
|
### Data Split |
|
|
|
|
|
|
|
|
data.drop(columns=['age_group'], inplace=True) |
|
|
|
|
|
X = data.drop('DEATH_EVENT', axis=1) |
|
|
y = data['DEATH_EVENT'] |
|
|
from sklearn.model_selection import train_test_split |
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) |
|
|
|
|
|
### Feature Scaling |
|
|
|
|
|
from sklearn.preprocessing import StandardScaler |
|
|
scaler = StandardScaler() |
|
|
continuous_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time'] |
|
|
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features]) |
|
|
X_test[continuous_features] = scaler.transform(X_test[continuous_features]) |
|
|
|
|
|
|
|
|
#Modeling |
|
|
|
|
|
### Logistic Regression |
|
|
|
|
|
log_params = { |
|
|
'penalty': ['l1', 'l2', 'elasticnet', 'none'], |
|
|
'C': [0.01, 0.1, 1, 10, 100], |
|
|
'solver': ['lbfgs', 'saga'], |
|
|
'max_iter': [1000] |
|
|
} |
|
|
|
|
|
log_grid = GridSearchCV(LogisticRegression(random_state=42), log_params, cv=5) |
|
|
log_grid.fit(X_train, y_train) |
|
|
|
|
|
print(" Logistic Regression Best Params:", log_grid.best_params_) |
|
|
|
|
|
####Evaluation |
|
|
|
|
|
log_model = LogisticRegression( |
|
|
penalty='l2', |
|
|
C=0.1, |
|
|
solver='lbfgs', |
|
|
max_iter=1000, |
|
|
random_state=42 |
|
|
) |
|
|
log_model.fit(X_train, y_train) |
|
|
y_pred_log = log_model.predict(X_test) |
|
|
print(" Logistic Regression") |
|
|
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.4f}") |
|
|
print(classification_report(y_test, y_pred_log)) |
|
|
|
|
|
### Random Forest |
|
|
|
|
|
rf_params = { |
|
|
'n_estimators': [50, 100, 200], |
|
|
'max_depth': [None, 5, 10], |
|
|
'min_samples_split': [2, 5], |
|
|
'min_samples_leaf': [1, 2] |
|
|
} |
|
|
|
|
|
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5) |
|
|
rf_grid.fit(X_train, y_train) |
|
|
|
|
|
print(" Random Forest Best Params:", rf_grid.best_params_) |
|
|
|
|
|
|
|
|
####Evaluation |
|
|
|
|
|
rf_model = RandomForestClassifier( |
|
|
n_estimators=50, max_depth=5, |
|
|
min_samples_leaf=2, min_samples_split=5, |
|
|
random_state=42 |
|
|
) |
|
|
rf_model.fit(X_train, y_train) |
|
|
y_pred_rf = rf_model.predict(X_test) |
|
|
print(" Random Forest") |
|
|
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}") |
|
|
print(classification_report(y_test, y_pred_rf)) |
|
|
|
|
|
|
|
|
### SVM |
|
|
|
|
|
svm_params = { |
|
|
'kernel': ['linear', 'rbf'], |
|
|
'C': [0.1, 1, 10], |
|
|
'gamma': ['scale', 'auto'] |
|
|
} |
|
|
|
|
|
svm_grid = GridSearchCV(SVC(probability=True, random_state=42), svm_params, cv=5) |
|
|
svm_grid.fit(X_train, y_train) |
|
|
|
|
|
print(" SVM Best Params:", svm_grid.best_params_) |
|
|
|
|
|
#### Evaluation |
|
|
|
|
|
svm_model = SVC( |
|
|
C=0.1, gamma='scale', kernel='linear', |
|
|
probability=True, random_state=42 |
|
|
) |
|
|
svm_model.fit(X_train, y_train) |
|
|
y_pred_svm = svm_model.predict(X_test) |
|
|
print("\n SVM") |
|
|
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}") |
|
|
print(classification_report(y_test, y_pred_svm)) |
|
|
|
|
|
### MLP |
|
|
|
|
|
mlp_params = { |
|
|
'hidden_layer_sizes': [(64,), (64, 32), (128, 64)], |
|
|
'activation': ['relu', 'tanh'], |
|
|
'alpha': [0.0001, 0.001], |
|
|
'learning_rate': ['constant', 'adaptive'] |
|
|
} |
|
|
|
|
|
mlp_grid = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), mlp_params, cv=5) |
|
|
mlp_grid.fit(X_train, y_train) |
|
|
|
|
|
print(" MLP Best Params:", mlp_grid.best_params_) |
|
|
|
|
|
|
|
|
#### Evaluation |
|
|
|
|
|
mlp_model = MLPClassifier( |
|
|
hidden_layer_sizes=(64, 32), |
|
|
activation='tanh', |
|
|
alpha=0.0001, |
|
|
learning_rate='constant', |
|
|
max_iter=1000, |
|
|
random_state=42 |
|
|
) |
|
|
mlp_model.fit(X_train, y_train) |
|
|
y_pred_mlp = mlp_model.predict(X_test) |
|
|
print("\n MLP Neural Network") |
|
|
print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}") |
|
|
print(classification_report(y_test, y_pred_mlp)) |
|
|
|
|
|
### XGBoost |
|
|
|
|
|
xgb_params = { |
|
|
'n_estimators': [50, 100, 200], |
|
|
'max_depth': [3, 4, 5], |
|
|
'learning_rate': [0.01, 0.1, 0.2] |
|
|
} |
|
|
|
|
|
xgb_grid = GridSearchCV( |
|
|
XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), |
|
|
xgb_params, cv=5 |
|
|
) |
|
|
xgb_grid.fit(X_train, y_train) |
|
|
|
|
|
print(" XGBoost Best Params:", xgb_grid.best_params_) |
|
|
|
|
|
|
|
|
#### Evaluation |
|
|
|
|
|
xgb_model = XGBClassifier( |
|
|
n_estimators=50, |
|
|
max_depth=4, |
|
|
learning_rate=0.2, |
|
|
use_label_encoder=False, |
|
|
eval_metric='logloss', |
|
|
random_state=42 |
|
|
) |
|
|
xgb_model.fit(X_train, y_train) |
|
|
y_pred_xgb = xgb_model.predict(X_test) |
|
|
print("\n XGBoost") |
|
|
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}") |
|
|
print(classification_report(y_test, y_pred_xgb)) |
|
|
|
|
|
### KNN |
|
|
|
|
|
knn_params = { |
|
|
'n_neighbors': [3, 5, 7, 9], |
|
|
'weights': ['uniform', 'distance'], |
|
|
'metric': ['euclidean', 'manhattan'] |
|
|
} |
|
|
|
|
|
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5) |
|
|
knn_grid.fit(X_train, y_train) |
|
|
|
|
|
print(" KNN Best Params:", knn_grid.best_params_) |
|
|
|
|
|
#### Evaluation |
|
|
|
|
|
knn_model = KNeighborsClassifier( |
|
|
n_neighbors=5, |
|
|
weights='uniform', |
|
|
metric='euclidean' |
|
|
) |
|
|
knn_model.fit(X_train, y_train) |
|
|
y_pred_knn = knn_model.predict(X_test) |
|
|
print("\n KNN") |
|
|
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}") |
|
|
print(classification_report(y_test, y_pred_knn)) |
|
|
|
|
|
### Models Accuracies |
|
|
|
|
|
models = [ |
|
|
'Random Forest', 'SVM', 'MLP', |
|
|
'XGBoost', 'KNN', 'Logistic Regression' |
|
|
] |
|
|
accuracies = [ |
|
|
0.85, 0.8333, 0.6833, |
|
|
0.8333, 0.7167, 0.8333 |
|
|
] |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
|
plt.bar(models, accuracies, color=['blue', 'green', 'purple', 'orange', 'red', 'cyan']) |
|
|
plt.ylim(0, 1) |
|
|
plt.ylabel('Accuracy') |
|
|
plt.title('Model Accuracy Comparison') |
|
|
|
|
|
plt.xticks(rotation=30) |
|
|
plt.show() |
|
|
|
|
|
import gradio as gr |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
import joblib |
|
|
|
|
|
joblib.dump(rf_model, "heart_model.pkl") |
|
|
joblib.dump(scaler, "scaler.pkl") |
|
|
print("Model and scaler saved successfully") |
|
|
|
|
|
model = joblib.load("heart_model.pkl") |
|
|
scaler = joblib.load("scaler.pkl") |
|
|
|
|
|
def predict_heart_risk(age, cpk, ef, platelets, sc, ss, time, anaemia, diabetes, high_bp, sex, smoking): |
|
|
data = pd.DataFrame([[ |
|
|
age, anaemia, cpk, diabetes, ef, high_bp, |
|
|
platelets, sc, ss, sex, smoking, time |
|
|
]], columns=[ |
|
|
'age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', |
|
|
'ejection_fraction', 'high_blood_pressure', 'platelets', |
|
|
'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time' |
|
|
]) |
|
|
|
|
|
|
|
|
continuous_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction','platelets', 'serum_creatinine', 'serum_sodium', 'time'] |
|
|
data[continuous_features] = scaler.transform(data[continuous_features]) |
|
|
|
|
|
prediction = model.predict(data)[0] |
|
|
return " At Risk" if prediction == 1 else " Not At Risk" |
|
|
|
|
|
inputs = [ |
|
|
gr.Number(label="Age"), |
|
|
gr.Number(label="Creatinine Phosphokinase, Range [0,100000]"), |
|
|
gr.Number(label="Ejection Fraction, Range [5,85] "), |
|
|
gr.Number(label="Platelets, Range [5000,2000000]"), |
|
|
gr.Number(label="Serum Creatinine, Range [0.1,60]"), |
|
|
gr.Number(label="Serum Sodium, Range [95,255]"), |
|
|
gr.Number(label="Follow-up Time (days)"), |
|
|
gr.Radio([0, 1], label="Anaemia (0=No, 1=Yes)"), |
|
|
gr.Radio([0, 1], label="Diabetes (0=No, 1=Yes)"), |
|
|
gr.Radio([0, 1], label="High Blood Pressure (0=No, 1=Yes)"), |
|
|
gr.Radio([0, 1], label="Sex (0=Female, 1=Male)"), |
|
|
gr.Radio([0, 1], label="Smoking (0=No, 1=Yes)") |
|
|
] |
|
|
|
|
|
gr.Interface( |
|
|
fn=predict_heart_risk, |
|
|
inputs=inputs, |
|
|
outputs="text", |
|
|
title=" Heart Failure Risk Predictor", |
|
|
description="Enter patient data to predict if they are at risk of heart failure.", |
|
|
allow_flagging="never" |
|
|
).launch() |
|
|
|
|
|
|