heart

File size: 9,575 Bytes

---
license: bigscience-openrail-m
datasets:
- Abdelrahma12/heart_failure_clinical_records_dataset.csv
language:
- ar
metrics:
- accuracy
base_model:
- HuggingFaceTB/SmolLM3-3B
new_version: HuggingFaceTB/SmolLM3-3B
library_name: adapter-transformers
tags:
- medical
---
###Importing Liberaries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive


### Data Load

data = pd.read_csv(r'/content/heart_failure_clinical_records_dataset - Copy.csv')


### Data Exploratory


### Data Exploratory

data

data.head()

data.info()

data.duplicated().sum()

labels = ["40-45", "46-50", "51-55", "56-60", "61-65", "66-70", "71-75", "76-80", "81-95"]
data['age_group'] = pd.cut(data['age'], bins=[40, 45, 50, 55, 60, 65, 70, 75, 80, 95], labels=labels)

data.isnull().sum()

### Data Visualization

plt.figure(figsize=(10,6))
sns.countplot(data=data, x='age_group', hue='DEATH_EVENT', palette=["lightblue", "red"])
plt.title("Death Count by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Patient Count")
plt.legend(["Survived", "Died"])
plt.show()

corr_matrix = data.drop(columns=['age_group']).corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Heart Failure Clinical Records')
plt.show()

death_counts = data['DEATH_EVENT'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(death_counts, labels=['Not Died', 'Died'], autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
plt.title('Distribution of DEATH_EVENT')
plt.show()

# Select a subset of numerical features that showed some correlation with DEATH_EVENT
selected_features = ['time', 'serum_creatinine', 'ejection_fraction', 'age', 'serum_sodium', 'DEATH_EVENT']

sns.pairplot(data[selected_features], hue='DEATH_EVENT', diag_kind='kde')
plt.suptitle('Pairplot of Selected Numerical Features by DEATH_EVENT', y=1.02)
plt.show()

# Data Preprocessing

### Data Split


data.drop(columns=['age_group'], inplace=True)

X = data.drop('DEATH_EVENT', axis=1)
y = data['DEATH_EVENT']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Feature Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
continuous_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])


#Modeling

### Logistic Regression

log_params = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'saga'],
    'max_iter': [1000]
}

log_grid = GridSearchCV(LogisticRegression(random_state=42), log_params, cv=5)
log_grid.fit(X_train, y_train)

print(" Logistic Regression Best Params:", log_grid.best_params_)

####Evaluation

log_model = LogisticRegression(
    penalty='l2',
    C=0.1,
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
print(" Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.4f}")
print(classification_report(y_test, y_pred_log))

### Random Forest

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5)
rf_grid.fit(X_train, y_train)

print(" Random Forest Best Params:", rf_grid.best_params_)


####Evaluation

rf_model = RandomForestClassifier(
    n_estimators=50, max_depth=5,
    min_samples_leaf=2, min_samples_split=5,
    random_state=42
)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(" Random Forest")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))


### SVM

svm_params = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

svm_grid = GridSearchCV(SVC(probability=True, random_state=42), svm_params, cv=5)
svm_grid.fit(X_train, y_train)

print(" SVM Best Params:", svm_grid.best_params_)

#### Evaluation

svm_model = SVC(
    C=0.1, gamma='scale', kernel='linear',
    probability=True, random_state=42
)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("\n SVM")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(classification_report(y_test, y_pred_svm))

### MLP

mlp_params = {
    'hidden_layer_sizes': [(64,), (64, 32), (128, 64)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}

mlp_grid = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), mlp_params, cv=5)
mlp_grid.fit(X_train, y_train)

print(" MLP Best Params:", mlp_grid.best_params_)


#### Evaluation

mlp_model = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='tanh',
    alpha=0.0001,
    learning_rate='constant',
    max_iter=1000,
    random_state=42
)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)
print("\n MLP Neural Network")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")
print(classification_report(y_test, y_pred_mlp))

### XGBoost

xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb_grid = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    xgb_params, cv=5
)
xgb_grid.fit(X_train, y_train)

print(" XGBoost Best Params:", xgb_grid.best_params_)


#### Evaluation

xgb_model = XGBClassifier(
    n_estimators=50,
    max_depth=4,
    learning_rate=0.2,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("\n XGBoost")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb))

### KNN

knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train, y_train)

print(" KNN Best Params:", knn_grid.best_params_)

#### Evaluation

knn_model = KNeighborsClassifier(
    n_neighbors=5,
    weights='uniform',
    metric='euclidean'
)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
print("\n KNN")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(classification_report(y_test, y_pred_knn))

### Models Accuracies

models = [
    'Random Forest', 'SVM', 'MLP',
    'XGBoost', 'KNN', 'Logistic Regression'
]
accuracies = [
    0.85, 0.8333, 0.6833,
    0.8333, 0.7167, 0.8333
]

plt.figure(figsize=(10, 6))
plt.bar(models, accuracies, color=['blue', 'green', 'purple', 'orange', 'red', 'cyan'])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')

plt.xticks(rotation=30)
plt.show()

import gradio as gr
from sklearn.preprocessing import StandardScaler
import joblib

joblib.dump(rf_model, "heart_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Model and scaler saved successfully")

model = joblib.load("heart_model.pkl")
scaler = joblib.load("scaler.pkl")

def predict_heart_risk(age, cpk, ef, platelets, sc, ss, time, anaemia, diabetes, high_bp, sex, smoking):
    data = pd.DataFrame([[
        age, anaemia, cpk, diabetes, ef, high_bp,
        platelets, sc, ss, sex, smoking, time
    ]], columns=[
        'age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
        'ejection_fraction', 'high_blood_pressure', 'platelets',
        'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time'
    ])


    continuous_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction','platelets', 'serum_creatinine', 'serum_sodium', 'time']
    data[continuous_features] = scaler.transform(data[continuous_features])

    prediction = model.predict(data)[0]
    return " At Risk" if prediction == 1 else " Not At Risk"

inputs = [
    gr.Number(label="Age"),
    gr.Number(label="Creatinine Phosphokinase, Range [0,100000]"),
    gr.Number(label="Ejection Fraction, Range [5,85] "),
    gr.Number(label="Platelets, Range [5000,2000000]"),
    gr.Number(label="Serum Creatinine, Range [0.1,60]"),
    gr.Number(label="Serum Sodium, Range [95,255]"),
    gr.Number(label="Follow-up Time (days)"),
    gr.Radio([0, 1], label="Anaemia (0=No, 1=Yes)"),
    gr.Radio([0, 1], label="Diabetes (0=No, 1=Yes)"),
    gr.Radio([0, 1], label="High Blood Pressure (0=No, 1=Yes)"),
    gr.Radio([0, 1], label="Sex (0=Female, 1=Male)"),
    gr.Radio([0, 1], label="Smoking (0=No, 1=Yes)")
]

gr.Interface(
    fn=predict_heart_risk,
    inputs=inputs,
    outputs="text",
    title=" Heart Failure Risk Predictor",
    description="Enter patient data to predict if they are at risk of heart failure.",
    allow_flagging="never"
).launch()