Adapters
Arabic
medical
heart / README.md
Abdelrahma12's picture
Update README.md
ad22b81 verified
---
license: bigscience-openrail-m
datasets:
- Abdelrahma12/heart_failure_clinical_records_dataset.csv
language:
- ar
metrics:
- accuracy
base_model:
- HuggingFaceTB/SmolLM3-3B
new_version: HuggingFaceTB/SmolLM3-3B
library_name: adapter-transformers
tags:
- medical
---
###Importing Liberaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
### Data Load
data = pd.read_csv(r'/content/heart_failure_clinical_records_dataset - Copy.csv')
### Data Exploratory
### Data Exploratory
data
data.head()
data.info()
data.duplicated().sum()
labels = ["40-45", "46-50", "51-55", "56-60", "61-65", "66-70", "71-75", "76-80", "81-95"]
data['age_group'] = pd.cut(data['age'], bins=[40, 45, 50, 55, 60, 65, 70, 75, 80, 95], labels=labels)
data.isnull().sum()
### Data Visualization
plt.figure(figsize=(10,6))
sns.countplot(data=data, x='age_group', hue='DEATH_EVENT', palette=["lightblue", "red"])
plt.title("Death Count by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Patient Count")
plt.legend(["Survived", "Died"])
plt.show()
corr_matrix = data.drop(columns=['age_group']).corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Heart Failure Clinical Records')
plt.show()
death_counts = data['DEATH_EVENT'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(death_counts, labels=['Not Died', 'Died'], autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
plt.title('Distribution of DEATH_EVENT')
plt.show()
# Select a subset of numerical features that showed some correlation with DEATH_EVENT
selected_features = ['time', 'serum_creatinine', 'ejection_fraction', 'age', 'serum_sodium', 'DEATH_EVENT']
sns.pairplot(data[selected_features], hue='DEATH_EVENT', diag_kind='kde')
plt.suptitle('Pairplot of Selected Numerical Features by DEATH_EVENT', y=1.02)
plt.show()
# Data Preprocessing
### Data Split
data.drop(columns=['age_group'], inplace=True)
X = data.drop('DEATH_EVENT', axis=1)
y = data['DEATH_EVENT']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
### Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
continuous_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])
#Modeling
### Logistic Regression
log_params = {
'penalty': ['l1', 'l2', 'elasticnet', 'none'],
'C': [0.01, 0.1, 1, 10, 100],
'solver': ['lbfgs', 'saga'],
'max_iter': [1000]
}
log_grid = GridSearchCV(LogisticRegression(random_state=42), log_params, cv=5)
log_grid.fit(X_train, y_train)
print(" Logistic Regression Best Params:", log_grid.best_params_)
####Evaluation
log_model = LogisticRegression(
penalty='l2',
C=0.1,
solver='lbfgs',
max_iter=1000,
random_state=42
)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
print(" Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.4f}")
print(classification_report(y_test, y_pred_log))
### Random Forest
rf_params = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5)
rf_grid.fit(X_train, y_train)
print(" Random Forest Best Params:", rf_grid.best_params_)
####Evaluation
rf_model = RandomForestClassifier(
n_estimators=50, max_depth=5,
min_samples_leaf=2, min_samples_split=5,
random_state=42
)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(" Random Forest")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))
### SVM
svm_params = {
'kernel': ['linear', 'rbf'],
'C': [0.1, 1, 10],
'gamma': ['scale', 'auto']
}
svm_grid = GridSearchCV(SVC(probability=True, random_state=42), svm_params, cv=5)
svm_grid.fit(X_train, y_train)
print(" SVM Best Params:", svm_grid.best_params_)
#### Evaluation
svm_model = SVC(
C=0.1, gamma='scale', kernel='linear',
probability=True, random_state=42
)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("\n SVM")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(classification_report(y_test, y_pred_svm))
### MLP
mlp_params = {
'hidden_layer_sizes': [(64,), (64, 32), (128, 64)],
'activation': ['relu', 'tanh'],
'alpha': [0.0001, 0.001],
'learning_rate': ['constant', 'adaptive']
}
mlp_grid = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), mlp_params, cv=5)
mlp_grid.fit(X_train, y_train)
print(" MLP Best Params:", mlp_grid.best_params_)
#### Evaluation
mlp_model = MLPClassifier(
hidden_layer_sizes=(64, 32),
activation='tanh',
alpha=0.0001,
learning_rate='constant',
max_iter=1000,
random_state=42
)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)
print("\n MLP Neural Network")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")
print(classification_report(y_test, y_pred_mlp))
### XGBoost
xgb_params = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 4, 5],
'learning_rate': [0.01, 0.1, 0.2]
}
xgb_grid = GridSearchCV(
XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
xgb_params, cv=5
)
xgb_grid.fit(X_train, y_train)
print(" XGBoost Best Params:", xgb_grid.best_params_)
#### Evaluation
xgb_model = XGBClassifier(
n_estimators=50,
max_depth=4,
learning_rate=0.2,
use_label_encoder=False,
eval_metric='logloss',
random_state=42
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("\n XGBoost")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb))
### KNN
knn_params = {
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']
}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train, y_train)
print(" KNN Best Params:", knn_grid.best_params_)
#### Evaluation
knn_model = KNeighborsClassifier(
n_neighbors=5,
weights='uniform',
metric='euclidean'
)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
print("\n KNN")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(classification_report(y_test, y_pred_knn))
### Models Accuracies
models = [
'Random Forest', 'SVM', 'MLP',
'XGBoost', 'KNN', 'Logistic Regression'
]
accuracies = [
0.85, 0.8333, 0.6833,
0.8333, 0.7167, 0.8333
]
plt.figure(figsize=(10, 6))
plt.bar(models, accuracies, color=['blue', 'green', 'purple', 'orange', 'red', 'cyan'])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=30)
plt.show()
import gradio as gr
from sklearn.preprocessing import StandardScaler
import joblib
joblib.dump(rf_model, "heart_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Model and scaler saved successfully")
model = joblib.load("heart_model.pkl")
scaler = joblib.load("scaler.pkl")
def predict_heart_risk(age, cpk, ef, platelets, sc, ss, time, anaemia, diabetes, high_bp, sex, smoking):
data = pd.DataFrame([[
age, anaemia, cpk, diabetes, ef, high_bp,
platelets, sc, ss, sex, smoking, time
]], columns=[
'age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
'ejection_fraction', 'high_blood_pressure', 'platelets',
'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time'
])
continuous_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction','platelets', 'serum_creatinine', 'serum_sodium', 'time']
data[continuous_features] = scaler.transform(data[continuous_features])
prediction = model.predict(data)[0]
return " At Risk" if prediction == 1 else " Not At Risk"
inputs = [
gr.Number(label="Age"),
gr.Number(label="Creatinine Phosphokinase, Range [0,100000]"),
gr.Number(label="Ejection Fraction, Range [5,85] "),
gr.Number(label="Platelets, Range [5000,2000000]"),
gr.Number(label="Serum Creatinine, Range [0.1,60]"),
gr.Number(label="Serum Sodium, Range [95,255]"),
gr.Number(label="Follow-up Time (days)"),
gr.Radio([0, 1], label="Anaemia (0=No, 1=Yes)"),
gr.Radio([0, 1], label="Diabetes (0=No, 1=Yes)"),
gr.Radio([0, 1], label="High Blood Pressure (0=No, 1=Yes)"),
gr.Radio([0, 1], label="Sex (0=Female, 1=Male)"),
gr.Radio([0, 1], label="Smoking (0=No, 1=Yes)")
]
gr.Interface(
fn=predict_heart_risk,
inputs=inputs,
outputs="text",
title=" Heart Failure Risk Predictor",
description="Enter patient data to predict if they are at risk of heart failure.",
allow_flagging="never"
).launch()