heart / README.md

Update README.md

ad22b81 verified 4 months ago

9.58 kB

	---
	license: bigscience-openrail-m
	datasets:
	- Abdelrahma12/heart_failure_clinical_records_dataset.csv
	language:
	- ar
	metrics:
	- accuracy
	base_model:
	- HuggingFaceTB/SmolLM3-3B
	new_version: HuggingFaceTB/SmolLM3-3B
	library_name: adapter-transformers
	tags:
	- medical
	---
	###Importing Liberaries

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import GridSearchCV
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.neural_network import MLPClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from xgboost import XGBClassifier
	from sklearn.svm import SVC
	from sklearn.metrics import accuracy_score, classification_report
	import warnings
	warnings.filterwarnings('ignore')

	from google.colab import drive


	### Data Load

	data = pd.read_csv(r'/content/heart_failure_clinical_records_dataset - Copy.csv')


	### Data Exploratory


	### Data Exploratory

	data

	data.head()

	data.info()

	data.duplicated().sum()

	labels = ["40-45", "46-50", "51-55", "56-60", "61-65", "66-70", "71-75", "76-80", "81-95"]
	data['age_group'] = pd.cut(data['age'], bins=[40, 45, 50, 55, 60, 65, 70, 75, 80, 95], labels=labels)

	data.isnull().sum()

	### Data Visualization

	plt.figure(figsize=(10,6))
	sns.countplot(data=data, x='age_group', hue='DEATH_EVENT', palette=["lightblue", "red"])
	plt.title("Death Count by Age Group")
	plt.xlabel("Age Group")
	plt.ylabel("Patient Count")
	plt.legend(["Survived", "Died"])
	plt.show()

	corr_matrix = data.drop(columns=['age_group']).corr()
	plt.figure(figsize=(12, 10))
	sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
	plt.title('Correlation Matrix of Heart Failure Clinical Records')
	plt.show()

	death_counts = data['DEATH_EVENT'].value_counts()
	plt.figure(figsize=(6, 6))
	plt.pie(death_counts, labels=['Not Died', 'Died'], autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
	plt.title('Distribution of DEATH_EVENT')
	plt.show()

	# Select a subset of numerical features that showed some correlation with DEATH_EVENT
	selected_features = ['time', 'serum_creatinine', 'ejection_fraction', 'age', 'serum_sodium', 'DEATH_EVENT']

	sns.pairplot(data[selected_features], hue='DEATH_EVENT', diag_kind='kde')
	plt.suptitle('Pairplot of Selected Numerical Features by DEATH_EVENT', y=1.02)
	plt.show()

	# Data Preprocessing

	### Data Split


	data.drop(columns=['age_group'], inplace=True)

	X = data.drop('DEATH_EVENT', axis=1)
	y = data['DEATH_EVENT']
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

	### Feature Scaling

	from sklearn.preprocessing import StandardScaler
	scaler = StandardScaler()
	continuous_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']
	X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])
	X_test[continuous_features] = scaler.transform(X_test[continuous_features])


	#Modeling

	### Logistic Regression

	log_params = {
	'penalty': ['l1', 'l2', 'elasticnet', 'none'],
	'C': [0.01, 0.1, 1, 10, 100],
	'solver': ['lbfgs', 'saga'],
	'max_iter': [1000]
	}

	log_grid = GridSearchCV(LogisticRegression(random_state=42), log_params, cv=5)
	log_grid.fit(X_train, y_train)

	print(" Logistic Regression Best Params:", log_grid.best_params_)

	####Evaluation

	log_model = LogisticRegression(
	penalty='l2',
	C=0.1,
	solver='lbfgs',
	max_iter=1000,
	random_state=42
	)
	log_model.fit(X_train, y_train)
	y_pred_log = log_model.predict(X_test)
	print(" Logistic Regression")
	print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.4f}")
	print(classification_report(y_test, y_pred_log))

	### Random Forest

	rf_params = {
	'n_estimators': [50, 100, 200],
	'max_depth': [None, 5, 10],
	'min_samples_split': [2, 5],
	'min_samples_leaf': [1, 2]
	}

	rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5)
	rf_grid.fit(X_train, y_train)

	print(" Random Forest Best Params:", rf_grid.best_params_)


	####Evaluation

	rf_model = RandomForestClassifier(
	n_estimators=50, max_depth=5,
	min_samples_leaf=2, min_samples_split=5,
	random_state=42
	)
	rf_model.fit(X_train, y_train)
	y_pred_rf = rf_model.predict(X_test)
	print(" Random Forest")
	print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
	print(classification_report(y_test, y_pred_rf))


	### SVM

	svm_params = {
	'kernel': ['linear', 'rbf'],
	'C': [0.1, 1, 10],
	'gamma': ['scale', 'auto']
	}

	svm_grid = GridSearchCV(SVC(probability=True, random_state=42), svm_params, cv=5)
	svm_grid.fit(X_train, y_train)

	print(" SVM Best Params:", svm_grid.best_params_)

	#### Evaluation

	svm_model = SVC(
	C=0.1, gamma='scale', kernel='linear',
	probability=True, random_state=42
	)
	svm_model.fit(X_train, y_train)
	y_pred_svm = svm_model.predict(X_test)
	print("\n SVM")
	print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
	print(classification_report(y_test, y_pred_svm))

	### MLP

	mlp_params = {
	'hidden_layer_sizes': [(64,), (64, 32), (128, 64)],
	'activation': ['relu', 'tanh'],
	'alpha': [0.0001, 0.001],
	'learning_rate': ['constant', 'adaptive']
	}

	mlp_grid = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), mlp_params, cv=5)
	mlp_grid.fit(X_train, y_train)

	print(" MLP Best Params:", mlp_grid.best_params_)


	#### Evaluation

	mlp_model = MLPClassifier(
	hidden_layer_sizes=(64, 32),
	activation='tanh',
	alpha=0.0001,
	learning_rate='constant',
	max_iter=1000,
	random_state=42
	)
	mlp_model.fit(X_train, y_train)
	y_pred_mlp = mlp_model.predict(X_test)
	print("\n MLP Neural Network")
	print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")
	print(classification_report(y_test, y_pred_mlp))

	### XGBoost

	xgb_params = {
	'n_estimators': [50, 100, 200],
	'max_depth': [3, 4, 5],
	'learning_rate': [0.01, 0.1, 0.2]
	}

	xgb_grid = GridSearchCV(
	XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
	xgb_params, cv=5
	)
	xgb_grid.fit(X_train, y_train)

	print(" XGBoost Best Params:", xgb_grid.best_params_)


	#### Evaluation

	xgb_model = XGBClassifier(
	n_estimators=50,
	max_depth=4,
	learning_rate=0.2,
	use_label_encoder=False,
	eval_metric='logloss',
	random_state=42
	)
	xgb_model.fit(X_train, y_train)
	y_pred_xgb = xgb_model.predict(X_test)
	print("\n XGBoost")
	print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
	print(classification_report(y_test, y_pred_xgb))

	### KNN

	knn_params = {
	'n_neighbors': [3, 5, 7, 9],
	'weights': ['uniform', 'distance'],
	'metric': ['euclidean', 'manhattan']
	}

	knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
	knn_grid.fit(X_train, y_train)

	print(" KNN Best Params:", knn_grid.best_params_)

	#### Evaluation

	knn_model = KNeighborsClassifier(
	n_neighbors=5,
	weights='uniform',
	metric='euclidean'
	)
	knn_model.fit(X_train, y_train)
	y_pred_knn = knn_model.predict(X_test)
	print("\n KNN")
	print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
	print(classification_report(y_test, y_pred_knn))

	### Models Accuracies

	models = [
	'Random Forest', 'SVM', 'MLP',
	'XGBoost', 'KNN', 'Logistic Regression'
	]
	accuracies = [
	0.85, 0.8333, 0.6833,
	0.8333, 0.7167, 0.8333
	]

	plt.figure(figsize=(10, 6))
	plt.bar(models, accuracies, color=['blue', 'green', 'purple', 'orange', 'red', 'cyan'])
	plt.ylim(0, 1)
	plt.ylabel('Accuracy')
	plt.title('Model Accuracy Comparison')

	plt.xticks(rotation=30)
	plt.show()

	import gradio as gr
	from sklearn.preprocessing import StandardScaler
	import joblib

	joblib.dump(rf_model, "heart_model.pkl")
	joblib.dump(scaler, "scaler.pkl")
	print("Model and scaler saved successfully")

	model = joblib.load("heart_model.pkl")
	scaler = joblib.load("scaler.pkl")

	def predict_heart_risk(age, cpk, ef, platelets, sc, ss, time, anaemia, diabetes, high_bp, sex, smoking):
	data = pd.DataFrame([[
	age, anaemia, cpk, diabetes, ef, high_bp,
	platelets, sc, ss, sex, smoking, time
	]], columns=[
	'age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
	'ejection_fraction', 'high_blood_pressure', 'platelets',
	'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time'
	])


	continuous_features = ['age', 'creatinine_phosphokinase', 'ejection_fraction','platelets', 'serum_creatinine', 'serum_sodium', 'time']
	data[continuous_features] = scaler.transform(data[continuous_features])

	prediction = model.predict(data)[0]
	return " At Risk" if prediction == 1 else " Not At Risk"

	inputs = [
	gr.Number(label="Age"),
	gr.Number(label="Creatinine Phosphokinase, Range [0,100000]"),
	gr.Number(label="Ejection Fraction, Range [5,85] "),
	gr.Number(label="Platelets, Range [5000,2000000]"),
	gr.Number(label="Serum Creatinine, Range [0.1,60]"),
	gr.Number(label="Serum Sodium, Range [95,255]"),
	gr.Number(label="Follow-up Time (days)"),
	gr.Radio([0, 1], label="Anaemia (0=No, 1=Yes)"),
	gr.Radio([0, 1], label="Diabetes (0=No, 1=Yes)"),
	gr.Radio([0, 1], label="High Blood Pressure (0=No, 1=Yes)"),
	gr.Radio([0, 1], label="Sex (0=Female, 1=Male)"),
	gr.Radio([0, 1], label="Smoking (0=No, 1=Yes)")
	]

	gr.Interface(
	fn=predict_heart_risk,
	inputs=inputs,
	outputs="text",
	title=" Heart Failure Risk Predictor",
	description="Enter patient data to predict if they are at risk of heart failure.",
	allow_flagging="never"
	).launch()