This research aims to develop an artificial intelligence-based system that identifies patients who are more likely to develop heart disease based on their medical history. The heart disease dataset from the UCI Machine Learning Repository was used for training and validation.
In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import random
Install Required Libraries¶
In [2]:
%%capture
!pip install -q hvplot
!pip install pytorch-tabnet
Import Libraries¶
In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import scikitplot as skplt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from scipy import stats
from numpy import isnan
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import KFold
from matplotlib.pyplot import figure
Get Cleveland Data from UCI Repository¶
In [4]:
data=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data",header=None)
data = data.replace("?",np.nan)
data = data.dropna().reset_index(drop=True)
data.columns = ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
'fasting blood sugar', 'resting ecg', 'max heart rate',
'exercise angina', 'oldpeak', 'ST slope','ca', 'thal', 'target']
k=['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
'fasting blood sugar', 'resting ecg', 'max heart rate',
'exercise angina', 'ST slope','ca', 'thal', 'target']
for j in k:
data[j] = data[j].astype('float').astype('int')
data['oldpeak'] = data['oldpeak'].astype('float')
data['target'] = np.where(data.target>0,1,0)
dataTab = data.copy()
data.head()
Out[4]:
age | sex | chest pain type | resting bp s | cholesterol | fasting blood sugar | resting ecg | max heart rate | exercise angina | oldpeak | ST slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 1 | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0 | 6 | 0 |
1 | 67 | 1 | 4 | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3 | 3 | 1 |
2 | 67 | 1 | 4 | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2 | 7 | 1 |
3 | 37 | 1 | 3 | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0 | 3 | 0 |
4 | 41 | 0 | 2 | 130 | 204 | 0 | 2 | 172 | 0 | 1.4 | 1 | 0 | 3 | 0 |
Convert Nominal Variables - chest pain type, resting ecg and thal¶
In [5]:
CP_Dict = {1:'typical angina',2:'atypical angina',3:'non-anginal',4:'asymptomatic'}
ECG_Dict = {0:'normal',1:'ST-T wave abnormality',2:'left ventricular hypertrophy'}
thal_Dict = {3:'normal',6:'fixed defect',7:'reversable defect'}
data.replace({"chest pain type": CP_Dict},inplace=True)
data.replace({"resting ecg": ECG_Dict},inplace=True)
data.replace({"thal": thal_Dict},inplace=True)
data.head()
Out[5]:
age | sex | chest pain type | resting bp s | cholesterol | fasting blood sugar | resting ecg | max heart rate | exercise angina | oldpeak | ST slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | typical angina | 145 | 233 | 1 | left ventricular hypertrophy | 150 | 0 | 2.3 | 3 | 0 | fixed defect | 0 |
1 | 67 | 1 | asymptomatic | 160 | 286 | 0 | left ventricular hypertrophy | 108 | 1 | 1.5 | 2 | 3 | normal | 1 |
2 | 67 | 1 | asymptomatic | 120 | 229 | 0 | left ventricular hypertrophy | 129 | 1 | 2.6 | 2 | 2 | reversable defect | 1 |
3 | 37 | 1 | non-anginal | 130 | 250 | 0 | normal | 187 | 0 | 3.5 | 3 | 0 | normal | 0 |
4 | 41 | 0 | atypical angina | 130 | 204 | 0 | left ventricular hypertrophy | 172 | 0 | 1.4 | 1 | 0 | normal | 0 |
In [6]:
Sex_Dict = {1:'male',0:'female'}
FS_Dict = {0:'under 120mgdl',1:'over 120mgdl'}
exang_Dict = {0:'not induced',1:'induced'}
slope_Dict = {1:'upsloping',2:'flat',3:'downsloping'}
data.replace({"sex": Sex_Dict},inplace=True)
data.replace({"fasting blood sugar": FS_Dict},inplace=True)
data.replace({"exercise angina": exang_Dict},inplace=True)
data.replace({"ST slope": slope_Dict},inplace=True)
dataset = data.copy()
data.head()
Out[6]:
age | sex | chest pain type | resting bp s | cholesterol | fasting blood sugar | resting ecg | max heart rate | exercise angina | oldpeak | ST slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | male | typical angina | 145 | 233 | over 120mgdl | left ventricular hypertrophy | 150 | not induced | 2.3 | downsloping | 0 | fixed defect | 0 |
1 | 67 | male | asymptomatic | 160 | 286 | under 120mgdl | left ventricular hypertrophy | 108 | induced | 1.5 | flat | 3 | normal | 1 |
2 | 67 | male | asymptomatic | 120 | 229 | under 120mgdl | left ventricular hypertrophy | 129 | induced | 2.6 | flat | 2 | reversable defect | 1 |
3 | 37 | male | non-anginal | 130 | 250 | under 120mgdl | normal | 187 | not induced | 3.5 | downsloping | 0 | normal | 0 |
4 | 41 | female | atypical angina | 130 | 204 | under 120mgdl | left ventricular hypertrophy | 172 | not induced | 1.4 | upsloping | 0 | normal | 0 |
In [7]:
data['target'].value_counts(dropna=False)
Out[7]:
0 160 1 137 Name: target, dtype: int64
Exploratory Data Analysis¶
Target Variables¶
In [8]:
f, axes = plt.subplots(1, 1, figsize=(4, 6))
sns.countplot(ax=axes,x='target', data=data, palette=['green','orange'])
axes.set_title("Target Distribution", fontsize=20)
Out[8]:
Text(0.5, 1.0, 'Target Distribution')
Categorical Binaries - Sex, FBS and Exang¶
In [9]:
f, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.countplot(ax=axes[0],x='sex', data=data, palette=['green','orange'],hue="target")
axes[0].set_title("sex", fontsize=20)
sns.countplot(ax=axes[1],x='fasting blood sugar', data=data, palette=['green','orange'],hue="target")
axes[1].set_title("fasting blood sugar", fontsize=20)
sns.countplot(ax=axes[2],x='exercise angina', data=data, palette=['green','orange'],hue="target")
plt.title("exercise angina", fontsize=20)
Out[9]:
Text(0.5, 1.0, 'exercise angina')
Categorical Variables - CP, ECG and thal¶
In [10]:
plt.figure(figsize=(12,5))
sns.countplot(x='chest pain type', data=data, palette=['green','orange'],hue="target")
plt.title("chest pain type", fontsize=20)
Out[10]:
Text(0.5, 1.0, 'chest pain type')
In [11]:
plt.figure(figsize=(12,5))
sns.countplot(x='ST slope', data=data, palette=['green','orange'],hue="target")
plt.title("ST slope", fontsize=20)
Out[11]:
Text(0.5, 1.0, 'ST slope')
In [12]:
plt.figure(figsize=(12,5))
ax=sns.countplot(x='resting ecg', data=data, palette=['green','orange'],hue="target")
plt.title("resting ecg", fontsize=20)
Out[12]:
Text(0.5, 1.0, 'resting ecg')
In [13]:
plt.figure(figsize=(12,5))
sns.countplot(x='ca', data=data, palette=['green','orange'],hue="target")
plt.title("ca", fontsize=20)
Out[13]:
Text(0.5, 1.0, 'ca')
In [14]:
plt.figure(figsize=(12,5))
sns.countplot(x='thal', data=data, palette=['green','orange'],hue="target")
plt.title("thal", fontsize=20)
Out[14]:
Text(0.5, 1.0, 'thal')
Numeric Variables - Age, Cholestrol,Resting BP and Max heart rate¶
In [15]:
data_disease = data[data["target"] == 1]
data_normal = data[data["target"] == 0]
Age¶
In [16]:
plt.figure(figsize=(8,5))
sns.distplot(data_normal["age"], bins=24, color='g')
sns.distplot(data_disease["age"], bins=24, color='r')
plt.title("Distribuition and density by Age",fontsize=20)
plt.xlabel("Age",fontsize=15)
plt.show()
Cholestrol¶
In [17]:
#figure size
plt.figure(figsize=(8,5))
sns.distplot(data_normal["cholesterol"], bins=24, color='g')
sns.distplot(data_disease["cholesterol"], bins=24, color='r')
plt.title("Distribuition and density by cholesterol",fontsize=20)
plt.xlabel("cholesterol",fontsize=15)
plt.show()
Resting BP¶
In [18]:
plt.figure(figsize=(8,5))
sns.distplot(data_normal["resting bp s"], bins=24, color='g')
sns.distplot(data_disease["resting bp s"], bins=24, color='r')
plt.title("Distribuition and density by resting bp",fontsize=20)
plt.xlabel("resting bp",fontsize=15)
plt.show()
Max Heart Rate¶
In [19]:
plt.figure(figsize=(8,5))
sns.distplot(data_normal["max heart rate"], bins=24, color='g')
sns.distplot(data_disease["max heart rate"], bins=24, color='r')
plt.title("Distribuition and density by max heart rate",fontsize=20)
plt.xlabel("max heart rate",fontsize=15)
plt.show()
Oldpeak¶
In [20]:
plt.figure(figsize=(8,5))
sns.distplot(data_normal["oldpeak"], bins=24, color='g')
sns.distplot(data_disease["oldpeak"], bins=24, color='r')
plt.title("Distribuition and density by old peak",fontsize=20)
plt.xlabel("oldpeak",fontsize=15)
plt.show()
Age vs Max HR¶
In [21]:
plt.figure(figsize=(9, 7))
plt.scatter(data_disease["age"],
data_disease["max heart rate"],
c="salmon")
plt.scatter(data_normal["age"],
data_normal["max heart rate"],
c="lightblue")
plt.title("Heart Disease in function of Age and Max Heart Rate")
plt.xlabel("Age")
plt.ylabel("Max Heart Rate")
plt.legend(["Disease", "No Disease"]);
Correlations¶
In [22]:
import hvplot.pandas
data.drop('target', axis=1).corrwith(data.target).hvplot.barh(
width=600, height=400,
title="Correlation between Heart Disease and Numeric Features",
ylabel='Correlation', xlabel='Numerical Features'
)