## Python Titanic Model
# Import the required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import seaborn as sns
# Load the Titanic dataset
titanic_data = sns.load_dataset('titanic')
# Display the column names and selected columns
print("Titanic Data")
print(titanic_data.columns)
display(titanic_data[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'alone']])
# Preprocess the data
td = titanic_data.copy() # Create a copy to avoid modifying the original dataset
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # Drop rows with missing values
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x else 0)
# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
embarked_encoded = pd.DataFrame(enc.fit_transform(td[['embarked']]).toarray(), columns=enc.get_feature_names(['embarked']))
td = pd.concat([td, embarked_encoded], axis=1)
td.drop(['embarked'], axis=1, inplace=True)
# Split data into features (X) and target (y)
X = td.drop('survived', axis=1)
y = td['survived']
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Train a decision tree classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
# Test the decision tree model
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print('Decision Tree Classifier Accuracy: {:.2%}'.format(accuracy_dt))
# Train a logistic regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
# Test the logistic regression model
y_pred_logreg = logreg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print('Logistic Regression Accuracy: {:.2%}'.format(accuracy_logreg))
# Feature importance using the decision tree model
importances = dt.feature_importances_
for feature, importance in zip(X.columns, importances):
print(f'The importance of {feature} is: {importance}')
# Define the TitanicRegression global variable
titanic_regression = None
# Define the TitanicRegression class
class TitanicRegression:
def __init__(self):
self.dt = None
self.logreg = None
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
self.encoder = None
def initTitanic(self):
titanic_data = sns.load_dataset('titanic')
X = titanic_data.drop('survived', axis=1)
y = titanic_data['survived']
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize the encoder
self.encoder = OneHotEncoder(handle_unknown='ignore')
self.X_train = self.encoder.fit_transform(self.X_train)
self.X_test = self.encoder.transform(self.X_test)
self.dt = DecisionTreeClassifier()
self.dt.fit(self.X_train, self.y_train)
self.logreg = LogisticRegression()
self.logreg.fit(self.X_train, self.y_train)
def runDecisionTree(self):
if self.dt is None:
print("Decision Tree model is not initialized. Please run initTitanic() first.")
return
y_pred_dt = self.dt.predict(self.X_test)
accuracy_dt = accuracy_score(self.y_test, y_pred_dt)
print('Decision Tree Classifier Accuracy: {:.2%}'.format(accuracy_dt))
def runLogisticRegression(self):
if self.logreg is None:
print("Logistic Regression model is not initialized. Please run initTitanic() first.")
return
y_pred_logreg = self.logreg.predict(self.X_test)
accuracy_logreg = accuracy_score(self.y_test, y_pred_logreg)
print('Logistic Regression Accuracy: {:.2%}'.format(accuracy_logreg))
def initTitanic():
global titanic_regression
titanic_regression = TitanicRegression()
titanic_regression.initTitanic()
titanic_regression.runDecisionTree()
titanic_regression.runLogisticRegression()
def predictSurvival(passenger):
passenger_df = pd.DataFrame(passenger, index=[0])
passenger_df.drop(['name'], axis=1, inplace=True)
passenger = passenger_df.copy()
# Add missing columns and fill them with default values
missing_cols = set(titanic_regression.X_train.columns) - set(passenger.columns)
for col in missing_cols:
passenger[col] = 0
# Ensure the order of column in the passenger matches the order in the training data
passenger = passenger[titanic_regression.X_train.columns]
# Preprocess the passenger data
passenger = titanic_regression.encoder.transform(passenger)
predict = titanic_regression.logreg.predict(passenger)
return predict
# Sample usage
if __name__ == "__main__":
# Initialize the Titanic model
initTitanic()
# Predict the survival of a passenger
passenger = {
'name': ['John Mortensen'],
'pclass': [2],
'sex': ['male'],
'age': [64],
'sibsp': [1],
'parch': [1],
'fare': [16.00],
'embarked': ['S'],
'alone': [False]
}
print(predictSurvival(passenger)) ##
Titanic Data
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
'alive', 'alone'],
dtype='object')
survived | pclass | sex | age | sibsp | parch | fare | embarked | alone | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | False |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | False |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | True |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | False |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | True |
887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S | True |
888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | False |
889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C | True |
890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | True |
891 rows × 9 columns
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_178840/4147884662.py in <module>
27 # Encode categorical variables
28 enc = OneHotEncoder(handle_unknown='ignore')
---> 29 embarked_encoded = pd.DataFrame(enc.fit_transform(td[['embarked']]).toarray(), columns=enc.get_feature_names(['embarked']))
30 td = pd.concat([td, embarked_encoded], axis=1)
31 td.drop(['embarked'], axis=1, inplace=True)
AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names'