## Python Titanic Model

# Import the required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import seaborn as sns

# Load the Titanic dataset
titanic_data = sns.load_dataset('titanic')

# Display the column names and selected columns
print("Titanic Data")
print(titanic_data.columns)
display(titanic_data[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'alone']])

# Preprocess the data
td = titanic_data.copy()  # Create a copy to avoid modifying the original dataset
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True)  # Drop rows with missing values
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x else 0)

# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
embarked_encoded = pd.DataFrame(enc.fit_transform(td[['embarked']]).toarray(), columns=enc.get_feature_names(['embarked']))
td = pd.concat([td, embarked_encoded], axis=1)
td.drop(['embarked'], axis=1, inplace=True)

# Split data into features (X) and target (y)
X = td.drop('survived', axis=1)
y = td['survived']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a decision tree classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Test the decision tree model
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print('Decision Tree Classifier Accuracy: {:.2%}'.format(accuracy_dt))

# Train a logistic regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

# Test the logistic regression model
y_pred_logreg = logreg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print('Logistic Regression Accuracy: {:.2%}'.format(accuracy_logreg))

# Feature importance using the decision tree model
importances = dt.feature_importances_
for feature, importance in zip(X.columns, importances):
    print(f'The importance of {feature} is: {importance}')


# Define the TitanicRegression global variable
titanic_regression = None

# Define the TitanicRegression class
class TitanicRegression:
    def __init__(self):
        self.dt = None
        self.logreg = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.encoder = None

    def initTitanic(self):
        titanic_data = sns.load_dataset('titanic')
        X = titanic_data.drop('survived', axis=1)
        y = titanic_data['survived']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Initialize the encoder
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.X_train = self.encoder.fit_transform(self.X_train)
        self.X_test = self.encoder.transform(self.X_test)

        self.dt = DecisionTreeClassifier()
        self.dt.fit(self.X_train, self.y_train)

        self.logreg = LogisticRegression()
        self.logreg.fit(self.X_train, self.y_train)

    def runDecisionTree(self):
        if self.dt is None:
            print("Decision Tree model is not initialized. Please run initTitanic() first.")
            return
        y_pred_dt = self.dt.predict(self.X_test)
        accuracy_dt = accuracy_score(self.y_test, y_pred_dt)
        print('Decision Tree Classifier Accuracy: {:.2%}'.format(accuracy_dt))

    def runLogisticRegression(self):
        if self.logreg is None:
            print("Logistic Regression model is not initialized. Please run initTitanic() first.")
            return
        y_pred_logreg = self.logreg.predict(self.X_test)
        accuracy_logreg = accuracy_score(self.y_test, y_pred_logreg)
        print('Logistic Regression Accuracy: {:.2%}'.format(accuracy_logreg))

def initTitanic():
    global titanic_regression
    titanic_regression = TitanicRegression()
    titanic_regression.initTitanic()
    titanic_regression.runDecisionTree()
    titanic_regression.runLogisticRegression()

def predictSurvival(passenger):
    passenger_df = pd.DataFrame(passenger, index=[0])   
    passenger_df.drop(['name'], axis=1, inplace=True)
    passenger = passenger_df.copy()

    # Add missing columns and fill them with default values
    missing_cols = set(titanic_regression.X_train.columns) - set(passenger.columns)
    for col in missing_cols:
        passenger[col] = 0

    # Ensure the order of column in the passenger matches the order in the training data
    passenger = passenger[titanic_regression.X_train.columns]

    # Preprocess the passenger data
    passenger = titanic_regression.encoder.transform(passenger)

    predict = titanic_regression.logreg.predict(passenger)
    return predict


# Sample usage
if __name__ == "__main__":
    # Initialize the Titanic model
    initTitanic()

    # Predict the survival of a passenger
    passenger = {
        'name': ['John Mortensen'],
        'pclass': [2],
        'sex': ['male'],
        'age': [64],
        'sibsp': [1],
        'parch': [1],
        'fare': [16.00],
        'embarked': ['S'],
        'alone': [False]
    }
    print(predictSurvival(passenger)) ## 
Titanic Data
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')
survived pclass sex age sibsp parch fare embarked alone
0 0 3 male 22.0 1 0 7.2500 S False
1 1 1 female 38.0 1 0 71.2833 C False
2 1 3 female 26.0 0 0 7.9250 S True
3 1 1 female 35.0 1 0 53.1000 S False
4 0 3 male 35.0 0 0 8.0500 S True
... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S True
887 1 1 female 19.0 0 0 30.0000 S True
888 0 3 female NaN 1 2 23.4500 S False
889 1 1 male 26.0 0 0 30.0000 C True
890 0 3 male 32.0 0 0 7.7500 Q True

891 rows × 9 columns

---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

/tmp/ipykernel_178840/4147884662.py in <module>
     27 # Encode categorical variables
     28 enc = OneHotEncoder(handle_unknown='ignore')
---> 29 embarked_encoded = pd.DataFrame(enc.fit_transform(td[['embarked']]).toarray(), columns=enc.get_feature_names(['embarked']))
     30 td = pd.concat([td, embarked_encoded], axis=1)
     31 td.drop(['embarked'], axis=1, inplace=True)


AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names'