In this Kaggle challenge, We need to complete the analysis of what sorts of people were likely to survive. We apply the tools of machine learning to predict which passengers survived the Titanic tragedy.
Data Sets: Train (data which include Survival Flag) & Test (data to be predicted for the Target – Survivor)
train <- read.csv("C:/Users/Liew Keong Han/Desktop/train.csv", na.strings=c("",".","NA"))
test <- read.csv("C:/Users/Liew Keong Han/Desktop/test.csv", na.strings=c("",".","NA"))
1) Tool: R Studio
Packages used: springr, rpart, rattle, rpart.plot, RColorBrewer,Random Forest.
Exploring the data patterns and identifying potential independent variables
str(train)
str(test)
head(train, 10)
head(test, 10)
tail(train, 10)
head(test, 10)
Library(stringr)Regular Expression to extraction “Title”
str_extract(train$Name, ‘[ ][a-zA-Z]+[.][ ]’)
train$Title <- str_extract(train$Name, '[ ][a-zA-Z]+[.][ ]')
train$Title <- train$Title ==c(" Mr."," Mrs."," Miss.")
test$Title <- str_extract(test$Name, '[ ][a-zA-Z]+[.][ ]')
test$Title <- test$Title ==c(" Mr."," Mrs."," Miss.")
# Passengers that survived vs passengers that passed away
table(train$Survived)
table(train$Survived, train$Sex)
prop.table(table(train$Survived))
prop.table(table(train$Survived, train$Sex))
# Males & females that survived vs males & females that passed away
table(train$Sex, train$Survived)
prop.table(table(train$Sex, train$Survived),1)
train$Child[train$Age==”NA”] <- NA
train$Child[train$Age >=18] <- 0
train$Child[train$Age < 18] <- 1
# Two-way comparison
prop.table(table(train$Child, train$Survived),1)
train$familysize <- train$SibSp + train$Parch +1
test$familysize <- test$SibSp + test$Parch +1
prop.table(table(train$familysize, train$Survived),1)
train$WOCabin <- is.na(train$Cabin)
test$WOCabin <- is.na(test$Cabin)
prop.table(table(train$WOCabin, train$Survived),1)
prop.table(table(train$Title, train$Survived),1)
Decision Tree
# Build the decision tree
decision_tree <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + familysize + WOCabin, data = train, method ="class")
# Load in the packages to create a fancified version of your tree
library(rattle)
library(rpart.plot)
library(RColorBrewer)
# Time to plot your fancified tree
fancyRpartPlot(decision_tree)
# Make your prediction using the test set
Prediction <- predict(decision_tree, test, type="class")
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerID = test$PassengerId, Survived = my_prediction)
# Check that your data frame has 418 entries
nrow(my_solution) == 418
# Write your solution to a csv file with the name my_solution.csv
write.csv(my_solution, row.names = FALSE , file=”my_solution.csv”)
# Create a new decision tree my_tree_three
my_tree_three <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, train, method ="class", control = rpart.control(cp = 0, minsplit = 50))
# Visualize your new decision tree
fancyRpartPlot(my_tree_three)
# create a new train set with the new variable
train_two <- train
train_two$family_size <- train$SibSp + train$Parch + 1
# Create a new decision tree my_tree_three
my_tree_four <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + family_size, train_two, method = "class")
# Visualize your new decision tree
fancyRpartPlot(my_tree_four)
# Create a new model `my_tree_five`
my_tree_five <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, train_new, method = "class")
# Visualize your new decision tree
fancyRpartPlot(my_tree_five)
# Make your prediction using `my_tree_five` and `test_new`
my_prediction <- predict(my_tree_five, test_new, type ="class")
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerId = test_new$PassengerId, Survived = my_prediction)
# Write your solution away to a csv file with the name my_solution.csv
write.csv(my_solution, row.names = FALSE, file =”my_solution.csv”)
Random Forest
# Create a new model `my_tree_five`
my_tree_five <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, train_new, method = "class")
# Visualize your new decision tree
fancyRpartPlot(my_tree_five)
# Make your prediction using `my_tree_five` and `test_new`
my_prediction <- predict(my_tree_five, test_new, type ="class")
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerId = test_new$PassengerId, Survived = my_prediction)
# Write your solution away to a csv file with the name my_solution.csv
write.csv(my_solution, row.names = FALSE, file =”my_solution.csv”)
# Load in the package
library(randomForest)
# Train set and test set
str(train)
str(test)
# Set seed for reproducibility
set.seed(111)
# Apply the Random Forest Algorithm
my_forest <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, data= train, importance = TRUE, ntree = 1000)
# Make your prediction using the test set
my_prediction <- predict(my_forest, test, type="class")
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerId = test$PassengerId, my_prediction)
# Write your solution away to a csv file with the name my_solution.csv
write.csv(my_solution, row.names = FALSE, file = “my_solution.csv”)
2) Tool: IBM SPSS Modeler
Model: AutoClassifier
After finished running you should get the prediction result as per above. Reformat the prediction result according the kaggle submission requirement.
The blog is so interactive and Informative , you should write more blogs like this Big Data Hadoop Online Course Hyderabad
your blog is trust worthy Aws online training