Seek Accuracy

3
setwd("C:/Users/Omar/Downloads/Seek Accuracy") rm(list=ls()) #Loading Library library(sqldf) library(data.table) library(plyr) library(caret) #Loaing the data total = read.csv('Train_seers_accuracy.csv',stringsAsFactors = F) #Formating the data str(total) total$Client_ID = as.factor(total$Client_ID) total$Gender = as.factor(total$Gender) #total$Transaction_Date = as.Date(total$Transaction_Date,'%d-%b-%y') total$year = as.numeric(lapply(strsplit(total$Transaction_Date,"-"), function(x) x[3])) total$age = 106-as.numeric(lapply(strsplit(total$DOB,"-"), function(x) x[3])) total$Quarter = lubridate::quarter(as.Date(total$Transaction_Date,'%d-%b-%y'),wi th_year = T) train = total[total$year<6,] Client_train <- sqldf("select Client_ID,count(*) cross_sell,avg(age),sum(Transac tion_Amount),avg(Transaction_Amount),sum(Var1),avg(Var1),sum(Var2),avg(Var2),sum (Var3),avg(Var3),avg(Number_of_EMI),count(Store_ID) from train group by Client_I D ") Client_total<- sqldf("select Client_ID,count(*) cross_sell,avg(age),sum(Transact ion_Amount),avg(Transaction_Amount),sum(Var1),avg(Var1),sum(Var2),avg(Var2),sum( Var3),avg(Var3),avg(Number_of_EMI),count(Store_ID) from total group by Client_ID ") #Client_train <- sqldf("select Client_ID,Store_ID,count(*) Cross_Sell from total group by Client_ID,Store_ID ") #Client_train <- sqldf("select Client_ID,Store_ID,count(*) Cross_Sell from total where Transaction_Date > '2003-12-31' group by Client_ID " ) str(Client_train) # Client_train$Cross_Sell <- ifelse(Client_train$Cross_Sell>1,1,0) #New data New_data <- data.frame(data.table::dcast(setDT(total), Client_ID+Gender ~ year,v alue.var = c('Transaction_ID'),fun=length)) # New_data <- data.frame(data.table::dcast(setDT(data), Client_ID+Gender ~ Quart er,value.var = c('Transaction_ID'),fun=length)) levels(New_data$Gender)[1] = 'missing' # testdata <- New_data[,c(-3,-4,-5,-6)] # colnames(testdata) <- colnames(New_data)[1:14] train_data = merge(New_data,Client_train,by='Client_ID',all.x=T) test_data = merge(New_data,Client_total,by='Client_ID',all.x=T) train_data$Client_ID = NULL test_data$X3 = NULL # testdata$X2003.1 = NULL # testdata$X2003.2 = NULL # testdata$X2003.3 = NULL # testdata$X2003.4 = NULL # colnames(testdata)[3:14] <- colnames(New_data)[2:13] colnames(test_data)[3:5] <- colnames(train_data)[2:4] anova <- aov(((X6) > 0)~.,data=New_data) # anova <- aov((X2006.1+X2006.2+X2006.3+X2006.4)>0~.,data=New_data)

Transcript of Seek Accuracy

Page 1: Seek Accuracy

setwd("C:/Users/Omar/Downloads/Seek Accuracy")rm(list=ls())

#Loading Librarylibrary(sqldf)library(data.table)library(plyr)library(caret)

#Loaing the datatotal = read.csv('Train_seers_accuracy.csv',stringsAsFactors = F)

#Formating the datastr(total)total$Client_ID = as.factor(total$Client_ID)total$Gender = as.factor(total$Gender)#total$Transaction_Date = as.Date(total$Transaction_Date,'%d-%b-%y')total$year = as.numeric(lapply(strsplit(total$Transaction_Date,"-"), function(x) x[3]))total$age = 106-as.numeric(lapply(strsplit(total$DOB,"-"), function(x) x[3]))total$Quarter = lubridate::quarter(as.Date(total$Transaction_Date,'%d-%b-%y'),with_year = T)train = total[total$year<6,]

Client_train <- sqldf("select Client_ID,count(*) cross_sell,avg(age),sum(Transaction_Amount),avg(Transaction_Amount),sum(Var1),avg(Var1),sum(Var2),avg(Var2),sum(Var3),avg(Var3),avg(Number_of_EMI),count(Store_ID) from train group by Client_ID ")Client_total<- sqldf("select Client_ID,count(*) cross_sell,avg(age),sum(Transaction_Amount),avg(Transaction_Amount),sum(Var1),avg(Var1),sum(Var2),avg(Var2),sum(Var3),avg(Var3),avg(Number_of_EMI),count(Store_ID) from total group by Client_ID ")#Client_train <- sqldf("select Client_ID,Store_ID,count(*) Cross_Sell from total group by Client_ID,Store_ID ")#Client_train <- sqldf("select Client_ID,Store_ID,count(*) Cross_Sell from total where Transaction_Date > '2003-12-31' group by Client_ID " )str(Client_train)# Client_train$Cross_Sell <- ifelse(Client_train$Cross_Sell>1,1,0)

#New dataNew_data <- data.frame(data.table::dcast(setDT(total), Client_ID+Gender ~ year,value.var = c('Transaction_ID'),fun=length))# New_data <- data.frame(data.table::dcast(setDT(data), Client_ID+Gender ~ Quarter,value.var = c('Transaction_ID'),fun=length))levels(New_data$Gender)[1] = 'missing'# testdata <- New_data[,c(-3,-4,-5,-6)]# colnames(testdata) <- colnames(New_data)[1:14]train_data = merge(New_data,Client_train,by='Client_ID',all.x=T)test_data = merge(New_data,Client_total,by='Client_ID',all.x=T)train_data$Client_ID = NULLtest_data$X3 = NULL# testdata$X2003.1 = NULL# testdata$X2003.2 = NULL# testdata$X2003.3 = NULL# testdata$X2003.4 = NULL# colnames(testdata)[3:14] <- colnames(New_data)[2:13]colnames(test_data)[3:5] <- colnames(train_data)[2:4]

anova <- aov(((X6) > 0)~.,data=New_data)# anova <- aov((X2006.1+X2006.2+X2006.3+X2006.4)>0~.,data=New_data)

Page 2: Seek Accuracy

summary(anova)

colnames(train_data) <- make.names(names(train_data))colnames(test_data) <- make.names(names(test_data))

#5-Fold validationfitControl <- trainControl(method = "cv", number = 5, allowParallel = TRUE, classProbs = T, summaryFunction = twoClassSummary)

system.time(rpart <- train(as.factor(paste('X',(X6 > 0),sep=""))~.,# system.time(rpart <-train(as.factor(paste('X',!(X2006.1+X2006.2+X2006.3+X2006.4>0),sep=""))~., data=train_data, method = 'rpart', metric='ROC', #minbucket = 1, tuneGrid = expand.grid(cp=c(0.0001)), trControl = fitControl))rpart

system.time(C50 <- train(as.factor(paste('X',(X6 > 0),sep=""))~.-X3-X4-X5, data=New_data, method = 'C5.0', metric='ROC', tuneLength = 2, trControl = fitControl))C50

system.time(rf <- train(as.factor(paste('X',(X6 > 0),sep=""))~-X3-X4-X5, data=New_data, method = 'rf', metric='ROC', tuneLenth = 2, ntree=100, do.trace=10, trControl = fitControl))rf

system.time(xgb <- train(as.factor(paste('X',(X6 > 0),sep=""))~., data=New_data, method = 'xgbTree', tuneLength = 1, metric='ROC', trControl = fitControl))xgb

###########################Prediction#####################

submission = NULLsubmission$Client_ID = test_data$Client_IDsubmission$Cross_Sell <- predict(rpart,newdata=test_data,type='prob')[,2]

#submission$Cross_Sell = 0#submission = data.frame(submission)#submission[submission$Client_ID %in% Client_train$Client_ID,'Cross_Sell'] = Client_train$Cross_Sell

Page 3: Seek Accuracy

write.csv(submission,'submission.csv',row.names = F)