Using the models, prediction, deciding

1

Peter Fox

Data Analytics – ITWS-4963/ITWS-6965

Week 7b, March 7, 2014

Using the models, prediction, deciding

scatterplotMatrix

2

Hierarchical clustering

3

> dswiss <- dist(as.matrix(swiss))

> hs <- hclust(dswiss)

> plot(hs)

ctree

4

require(party)

swiss_ctree <- ctree(Fertility ~ Agriculture + Education + Catholic, data = swiss)

plot(swiss_ctree)

pairs(iris[1:4], main = "Anderson's Iris Data -- 3 species”, pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])

6

splom extra!require(lattice)

super.sym <- trellis.par.get("superpose.symbol")

splom(~iris[1:4], groups = Species, data = iris,

panel = panel.superpose,

key = list(title = "Three Varieties of Iris",

columns = 3,

points = list(pch = super.sym$pch[1:3],

col = super.sym$col[1:3]),

text = list(c("Setosa", "Versicolor", "Virginica"))))

splom(~iris[1:3]|Species, data = iris,

layout=c(2,2), pscales = 0,

varnames = c("Sepal\nLength", "Sepal\nWidth", "Petal\nLength"),

page = function(...) {

ltext(x = seq(.6, .8, length.out = 4),

y = seq(.9, .6, length.out = 4),

labels = c("Three", "Varieties", "of", "Iris"),

cex = 2)

})

7

8

parallelplot(~iris[1:4] | Species, iris)

9

parallelplot(~iris[1:4], iris, groups = Species, horizontal.axis = FALSE, scales = list(x = list(rot = 90)))

hclust for iris

10

plot(iris_ctree)

11

Ctree> iris_ctree <- ctree(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data=iris)

> print(iris_ctree)

Conditional inference tree with 4 terminal nodes

Response: Species

Inputs: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width

Number of observations: 150

1) Petal.Length <= 1.9; criterion = 1, statistic = 140.264

2)* weights = 50

1) Petal.Length > 1.9

3) Petal.Width <= 1.7; criterion = 1, statistic = 67.894

4) Petal.Length <= 4.8; criterion = 0.999, statistic = 13.865

5)* weights = 46

4) Petal.Length > 4.8

6)* weights = 8

3) Petal.Width > 1.7

7)* weights = 46 12

> plot(iris_ctree, type="simple”)

13

New dataset to work with treesfitK <- rpart(Kyphosis ~ Age + Number + Start, method="class", data=kyphosis)

printcp(fitK) # display the results

plotcp(fitK) # visualize cross-validation results

summary(fitK) # detailed summary of splits

# plot tree

plot(fitK, uniform=TRUE, main="Classification Tree for Kyphosis")

text(fitK, use.n=TRUE, all=TRUE, cex=.8)

# create attractive postscript plot of tree

post(fitK, file = “kyphosistree.ps", title = "Classification Tree for Kyphosis") # might need to convert to PDF (distill)

14

16

> pfitK<- prune(fitK, cp= fitK$cptable[which.min(fitK$cptable[,"xerror"]),"CP"])> plot(pfitK, uniform=TRUE, main="Pruned Classification Tree for Kyphosis")> text(pfitK, use.n=TRUE, all=TRUE, cex=.8)> post(pfitK, file = “ptree.ps", title = "Pruned Classification Tree for Kyphosis”)

17

> fitK <- ctree(Kyphosis ~ Age + Number + Start, data=kyphosis)> plot(fitK, main="Conditional Inference Tree for Kyphosis”)

18

> plot(fitK, main="Conditional Inference Tree for Kyphosis",type="simple")

randomForest> require(randomForest)

> fitKF <- randomForest(Kyphosis ~ Age + Number + Start, data=kyphosis)

> print(fitKF) # view results

Call:

randomForest(formula = Kyphosis ~ Age + Number + Start, data = kyphosis)

Type of random forest: classification

Number of trees: 500

No. of variables tried at each split: 1

OOB estimate of error rate: 20.99%

Confusion matrix:

absent present class.error

absent 59 5 0.0781250

present 12 5 0.7058824

> importance(fitKF) # importance of each predictor

MeanDecreaseGini

Age 8.654112

Number 5.584019

Start 10.168591 19

Random forests improve predictive accuracy by generating a large number of bootstrapped trees (based on random samples of variables), classifying a case using each tree in this new "forest", and deciding a final predicted outcome by combining the results across all of the trees (an average in regression, a majority vote in classification).

More on another dataset.# Regression Tree Example

library(rpart)

# build the tree

fitM <- rpart(Mileage~Price + Country + Reliability + Type, method="anova", data=cu.summary)

printcp(fitM) # display the results….

Root node error: 1354.6/60 = 22.576

n=60 (57 observations deleted due to missingness)

CP nsplit rel error xerror xstd

1 0.622885 0 1.00000 1.03165 0.176920

2 0.132061 1 0.37711 0.51693 0.102454

3 0.025441 2 0.24505 0.36063 0.079819

4 0.011604 3 0.21961 0.34878 0.080273

5 0.010000 4 0.20801 0.36392 0.075650 20

Mileage…plotcp(fitM) # visualize cross-validation results

summary(fitM) # detailed summary of splits

<we will leave this for Friday to look at> 21

22

par(mfrow=c(1,2)) rsq.rpart(fitM) # visualize cross-validation results

# plot tree

plot(fitM, uniform=TRUE, main="Regression Tree for Mileage ")

text(fitM, use.n=TRUE, all=TRUE, cex=.8)

# prune the tree

pfitM<- prune(fitM, cp=0.01160389) # from cptable

# plot the pruned tree

plot(pfitM, uniform=TRUE, main="Pruned Regression Tree for Mileage")

text(pfitM, use.n=TRUE, all=TRUE, cex=.8)

post(pfitM, file = ”ptree2.ps", title = "Pruned Regression Tree for Mileage”)

23

# Conditional Inference Tree for Mileage

fit2M <- ctree(Mileage~Price + Country + Reliability + Type, data=na.omit(cu.summary))

25

Enough of trees!

26

Bayes> cl <- kmeans(iris[,1:4], 3)

> table(cl$cluster, iris[,5])

setosa versicolor virginica

2 0 2 36

1 0 48 14

3 50 0 0

#

> m <- naiveBayes(iris[,1:4], iris[,5])

> table(predict(m, iris[,1:4]), iris[,5])


setosa 50 0 0

versicolor 0 47 3

virginica 0 3 47 27

pairs(iris[1:4],main="Iris Data (red=setosa,green=versicolor,blue=virginica)", pch=21, bg=c("red","green3","blue")[unclass(iris$Species)])

Digging into irisclassifier<-naiveBayes(iris[,1:4], iris[,5])

table(predict(classifier, iris[,-5]), iris[,5], dnn=list('predicted','actual'))

actual

predicted setosa versicolor virginica

setosa 50 0 0

versicolor 0 47 3

virginica 0 3 47

28

Digging into iris> classifier$apriori

iris[, 5]


50 50 50

> classifier$tables$Petal.Length

Petal.Length

iris[, 5] [,1] [,2]

setosa 1.462 0.1736640

versicolor 4.260 0.4699110

virginica 5.552 0.551894729

Digging into irisplot(function(x) dnorm(x, 1.462, 0.1736640), 0, 8, col="red", main="Petal length distribution for the 3 different species")

curve(dnorm(x, 4.260, 0.4699110), add=TRUE, col="blue")

curve(dnorm(x, 5.552, 0.5518947 ), add=TRUE, col = "green")

30

http://www.ugrad.stat.ubc.ca/R/library/mlbench/html/HouseVotes84.

html > require(mlbench)

> data(HouseVotes84)

> model <- naiveBayes(Class ~ ., data = HouseVotes84)

> predict(model, HouseVotes84[1:10,-1])

[1] republican republican republican democrat democrat democrat republican republican republican

[10] democrat

Levels: democrat republican 31

http://www.ugrad.stat.ubc.ca/R/library/mlbench/html/HouseVotes84.html



House Votes 1984> predict(model, HouseVotes84[1:10,-1], type = "raw")

democrat republican

[1,] 1.029209e-07 9.999999e-01

[2,] 5.820415e-08 9.999999e-01

[3,] 5.684937e-03 9.943151e-01

[4,] 9.985798e-01 1.420152e-03

[5,] 9.666720e-01 3.332802e-02

[6,] 8.121430e-01 1.878570e-01

[7,] 1.751512e-04 9.998248e-01

[8,] 8.300100e-06 9.999917e-01

[9,] 8.277705e-08 9.999999e-01

[10,] 1.000000e+00 5.029425e-1132

House Votes 1984

> pred <- predict(model, HouseVotes84[,-1])

> table(pred, HouseVotes84$Class)

pred democrat republican

democrat 238 13

republican 29 155

33

So now you could complete this:> data(HairEyeColor)

> mosaicplot(HairEyeColor)

> margin.table(HairEyeColor,3)

Sex

Male Female

279 313

> margin.table(HairEyeColor,c(1,3))

Sex

Hair Male Female

Black 56 52

Brown 143 143

Red 34 37

Blond 46 81

Construct a naïve Bayes classifier and test. 34

Assignments to come…

• Term project (A6). Due ~ week 13. 30% (25% written, 5% oral; individual).

• Assignment 7: Predictive and Prescriptive Analytics. Due ~ week 9/10. 20% (15% written and 5% oral; individual);

35

Coming weeks• I will be out of town Friday March 21 and 28• On March 21 you will have a lab –

attendance will be taken – to work on assignments (term (6) and assignment 7). Your project proposals (Assignment 5) are on March 18.

• On March 28 you will have a lecture on SVM, thus the Tuesday March 25 will be a lab.

• Back to regular schedule in April (except 18th)36

Admin info (keep/ print this slide)• Class: ITWS-4963/ITWS 6965• Hours: 12:00pm-1:50pm Tuesday/ Friday• Location: SAGE 3101• Instructor: Peter Fox• Instructor contact: [email protected], 518.276.4862 (do not

leave a msg)• Contact hours: Monday** 3:00-4:00pm (or by email appt)• Contact location: Winslow 2120 (sometimes Lally 207A

announced by email)• TA: Lakshmi Chenicheri [email protected] • Web site: http://tw.rpi.edu/web/courses/DataAnalytics/2014

– Schedule, lectures, syllabus, reading, assignments, etc.

37

mailto:[email protected]

mailto:[email protected]

http://tw.rpi.edu/web/courses/DataAnalytics/2014

Using the models, prediction, deciding

Documents

Transcript of Using the models, prediction, deciding