Data Visualization with R Models and relationship between ...€¦ · Outline 2 Qualitative...

Post on 07-Jun-2020

2 views 0 download

Transcript of Data Visualization with R Models and relationship between ...€¦ · Outline 2 Qualitative...

Data Visualization with R, Models andrelationship between variables

Dhafer Malouche

http://dhafermalouche.net

Outline

2 Qualitative variables

2 Quantitative variables, corrplot package

ggfortify package, visualizing models

tabplot package

2 Qualitative variables

Contingency table> require(graphics)

> M <- as.table( cbind( c( 18,28,14), c( 20,51,28) , c( 12,25,9)))

> dimnames( M) <- list( Husband = c(" Tall", "Medium", "Short"), Wife = c(" Tall"," Medium", "Short"))

> M

Wife

Husband Tall Medium Short

Tall 18 20 12

Medium 28 51 25

Short 14 28 9

Mosaic plot> mosaicplot( M, col = c(" green", "red"),main = "Husband x Wife")

Mosaic plot

Mosaic plot> library(vcd)

> mosaic(M, shade=T,main = "Husband x Wife")

Mosaic plot

Radial or Radar chartsRadial or Radar charts are

called Spider or Web or Polar charts.

a way of comparing multiple quantitative variables.

are also useful for seeing which variables are scoring high or low within a dataset.

Method 1We use the following R function

> source('CreatRadialPlot.R') > CreateRadialPlot function (plot.data, axis.labels = colnames(plot.data)[-1], grid.min = -0.5, grid.mid = 0, grid.max = 0.5, centre.y = grid.min - ((1/9) * (grid.max - grid.min)), plot.extent.x.sf = 1.2, plot.extent.y.sf = 1.2, x.centre.range = 0.02 * (grid.max - centre.y), label.centre.y = FALSE, grid.line.width = 0.5, gridline.min.linetype = "longdash", gridline.mid.linetype = "longdash", gridline.max.linetype = "longdash", gridline.min.colour = "grey", gridline.mid.colour = "blue", gridline.max.colour = "grey", grid.label.size = 4, gridline.label.offset = -0.02 * (grid.max - centre.y), label.gridline.min = TRUE, axis.label.offset = 1.15, axis.label.size = 3, axis.line.colour = "grey", group.line.width = 1, group.point.size = 4, background.circle.colour = "yellow", background.circle.transparency = 0.2, plot.legend = if (nrow(plot.data) > 1) TRUE else FALSE, legend.title = "Cluster", legend.text.size = grid.label.size) { var.names <- colnames(plot.data)[-1] plot.extent.x = (grid.max + abs(centre.y)) * plot.extent.x.sf plot.extent.y = (grid.max + abs(centre.y)) * plot.extent.y.sf if (length(axis.labels) != ncol(plot.data) - 1) return("Error: 'axis.labels' contains the wrong number of axis labels") if (min(plot.data[, -1]) < centre.y) return("Error: plot.data' contains value(s) < centre.y") if (max(plot.data[, -1]) > grid.max) return("Error: 'plot.data' contains value(s) > grid.max") CalculateGroupPath <- function(df) { path <- as.factor(as.character(df[, 1])) angles = seq(from = 0, to = 2 * pi, by = (2 * pi)/(ncol(df) - 1)) graphData = data.frame(seg = "", x = 0, y = 0) graphData = graphData[-1, ] for (i in levels(path)) { pathData = subset(df, df[, 1] == i) for (j in c(2:ncol(df))) { graphData = rbind(graphData, data.frame(group = i, x = pathData[, j] * sin(angles[j - 1]), y = pathData[, j] * cos(angles[j - 1]))) } graphData = rbind(graphData, data.frame(group = i, x = pathData[, 2] * sin(angles[1]), y = pathData[, 2] * cos(angles[1]))) } colnames(graphData)[1] <- colnames(df)[1] graphData } CaclulateAxisPath = function(var.names, min, max) { n.vars <- length(var.names) angles <- seq(from = 0, to = 2 * pi, by = (2 * pi)/n.vars) min.x <- min * sin(angles) min.y <- min * cos(angles) max.x <- max * sin(angles)

max.y <- max * cos(angles) axisData <- NULL for (i in 1:n.vars) { a <- c(i, min.x[i], min.y[i]) b <- c(i, max.x[i], max.y[i]) axisData <- rbind(axisData, a, b) }

i " i " " " " "

colnames(axisData) <- c("axis.no", "x", "y") rownames(axisData) <- seq(1:nrow(axisData)) as.data.frame(axisData) } funcCircleCoords <- function(center = c(0, 0), r = 1, npoints = 100) { tt <- seq(0, 2 * pi, length.out = npoints) xx <- center[1] + r * cos(tt) yy <- center[2] + r * sin(tt) return(data.frame(x = xx, y = yy)) } plot.data.offset <- plot.data plot.data.offset[, 2:ncol(plot.data)] <- plot.data[, 2:ncol(plot.data)] + abs(centre.y) group <- NULL group$path <- CalculateGroupPath(plot.data.offset) axis <- NULL axis$path <- CaclulateAxisPath(var.names, grid.min + abs(centre.y), grid.max + abs(centre.y)) axis$label <- data.frame(text = axis.labels, x = NA, y = NA) n.vars <- length(var.names) angles = seq(from = 0, to = 2 * pi, by = (2 * pi)/n.vars) axis$label$x <- sapply(1:n.vars, function(i, x) { ((grid.max + abs(centre.y)) * axis.label.offset) * sin(angles[i]) }) axis$label$y <- sapply(1:n.vars, function(i, x) { ((grid.max + abs(centre.y)) * axis.label.offset) * cos(angles[i]) }) gridline <- NULL gridline$min$path <- funcCircleCoords(c(0, 0), grid.min + abs(centre.y), npoints = 360) gridline$mid$path <- funcCircleCoords(c(0, 0), grid.mid + abs(centre.y), npoints = 360) gridline$max$path <- funcCircleCoords(c(0, 0), grid.max + abs(centre.y), npoints = 360) gridline$min$label <- data.frame(x = gridline.label.offset, y = grid.min + abs(centre.y), text = as.character(grid.min)) gridline$max$label <- data.frame(x = gridline.label.offset, y = grid.max + abs(centre.y), text = as.character(grid.max)) gridline$mid$label <- data.frame(x = gridline.label.offset, y = grid.mid + abs(centre.y), text = as.character(grid.mid)) theme_clear <- theme_bw() + theme(legend.position = "bottom", axis.text.y = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), legend.key = element_rect(linetype = "blank")) if (plot.legend == FALSE) theme_clear <- theme_clear + theme(legend.position = "none") base <- ggplot(axis$label) + xlab(NULL) + ylab(NULL) + coord_equal() + geom_text(data = subset(axis$label, axis$label$x < (-x.centre.range)), aes(x = x, y = y, label = text), size = axis.label.size, hjust = 1) + scale_x_continuous(limits = c(-plot.extent.x, plot.extent.x)) + scale_y_continuous(limits = c(-plot.extent.y, plot.extent.y)) base <- base + geom_text(data = subset(axis$label, abs(axis$label$x) <= x.centre.range), aes(x = x, y = y, label = text), size = axis.label.size, hjust = 0.5) base <- base + geom_text(data = subset(axis$label, axis$label$x > x.centre.range), aes(x = x, y = y, label = text), size = axis.label.size, hjust = 0)

base <- base + theme_clear base <- base + geom_polygon(data = gridline$max$path, aes(x, y), fill = background.circle.colour, alpha = background.circle.transparency) base <- base + geom_path(data = axis$path, aes(x = x, y = y, group = axis.no), colour = axis.line.colour) base <- base + geom_path(data = group$path, aes(x = x, y = y, group = group, colour = group), size = group.line.width)

i $

base <- base + geom_point(data = group$path, aes(x = x, y = y, group = group, colour = group), size = group.point.size) if (plot.legend == TRUE) base <- base + labs(colour = legend.title, size = legend.text.size) base <- base + geom_path(data = gridline$min$path, aes(x = x, y = y), lty = gridline.min.linetype, colour = gridline.min.colour, size = grid.line.width) base <- base + geom_path(data = gridline$mid$path, aes(x = x, y = y), lty = gridline.mid.linetype, colour = gridline.mid.colour, size = grid.line.width) base <- base + geom_path(data = gridline$max$path, aes(x = x, y = y), lty = gridline.max.linetype, colour = gridline.max.colour, size = grid.line.width) if (label.gridline.min == TRUE) { base <- base + geom_text(aes(x = x, y = y, label = text), data = gridline$min$label, fontface = "bold", size = grid.label.size, hjust = 1) } base <- base + geom_text(aes(x = x, y = y, label = text), data = gridline$mid$label, fontface = "bold", size = grid.label.size, hjust = 1) base <- base + geom_text(aes(x = x, y = y, label = text), data = gridline$max$label, fontface = "bold", size = grid.label.size, hjust = 1) if (label.centre.y == TRUE) { centre.y.label <- data.frame(x = 0, y = 0, text = as.character(centre.y)) base <- base + geom_text(aes(x = x, y = y, label = text), data = centre.y.label, fontface = "bold", size = grid.label.size, hjust = 0.5) } return(base) }

ExampleSchool dropout in Tunisia

> library(ggplot2) > source('CreatRadialPlot.R') > df <- read.csv("drop_out_school_tunisia.csv") > df X group bizerte siliana monastir mahdia tunis sfax National 1 xFemale Female 7.575758 5.952381 11.83432 6.569343 12.328767 3.960396 8.253968 2 xmale Male 14.285714 6.363636 23.00000 6.206897 6.024096 5.343511 11.473272 3 xAll All 11.585366 6.185567 17.88618 6.382979 8.974359 4.741379 10.021475 > df <- df[,-1]

Example> CreateRadialPlot(df,grid.label.size = 5, + axis.label.size = 4,group.line.width = 2, + plot.extent.x.sf = 1.5, + background.circle.colour = 'gray', + grid.max = 26, + grid.mid = round(df[3,8],1), + grid.min = 4.5, + axis.line.colour = 'black', + legend.title = '')

Example

Method 2 with fmsb package> library(fmsb) > > # Create data: note in High school for several students > set.seed(99) > data=as.data.frame(matrix( sample( 0:20 , 15 , replace=F) , ncol=5)) > colnames(data)=c("math" , "english" , "biology" , + "music" , "R-coding" ) > rownames(data)=paste("mister" , letters[1:3] , sep="-") > # We add 2 lines to the dataframe: the max and min of each > # topic to show on the plot! > data=rbind(rep(20,5) , rep(0,5) , data)

> data math english biology music R-coding 1 20 20 20 20 20 2 0 0 0 0 0 mister-a 12 17 10 19 1 mister-b 2 9 4 6 16 mister-c 13 15 18 5 20

Radar> colors_border=c( rgb(0.2,0.5,0.5,0.9), + rgb(0.8,0.2,0.5,0.9) , + rgb(0.7,0.5,0.1,0.9) ) > colors_in=c( rgb(0.2,0.5,0.5,0.4), + rgb(0.8,0.2,0.5,0.4) , + rgb(0.7,0.5,0.1,0.4) ) > radarchart( data , axistype=1 , + #custom polygon + pcol=colors_border , pfcol=colors_in , plwd=4 , plty=1, + #custom the grid + cglcol="grey", cglty=1, axislabcol="grey", + caxislabels=seq(0,20,5), cglwd=0.8, + #custom labels + vlcex=0.8 + ) > legend(x=0.7, y=1, + legend = rownames(data[-c(1,2),]), + bty = "n", pch=20 , + col=colors_in , text.col = "grey", cex=1.2, pt.cex=3)

Radar

googleVis package

Preparing data> library(googleVis) > op <- options(gvis.plot.tag="chart") > df1=t(df[,-1]) > df1=cbind.data.frame(colnames(df[,-1]),df1) > > colnames(df1)=c("Gouvernorat","Female","Male","All") > df1[,-1]=round(df1[,-1],2) > df1 Gouvernorat Female Male All bizerte bizerte 7.58 14.29 11.59 siliana siliana 5.95 6.36 6.19 monastir monastir 11.83 23.00 17.89 mahdia mahdia 6.57 6.21 6.38 tunis tunis 12.33 6.02 8.97 sfax sfax 3.96 5.34 4.74 National National 8.25 11.47 10.02

Preparing data> Bar <- gvisBarChart(df1,xvar="Gouvernorat", + yvar=c("Female","Male","All"), + options=list(width=1250, height=700, + title="Drop out School Rate", + titleTextStyle="{color:'red',fontName:'Courier',fontSize:16}", + bar="{groupWidth:'100%'}", + hAxis="{format:'#,#%'}")) > plot(Bar)

Preparing data

With some options> Bar <- gvisLineChart(df1[,-4],xvar="Gouvernorat", + options=list(width=1250, height=700, + title="Drop out School Rate", + titleTextStyle="{color:'red',fontName:'Courier',fontSize:16}", + bar="{groupWidth:'100%'}", + vAxis="{format:'#,#%'}")) > plot(Bar)

With some options

2 Quantitative variables, corrplotpackage

Displaying correlation matrix

CI of correlations

Test of Independences between a set of variables

With circles> library(corrplot) > data(mtcars) > head(mtcars) > M <- cor(mtcars) > corrplot(M, method = "circle")

With circles

With squares> corrplot(M, method = "square")

With squares

With ellipses> corrplot(M, method = "ellipse")

With ellipses

With numbers> corrplot(M, method = "number")

With numbers

With pies> corrplot(M, method = "pie")

With pies

Only upper matrix

> corrplot(M, type = "upper")

Only upper matrix

Ellipses and numbers> corrplot.mixed(M, lower = "ellipse", upper = "number")

Ellipses and numbers

Reordering variablesCharacter, the ordering method of the correlation matrix.

“original” for original order (default).

“AOE” for the angular order of the eigenvectors.

“FPC” for the first principal component order.

“hclust” for the hierarchical clustering order.

“alphabet” for alphabetical order.

h-clust> corrplot(M, order = "hclust")

h-clust

Showing clusters with rectangles> corrplot(M, order = "hclust",addrect = 3)

Showing clusters with rectangles

Customizing the plot

Colors> mycol <- colorRampPalette(c("red", "white", "blue")) > corrplot(M, order = "hclust",addrect = 2,col=mycol(50))

Colors

Background> wb <- c("white", "black") > corrplot(M, order = "hclust", + addrect = 2, + col = wb, bg = "gold2")

Background

An R code for Independence hypothesistesting

> cor.mtest <- function(mat, conf.level = 0.95) { + mat <- as.matrix(mat) + n <- ncol(mat) + p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n) + diag(p.mat) <- 0 + diag(lowCI.mat) <- diag(uppCI.mat) <- 1 + for (i in 1:(n - 1)) { + for (j in (i + 1):n) { + tmp <- cor.test(mat[, i], mat[, j], conf.level = conf.level) + p.mat[i, j] <- p.mat[j, i] <- tmp$p.value + lowCI.mat[i, j] <- lowCI.mat[j, i] <- tmp$conf.int[1] + uppCI.mat[i, j] <- uppCI.mat[j, i] <- tmp$conf.int[2] + } + } + return(list(p.mat, lowCI.mat, uppCI.mat)) + }

Independence hypothesis testing> res1 <- cor.mtest(mtcars, 0.95) > res1[[1]][1:3,1:3] [,1] [,2] [,3] [1,] 0.000000e+00 6.112687e-10 9.380327e-10 [2,] 6.112687e-10 0.000000e+00 1.802838e-12 [3,] 9.380327e-10 1.802838e-12 0.000000e+00 > res1[[2]][1:3,1:3] [,1] [,2] [,3] [1,] 1.0000000 -0.9257694 -0.9233594 [2,] -0.9257694 1.0000000 0.8072442 [3,] -0.9233594 0.8072442 1.0000000

Adding p-values> corrplot(M, p.mat = res1[[1]], sig.level = 0.1)

Non-significant independencecorrelations with a X

Or

> corrplot(M, p.mat = res1[[1]], sig.level = 0.01)

Or

Writing p-values> corrplot(M, p.mat = res1[[1]], sig.level = 0.01,insig = "p-value")

Writing p-values

Displaying white squares instead of p-values

> corrplot(M, p.mat = res1[[1]], sig.level = 0.01,insig = "blank")

Displaying squares instead of p-values

Use xtable R package to display nicecorrelation table in html format

> library(xtable) > mcor<-round(cor(mtcars),2) > upper<-mcor > upper[upper.tri(mcor)]<-"" > upper<-as.data.frame(upper)

Use xtable R package to display nicecorrelation table in html format

> print(xtable(upper), type="html")

Use xtable R package to display nicecorrelation table in html format

mpg cyl disp hp drat wt qsec vs am gear carbmpg 1

cyl -0.85 1

disp -0.85 0.9 1

hp -0.78 0.83 0.79 1

drat 0.68 -0.7 -0.71 -0.45 1

wt -0.87 0.78 0.89 0.66 -0.71 1

qsec 0.42 -0.59 -0.43 -0.71 0.09 -0.17 1

vs 0.66 -0.81 -0.71 -0.72 0.44 -0.55 0.74 1

am 0.6 -0.52 -0.59 -0.24 0.71 -0.69 -0.23 0.17 1

gear 0.48 -0.49 -0.56 -0.13 0.7 -0.58 -0.21 0.21 0.79 1

carb -0.55 0.53 0.39 0.75 -0.09 0.43 -0.66 -0.57 0.06 0.27 1

Combine matrix of correlationcoefficients and significance levels

We use corstar function

> # x is a matrix containing the data > # method : correlation method. "pearson"" or "spearman"" is supported > # removeTriangle : remove upper or lower triangle > # results : if "html" or "latex" > # the results will be displayed in html or latex format > corstars <-function(x, method=c("pearson", "spearman"),

+ removeTriangle=c("upper", "lower"),

+ result=c("none", "html", "latex")){

+ #Compute correlation matrix + require(Hmisc)

+ x <- as.matrix(x)

+ correlation_matrix<-rcorr(x, type=method[1])

+ R <- correlation_matrix$r # Matrix of correlation coeficients + p <- correlation_matrix$P # Matrix of p-value +

+ # Define notions for significance levels; spacing is important. + mystars <- ifelse(p < .0001, "****", ifelse(p < .001, "*** ", ifelse(p < .01, "** ", ifelse(p < .05, "* ", " "

+

+ # trunctuate the correlation matrix to two decimal + R <- format(round(cbind(rep(-1.11, ncol(x)), R), 2))[,-1]

+

+ # build a new matrix that includes the correlations with their apropriate stars + Rnew <- matrix(paste(R, mystars, sep=""), ncol=ncol(x))

+ diag(Rnew) <- paste(diag(R), " ", sep="")

+ rownames(Rnew) <- colnames(x)

+ colnames(Rnew) <- paste(colnames(x), "", sep="")

+

+ # remove upper triangle of correlation matrix + if(removeTriangle[1]=="upper"){

+ Rnew <- as.matrix(Rnew)

+ Rnew[upper.tri(Rnew, diag = TRUE)] <- ""

+ Rnew <- as.data.frame(Rnew)

+ }

+

+ # remove lower triangle of correlation matrix + else if(removeTriangle[1]=="lower"){

+ Rnew <- as.matrix(Rnew)

+ Rnew[lower.tri(Rnew, diag = TRUE)] <- ""

+ Rnew <- as.data.frame(Rnew)

+ }

+

+ # remove last column and return the correlation matrix + Rnew <- cbind(Rnew[1:length(Rnew)-1])

+ if (result[1]=="none") return(Rnew)

+ else{

+ if(result[1]=="html") print(xtable(Rnew), type="html")

+ else print(xtable(Rnew), type="latex")

+ }

+ }

>

Combine matrix of correlationcoefficients and significance levels

> corstars(mtcars[,1:7], + result="html")

Combine matrix of correlationcoefficients and significance levels

mpg cyl disp hp drat wtmpg

cyl -0.85****

disp -0.85**** 0.90****

hp -0.78**** 0.83**** 0.79****

drat 0.68**** -0.70**** -0.71**** -0.45**

wt -0.87**** 0.78**** 0.89**** 0.66**** -0.71****

qsec 0.42* -0.59*** -0.43* -0.71**** 0.09 -0.17

My Shiny app: Visulazing Correlationmatrix

https://dhafer.shinyapps.io/CorrMatrixViz

Correlation Matrix: Visualization and Independence testsUpload your CSV File

Variables to use:

Browse...Visualization Pearson-Indepence Test

Active Data

Single represensation Mixed represensation

Customizingthe graphCorrelation type

Method

Type

Ordering

Position of text

labels.

pearson

circle

full

original

left and top

No file selected

ggfortify package, visualizing models

Time series> library(ggfortify) > head(AirPassengers) [1] 112 118 132 129 121 135 > class(AirPassengers) [1] "ts"

> autoplot(AirPassengers)

Time serie

Times series, customizing> p <- autoplot(AirPassengers) > p + ggtitle('AirPassengers') + xlab('Year') + ylab('Passengers')

Times series, customizing

Clustering> set.seed(1) > head(iris) Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 3.5 1.4 0.2 setosa 2 4.9 3.0 1.4 0.2 setosa 3 4.7 3.2 1.3 0.2 setosa 4 4.6 3.1 1.5 0.2 setosa 5 5.0 3.6 1.4 0.2 setosa 6 5.4 3.9 1.7 0.4 setosa

> p <- autoplot(kmeans(iris[-5], 3), data = iris) > p

Clustering

PCA> df <- iris[c(1, 2, 3, 4)] > autoplot(prcomp(df))

PCA

PCA, by showing groups! Convexes> autoplot(prcomp(df), + data = iris, + colour = 'Species', + shape = FALSE, + label.size = 3, frame=T)

PCA, by showing groups! Convexes

Biplot for a PCA> autoplot(prcomp(df), data = iris, colour = 'Species', + loadings = TRUE, loadings.colour = 'blue', + loadings.label = TRUE, loadings.label.size = 3)

Biplot for a PCA

Regression diagnostic> m <- lm(Petal.Width ~ Petal.Length, data = iris) > autoplot(m, which = 1:6, colour = 'dodgerblue3', + smooth.colour = 'black', + smooth.linetype = 'dashed', + ad.colour = 'blue', + label.size = 3, label.n = 5, label.colour = 'blue', + ncol = 3)

Regression diagnostic

Local Fisher Discriminant Analysis> library(lfda) > model <- lfda(x = iris[-5], y = iris[, 5], r = 3, metric="plain") > autoplot(model, + data = iris, + frame = TRUE, + frame.colour = 'Species')

Local Fisher Discriminant Analysis

tabplot package

Data> require(ggplot2)

> data(diamonds)

> head(diamonds)

# A tibble: 6 x 10 carat cut color clarity depth table price x y z

<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>

1 0.230 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

2 0.210 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

3 0.230 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

4 0.290 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

5 0.310 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

6 0.240 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48

> summary(diamonds)

carat cut color clarity depth table price x

Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.0

1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.7

Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80 Median :57.00 Median : 2401 Median : 5.7

Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.7

3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.5

Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00 Max. :95.00 Max. :18823 Max. :10.7

J: 2808 (Other): 2531

Exploring Data> require(tabplot) > tableplot(diamonds)

Exploring Data

Exploring Data, how it works?> tableplot(diamonds, nBins=2,select =c(carat,color))

Exploring Data, how it works?

Exploring Data, how it works?> x=tableplot(diamonds, nBins=2,select =c(carat,color),decreasing = T) > names(x) > x$columns$carat$mean > z=sort(diamonds$carat,d=T) > dim(diamonds) > mean(z[1:26970]) > mean(z[26971:53940]) > x$columns$color$widths > y=diamonds$color[order(diamonds$carat,decreasing = T)] > prop.table(table(y[1:26940])) > prop.table(table(y[26971:53940]))

Exploring Data, how it works?

[1] "dataset" "select" "subset" "nBins" "binSizes" "sortCol" "decreasing" "fro

[1] 1.1723063 0.4235732

[1] 53940 10

[1] 1.172306

[1] 0.4235732

[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]

[1,] 0.09492028 0.1389692 0.1665925 0.2012236 0.1852799 0.13388951 0.07912495 0

[2,] 0.15628476 0.2242862 0.1872080 0.2174638 0.1226177 0.06714868 0.02499073 0

D E F G H I J

0.09561990 0.13938382 0.16674091 0.20100223 0.18511507 0.13333333 0.07880475

D E F G H I J

0.15550612 0.22376715 0.18694846 0.21757508 0.12298851 0.06781609 0.02539859

Missing values> # add some NA's > diamonds2=diamonds > diamonds2$price[which(diamonds2$cut == "Ideal")]<-NA > diamonds2$cut[diamonds2$depth>65]=NA > tableplot(diamonds2,colorNA = "black")

Missing values

Zooming on data,> tableplot(diamonds, nBins=5, select = c(carat, price, cut, color, clarity), sortCol = price, + from = 0, to = 5)

Zooming on data,

Filtering data> tableplot(diamonds, subset = price < 5000 & cut == "Premium")

Filtering data

Change colors> tableplot(diamonds, pals = list(cut="Set1(6)", color="Set5", clarity=rainbow(8)))

Change colors

Preprocessing of Large data> # create large dataset > large_diamonds <- diamonds[rep(seq.int(nrow(diamonds)), 10),] > > system.time({ + p <- tablePrepare(large_diamonds) + }) user system elapsed 1.287 0.758 2.301