rm(list=ls()) library(grid) library(vcd) library(gnm) library(plotrix) library(TeachingDemos) library(lattice) library(ggplot2) library(rpart) library(rpart.plot) library(vcdExtra) library(plotmo) library(Matrix) library(RWeka) library(C50) library(randomForest) library(Formula) library(partykit) library(arules) library(caret) library(psych) source("library.R") generateData <- function(case, xs) { ys <- NULL switch(case, "1"={ ys <- xs }, "2"={ ys <- ifelse(xs<50, 0, 50) }, "3"={ ys <- jitter(xs, amount=3) }, "4"={ mid <- mean(xs) ys <- sqrt(mid^2 + (xs - mid)^2) - mid }, "5"={ ys <- xs * xs } ) ys } exploreLinearData <- function() { x <- seq(1:100) cases <- c(1) form <- respondents ~ x readline(prompt) for (case in cases) { respondents <- generateData(case, x) dat <- data.frame(x, respondents) model <- lm(form, data=dat) plot(x, respondents) abline(model) readline(prompt) rp0 <- rpart(form, data=dat) rpart.plot(rp0, type=0, ## extra=2, cex=1.5, box.palett=colors, main=sprintf("Case #%.0f", case)) readline(prompt) temp <- plotmo(rp0, ## nresponse=2, ## degree1=0, type2="image", col.image="red", main=sprintf("Case #%.0f", case) ) usr <- par("usr") points(rescale(x, c(usr[1], usr[2])), rescale(respondents, c(usr[3], usr[4])), col="green", cex=1) } } createImage <- function(model, imageFile) { tempFile <- tempfile() print(tempFile) write_to_dot(model, con=file(tempFile), "w") command <- sprintf("dot -Tpng %s -o %s", tempFile, imageFile) dumpObject(command) system(command) unlink(tempFile) } main <- function() { colors <- gray(seq(.6, 1,.05)) prompt <- "Press RETURN for next plot." data(Titanicp) form <- survived ~ pclass + age rp0 <- rpart(form, data=Titanicp) rpart.plot(rp0, type=0, extra=2, cex=1.5, box.palett=colors, main="Modeled likelihood of survival based on class and age. (Titanicp)") readline(prompt) plotmo(rp0, nresponse="survived", degree1=0, type2="image", col.image=colors, col.response=ifelse(Titanicp$survived=="died", "red", "blue"), pch=ifelse(Titanicp$survived=="died", 15, 16), main="Modeled likelihood of survival based on class and age. (Titanicp)") Titanic.Weights <- as.data.frame(Titanic) Titanic.df <- Titanic.Weights[rep(1:nrow(Titanic.Weights), Titanic.Weights$Freq),] Titanic.df <- Titanic.df[,1:4] dumpObject(table(Titanic.df$Survived)) EntropAll <- -(1490/2201) * log2(1490/2201) -(711/2201) * log2(711/2201) #= 0.9076514 dumpObject(table(Titanic.df$Sex,Titanic.df$Survived)) EntropM <- -(1364/1731) * log2(1364/1731) -(367/1731) * log2(367/1731) # = 0.745319 EntropF <- -(126/470) * log2(126/470) -(344/470) * log2(344/470) # = 0.8387034 EntropSex <- ( (1731/ 2201) * EntropM ) + ((470/ 2201) * EntropF ) # = 0.7652602 InformationGain <- EntropAll - EntropSex # = 0.1423912 dumpObject(InformationGain) data(AdultUCI) dumpObject(summary(AdultUCI)) ADULT = na.omit(AdultUCI)[,-c(3,5)] set.seed(123) TrainCases <- createDataPartition(ADULT$income, p = .5, list=F) TrainTemp <- ADULT[TrainCases,] AdultTrainSmallIncome <- TrainTemp[TrainTemp$income == "small",] AdultTrainLargeIncome <- TrainTemp[TrainTemp$income == "large",] Oversample <- sample(nrow(AdultTrainLargeIncome), nrow(AdultTrainSmallIncome), replace = TRUE) AdultTrain <- rbind(AdultTrainSmallIncome, AdultTrainLargeIncome[Oversample,]) AdultTest <- ADULT[-TrainCases,] # The unpruned tree, from p. 204 C45tree <- J48(income ~ . , data= AdultTrain, control= Weka_control(U=TRUE)) dumpObject(C45tree) dumpObject(summary(C45tree)) ## createImage(C45tree, "/home/chuck/Teaching/DataAnalysis/2017/Images/c45tree.png") Predictions <- data.frame(matrix(nrow = nrow(AdultTest), ncol=0)) Predictions$C45 <- predict(C45tree, AdultTest) # The pruned tree, from p. 205 C45pruned <- J48(income ~ . , data= AdultTrain, control= Weka_control(U=FALSE)) dumpObject(summary(C45pruned)) Predictions$C45pr <- predict(C45pruned, AdultTest) C50tree <- C5.0(y = AdultTrain$income, x = AdultTrain[,-13], Trials = 10) dumpObject(summary(C50tree)) TabC5.0<- as.table(matrix(c(10061,1980,714,10613), byrow=T, nrow = 2)) dumpObject(cohen.kappa(TabC5.0)) Predictions$C5.0 <- predict(C50tree, AdultTest) # CART, from p. 207 CARTtree <- rpart(income ~. , data= AdultTrain) dumpObject(summary(CARTtree)) readline(prompt) rpart.plot(CARTtree, extra = 1, box.palett=colors) ProbsCART <- predict(CARTtree, AdultTrain) PredictCART <- rep(0, nrow(ProbsCART)) PredictCART[ProbsCART[,1] <=.5] = "small" PredictCART[ProbsCART[,1] >.5] = "large" TabCART <- table(AdultTrain$income, PredictCART) dumpObject(TabCART) dumpObject(cohen.kappa(TabCART)) # Pruning, from p. 208 CARTtreePruned <- prune(CARTtree, cp=0.03) readline(prompt) rpart.plot(CARTtreePruned, extra = 1, box.palett=colors) ProbsCARTtest <- predict(CARTtreePruned, AdultTest) Predictions$CART[ProbsCARTtest[,1] <=.5] = "small" Predictions$CART[ProbsCARTtest[,1] >.5] = "large" # Random forests in R from p. 210 RF <- randomForest(y = AdultTrain$income, x = AdultTrain[,-13]) dumpObject(RF) Predictions$RF <- predict(RF, AdultTest) # Examining the predictions on the testing set, from p. 211 values <- data.frame(matrix(ncol = ncol(Predictions), nrow = 6)) rownames(values) <- c("True +", "True -", "False +", "False -", "Accuracy", "Kappa") names(values) <- names(Predictions) for (i in 1:ncol(Predictions)) { tab <- table(AdultTest$income,Predictions[,i]) values[1,i] <- tab[1,1] values[2,i] <- tab[2,2] values[3,i] <- tab[1,2] values[4,i] <- tab[2,1] values[5,i] <- sum(diag(tab))/sum(tab) values[6,i] <- cohen.kappa(tab)[1] } round(values,2) dumpObject(values) # Conditional inference trees in R, from p. 212 set.seed(999) TitanicRandom <- Titanic.df[sample(nrow(Titanic.df)),] TitanicTrain <- TitanicRandom[1:1100,] TitanicTest <- TitanicRandom[1101:2201,] CItree <- ctree(Survived ~ Class + Sex + Age, data=TitanicTrain) readline(prompt) plot(CItree) CIpredictTrain <- predict(CItree, TitanicTrain) CIpredictTest <- predict(CItree, TitanicTest) TabCI_Train <- table(TitanicTrain$Survived,CIpredictTrain) TabCI_Test <- table(TitanicTest$Survived,CIpredictTest) dumpObject(TabCI_Train) dumpObject(TabCI_Test) print("The program has ended.") } main()