MachineLearning
preparing data
factors to dummy variables
https://www.youtube.com/watch?v=7rgzCjrIA-o
library(caret)
customers <- data.frame(id=c(10,20,30,40,50),
gender=c('male','female','female','male','female'),
mood=c('happy','sad','happy','sad','happy'),
outcome=c(1,1,0,0,0))
dmy <- dummyVars(formula = "~ .", data=customers,fullRank = T)
#fullRank=T will remove unnecessary highly correlated variables.
trsf <- data.frame(predict(dmy, newdata=customers))
trsf
sparce model matrix
some_dataframe <- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
2 7 0 0 0 0 0 0 0 0 0
0 0 3 0 0 0 0 0 0 0 0
0 0 0 6 1 0 0 0 0 0 0
0 0 0 2 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 12 0 1
0 0 0 0 0 25 0 0 0 0 1
1 0 0 0 2 0 0 0 0 0 0
0 0 0 2 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 14 0 1
0 0 0 0 0 21 0 0 0 0 1
0 0 0 0 0 0 28 0 0 0 1
0 0 0 0 0 0 0 35 0 0 1
0 0 0 0 0 0 0 0 42 0 1
0 0 0 0 0 0 0 0 0 49 1",
header=T, sep="")
library(Matrix)
some_matrix <- data.matrix(some_dataframe[1:10])
# show matrix representation of data set
Matrix(some_matrix, sparse=TRUE)
# split data set into a train and test portion
set.seed(2)
split <- sample(nrow(some_dataframe), floor(0.7*nrow(some_dataframe)))
train <-some_dataframe[split,]
test <- some_dataframe[-split,]
# transform both sets into sparse matrices using the sparse.model.matrix
train_sparse <- sparse.model.matrix(~.,train[1:10])
test_sparse <- sparse.model.matrix(~.,test[1:10])
# model the sparse sets using glmnet
library(glmnet)
fit <- glmnet(train_sparse,train[,11])
# use cv.glmnet to find best lambda/penalty
# s is the penalty parameter
cv <- cv.glmnet(train_sparse,train[,11],nfolds=3)
pred <- predict(fit, test_sparse,type="response", s=cv$lambda.min)
# receiver operating characteristic (ROC curves)
library(pROC)
auc = roc(test[,11], pred)
print(auc$auc)
cat_dataframe<- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 mood outcome
2 7 0 0 0 0 0 0 0 0 happy 0
0 0 3 0 0 0 0 0 0 0 happy 0
0 0 0 6 1 0 0 0 0 0 happy 0
0 0 0 2 0 0 0 0 0 0 happy 0
0 0 0 0 0 0 0 0 12 0 sad 1
0 0 0 0 0 25 0 0 0 0 sad 1
1 0 0 0 2 0 0 0 0 0 happy 0
0 0 0 2 0 0 0 0 0 0 happy 0
0 0 0 0 0 0 0 14 0 0 sad 1
0 0 0 0 0 21 0 0 0 0 sad 1
0 0 0 0 0 0 28 0 0 0 sad 1
0 0 0 0 0 0 0 35 0 0 sad 1
0 0 0 0 0 0 0 0 42 0 sad 1
0 0 0 0 0 0 0 0 0 49 sad 1",
header=T, sep="")
print(sparse.model.matrix(~.,cat_dataframe))
# increasing the number of levels in the mood variable)
cat_dataframe <- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 mood outcome
2 7 0 0 0 0 0 0 0 0 angry 0
0 0 3 0 0 0 0 0 0 0 neutral 0
0 0 0 6 1 0 0 0 0 0 happy 0
0 0 0 2 0 0 0 0 0 0 happy 0
0 0 0 0 0 0 0 0 12 0 sad 1
0 0 0 0 0 25 0 0 0 0 sad 1
1 0 0 0 2 0 0 0 0 0 happy 0
0 0 0 2 0 0 0 0 0 0 happy 0
0 0 0 0 0 0 0 0 14 0 sad 1
0 0 0 0 0 21 0 0 0 0 neutral 1
0 0 0 0 0 0 28 0 0 0 sad 1
0 0 0 0 0 0 0 35 0 0 sad 1
0 0 0 0 0 0 0 0 42 0 sad 1
0 0 0 0 0 0 0 0 0 49 sad 1", header=T, sep="")
print(levels(cat_dataframe$mood))
dim(cat_dataframe)
# sparse added extra columns when in binarized mood
dim(sparse.model.matrix(~.,cat_dataframe))
colnames(sparse.model.matrix(~.-1,cat_dataframe),)
partition with maxDissim
https://r-forge.r-project.org/scm/viewvc.php/*checkout*/www/splitting.html?revision=828&root=caret
library(caret)
library(mlbench)
data(BostonHousing)
testing <- scale(BostonHousing[, c("age", "nox")])
set.seed(11)
## A random sample of 5 data points
startSet <- sample(1:dim(testing)[1], 5)
samplePool <- testing[-startSet, ]
start <- testing[startSet, ]
newSamp <- maxDissim(start, samplePool, n = 20, randomFrac = 0.5)
# To select more samples towards the interior of the data set,
# set randomFrac to be small (range is (0, 1]).
head(newSamp)
Feature hashing example.
# Feature Hashing
# http://amunategui.github.io/feature-hashing/#sourcecode
require(RCurl)
binData <- getBinaryURL("https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip",
ssl.verifypeer=FALSE)
conObj <- file("dataset_diabetes.zip", open = "wb")
writeBin(binData, conObj)
# don't forget to close it
close(conObj)
# open diabetes file
files <- unzip("dataset_diabetes.zip")
diabetes <- read.csv(files[1], stringsAsFactors = FALSE)
# quick look at the data
str(diabetes)
# drop useless variables
diabetes <- subset(diabetes,select=-c(encounter_id, patient_nbr))
# transform all "?" to 0s
diabetes[diabetes == "?"] <- NA
# remove zero variance - ty James http://stackoverflow.com/questions/8805298/quickly-remove-zero-variance-variables-from-a-data-frame
diabetes <- diabetes[sapply(diabetes, function(x) length(levels(factor(x,exclude=NULL)))>1)]
# prep outcome variable to those readmitted under 30 days
diabetes$readmitted <- ifelse(diabetes$readmitted == "<30",1,0)
# generalize outcome name
outcomeName <- 'readmitted'
# large factors to deal with
length(unique(diabetes$diag_1))
length(unique(diabetes$diag_2))
length(unique(diabetes$diag_3))
diabetes_hash <- diabetes
predictorNames <- setdiff(names(diabetes_hash),outcomeName)
# change all NAs to 0
diabetes_hash[is.na(diabetes_hash)] <- 0
set.seed(1234)
split <- sample(nrow(diabetes_hash), floor(0.5*nrow(diabetes_hash)))
objTrain <-diabetes_hash[split,]
objTest <- diabetes_hash[-split,]
library(FeatureHashing) ; install.packages("FeatureHashing")
objTrain_hashed = hashed.model.matrix(~., data=objTrain[,predictorNames], hash.size=2^12, transpose=FALSE)
objTrain_hashed = as(objTrain_hashed, "dgCMatrix")
objTest_hashed = hashed.model.matrix(~., data=objTest[,predictorNames], hash.size=2^12, transpose=FALSE)
objTest_hashed = as(objTest_hashed, "dgCMatrix")
library(glmnet)
glmnetModel <- cv.glmnet(objTrain_hashed, objTrain[,outcomeName],
family = "binomial", type.measure = "auc")
#Let’s see how this version scored:
glmnetPredict <- predict(glmnetModel, objTest_hashed, s="lambda.min")
auc(objTest[,outcomeName], glmnetPredict)
https://youtu.be/vG3-yCyPNDQ, https://youtu.be/oHSMZk3Ynzg https://en.wikipedia.org/wiki/Feature_hashing http://amunategui.github.io/feature-hashing/
xgboost and caret
random search
data(iris)
fitControl <- trainControl(method = "adaptive_CV",
number = 10,
repeats = 10,
search = "random",
adaptive = list(min = 5, #should be less than number
alpha = 0.05,
method = "gls", #gls or BT
complete = TRUE))
model <- train(factor(Species)~.,
data = iris,
method = "xgbTree",
preProc = c("center", "scale"),
trControl = fitControl,
metric = "Kappa",
stratified = T,
tuneLength=50,
allowParallel=TRUE)
print(model)
prediction<-predict(data=iris[1,4], object = model)
table(prediction,iris$Species)
plot(prediction,iris$Species)
plot(model)
#plot relationship betw. parameters and performance
require(ggplot2)
ggplot(model) +
geom_smooth(se = TRUE, span = .8, method = loess) +
theme(legend.position = "top")
plot(varImp(model))
modelvalues<-data.frame(obs = iris$Species, pred=prediction)
defaultSummary(modelvalues)
#if results were numerical
residuals<-resid(model)
predictedValues<-predict(model)
plot(dev$mpg,residuals)
abline(0,0)
plot(dev$mpg,predictedValues)
using grid
explore how to set up ensembles http://www.r-bloggers.com/caretensemble-classification-example/ explore adaptive resampling http://topepo.github.io/caret/adaptive.html
require(BradleyTerry2) #install.packages("BradleyTerry2")
library(caret); library(xgboost)
data(iris)
fitControl <- trainControl(method = "repeatedCV",
number = 10,
repeats = 5,
search = "random")
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
xgbGrid <- expand.grid(
eta = 0.3, #typical 0.01-0.2 default 0.3
max_depth = c(2,3,4), #typical 3-10 default 6 IMPORTANT2
nrounds = seq(100, 250,25),
gamma = 0, #default=0
colsample_bytree = 0.8, #typical values 0.5-1 default=1 start 0.8
min_child_weight = 2 #default=1 to high - underfitting, use scale_pos_weight = 1 if class imbalance. IMPORTANT3
)
model <- train(factor(Species)~.,
data = iris,
method = "xgbTree",
preProc = c("center", "scale"),
trControl = fitControl,
metric = "Kappa",
stratified = T,
tuneGrid=xgbGrid,
allowParallel=TRUE)
print(model)
prediction<-predict(data=iris[1,4], object = model)
table(prediction,iris$Species)
plot(prediction,iris$Species)
plot(model)
plot(varImp(model, scale=FALSE))
https://www.r-bloggers.com/r-setup-a-grid-search-for-xgboost/ https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd https://cran.r-project.org/web/packages/xgboost/vignettes/xgboostPresentation.html http://www.analyticskhoj.com/data-mining/xgboost-algorithm/ https://github.com/amunategui/BetterCrossValidation/blob/master/CrossValidation2.R https://www.analyticsvidhya.com/blog/2016/01/xgboost-algorithm-easy-steps/ https://github.com/rachar1/DataAnalysis/blob/master/xgboost_Classification.R http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stackoverflow.com/questions/33949735/tuning-xgboost-parameters-in-r
https://www.kaggle.com/steves/springleaf-marketing-response/xgboost-with-caret/run/78586/code https://www.kaggle.com/steves/springleaf-marketing-response/xgboost-example-0-76178/output https://github.com/topepo/caret/issues/147
model tuning:https://rstudio-pubs-static.s3.amazonaws.com/123946_a0509ae50ca74fc8ad4392fc29326357.html
Example preparation of data: Easily convert categorical data to sparse model matrix sparse_matrix <- sparse.model.matrix(response ~ .-1, data = campaign)
Convert response to numerical as well: output_vector = df[,response] == "Responder"