MachineLearning

preparing data

factors to dummy variables

https://www.youtube.com/watch?v=7rgzCjrIA-o

library(caret)
customers <- data.frame(id=c(10,20,30,40,50),
                        gender=c('male','female','female','male','female'),
                        mood=c('happy','sad','happy','sad','happy'),
                        outcome=c(1,1,0,0,0))
dmy <- dummyVars(formula = "~ .", data=customers,fullRank = T) 
#fullRank=T will remove unnecessary highly correlated variables.
trsf <- data.frame(predict(dmy, newdata=customers))
trsf

sparce model matrix

https://github.com/amunategui/Sparse-Matrices-And-GLMNET-Demo/blob/master/Sparse-Matrices-And-GLMNET-Demo.R

some_dataframe <- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
    2 7 0 0 0 0 0 0 0 0 0
    0 0 3 0 0 0 0 0 0 0 0
    0 0 0 6 1 0 0 0 0 0 0
    0 0 0 2 0 0 0 0 0 0 0
    0 0 0 0 0 0 0 0 12 0 1
    0 0 0 0 0 25 0 0 0 0 1
    1 0 0 0 2 0 0 0 0 0 0
    0 0 0 2 0 0 0 0 0 0 0
    0 0 0 0 0 0 0 0 14 0 1
    0 0 0 0 0 21 0 0 0 0 1
    0 0 0 0 0 0 28 0 0 0 1
    0 0 0 0 0 0 0 35 0 0 1
    0 0 0 0 0 0 0 0 42 0 1
    0 0 0 0 0 0 0 0 0 49 1", 
    header=T, sep="") 

library(Matrix)
some_matrix <- data.matrix(some_dataframe[1:10])

# show matrix representation of data set
Matrix(some_matrix, sparse=TRUE)


# split data set into a train and test portion
set.seed(2)
split <- sample(nrow(some_dataframe), floor(0.7*nrow(some_dataframe)))
train <-some_dataframe[split,]
test <- some_dataframe[-split,]

# transform both sets into sparse matrices using the sparse.model.matrix
train_sparse <- sparse.model.matrix(~.,train[1:10])
test_sparse <- sparse.model.matrix(~.,test[1:10])

# model the sparse sets using glmnet
library(glmnet)  
fit <- glmnet(train_sparse,train[,11])

# use cv.glmnet to find best lambda/penalty 
# s is the penalty parameter
cv <- cv.glmnet(train_sparse,train[,11],nfolds=3)
pred <- predict(fit, test_sparse,type="response", s=cv$lambda.min)

#  receiver operating characteristic (ROC curves)
library(pROC)  
auc = roc(test[,11], pred)
print(auc$auc)

cat_dataframe<- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 mood outcome
  2 7 0 0 0 0 0 0 0 0 happy 0
  0 0 3 0 0 0 0 0 0 0 happy 0
  0 0 0 6 1 0 0 0 0 0 happy 0
  0 0 0 2 0 0 0 0 0 0 happy 0
  0 0 0 0 0 0 0 0 12 0 sad 1
  0 0 0 0 0 25 0 0 0 0 sad 1
  1 0 0 0 2 0 0 0 0 0 happy 0
  0 0 0 2 0 0 0 0 0 0 happy 0
  0 0 0 0 0 0 0 14 0 0 sad 1
  0 0 0 0 0 21 0 0 0 0 sad 1
  0 0 0 0 0 0 28 0 0 0 sad 1
  0 0 0 0 0 0 0 35 0 0 sad 1
  0 0 0 0 0 0 0 0 42 0 sad 1
  0 0 0 0 0 0 0 0 0 49 sad 1", 
  header=T, sep="") 
print(sparse.model.matrix(~.,cat_dataframe))

# increasing the number of levels in the mood variable)
cat_dataframe <- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 mood outcome
        2 7 0 0 0 0 0 0 0 0 angry 0
        0 0 3 0 0 0 0 0 0 0 neutral 0
        0 0 0 6 1 0 0 0 0 0 happy 0
        0 0 0 2 0 0 0 0 0 0 happy 0
        0 0 0 0 0 0 0 0 12 0 sad 1
        0 0 0 0 0 25 0 0 0 0 sad 1
        1 0 0 0 2 0 0 0 0 0 happy 0
        0 0 0 2 0 0 0 0 0 0 happy 0
        0 0 0 0 0 0 0 0 14 0 sad 1
        0 0 0 0 0 21 0 0 0 0 neutral 1
        0 0 0 0 0 0 28 0 0 0 sad 1
        0 0 0 0 0 0 0 35 0 0 sad 1
        0 0 0 0 0 0 0 0 42 0 sad 1
        0 0 0 0 0 0 0 0 0 49 sad 1", header=T, sep="") 
print(levels(cat_dataframe$mood))
dim(cat_dataframe)

# sparse added extra columns when in binarized mood
dim(sparse.model.matrix(~.,cat_dataframe))
colnames(sparse.model.matrix(~.-1,cat_dataframe),)

partition with maxDissim

https://r-forge.r-project.org/scm/viewvc.php/*checkout*/www/splitting.html?revision=828&root=caret


library(caret)
library(mlbench)
data(BostonHousing)

testing <- scale(BostonHousing[, c("age", "nox")])
set.seed(11)
## A random sample of 5 data points
startSet <- sample(1:dim(testing)[1], 5)
samplePool <- testing[-startSet, ]
start <- testing[startSet, ]
newSamp <- maxDissim(start, samplePool, n = 20, randomFrac = 0.5)
# To select more samples towards the interior of the data set, 
# set randomFrac to be small (range is (0, 1]).

head(newSamp)

Feature hashing example.

# Feature Hashing
# http://amunategui.github.io/feature-hashing/#sourcecode

require(RCurl)
binData <- getBinaryURL("https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip",
                        ssl.verifypeer=FALSE)

conObj <- file("dataset_diabetes.zip", open = "wb")
writeBin(binData, conObj)
# don't forget to close it
close(conObj)

# open diabetes file
files <- unzip("dataset_diabetes.zip")
diabetes <- read.csv(files[1], stringsAsFactors = FALSE)

# quick look at the data
str(diabetes)

# drop useless variables
diabetes <- subset(diabetes,select=-c(encounter_id, patient_nbr))

# transform all "?" to 0s
diabetes[diabetes == "?"] <- NA

# remove zero variance - ty James http://stackoverflow.com/questions/8805298/quickly-remove-zero-variance-variables-from-a-data-frame
diabetes <- diabetes[sapply(diabetes, function(x) length(levels(factor(x,exclude=NULL)))>1)]

# prep outcome variable to those readmitted under 30 days
diabetes$readmitted <- ifelse(diabetes$readmitted == "<30",1,0)

# generalize outcome name
outcomeName <- 'readmitted'

# large factors to deal with
length(unique(diabetes$diag_1))
length(unique(diabetes$diag_2))
length(unique(diabetes$diag_3))

diabetes_hash <- diabetes
predictorNames <- setdiff(names(diabetes_hash),outcomeName)

# change all NAs to 0
diabetes_hash[is.na(diabetes_hash)] <- 0

set.seed(1234)
split <- sample(nrow(diabetes_hash), floor(0.5*nrow(diabetes_hash)))
objTrain <-diabetes_hash[split,]
objTest <- diabetes_hash[-split,]

library(FeatureHashing) ; install.packages("FeatureHashing")
objTrain_hashed = hashed.model.matrix(~., data=objTrain[,predictorNames], hash.size=2^12, transpose=FALSE)
objTrain_hashed = as(objTrain_hashed, "dgCMatrix")
objTest_hashed = hashed.model.matrix(~., data=objTest[,predictorNames], hash.size=2^12, transpose=FALSE)
objTest_hashed = as(objTest_hashed, "dgCMatrix")

library(glmnet)
glmnetModel <- cv.glmnet(objTrain_hashed, objTrain[,outcomeName], 
                         family = "binomial", type.measure = "auc")

#Let’s see how this version scored:

  glmnetPredict <- predict(glmnetModel, objTest_hashed, s="lambda.min")
auc(objTest[,outcomeName], glmnetPredict)

https://youtu.be/vG3-yCyPNDQ, https://youtu.be/oHSMZk3Ynzg https://en.wikipedia.org/wiki/Feature_hashing http://amunategui.github.io/feature-hashing/

xgboost and caret

random search


data(iris)
fitControl <- trainControl(method = "adaptive_CV", 
                           number = 10, 
                           repeats = 10,  
                           search = "random",
                           adaptive = list(min = 5,         #should be less than number
                                           alpha = 0.05,
                                           method = "gls",   #gls or BT
                                           complete = TRUE))

model <- train(factor(Species)~., 
               data = iris, 
               method = "xgbTree",
               preProc = c("center", "scale"),
               trControl = fitControl,
               metric = "Kappa",
               stratified = T,
               tuneLength=50,
               allowParallel=TRUE)
print(model)
prediction<-predict(data=iris[1,4], object = model)
table(prediction,iris$Species)
plot(prediction,iris$Species)
plot(model)

#plot relationship betw. parameters and performance
require(ggplot2)

ggplot(model) +
  geom_smooth(se = TRUE, span = .8, method = loess) +
  theme(legend.position = "top")


plot(varImp(model))

modelvalues<-data.frame(obs = iris$Species, pred=prediction)

defaultSummary(modelvalues)


#if results were numerical

residuals<-resid(model)

predictedValues<-predict(model)

plot(dev$mpg,residuals)

abline(0,0)

plot(dev$mpg,predictedValues)

using grid

explore how to set up ensembles http://www.r-bloggers.com/caretensemble-classification-example/ explore adaptive resampling http://topepo.github.io/caret/adaptive.html

require(BradleyTerry2) #install.packages("BradleyTerry2")
library(caret); library(xgboost)

data(iris)
fitControl <- trainControl(method = "repeatedCV", 
                           number = 10, 
                           repeats = 5,  
                           search = "random")

# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
xgbGrid <- expand.grid(
  eta = 0.3,   #typical 0.01-0.2 default 0.3
  max_depth = c(2,3,4), #typical 3-10 default 6   IMPORTANT2
  nrounds = seq(100, 250,25),
  gamma = 0, #default=0
  colsample_bytree = 0.8,       #typical values 0.5-1 default=1 start  0.8
  min_child_weight = 2     #default=1 to high - underfitting, use scale_pos_weight = 1 if class imbalance. IMPORTANT3
)

model <- train(factor(Species)~., 
               data = iris, 
               method = "xgbTree",
               preProc = c("center", "scale"),
               trControl = fitControl,
               metric = "Kappa",
               stratified = T,
               tuneGrid=xgbGrid,
               allowParallel=TRUE)


print(model)
prediction<-predict(data=iris[1,4], object = model)
table(prediction,iris$Species)
plot(prediction,iris$Species)
plot(model)
plot(varImp(model, scale=FALSE))

https://www.r-bloggers.com/r-setup-a-grid-search-for-xgboost/ https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd https://cran.r-project.org/web/packages/xgboost/vignettes/xgboostPresentation.html http://www.analyticskhoj.com/data-mining/xgboost-algorithm/ https://github.com/amunategui/BetterCrossValidation/blob/master/CrossValidation2.R https://www.analyticsvidhya.com/blog/2016/01/xgboost-algorithm-easy-steps/ https://github.com/rachar1/DataAnalysis/blob/master/xgboost_Classification.R http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stackoverflow.com/questions/33949735/tuning-xgboost-parameters-in-r

https://www.kaggle.com/steves/springleaf-marketing-response/xgboost-with-caret/run/78586/code https://www.kaggle.com/steves/springleaf-marketing-response/xgboost-example-0-76178/output https://github.com/topepo/caret/issues/147

model tuning:https://rstudio-pubs-static.s3.amazonaws.com/123946_a0509ae50ca74fc8ad4392fc29326357.html

logloss: https://www.kaggle.com/c/march-machine-learning-mania-2015/forums/t/12449/r-help-using-the-caret-library-to-optimize-models-for-log-loss

Example preparation of data: Easily convert categorical data to sparse model matrix sparse_matrix <- sparse.model.matrix(response ~ .-1, data = campaign)

Convert response to numerical as well: output_vector = df[,response] == "Responder"

MachineLearning

MachineLearning

preparing data

factors to dummy variables

sparce model matrix

partition with maxDissim

Feature hashing example.

xgboost and caret

random search

using grid

results matching ""

No results matching ""