MachineLearning

preparing data

factors to dummy variables

https://www.youtube.com/watch?v=7rgzCjrIA-o

library(caret)
customers <- data.frame(id=c(10,20,30,40,50),
                        gender=c('male','female','female','male','female'),
                        mood=c('happy','sad','happy','sad','happy'),
                        outcome=c(1,1,0,0,0))
dmy <- dummyVars(formula = "~ .", data=customers,fullRank = T) 
#fullRank=T will remove unnecessary highly correlated variables.
trsf <- data.frame(predict(dmy, newdata=customers))
trsf

sparce model matrix

https://github.com/amunategui/Sparse-Matrices-And-GLMNET-Demo/blob/master/Sparse-Matrices-And-GLMNET-Demo.R

some_dataframe <- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
    2 7 0 0 0 0 0 0 0 0 0
    0 0 3 0 0 0 0 0 0 0 0
    0 0 0 6 1 0 0 0 0 0 0
    0 0 0 2 0 0 0 0 0 0 0
    0 0 0 0 0 0 0 0 12 0 1
    0 0 0 0 0 25 0 0 0 0 1
    1 0 0 0 2 0 0 0 0 0 0
    0 0 0 2 0 0 0 0 0 0 0
    0 0 0 0 0 0 0 0 14 0 1
    0 0 0 0 0 21 0 0 0 0 1
    0 0 0 0 0 0 28 0 0 0 1
    0 0 0 0 0 0 0 35 0 0 1
    0 0 0 0 0 0 0 0 42 0 1
    0 0 0 0 0 0 0 0 0 49 1", 
    header=T, sep="") 

library(Matrix)
some_matrix <- data.matrix(some_dataframe[1:10])

# show matrix representation of data set
Matrix(some_matrix, sparse=TRUE)


# split data set into a train and test portion
set.seed(2)
split <- sample(nrow(some_dataframe), floor(0.7*nrow(some_dataframe)))
train <-some_dataframe[split,]
test <- some_dataframe[-split,]

# transform both sets into sparse matrices using the sparse.model.matrix
train_sparse <- sparse.model.matrix(~.,train[1:10])
test_sparse <- sparse.model.matrix(~.,test[1:10])

# model the sparse sets using glmnet
library(glmnet)  
fit <- glmnet(train_sparse,train[,11])

# use cv.glmnet to find best lambda/penalty 
# s is the penalty parameter
cv <- cv.glmnet(train_sparse,train[,11],nfolds=3)
pred <- predict(fit, test_sparse,type="response", s=cv$lambda.min)

#  receiver operating characteristic (ROC curves)
library(pROC)  
auc = roc(test[,11], pred)
print(auc$auc)

cat_dataframe<- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 mood outcome
  2 7 0 0 0 0 0 0 0 0 happy 0
  0 0 3 0 0 0 0 0 0 0 happy 0
  0 0 0 6 1 0 0 0 0 0 happy 0
  0 0 0 2 0 0 0 0 0 0 happy 0
  0 0 0 0 0 0 0 0 12 0 sad 1
  0 0 0 0 0 25 0 0 0 0 sad 1
  1 0 0 0 2 0 0 0 0 0 happy 0
  0 0 0 2 0 0 0 0 0 0 happy 0
  0 0 0 0 0 0 0 14 0 0 sad 1
  0 0 0 0 0 21 0 0 0 0 sad 1
  0 0 0 0 0 0 28 0 0 0 sad 1
  0 0 0 0 0 0 0 35 0 0 sad 1
  0 0 0 0 0 0 0 0 42 0 sad 1
  0 0 0 0 0 0 0 0 0 49 sad 1", 
  header=T, sep="") 
print(sparse.model.matrix(~.,cat_dataframe))

# increasing the number of levels in the mood variable)
cat_dataframe <- read.table(text="c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 mood outcome
        2 7 0 0 0 0 0 0 0 0 angry 0
        0 0 3 0 0 0 0 0 0 0 neutral 0
        0 0 0 6 1 0 0 0 0 0 happy 0
        0 0 0 2 0 0 0 0 0 0 happy 0
        0 0 0 0 0 0 0 0 12 0 sad 1
        0 0 0 0 0 25 0 0 0 0 sad 1
        1 0 0 0 2 0 0 0 0 0 happy 0
        0 0 0 2 0 0 0 0 0 0 happy 0
        0 0 0 0 0 0 0 0 14 0 sad 1
        0 0 0 0 0 21 0 0 0 0 neutral 1
        0 0 0 0 0 0 28 0 0 0 sad 1
        0 0 0 0 0 0 0 35 0 0 sad 1
        0 0 0 0 0 0 0 0 42 0 sad 1
        0 0 0 0 0 0 0 0 0 49 sad 1", header=T, sep="") 
print(levels(cat_dataframe$mood))
dim(cat_dataframe)

# sparse added extra columns when in binarized mood
dim(sparse.model.matrix(~.,cat_dataframe))
colnames(sparse.model.matrix(~.-1,cat_dataframe),)

partition with maxDissim

https://r-forge.r-project.org/scm/viewvc.php/*checkout*/www/splitting.html?revision=828&root=caret


library(caret)
library(mlbench)
data(BostonHousing)

testing <- scale(BostonHousing[, c("age", "nox")])
set.seed(11)
## A random sample of 5 data points
startSet <- sample(1:dim(testing)[1], 5)
samplePool <- testing[-startSet, ]
start <- testing[startSet, ]
newSamp <- maxDissim(start, samplePool, n = 20, randomFrac = 0.5)
# To select more samples towards the interior of the data set, 
# set randomFrac to be small (range is (0, 1]).

head(newSamp)

Feature hashing example.

# Feature Hashing
# http://amunategui.github.io/feature-hashing/#sourcecode

require(RCurl)
binData <- getBinaryURL("https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip",
                        ssl.verifypeer=FALSE)

conObj <- file("dataset_diabetes.zip", open = "wb")
writeBin(binData, conObj)
# don't forget to close it
close(conObj)

# open diabetes file
files <- unzip("dataset_diabetes.zip")
diabetes <- read.csv(files[1], stringsAsFactors = FALSE)

# quick look at the data
str(diabetes)

# drop useless variables
diabetes <- subset(diabetes,select=-c(encounter_id, patient_nbr))

# transform all "?" to 0s
diabetes[diabetes == "?"] <- NA

# remove zero variance - ty James http://stackoverflow.com/questions/8805298/quickly-remove-zero-variance-variables-from-a-data-frame
diabetes <- diabetes[sapply(diabetes, function(x) length(levels(factor(x,exclude=NULL)))>1)]

# prep outcome variable to those readmitted under 30 days
diabetes$readmitted <- ifelse(diabetes$readmitted == "<30",1,0)

# generalize outcome name
outcomeName <- 'readmitted'

# large factors to deal with
length(unique(diabetes$diag_1))
length(unique(diabetes$diag_2))
length(unique(diabetes$diag_3))

diabetes_hash <- diabetes
predictorNames <- setdiff(names(diabetes_hash),outcomeName)

# change all NAs to 0
diabetes_hash[is.na(diabetes_hash)] <- 0

set.seed(1234)
split <- sample(nrow(diabetes_hash), floor(0.5*nrow(diabetes_hash)))
objTrain <-diabetes_hash[split,]
objTest <- diabetes_hash[-split,]

library(FeatureHashing) ; install.packages("FeatureHashing")
objTrain_hashed = hashed.model.matrix(~., data=objTrain[,predictorNames], hash.size=2^12, transpose=FALSE)
objTrain_hashed = as(objTrain_hashed, "dgCMatrix")
objTest_hashed = hashed.model.matrix(~., data=objTest[,predictorNames], hash.size=2^12, transpose=FALSE)
objTest_hashed = as(objTest_hashed, "dgCMatrix")

library(glmnet)
glmnetModel <- cv.glmnet(objTrain_hashed, objTrain[,outcomeName], 
                         family = "binomial", type.measure = "auc")

#Let’s see how this version scored:

  glmnetPredict <- predict(glmnetModel, objTest_hashed, s="lambda.min")
auc(objTest[,outcomeName], glmnetPredict)

https://youtu.be/vG3-yCyPNDQ, https://youtu.be/oHSMZk3Ynzg https://en.wikipedia.org/wiki/Feature_hashing http://amunategui.github.io/feature-hashing/

xgboost and caret


data(iris)
fitControl <- trainControl(method = "adaptive_CV", 
                           number = 10, 
                           repeats = 10,  
                           search = "random",
                           adaptive = list(min = 5,         #should be less than number
                                           alpha = 0.05,
                                           method = "gls",   #gls or BT
                                           complete = TRUE))

model <- train(factor(Species)~., 
               data = iris, 
               method = "xgbTree",
               preProc = c("center", "scale"),
               trControl = fitControl,
               metric = "Kappa",
               stratified = T,
               tuneLength=50,
               allowParallel=TRUE)
print(model)
prediction<-predict(data=iris[1,4], object = model)
table(prediction,iris$Species)
plot(prediction,iris$Species)
plot(model)

#plot relationship betw. parameters and performance
require(ggplot2)

ggplot(model) +
  geom_smooth(se = TRUE, span = .8, method = loess) +
  theme(legend.position = "top")


plot(varImp(model))

modelvalues<-data.frame(obs = iris$Species, pred=prediction)

defaultSummary(modelvalues)


#if results were numerical

residuals<-resid(model)

predictedValues<-predict(model)

plot(dev$mpg,residuals)

abline(0,0)

plot(dev$mpg,predictedValues)

using grid

explore how to set up ensembles http://www.r-bloggers.com/caretensemble-classification-example/ explore adaptive resampling http://topepo.github.io/caret/adaptive.html

require(BradleyTerry2) #install.packages("BradleyTerry2")
library(caret); library(xgboost)

data(iris)
fitControl <- trainControl(method = "repeatedCV", 
                           number = 10, 
                           repeats = 5,  
                           search = "random")

# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
xgbGrid <- expand.grid(
  eta = 0.3,   #typical 0.01-0.2 default 0.3
  max_depth = c(2,3,4), #typical 3-10 default 6   IMPORTANT2
  nrounds = seq(100, 250,25),
  gamma = 0, #default=0
  colsample_bytree = 0.8,       #typical values 0.5-1 default=1 start  0.8
  min_child_weight = 2     #default=1 to high - underfitting, use scale_pos_weight = 1 if class imbalance. IMPORTANT3
)

model <- train(factor(Species)~., 
               data = iris, 
               method = "xgbTree",
               preProc = c("center", "scale"),
               trControl = fitControl,
               metric = "Kappa",
               stratified = T,
               tuneGrid=xgbGrid,
               allowParallel=TRUE)


print(model)
prediction<-predict(data=iris[1,4], object = model)
table(prediction,iris$Species)
plot(prediction,iris$Species)
plot(model)
plot(varImp(model, scale=FALSE))

https://www.r-bloggers.com/r-setup-a-grid-search-for-xgboost/ https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd https://cran.r-project.org/web/packages/xgboost/vignettes/xgboostPresentation.html http://www.analyticskhoj.com/data-mining/xgboost-algorithm/ https://github.com/amunategui/BetterCrossValidation/blob/master/CrossValidation2.R https://www.analyticsvidhya.com/blog/2016/01/xgboost-algorithm-easy-steps/ https://github.com/rachar1/DataAnalysis/blob/master/xgboost_Classification.R http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stats.stackexchange.com/questions/171043/how-to-tune-hyperparameters-of-xgboost-trees http://stackoverflow.com/questions/33949735/tuning-xgboost-parameters-in-r

https://www.kaggle.com/steves/springleaf-marketing-response/xgboost-with-caret/run/78586/code https://www.kaggle.com/steves/springleaf-marketing-response/xgboost-example-0-76178/output https://github.com/topepo/caret/issues/147

model tuning:https://rstudio-pubs-static.s3.amazonaws.com/123946_a0509ae50ca74fc8ad4392fc29326357.html

logloss: https://www.kaggle.com/c/march-machine-learning-mania-2015/forums/t/12449/r-help-using-the-caret-library-to-optimize-models-for-log-loss

Example preparation of data: Easily convert categorical data to sparse model matrix sparse_matrix <- sparse.model.matrix(response ~ .-1, data = campaign)

Convert response to numerical as well: output_vector = df[,response] == "Responder"

results matching ""

    No results matching ""