3.Deal with missing data.

"mice" package

https://gist.github.com/mick001/df77b69b30ef6ff9fc0b



#https://www.r-bloggers.com/imputing-missing-data-with-r-mice-package/
#load data
data("iris")

#seed 10% missing values
iris.mis <- prodNA(iris, noNA = 0.1)
summary(iris.mis)


library(mice)
library(lattice)
md.pattern(iris.mis)

# visualise missing values
library(VIM) #install.packages(c("VIM","DEoptimR")), 
aggr_plot <- aggr(iris.mis, col=c('navyblue','red'), 
                  numbers=TRUE, 
                  sortVars=TRUE, 
                  labels=names(data), 
                  cex.axis=.7, 
                  gap=3, 
                  ylab=c("Histogram of missing data","Pattern"))

marginplot(iris.mis[c(1,2)]) # see only 2 variables, red boxplot one variable with second missing.

tempData <- mice(iris.mis,m=1,maxit=100,seed=200) #meth='pmm', simple use
summary(tempData)
densityplot(tempData)
completedData <- complete(tempData)
xyplot(tempData,Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width,pch=18,cex=1)

https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/ http://adventuresindm.blogspot.com/2013/02/imputation-with-random-forest-miss.html

"missForest" package

install.packages("missForest")
library(missForest)

#load data
data("iris")

#seed 10% missing values
iris.mis <- prodNA(iris, noNA = 0.1)
summary(iris.mis)

#impute missing values, using all parameters as default values
iris.imp <- missForest(iris.mis) #simple use
iris.imp <- missForest(iris.mis, 
                       maxiter = 10, ntree = 300, 
                       xtrue=iris,
                       variablewise=T,
                       verbose=T) 
#check imputed values
iris.imp$ximp

#check imputation error
iris.imp$OOBerror

See also other packages from page https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/ Note, that missing values can also contain information. It may be wise to create a dummy variable with info, if the value was missing and has been imputed.

HMisc

#install package and load library
 install.packages("Hmisc")
 library(Hmisc)

#load data
 data("iris")

#seed missing values ( 10% )
 iris.mis <- prodNA(iris, noNA = 0.1) #function from missForest library(missForest)
 summary(iris.mis)

# impute with mean value
 iris.mis$imputed_age <- with(iris.mis, impute(Sepal.Length, mean))

# impute with random value
 iris.mis$imputed_age2 <- with(iris.mis, impute(Sepal.Length, 'random'))

#similarly you can use min, max, median to impute missing value

#using argImpute
 impute_arg <- aregImpute(~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width +
                             Species, data = iris.mis, n.impute = 5)

# argImpute() automatically identifies the variable type and treats them accordingly.

 impute_arg

miss

The output shows R² values for predicted missing values. Higher the value, better are the values predicted. You can also check imputed values using the following command

#check imputed variable Sepal.Length
 impute_arg$imputed$Sepal.Length

See also mtsdi for multivariate time series: http://stackoverflow.com/questions/29472532/arima-method-in-mtsdi

results matching ""

    No results matching ""