2. Clean data.
Replace data in a list.
Use %in% operator
data(iris)
iris$Species<-as.character(iris$Species) #convert levels to character type
iris$Species[iris$Species %in% c("setosa","virginica")]<-"SetosaOrVirginica"
Use grepl function
d = c("SDS0G2 Blue", "Blue SSC2CWA3", "Blue SA2M1GC", "SA5 Blue CSQ5", "ABCDE")
d[grepl("Blue", d, ignore.case=FALSE)] <- "Red"
d
Bibliography: https://cran.r-project.org/doc/contrib/de_Jonge+van_der_Loo-Introduction_to_data_cleaning_with_R.pdf
Cut data into categories
cut(x = iris$Sepal.Length,breaks = c(0.0,5.0,6.0,10.0),labels = c("small", "mid","large"))
# In the place of breaks you can put quantiles.
cutpoints<-quantile(iris$Sepal.Length, c(.3, .6),names = F)
iris$Species2<-cut(x = iris$Sepal.Length,breaks = c(0,cutpoints,10),labels = c("small", "mid","large"))
Table of replacements
http://stackoverflow.com/questions/4622060/case-statement-equivalent-in-r Using if
w<-function(x){
if (x<5) {
"less"
}
else if (x>=5 & x<8) {
"more"
}
else {
"xxmore"
}
}
sapply(x,FUN=w)
Reading more arguments, from two columns:
x <- c(2, 3, 5, 7, 0.24, 0.12)
y <- c(3, 3, 5.5, 7.3, 9.13, 6.72)
w<-function(x,y){
if (x<5 & y<5) {
"less"
}
else if (x>=5 & x<8 & y>=5 & y<8) {
"more"
}
else {
"xxmore"
}
}
mapply(FUN=w, x, y)
using a levels() function on a factor
df <- data.frame(name = c('cow','pig','eagle','pigeon'),
stringsAsFactors = FALSE)
df$type <- factor(df$name) # First step: copy vector and make it factor
# Change levels:
levels(df$type) <- list(
animal = c("cow", "pig"),
bird = c("eagle", "pigeon")
)
# making it a function
changelevels <- function(f, ...) {
f <- as.factor(f)
levels(f) <- list(...)
f
}
df <- data.frame(name = c('cow','pig','eagle','pigeon'),
stringsAsFactors = TRUE)
df$type <- changelevels(df$name, animal=c("cow", "pig"), bird=c("eagle", "pigeon")
http://stackoverflow.com/questions/4622060/case-statement-equivalent-in-r
#plyr:
foo <- mapvalues(foo, from=c("AA", "AC", "AG"), to=c("0101", "0102", "0103"))
# for loop and list
dict = list(AA = '0101', AC = '0102', AG = '0103')
foo2 = foo
for (i in 1:3){foo2 <- replace(foo2, foo2 == names(dict[i]), dict[i])}
#lapply and function
key <- c('AA','AC','AG')
val <- c('0101','0102','0103')
lapply(1:3,FUN = function(i){foo[foo == key[i]] <<- val[i]})
foo
#another for loop
hrw <- read.csv("hgWords.txt", header=T, stringsAsFactor=FALSE, encoding="UTF-8", sep="\t")
for (i in nrow(hrw))
{
document <- gsub(hrw$from[i], hrw$to[i], document, ignore.case=TRUE)
}
hgword.txt contains the following tab separated "from" "to" "AA" "0101" "AC" "0102" "AG" "0103"
V1 <- read.table(text="Q1r2c5
Q1r5c11
Q1r5_1c130
testc130", stringsAsFactors=F)
V2 <- read.table(text = "search replace
5 brand1
11 brand2
130 brand3", header = T, stringsAsFactors=F)
V2$replace[match(sub(".*c", "", V1$V1), V2$search)]
[1] "brand1" "brand2" "brand3" "brand3"
To study: https://en.wikibooks.org/wiki/R_Programming/Text_Processing http://stackoverflow.com/questions/17144128/string-cleaning-in-r-down-to-just-letters http://stackoverflow.com/questions/2261079/how-to-trim-leading-and-trailing-whitespace-in-r