2. Clean data.

Replace data in a list.

Use %in% operator

data(iris)
iris$Species<-as.character(iris$Species) #convert levels to character type 
iris$Species[iris$Species %in% c("setosa","virginica")]<-"SetosaOrVirginica"

Use grepl function

d = c("SDS0G2 Blue", "Blue SSC2CWA3", "Blue SA2M1GC", "SA5 Blue CSQ5", "ABCDE")
d[grepl("Blue", d, ignore.case=FALSE)] <- "Red"
d

Bibliography: https://cran.r-project.org/doc/contrib/de_Jonge+van_der_Loo-Introduction_to_data_cleaning_with_R.pdf

Cut data into categories

cut(x = iris$Sepal.Length,breaks = c(0.0,5.0,6.0,10.0),labels = c("small", "mid","large"))

# In the place of breaks you can put quantiles.

cutpoints<-quantile(iris$Sepal.Length, c(.3, .6),names = F) 
iris$Species2<-cut(x = iris$Sepal.Length,breaks = c(0,cutpoints,10),labels = c("small", "mid","large"))

Table of replacements

http://stackoverflow.com/questions/4622060/case-statement-equivalent-in-r Using if

w<-function(x){
  if (x<5) {
    "less"
  }
  else if (x>=5 & x<8) {
    "more"
  }
  else  {
    "xxmore"
  }
}
sapply(x,FUN=w)

Reading more arguments, from two columns:

x <- c(2, 3, 5, 7, 0.24, 0.12)
y <- c(3, 3, 5.5, 7.3, 9.13, 6.72)

w<-function(x,y){
  if (x<5 & y<5) {
    "less"
  }
  else if (x>=5 & x<8 & y>=5 & y<8) {
    "more"
  }
  else  {
    "xxmore"
  }
}
mapply(FUN=w, x, y)

using a levels() function on a factor

df <- data.frame(name = c('cow','pig','eagle','pigeon'), 
             stringsAsFactors = FALSE)
df$type <- factor(df$name) # First step: copy vector and make it factor
# Change levels:
levels(df$type) <- list(
    animal = c("cow", "pig"),
    bird = c("eagle", "pigeon")
)
# making it a function
changelevels <- function(f, ...) {
    f <- as.factor(f)
    levels(f) <- list(...)
    f
}

df <- data.frame(name = c('cow','pig','eagle','pigeon'), 
                 stringsAsFactors = TRUE)

df$type <- changelevels(df$name, animal=c("cow", "pig"), bird=c("eagle", "pigeon")

http://stackoverflow.com/questions/4622060/case-statement-equivalent-in-r

#plyr: 
foo <- mapvalues(foo, from=c("AA", "AC", "AG"), to=c("0101", "0102", "0103"))
# for loop and list
dict = list(AA = '0101', AC = '0102', AG = '0103')
foo2 = foo
for (i in 1:3){foo2 <- replace(foo2, foo2 == names(dict[i]), dict[i])}
#lapply and function
key <- c('AA','AC','AG')
val <- c('0101','0102','0103')

lapply(1:3,FUN = function(i){foo[foo == key[i]] <<- val[i]})
foo

#another for loop
hrw <- read.csv("hgWords.txt", header=T, stringsAsFactor=FALSE, encoding="UTF-8", sep="\t") 

for (i in nrow(hrw)) 
{
document <- gsub(hrw$from[i], hrw$to[i], document, ignore.case=TRUE)
}

hgword.txt contains the following tab separated "from" "to" "AA" "0101" "AC" "0102" "AG" "0103"

http://stackoverflow.com/questions/37585537/r-replace-part-of-a-string-conditionally-by-a-vector-of-possible-replacements

V1 <- read.table(text="Q1r2c5
                 Q1r5c11
                 Q1r5_1c130
                 testc130", stringsAsFactors=F)
V2 <- read.table(text = "search    replace
                 5         brand1
                 11        brand2
                 130       brand3", header = T, stringsAsFactors=F)

V2$replace[match(sub(".*c", "", V1$V1), V2$search)]
[1] "brand1" "brand2" "brand3" "brand3"

To study: https://en.wikibooks.org/wiki/R_Programming/Text_Processing http://stackoverflow.com/questions/17144128/string-cleaning-in-r-down-to-just-letters http://stackoverflow.com/questions/2261079/how-to-trim-leading-and-trailing-whitespace-in-r

results matching ""

    No results matching ""