machine learning - randomForest in R object not found error -
# init libs <- c("tm", "plyr", "class", "rtexttools", "randomforest") lapply(libs, require, character.only = true) # set options options(stringsasfactors = false) # set parameters labels <- read.table('labels.txt') path <- paste(getwd(), "/data", sep="") # clean text cleancorpus <- function(corpus) { corpus.tmp <- tm_map(corpus, removepunctuation) corpus.tmp <- tm_map(corpus.tmp, removenumbers) corpus.tmp <- tm_map(corpus.tmp, stripwhitespace) corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower)) corpus.tmp <- tm_map(corpus.tmp, stemdocument, language = "english") corpus.tmp <- tm_map(corpus.tmp, removewords, stopwords("english")) return(corpus.tmp) } # build tdm generatetdm <- function(label, path) { s.dir <- sprintf("%s/%s", path, label) s.cor <- corpus(dirsource(directory = s.dir), readercontrol = list(language = "en")) s.cor.cl <- cleancorpus(s.cor) s.tdm <- termdocumentmatrix(s.cor.cl) s.tdm <- removesparseterms(s.tdm, 0.7) return(list(name = label, tdm = s.tdm)) } tdm <- lapply(labels, generatetdm, path = path) # attach name bindlabeltotdm <- function(tdm) { s.mat <- t(data.matrix(tdm[["tdm"]])) s.df <- as.data.frame(s.mat, stringsasfactors = false) s.df <- cbind(s.df, rep(tdm[["name"]], nrow(s.df)), row.names = null) colnames(s.df)[ncol(s.df)] <- "targetlabel" return(s.df) } labeltdm <- lapply(tdm, bindlabeltotdm) # stack tdm.stack <- do.call(rbind.fill, labeltdm) tdm.stack[is.na(tdm.stack)] <- 0 # hold-out train.idx <- sample(nrow(tdm.stack), ceiling(nrow(tdm.stack) * 0.7)) test.idx <- (1:nrow(tdm.stack)) [- train.idx] tdm.lab <- tdm.stack[, "targetlabel"] tdm.stack.nl <- tdm.stack[, !colnames(tdm.stack) %in% "targetlabel"] train <- tdm.stack[train.idx, ] test <- tdm.stack[test.idx, ] train$targetlabel <- as.factor(train$targetlabel) label.rf <- randomforest(targetlabel ~ ., data = train, ntree = 5000, mtry = 15, importance = true)
i trying multi class classfication text files using randomforest algorithms. error because of last or second last line.
error in eval(expr, envir, enclos) : object '∗' not found
tdm.stack contains columns names words found in document , cell values frequency. last column contains class value.
i have tried cant figure out problem. please help.
the error being caused presence of non-ascii characters in corpuses. added line cleancorpus function remove non-ascii characters
corpus.tmp <- tm_map(corpus.tmp, function(x) iconv(x, "latin1", "ascii", sub=""))
this solved problem.
Comments
Post a Comment