I have read this website and am new to R.
The following code works well for csv files with 100 rows (tested), but gives an error message for csv files with 500,000 rows exceeding 1GB:
library(tm) 
library(RWeka) setwd("c:textanalysis/") 
data <- read.csv("postsdataset.csv", header=FALSE, stringsAsFactors=FALSE) 
data <- data[,2] 
source("GenerateTDM.R") # generatetdm function in appendix 
tdm.generate <- function(string, ng){   
    # tutorial on rweka - http://tm.r-forge.r-project.org/faq.html
    corpus <- Corpus(VectorSource(string)) # create corpus for TM processing
    corpus <- tm_map(corpus, content_transformer(tolower))
    corpus <- tm_map(corpus, removeNumbers) 
    corpus <- tm_map(corpus, removePunctuation)
    corpus <- tm_map(corpus, stripWhitespace)
    # corpus <- tm_map(corpus, removeWords, stopwords("english")) 
    options(mc.cores=1) # http://stackoverflow.com/questions/17703553/bigrams-   instead-of-single-words-in-termdocument-matrix-using-r-and-rweka/20251039#20251039
    BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = ng, max = ng)) # create n-grams
    tdm <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))   # create tdm from n-grams
    tdm
}
tdm <- tdm.generate(data, 2)
I want to clean text data (online posts collected in a csv file) and get rid of URL, blank rows, and usernames; and explore my data and do clustering analysis for ngrams with tf/idf.
How do I use source("GenerateTDM.R")?