I was wondering if it was possible to use subset on a geom_polyfreq()?
I am running a topic model and in order to report the facets properly i want to remove 4 out of 10 facets.
My code is as follows:
  ggplot(data = dat,
                 aes(x = date,
                     fill = Topics)) +
            geom_freqpoly(binwidth = 3) +
        labs(x = "", 
               y = "Topic Count",
               title = "Mentions of Topic On a Monthly Basis")+
        scale_x_date(date_breaks = "months", date_labels="%b")+
        theme(text = element_text(size=8)) +
        theme(axis.text.x = element_text(angle = 45))+
        facet_wrap(Topics ~ ., scales = "free")
> ggplot(subset(dat, Topics %in% c(3, 4, 5, 7, 8, 9)),
          aes(x = date,
              fill = topic)) +
     geom_freqpoly(binwidth = 3) +
 labs(x = "", 
        y = "Topic Count",
        title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=9)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
However, when I try to subset the data, I get an error that says: Fejl: Faceting variables must have at least one value
Does anybody know what the issue is? I hope this makes sense.
The full code is down below.
article.data <- article.data[!is.na(article.data$fulltext), ]
## Get date
article.data$date <- as.Date(article.data$date, "%Y-%m-%d")
#all of 2018
dat <- article.data[article.data$date > as.Date("2018-01-01", "%Y-%m-%d") & 
                       article.data$date < as.Date("2018-12-01", "%Y-%m-%d"), ]
## 'tokenize' fulltext
quanteda_options("language_stemmer" = "danish")
texts <- gsub(":", " ", dat$fulltext, fixed = T)
texts <- tokens(texts, what = "word",
                remove_numbers = T,
                remove_punct = T,
                remove_symbols = T,
                remove_separators = T,
                remove_hyphens = T,
                remove_url = T,
                verbose = T)
texts <- tokens_tolower(texts)
texts <- tokens_remove(texts, stopwords("danish"))
texts <- tokens_wordstem(texts)
texts <- tokens_remove(texts, stopwords("danish"))
# get actual dfm from tokens
txt.mat <- dfm(texts)
#remove frequent words with no substance
txt.mat <- txt.mat %>% dfm_remove(c("ad",
                                    "af","aldrig","alene","alle",
                                    "allerede","alligevel","alt",
                                    "altid","anden","andet","andre",
                                    "at","bag","bare", "bedre", "begge","bl.a.",
                                    "blandt", "blev", "blevet", "blive","bliver",
                                    "burde", "bør","ca.", "com", "da", 
                                    "dag", "dansk", "danske", "de",
                                    "dem", "den", "denne","dens",
                                    "der","derefter","deres","derfor",
                                    "derfra","deri","dermed","derpå",
                                    "derved","det","dette","dig",
                                    "din","dine","disse","dit",
                                    "dog","du","efter","egen",
                                    "ej","eller","ellers","en",
                                    "end","endnu","ene","eneste","enhver","ens",
                                    "enten","er","et","f.eks.","far","fem",
                                    "fik","fire","flere","flest",
                                    "fleste","for", "foran",
                                    "fordi","forrige","fra", "fx",
                                    "få","får","før","først",
                                    "gennem","gjorde","gjort","god",
                                    "godt","gør","gøre","gørende",
                                    "ham","han","hans","har",
                                    "havde","have","hej","hel",
                                    "heller","helt","hen","hende",
                                    "hendes","henover","her",
                                    "herefter","heri","hermed",
                                    "herpå","hos","hun","hvad",
                                    "hvem","hver","hvilke","hvilken",
                                    "hvilkes","hvis",
                                    "hvor", "hvordan","hvorefter","hvorfor",
                                    "hvorfra","hvorhen","hvori","hvorimod",
                                    "hvornår","hvorved","i", "ifølge", "igen",
                                    "igennem","ikke","imellem","imens",
                                    "imod","ind","indtil","ingen",
                                    "intet","ja","jeg","jer","jeres",
                                    "jo","kan","kom","komme",
                                    "kommer", "kroner", "kun","kunne","lad",
                                    "langs", "lang", "langt", "lav","lave","lavet",
                                    "lidt","lige","ligesom","lille",
                                    "længere","man","mand","mange",
                                    "med","meget","mellem","men", "mener",
                                    "mens","mere","mest","mig",
                                    "min","mindre","mindst","mine",
                                    "mit","mod","må","måske",
                                    "ned","nej","nemlig","ni",
                                    "nogen","nogensinde","noget",
                                    "nogle","nok","nu","ny", "nye",
                                    "nyt","når","nær","næste",
                                    "næsten","og","også","okay",
                                    "om","omkring","op","os",
                                    "otte","over","overalt","pga.", "partier",
                                    "partiets", "partiers", "politiske",
                                    "procent", "på", "ritzau", "samme", 
                                    "sammen","se","seks","selv","selvom",
                                    "senere","ser","ses","siden","sig",
                                    "sige", "siger", "sin","sine","sit",
                                    "skal","skulle","som","stadig",
                                    "stor","store","synes","syntes",
                                    "syv","så","sådan","således",
                                    "tag","tage","temmelig","thi",
                                    "ti","tidligere","til","tilbage",
                                    "tit","to","tre","ud","uden",
                                    "udover","under","undtagen","var",
                                    "ved","vi","via","vil","ville", "viser", 
                                    "vor","vore","vores","vær","være",
                                    "været","øvrigt","facebook","http", "https",
                                    "www","millioner", "frem", "lars", "lars_løkke", 
                                    "rasmussen", "løkke_rasmussen", "statsminister", "politik",
                                    "formand", "partiet", "år", "tid", "and", "fler",
                                    "sid", "regeringen", "giv", "politisk", "folketing", "mer",
                                    "ifølg"))
############################################################
## FEATURE SELECTION
############################################################
# check out top-appearing features in dfm
topfeatures(txt.mat)
# keep features (words) appearing in >2 documents
txt.mat <- dfm_trim(txt.mat, min_termfreq = 4)
# filter out one-character words
txt.mat <- txt.mat[, str_length(colnames(txt.mat)) > 2]
# filter out some html trash features
#txt.mat <- txt.mat[, !grepl("[[:digit:]]+px", colnames(txt.mat))]
#txt.mat <- txt.mat[, !grepl(".", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("_", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("@", colnames(txt.mat), fixed = T)]
#txt.mat <- txt.mat[, !grepl("#", colnames(txt.mat), fixed = T)]
############################################################
## SELECT FEATURES BY TF-IDF
############################################################
# Create tf_idf-weighted dfm
ti <- dfm_tfidf(txt.mat)
# Select from main dfm using its top features
txt.mat <- dfm_keep(txt.mat, names(topfeatures(ti, n = 1000)))
############################################################
## RUN TOPIC MODEL
############################################################
# convert quanteda dfm to tm 'dtm'
dtm <- convert(txt.mat, to = "topicmodels")
# run lda with 8 topics
lda <- LDA(dtm, k = 8)
# review terms by topic
terms(lda, 10)
############################################################
## LOOK FOR 'OPTIMAL' k
############################################################
# randomly sample test data
set.seed(61218)
select <- sample(1:nrow(dtm), size = 100)
test <- dtm[select, ]
train <- dtm[!(1:nrow(dtm) %in% select), ]
n.tops <- 3:14
metrics <- data.frame(topics = n.tops,
                      perplexity = NA)
for(i in n.tops) { # NB: takes awhile to run
  print(i)
  est <- LDA(train, k = i)
  metrics[(i - 1), "perplexity"] <- perplexity(est, newdata = test)
}
save(metrics, file = "lda_perplexity2018.RData")
qplot(data = metrics, x = topics, y = perplexity, geom = "line",
      xlab = "Number of topics",
      ylab = "Perplexity on test data") + theme_bw()
#We found that 8 topics was one of those of lowest perplexity but 
#also the ones which made the most sense
############################################################
## RERUN WITH BETTER CHOICE OF k
############################################################
# run lda with 10 topics
lda <- LDA(dtm, k = 10)
save(lda, file = "dr_ft_keep2018.RData")
# examine output
terms(lda, 20)
# put topics into original data
dat$topic <- topics(lda)
# add labels
#dat$date <- factor(dat$date,
#levels = 1:12,
#labels = c("januar","februar", "marts","april", "maj", "juni", "juli", "august", "september", "oktober", "november", "decemeber"))
dat$Topics <- factor(dat$topic,
                     levels = 1:10,
                     labels = c("Topc 1", "Topic 2", "Integration", "Taxation", "Burka Prohibition", 
                                "Topic 6", "Justice", "Foreign Affairs", "Housing", "Topic 10"))
# frequency
qplot(data = dat, x = Topics, 
      geom = "bar", xlab = "", 
      ylab = "Topic Frequency", fill=Topics, main = "Figure 1: Main Topics in 2018 - DR") + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90))
#Make visualization showing topics over time
ggplot(data = dat,
       aes(x = date,
           fill = Topics[1])) +
  geom_freqpoly(binwidth = 30) +
  facet_wrap(Topics ~ ., scales = "free")+
theme_classic() +
scale_x_date(breaks = as.Date(c( "2018-02-01", "2018-04-01",  "2018-06-01",  "2018-08-01",  "2018-10-01", "2018-12-01",  date_labels="%B"))) + 
theme(axis.text.x = element_text(angle = 90)) 
ggplot(data = dat,
         aes(x = date,
             fill = Topics)) +
    geom_freqpoly(binwidth = 3) +
labs(x = "", 
       y = "Topic Count",
       title = "Mentions of Topic On a Monthly Basis")+
scale_x_date(date_breaks = "months", date_labels="%b")+
theme(text = element_text(size=8)) +
theme(axis.text.x = element_text(angle = 45))+
facet_wrap(Topics ~ ., scales = "free")
 
    