I have written a function that iterates through URLs and scrapes the data I need from each page.
library(xml2)
library(rvest)
The below creates a vector of the relevant URLs:
tripadvisor_urls <- c()
for (n in seq(0, 80, 10)) {
    url <- paste('https://www.tripadvisor.co.uk/Attraction_Review-g186306-d9756771-Reviews-or',n,
           '-Suckerpunch_St_Albans-St_Albans_Hertfordshire_England.html', sep = "")
    tripadvisor_urls <- c(tripadvisor_urls, url)
    }
And this is the function I wrote:
all_pages <- function(x) {
id_v <- c()
rating_v <- c()
headline_quote_v <- c()
date_v <- c()
review_v <- c()
for (url in x) {
        reviews <- url %>%
          read_html() %>%
          html_nodes("#REVIEWS .innerBubble")
        id <- reviews %>%
          html_node(".quote a") %>%
          html_attr("id")
        id_v <- c(id_v, id)
        headline_quote <- reviews %>%
          html_node(".quote span") %>%
          html_text()
        headline_quote_v <- c(headline_quote_v, headline_quote)
        rating_wrong <- url %>%
          read_html() %>%
          html_nodes("#REVIEWS .ui_bubble_rating") %>%
          as.character() %>%
          substr(38,39) %>%
          as.numeric()
        rating <- rating_wrong/10
        rating_v <- c(rating_v, rating)
        date <- reviews %>%
          html_node(".rating .ratingDate") %>%
          html_attr("title") %>%
          as.Date('%d %B %Y')
        date_v <- c(date_v, date)
        review <- reviews %>%
          html_node(".entry .partial_entry") %>%
          html_text()
        review_v <- c(review_v, review)
    }
tripadvisor <<- data.frame(id_v, headline_quote_v, rating_v, date_v, review_v)
}
all_pages(tripadvisor_urls)
When I look at the generated data frame, I see that there are duplicates:
duplicated(tripadvisor)
What have I done wrong? I would imagine it has something to do with constantly appending new elements to my vectors. What's the best way around this?
NOTE: I have requested permission from TripAdvisor so I am not violating their terms of service.
 
    