I've built a spell check function for a sample of 1000 rows to ensure its efficiency, using the 'hunspell' package and the Australian English dictionary. The spell-checker ignores abbreviations. My actual data has close to 2 million lines, I therefore need to convert the 'for' loops into the 'apply' family functions.
I'm almost there I, but the last part isn't working. Below are the original for loop functions:
for(i in 1:nrow(data_words))
{
  print(i)
  
  string1 <- data_words$title[i]
  string2 <- ""
  
  for(j in 1:sapply(strsplit(string1, " "), length))
  {
    w <- word(string1, j)
    
    # if word is not an abbreviation
    if (!isAbbreviation(w))
    {
      # correct word
      w <- correctText(w)
    }
    
    string2 <- paste0(string2, w, sep = " ")
    
    # add word in new column 'spell_check'
    data_words$spell_check[i] <- string2
    
  }
}
isAbbreviation <- function(x)
{
  abb = FALSE
  
  # all capitalised letters
  if(str_detect(x, "^[:upper:]+$"))
  {
    abb = TRUE
  }
  
  # dealing with abbs that end in an 's'
  b = str_extract_all(x, "(\\b[A-Z]+\\b)|\\b[A-Z]+s+\\b")
  list_empty = rlang::is_empty(unlist(b))
  
  if(!list_empty)
  {
    abb = TRUE
  }
  return(abb)
}
correctText = function(x)
{
  sapply(1:length(x), function(y)
  {
    # get misspelled words
    bad_words = hunspell(x[y], dict = "en_AU")[[1]]
    
    # if list of misspelled words is not empty
    if(length(bad_words))
    {
      for (i in 1:length(bad_words))
      {
        list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_words[i], 
                                                             dict = "en_AU")))
        # if suggestion list is not empty
        if(!list_empty)
        {
          # correct word
          good = unlist(lapply(hunspell_suggest(bad_words[i], dict = "en_AU"), `[[`, 1))
        }
        else
        {
          # else leave word is it is
          good = bad_words[i]
        }
        # replace mispelled words with corrected ones
        x[y] <<- gsub(bad_words[i], good, x[y])
      }
    }
  })
  x
}
Reproducible sample of phrases to be corrected:
library(dplyr)
library(stringr)
library(hunspell)
library(textclean)
sample <- 
  c("Paaediatrics AsseSssing Febrile Infant Child", "Manuual Handling Traain Trainer", "Catheterise CTHs", "Labelinsfbsbinsajectables", "Mentouring", "techhnical", "Basic Life Support BSL", "BloodSafe cliniiical transfusion practice", "Astthma", "Zika virus preegnancy update")
data_words <- data.frame(matrix(nrow = length(sample), ncol = 1))
names(data_words) <- "title"
data_words$title <- sample
data_words <- as_tibble(data_words)
I had a go at it, please refer to the below functions:
# the abbreviation function remains the same
# function to correct a misspelled word
correctTheWord <- function(bad_word)
{
  # print(bad_word)
  
  if (!isAbbreviation(bad_word))
  {
    list_empty = rlang::is_empty(unlist(hunspell_suggest(bad_word,
                                                         dict = "en_AU")))
    
    if (!list_empty)
    {
      good = unlist(
        lapply(hunspell_suggest(bad_word, dict = "en_AU"),
               `[[`,
               1
        ))
    }
    else
    {
      good = bad_word
    }
  }
  
  else
  {
    good = bad_word
  }
}
# correct whole row function
correctText = function(x)
{
  sapply(1:length(x), function(y)
  {
    bad = hunspell(x[y], dict = "en_AU")[[1]]
    
    if (length(bad))
    {
      return(mgsub(x, bad, lapply(bad, correctTheWord)))
    }
    else
    {
      return(x)
    }
  })
}
# testing the first 2 titles
correctText("Paaediatrics AsseSssing Febrile Infant Child")
correctText("Manuual Handling Traain Trainer")
# this is not working 
data_words$spell_check <- 
  apply(data_words[, 1], 2,  correctText)
Also, can my functions can be simplified further?