Chapter 75 TF - IDF Bigrams

FoodInspectionWordsBiGram <- FoodInspectionsReduced %>%
  unnest_tokens(bigram, Violations, token = "ngrams", n = 2)

bigrams_separated <- FoodInspectionWordsBiGram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  dplyr::count(word1, word2, sort = TRUE)

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigram_tf_idf <- bigrams_united %>%
  dplyr::count(Results, bigram) %>%
  bind_tf_idf(bigram, Results, n) 

plot_FoodInspectionWords_TF_IDF <- bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram))))

plot_FoodInspectionWords_TF_IDF %>% 
  top_n(15) %>%
  ggplot(aes(bigram, tf_idf, fill = Results)) +
  geom_col() +
  labs(x = NULL, y = "tf-idf") +
  coord_flip() +
  theme_bw()

Therefore for Out of Business, the most important words are as follows::

  • “comments reflected”,
  • “license 34”,
  • “licensee inspection”,
  • “license 38”