Chapter 75 TF - IDF Bigrams
FoodInspectionWordsBiGram <- FoodInspectionsReduced %>%
unnest_tokens(bigram, Violations, token = "ngrams", n = 2)
bigrams_separated <- FoodInspectionWordsBiGram %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
dplyr::count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigram_tf_idf <- bigrams_united %>%
dplyr::count(Results, bigram) %>%
bind_tf_idf(bigram, Results, n)
plot_FoodInspectionWords_TF_IDF <- bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram))))
plot_FoodInspectionWords_TF_IDF %>%
top_n(15) %>%
ggplot(aes(bigram, tf_idf, fill = Results)) +
geom_col() +
labs(x = NULL, y = "tf-idf") +
coord_flip() +
theme_bw()
Therefore for Out of Business, the most important words are as follows::
- “comments reflected”,
- “license 34”,
- “licensee inspection”,
- “license 38”