Chapter 46 Relationship among words

Til now, we have explored the most important words for a character. Now, we will explore the relationship between words.

count_bigrams <- function(dataset) {
  dataset %>%
    unnest_tokens(bigram, normalized_text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word) %>%
    dplyr::count(word1, word2, sort = TRUE)
}


visualize_bigrams <- function(bigrams) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  bigrams %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
  
}

visualize_bigrams_individual <- function(bigrams) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  bigrams %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a,end_cap = circle(.07, 'inches')) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}

SCWords <- SC %>%
  count_bigrams()

SCWords %>%
  filter(n > 50) %>%
  visualize_bigrams()

The above infographic shows the words which follow another word. E.g. the word dont is preceded by many words such as

care
forget
understand
worry
mind
wanna

There are certain words which follows itself such as bye , hee since we have script lines which would be bye bye and hee hee respectively.

46.1 Dont word network graph

individual_words_bigrams <- function(SC, word1Value, word2Value) {
  x_Words1 <- SC %>%
    count_bigrams() %>%
    filter(word1 == word1Value)
  
  x_Words2 <- SC %>%
    count_bigrams() %>%
    filter(word2 == word2Value)
  
  x_full = rbind(x_Words1,x_Words2)
}


individual_words_bigrams(SC,"dont","dont") %>%
  filter(n > 20) %>%
  visualize_bigrams_individual()