Chapter 61 Modelling using the text2vec package

We create a vocabulary-based DTM. Here we collect unique terms from all documents and mark each of them with a unique ID using the create_vocabulary() function. We use an iterator to create the vocabulary. We also prune the vocabulary to reduce the terms in the matrix.

prep_fun  = function(x) {
  stringr::str_replace_all(tolower(x), "[^[:alpha:]]", " ")
}

tok_fun = word_tokenizer

it_train = itoken(train$text, 
                  preprocessor = prep_fun, 
                  tokenizer = tok_fun, 
                  ids = train$id, 
                  progressbar = FALSE)



it_test = test$text %>% 
  prep_fun %>% 
  tok_fun %>% 
  itoken(ids = test$id,  progressbar = FALSE)


NFOLDS = 4
vocab = create_vocabulary(it_train, ngram = c(1L, 3L))
vocab = vocab %>% prune_vocabulary(term_count_min = 10, 
                                   doc_proportion_max = 0.5,
                                   doc_proportion_min = 0.01,vocab_term_max = 5000)

trigram_vectorizer = vocab_vectorizer(vocab)

dtm_train = create_dtm(it_train, trigram_vectorizer)

61.1 Inspect the vocabulary

vocab

61.2 Inspect the Document Term Matrix

dim(dtm_train)

61.3 TF-IDF

# define tfidf model
tfidf = TfIdf$new()

# fit model to train data and transform train data with fitted model
dtm_train_tfidf = fit_transform(dtm_train, tfidf)

# tfidf modified by fit_transform() call!
# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf = create_dtm(it_test, trigram_vectorizer)

dtm_test_tfidf = transform(dtm_test_tfidf, tfidf)

61.4 Build the Multinomial Logistic Regression Model

glmnet_classifier = cv.glmnet(x = dtm_train_tfidf, y = train[['Sentiment']], 
                              family = 'multinomial', 
                              alpha = 1,
                              type.measure = "class",
                              nfolds = NFOLDS,
                              thresh = 1e-3,
                              maxit = 1e3)

61.5 Predict using the Multinomial Logistic Regression Model

predictions = data.frame(PhraseId=test$PhraseId,Sentiment = predict(glmnet_classifier, dtm_test_tfidf,type="class"))

options(scipen = 999)

predictions <- predictions %>%
  rename(Sentiment = X1)

predictions$PhraseId = as.numeric(predictions$PhraseId)
predictions$Sentiment = as.numeric(predictions$Sentiment)

write.csv(predictions, 'glmnet.csv', row.names = F)