Chapter 61 Modelling using the text2vec package
We create a vocabulary-based DTM. Here we collect unique terms from all documents and mark each of them with a unique ID using the create_vocabulary() function. We use an iterator to create the vocabulary. We also prune the vocabulary to reduce the terms in the matrix.
prep_fun  = function(x) {
  stringr::str_replace_all(tolower(x), "[^[:alpha:]]", " ")
}
tok_fun = word_tokenizer
it_train = itoken(train$text, 
                  preprocessor = prep_fun, 
                  tokenizer = tok_fun, 
                  ids = train$id, 
                  progressbar = FALSE)
it_test = test$text %>% 
  prep_fun %>% 
  tok_fun %>% 
  itoken(ids = test$id,  progressbar = FALSE)
NFOLDS = 4
vocab = create_vocabulary(it_train, ngram = c(1L, 3L))
vocab = vocab %>% prune_vocabulary(term_count_min = 10, 
                                   doc_proportion_max = 0.5,
                                   doc_proportion_min = 0.01,vocab_term_max = 5000)
trigram_vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, trigram_vectorizer)61.1 Inspect the vocabulary
vocab61.2 Inspect the Document Term Matrix
dim(dtm_train)61.3 TF-IDF
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_train_tfidf = fit_transform(dtm_train, tfidf)
# tfidf modified by fit_transform() call!
# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf = create_dtm(it_test, trigram_vectorizer)
dtm_test_tfidf = transform(dtm_test_tfidf, tfidf)61.4 Build the Multinomial Logistic Regression Model
glmnet_classifier = cv.glmnet(x = dtm_train_tfidf, y = train[['Sentiment']], 
                              family = 'multinomial', 
                              alpha = 1,
                              type.measure = "class",
                              nfolds = NFOLDS,
                              thresh = 1e-3,
                              maxit = 1e3)61.5 Predict using the Multinomial Logistic Regression Model
predictions = data.frame(PhraseId=test$PhraseId,Sentiment = predict(glmnet_classifier, dtm_test_tfidf,type="class"))
options(scipen = 999)
predictions <- predictions %>%
  rename(Sentiment = X1)
predictions$PhraseId = as.numeric(predictions$PhraseId)
predictions$Sentiment = as.numeric(predictions$Sentiment)
write.csv(predictions, 'glmnet.csv', row.names = F)