Chapter 23 Modelling using the text2vec package

We create a vocabulary-based DTM. Here we collect unique terms from all documents and mark each of them with a unique ID using the create_vocabulary() function. We use an iterator to create the vocabulary. We also prune the vocabulary to reduce the terms in the matrix.

prep_fun  = function(x) {
  stringr::str_replace_all(tolower(x), "[^[:alpha:]]", " ")
}

tok_fun = word_tokenizer

it_train = itoken(train$text, 
                  preprocessor = prep_fun, 
                  tokenizer = tok_fun, 
                  ids = train$id, 
                  progressbar = FALSE)



it_test = test$text %>% 
  prep_fun %>% 
  tok_fun %>% 
  itoken(ids = test$id,  progressbar = FALSE)


NFOLDS = 4
vocab = create_vocabulary(it_train, ngram = c(1L, 3L))
vocab = vocab %>% prune_vocabulary(term_count_min = 10, 
                                   doc_proportion_max = 0.5,
                                   doc_proportion_min = 0.01)

trigram_vectorizer = vocab_vectorizer(vocab)

dtm_train = create_dtm(it_train, trigram_vectorizer)
dtm_test = create_dtm(it_test, trigram_vectorizer)

23.1 Inspect the vocabulary

vocab
## Number of docs: 19579 
## 0 stopwords:  ... 
## ngram_min = 1; ngram_max = 3 
## Vocabulary: 
##          term term_count doc_count
##   1:    black        198       196
##   2:    until        200       200
##   3: over_the        201       198
##   4:   spirit        202       198
##   5:   itself        203       202
##  ---                              
## 325:      was       6647      5493
## 326:       in       9458      7101
## 327:        a      10750      7507
## 328:        i      10811      7075
## 329:       to      12843      8665

23.2 Inspect the Document Term Matrix

dim(dtm_train)
## [1] 19579   329

23.3 Build the Multinomial Logistic Regression Model

dtm_train <- cBind(train$len, dtm_train)
dtm_test <- cBind(test$len, dtm_test)

glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['author']], 
                              family = 'multinomial', 
                              alpha = 1,
                              type.measure = "class",
                              nfolds = NFOLDS,
                              thresh = 1e-3,
                              maxit = 1e3)

23.4 Predict using the Multinomial Logistic Regression Model

preds = data.frame(id=test$id,predict(glmnet_classifier, dtm_test, type = 'response'))
names(preds)[2] <- "EAP"
names(preds)[3] <- "HPL"
names(preds)[4] <- "MWS"

write_csv(preds, "glmnet_benchmark_vocab_3N-grams.csv")