Chapter 23 Modelling using the text2vec package
We create a vocabulary-based DTM. Here we collect unique terms from all documents and mark each of them with a unique ID using the create_vocabulary() function. We use an iterator to create the vocabulary. We also prune the vocabulary to reduce the terms in the matrix.
prep_fun = function(x) {
stringr::str_replace_all(tolower(x), "[^[:alpha:]]", " ")
}
tok_fun = word_tokenizer
it_train = itoken(train$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$id,
progressbar = FALSE)
it_test = test$text %>%
prep_fun %>%
tok_fun %>%
itoken(ids = test$id, progressbar = FALSE)
NFOLDS = 4
vocab = create_vocabulary(it_train, ngram = c(1L, 3L))
vocab = vocab %>% prune_vocabulary(term_count_min = 10,
doc_proportion_max = 0.5,
doc_proportion_min = 0.01)
trigram_vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, trigram_vectorizer)
dtm_test = create_dtm(it_test, trigram_vectorizer)
23.1 Inspect the vocabulary
vocab
## Number of docs: 19579
## 0 stopwords: ...
## ngram_min = 1; ngram_max = 3
## Vocabulary:
## term term_count doc_count
## 1: black 198 196
## 2: until 200 200
## 3: over_the 201 198
## 4: spirit 202 198
## 5: itself 203 202
## ---
## 325: was 6647 5493
## 326: in 9458 7101
## 327: a 10750 7507
## 328: i 10811 7075
## 329: to 12843 8665
23.2 Inspect the Document Term Matrix
dim(dtm_train)
## [1] 19579 329
23.3 Build the Multinomial Logistic Regression Model
dtm_train <- cBind(train$len, dtm_train)
dtm_test <- cBind(test$len, dtm_test)
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['author']],
family = 'multinomial',
alpha = 1,
type.measure = "class",
nfolds = NFOLDS,
thresh = 1e-3,
maxit = 1e3)
23.4 Predict using the Multinomial Logistic Regression Model
preds = data.frame(id=test$id,predict(glmnet_classifier, dtm_test, type = 'response'))
names(preds)[2] <- "EAP"
names(preds)[3] <- "HPL"
names(preds)[4] <- "MWS"
write_csv(preds, "glmnet_benchmark_vocab_3N-grams.csv")