Chapter 61 Modelling using the text2vec package
We create a vocabulary-based DTM. Here we collect unique terms from all documents and mark each of them with a unique ID using the create_vocabulary() function. We use an iterator to create the vocabulary. We also prune the vocabulary to reduce the terms in the matrix.
prep_fun = function(x) {
stringr::str_replace_all(tolower(x), "[^[:alpha:]]", " ")
}
tok_fun = word_tokenizer
it_train = itoken(train$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$id,
progressbar = FALSE)
it_test = test$text %>%
prep_fun %>%
tok_fun %>%
itoken(ids = test$id, progressbar = FALSE)
NFOLDS = 4
vocab = create_vocabulary(it_train, ngram = c(1L, 3L))
vocab = vocab %>% prune_vocabulary(term_count_min = 10,
doc_proportion_max = 0.5,
doc_proportion_min = 0.01,vocab_term_max = 5000)
trigram_vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, trigram_vectorizer)
61.1 Inspect the vocabulary
vocab
61.2 Inspect the Document Term Matrix
dim(dtm_train)
61.3 TF-IDF
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_train_tfidf = fit_transform(dtm_train, tfidf)
# tfidf modified by fit_transform() call!
# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf = create_dtm(it_test, trigram_vectorizer)
dtm_test_tfidf = transform(dtm_test_tfidf, tfidf)
61.4 Build the Multinomial Logistic Regression Model
glmnet_classifier = cv.glmnet(x = dtm_train_tfidf, y = train[['Sentiment']],
family = 'multinomial',
alpha = 1,
type.measure = "class",
nfolds = NFOLDS,
thresh = 1e-3,
maxit = 1e3)
61.5 Predict using the Multinomial Logistic Regression Model
predictions = data.frame(PhraseId=test$PhraseId,Sentiment = predict(glmnet_classifier, dtm_test_tfidf,type="class"))
options(scipen = 999)
predictions <- predictions %>%
rename(Sentiment = X1)
predictions$PhraseId = as.numeric(predictions$PhraseId)
predictions$Sentiment = as.numeric(predictions$Sentiment)
write.csv(predictions, 'glmnet.csv', row.names = F)