Chapter 20 Modelling with XGBoost

We try to predict whether the lines are written by a specific author.

We do Cross Validation using Caret package.Lastly we wish to examine the feature importance of the variables. This is shown in the flipped bar chart.

We then use the model to predict the authors.

makeFeatures <- function(train) {
  
  labeledTerms = makeDTM(train)
  
  ## Preparing the features for the XGBoost Model
  
  features <- colnames(labeledTerms)
  
  for (f in features) {
    if ((class(labeledTerms[[f]])=="factor") || (class(labeledTerms[[f]])=="character")) {
      levels <- unique(labeledTerms[[f]])
      labeledTerms[[f]] <- as.numeric(factor(labeledTerms[[f]], levels=levels))
    }
  }
  
  return(labeledTerms)
}

labeledTerms = makeFeatures(train)

labeledTermsTest = makeFeatures(test)

colnamesSame = intersect(colnames(labeledTerms),colnames(labeledTermsTest))

labeledTerms = labeledTerms[ , (colnames(labeledTerms) %in% colnamesSame)]
labeledTermsTest = labeledTermsTest[ , (colnames(labeledTermsTest) %in% colnamesSame)]

20.1 Add features

We add the following features to the model

Number of words in the line
Sentiment Score per line

labeledTerms$len = train$len
labeledTermsTest$len = test$len

labeledTerms$sentiScore = getSentimentScore(train)
labeledTermsTest$sentiScore = getSentimentScore(test)

20.2 Creating the XGBoost Model

labeledTerms$author = as.factor(train$author)
levels(labeledTerms$author) = make.names(unique(labeledTerms$author))

formula = author ~ .

#Please uncomment if you want to do Cross Validation
# fitControl <- trainControl(method="cv",number = 5,classProbs=TRUE, summaryFunction=mnLogLoss)
# 
# xgbGrid <- expand.grid(nrounds = 500,
#                        max_depth = 3,
#                        eta = .05,
#                        gamma = 0,
#                        colsample_bytree = .8,
#                        min_child_weight = 1,
#                        subsample = 1)

fitControl <- trainControl(method="none",classProbs=TRUE, summaryFunction=mnLogLoss)

xgbGrid <- expand.grid(nrounds = 500,
                       max_depth = 3,
                       eta = .05,
                       gamma = 0,
                       colsample_bytree = .8,
                       min_child_weight = 1,
                       subsample = 1)


set.seed(13)

AuthorXGB = train(formula, data = labeledTerms,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="LogLoss", maximize=FALSE)

importance = varImp(AuthorXGB)

varImportance <- data.frame(Variables = row.names(importance[[1]]), 
                            Importance = round(importance[[1]]$Overall,2))

# Create a rank variable based on importance
rankImportance <- varImportance %>%
  mutate(Rank = paste0('#',dense_rank(desc(Importance)))) %>%
  head(20)

rankImportancefull = rankImportance

ggplot(rankImportance, aes(x = reorder(Variables, Importance), 
                           y = Importance)) +
  geom_bar(stat='identity',colour="white", fill = fillColor) +
  geom_text(aes(x = Variables, y = 1, label = Rank),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Variables', title = 'Relative Variable Importance') +
  coord_flip() + 
  theme_bw()

AuthorXGB

## eXtreme Gradient Boosting 
## 
## 19579 samples
##   850 predictor
##     3 classes: 'EAP', 'HPL', 'MWS' 
## 
## No pre-processing
## Resampling: None