Chapter 84 Modelling with XGBoost

We try to predict whether the Inspection Result would result in Out Of Business or not. For this we use the following features
* Curated list of words in the Violations Text. Each word is a feature for the Model.
* Latitude
* Longitude
* Year of Inspection
* Month of Inspection

For ease of execution, we have taken only 5000 samples for the modelling Exercise.

We do Cross Validation using Caret package.

You can tune the parameters in your own machine for better results. The accuracy obtained through these parameters is quite good 0.9166024.

Lastly we wish to examine the feature importance of the variables. This is shown in the flipped bar chart.

FoodInspectionsSample =  FoodInspections %>%
  sample_n(5e3)

corpus = Corpus(VectorSource(FoodInspectionsSample$Violations))

# Pre-process data
corpus <- tm_map(corpus, tolower)

corpus <- tm_map(corpus, removePunctuation)

corpus <- tm_map(corpus, removeWords, c(stopwords("english"),"comments"))

corpus <- tm_map(corpus, removeWords, UniqueLowIDF[1:50])

corpus <- tm_map(corpus, stemDocument)

dtm = DocumentTermMatrix(corpus)


# Remove sparse terms
dtm = removeSparseTerms(dtm, 0.997)

# Create data frame
labeledTerms = as.data.frame(as.matrix(dtm))

FoodInspectionsSample = FoodInspectionsSample %>%
  mutate(isOOB = 0)

FoodInspectionsSample = FoodInspectionsSample %>%
  dplyr::rename(InspectionDate = `Inspection Date`) %>%
  mutate(yr = year(mdy(InspectionDate))) %>%
  mutate(month_of_year = month(mdy(InspectionDate))) 

FoodInspectionsSample = FoodInspectionsSample %>%
  mutate(isOOB=replace(isOOB, Results == "Out of Business", 1)) %>%
  as.data.frame()

labeledTerms$isOOB = as.factor(FoodInspectionsSample$isOOB)
labeledTerms$Latitude = as.numeric(FoodInspectionsSample$Latitude)
labeledTerms$Longitude = as.numeric(FoodInspectionsSample$Longitude)
labeledTerms$Year = as.numeric(FoodInspectionsSample$yr)
labeledTerms$Month = as.numeric(FoodInspectionsSample$month_of_year)



## Preparing the features for the XGBoost Model

features <- colnames(labeledTerms)

for (f in features) {
  if ((class(labeledTerms[[f]])=="factor") || (class(labeledTerms[[f]])=="character")) {
    levels <- unique(labeledTerms[[f]])
    labeledTerms[[f]] <- as.numeric(factor(labeledTerms[[f]], levels=levels))
  }
}


## Creating the XGBoost Model

labeledTerms$isOOB = as.factor(labeledTerms$isOOB)

formula = isOOB ~ .

fitControl <- trainControl(method="cv",number = 3)

xgbGrid <- expand.grid(nrounds = 10,
                       max_depth = 3,
                       eta = .05,
                       gamma = 0,
                       colsample_bytree = .8,
                       min_child_weight = 1,
                       subsample = 1)


set.seed(13)

OOBXGB = train(formula, data = labeledTerms,
                       method = "xgbTree",trControl = fitControl,
                       tuneGrid = xgbGrid,na.action = na.pass)

importance = varImp(OOBXGB)



varImportance <- data.frame(Variables = row.names(importance[[1]]), 
                            Importance = round(importance[[1]]$Overall,2))

# Create a rank variable based on importance
rankImportance <- varImportance %>%
  mutate(Rank = paste0('#',dense_rank(desc(Importance)))) %>%
  head(20)

rankImportancefull = rankImportance

ggplot(rankImportance, aes(x = reorder(Variables, Importance), 
                           y = Importance)) +
  geom_bar(stat='identity',colour="white", fill = fillColor) +
  geom_text(aes(x = Variables, y = 1, label = Rank),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Variables', title = 'Relative Variable Importance') +
  coord_flip() + 
  theme_bw()

All the factors affecting the decision of Inspection Result along with their ranks is provided below