Chapter 11 Versions of the Hidden Gems

We investigate the number of versions of a Hidden Gem and the following plot shows the distribution of the maximum version of a Hidden Gem.

  • Minimum number of Maximum Version Number is 1

  • Maximum number of Maximum Version Number is 362

  • Median number of Maximum Version Number is 11

95% Confidence Interval for a Hidden Gem Maximum Version Number is between 16 and 24

kvcs_versions = left_join(kvcs,kernel_versions,by = c("KernelId" = "ScriptId"))

kvcs_versions_max_df = kvcs_versions %>%
  group_by(KernelId) %>%
  summarise(MaxVersionNumber = max(VersionNumber,na.rm = TRUE))


kvcs_versions_info = left_join(kvcs,kvcs_versions_max_df)

kvcs_versions_info = kvcs_versions_info %>%
  filter(MaxVersionNumber > 0)

11.1 Box Plot [ Removing Outliers ]

kvcs_versions_info %>%
  filter(!is.na(MaxVersionNumber)) %>%
  filter(MaxVersionNumber < 40) %>%
      ggplot(aes(x = MaxVersionNumber, fill = fillColor2)) +
      geom_boxplot() + 
      labs(x= ' [MaxVersionNumber]',y = ' [Count]', title = paste("Distribution of", ' MaxVersionNumber ')) +
      theme_fivethirtyeight() +
  theme(legend.position = "none") 

11.2 Density Plot

kvcs_versions_info %>%
  filter(!is.na(MaxVersionNumber)) %>%
      ggplot(aes(x = MaxVersionNumber)) +
      geom_density(fill = "orange", bw = 0.01) +
      labs(x= ' [MaxVersionNumber]',y = ' [Count]', title = paste("Distribution of", ' MaxVersionNumber ')) +
  guides(fill=guide_legend(title="MaxVersionNumber Distribution")) +
      theme_fivethirtyeight()

11.3 Histogram Plot

kvcs_versions_info %>%
  filter(!is.na(MaxVersionNumber)) %>%
      ggplot(aes(x = MaxVersionNumber, fill = fillColor2)) +
      geom_histogram() +
      labs(x= 'Maximum Version Number',y = 'Count', title = paste("Distribution of", ' Maximum Version Number ')) +
      theme_fivethirtyeight()

11.4 Summary Statistics for Maximum Version Number

kvcs_versions_info = kvcs_versions_info %>%
  filter(MaxVersionNumber > 0)

summary(kvcs_versions_info$MaxVersionNumber)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00   11.00   19.76   22.00  362.00

11.5 95% Confidence Interval for Hidden Gems Maximum Version Number

# Calculate the mean and standard error
l.model <- lm(MaxVersionNumber ~ 1, kvcs_versions_info)

# Calculate the confidence interval
confint(l.model, level=0.95)
##                2.5 %   97.5 %
## (Intercept) 15.89662 23.63077