The tidymodels team fielded a short survey to gather community feedback on development priorities and possible next steps in 2022. This report summarizes the survey results.

tl;dr

  • Over 600 people responded to our survey (a significant increase from last year).
  • Close to equal proportions say they have used tidymodels packages a few times and many times.
  • About 60% of respondents say they work in industry.
  • The priorities given the most weight by our respondents (across most groups) include supervised feature selection, model fairness metrics, and probability calibrations.
  • Priorities involving H2O and spatial analysis were among the most likely to be given zero weight.

Exploring the data

Let’s start by exploring the characteristics of the survey respondents.

library(tidyverse)
library(qualtRics)
library(glue)

survey_id <- "SV_3gtKaK8G1Z1JC50"

survey_raw <- fetch_survey(survey_id, verbose = FALSE, force_request = TRUE) %>%
  filter(Status != "Survey Preview", Finished)

survey_select <- survey_raw %>%
  select(Q5_1:Q5_12, Q1002, Q12)

metadata_raw <- metadata(survey_id)

choice_text <- metadata_raw$questions$QID2001$choices %>% 
  map_chr("choiceText")

question_text <- survey_questions(survey_id) %>%
  filter(qname %in% c("Q1002", "Q12"))

labels_df <-
  enframe(choice_text) %>% 
  transmute(qname = glue("Q5_{name}"), 
            question = map(value, xml2::read_html)) %>% 
  mutate(question = map(question, xml2::as_list), 
         question = map_chr(question, ~.$html$body$strong[[1]])) %>%
  bind_rows(question_text)

tidy_survey <- survey_select %>% 
  pivot_longer(Q5_1:Q5_12, names_to = "qname", values_to = "dollars") %>% 
  inner_join(labels_df) %>%
  filter(question != "Other")

survey_raw %>%
  count(StartDate = as.Date(StartDate)) %>%
  ggplot(aes(StartDate, n)) +
  geom_col(alpha = 0.8) +
  labs(x = NULL, 
       y = "Number of survey responses",
       title = "Survey responses over time",
       subtitle = glue("There are ", {nrow(survey_raw)}, " total responses"))

survey_raw %>%
  mutate(Q1002 = fct_relabel(Q1002, str_wrap, width = 20)) %>%
  count(Q1002) %>%
  ggplot(aes(x = n, y = Q1002)) +
  geom_col(alpha = 0.8) +
  scale_x_continuous(expand = c(0,0)) +
  labs(x = "Number of survey responses", 
       y = NULL,
       title = "Familiarity with tidymodels",
       subtitle = glue("Of the respondents, ", 
                       {percent(mean(str_detect(survey_raw$Q1002, "a few times")))}, 
                       " say they have used tidymodels a few times"))

survey_raw %>%
  filter(`Duration (in seconds)` < 5e4) %>%
  mutate(Q1002 = fct_relabel(Q1002, str_wrap, width = 20)) %>%
  ggplot(aes(Q1002, `Duration (in seconds)`, fill = Q1002)) +
  geom_boxplot(show.legend = FALSE, alpha = 0.7) +
  scale_y_log10() +
  labs(x = NULL,
       y = "Time to take the survey (seconds)",
       title = "Survey length in seconds",
       subtitle = glue(
         "The median time to take the survey was ",
         {round(median(survey_raw$`Duration (in seconds)`) / 60, 2)},
         " minutes")
  )

survey_raw %>%
  mutate(Q12 = fct_relabel(Q12, str_wrap, width = 20)) %>%
  count(Q12) %>%
  ggplot(aes(x = n, y = Q12)) +
  geom_col(alpha = 0.8) +
  scale_x_continuous(expand = c(0,0)) +
  labs(x = "Number of survey responses", 
       y = NULL,
       title = "Current role",
       subtitle = glue("Of the respondents, ", 
                       {percent(mean(str_detect(survey_raw$Q12, "in industry")))}, 
                       " say they work in industry"))

Perspectives on priorities

The main question on the survey asked:

If you had a hypothetical $100 to spend on tidymodels development, how would you allocate those resources right now?

The possible priorities were presented in a randomized order to respondents, except for the “Other” option at the bottom.

Mean dollars allocated

Overall

tidy_survey %>%
  mutate(question = str_wrap(question, width = 25)) %>%
  group_by(question) %>%
  summarise(dollars_mean = mean(dollars)) %>%
  mutate(question = fct_reorder(question, dollars_mean)) %>%
  ggplot(aes(dollars_mean, question)) +
  geom_col(alpha = 0.8) +
  scale_x_continuous(labels = dollar_format(),
                     expand = c(0,0)) +
  labs(x = "Mean hypothetical dollars allocated",
       y = NULL,
       title = "What are the average dollars allocated to each priority?",
       subtitle = "Supervised feature selection and model fairness metrics had the highest mean scores")

By experience

library(tidytext)

tidy_survey %>%
  mutate(question = str_wrap(question, width = 25),
         Q1002 = fct_relabel(Q1002, str_wrap, width = 50)) %>%
  group_by(Q1002, question) %>%
  summarise(dollars_mean = mean(dollars)) %>%
  ungroup %>%
  mutate(question = reorder_within(question, dollars_mean, as.character(Q1002))) %>%
  ggplot(aes(dollars_mean, question, fill = Q1002)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~Q1002, scales = "free_y") +
  scale_x_continuous(labels = dollar_format(),
                     expand = c(0,0)) +
  scale_y_reordered() +
  labs(x = "Mean hypothetical dollars allocated",
       y = NULL,
       title = "What are the average dollars allocated to each priority?",
       subtitle = "There are differences for folks who have never used tidymodels")

By role

tidy_survey %>%
  mutate(question = str_wrap(question, width = 25),
         Q12 = fct_relabel(Q12, str_wrap, width = 40)) %>%
  group_by(Q12, question) %>%
  summarise(dollars_mean = mean(dollars)) %>%
  ungroup %>%
  mutate(question = reorder_within(question, dollars_mean, as.character(Q12))) %>%
  ggplot(aes(dollars_mean, question, fill = Q12)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~Q12, scales = "free_y") +
  scale_x_continuous(labels = dollar_format(),
                     expand = c(0,0)) +
  scale_y_reordered() +
  labs(x = "Mean hypothetical dollars allocated",
       y = NULL,
       title = "What are the average dollars allocated to each priority?",
       subtitle = "Supervised feature selection had the highest mean score for all groups")

Don’t spend it all in one place 💵

How many people gave their entire $100 to one priority? Very few:

tidy_survey %>% 
  filter(dollars > 99) %>% 
  count(question, sort = TRUE) %>% 
  kable(col.names = c("Priority", "Number of respondents allocating *all*"))
Priority Number of respondents allocating all
Spatial analysis models and methods 8
Supervised feature selection 5
H2O.ai support  4
Probability calibration (post modeling) 4
Model fairness analysis and metrics 3
Better serialization tools 2

Priorities least likely to be chosen

What priorities were people more likely to allocate $0 to?

Overall

tidy_survey %>% 
  mutate(question = str_wrap(question, width = 25)) %>%
  group_by(question) %>% 
  summarise(none = sum(dollars < 1)) %>%
  ggplot(aes(none, fct_reorder(question, none))) +
  geom_col(alpha = 0.8) +
  scale_x_continuous(expand = c(0,0)) +
  labs(x = "Number of people who allocated nothing",
       y = NULL,
       title = "Which priorities were chosen least often?",
       subtitle = "H2O support and spatial analysis methods were chosen less often")

By experience

tidy_survey %>% 
  mutate(question = str_wrap(question, width = 25),
         Q1002 = fct_relabel(Q1002, str_wrap, width = 50)) %>%
  group_by(Q1002, question) %>%
  summarise(none = sum(dollars < 1)) %>%
  ungroup %>%
  mutate(question = reorder_within(question, none, as.character(Q1002))) %>%
  ggplot(aes(none, question, fill = Q1002)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~Q1002, scales = "free") +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_reordered() +
  labs(x = "Number of people who allocated nothing",
       y = NULL,
       title = "Which priorities were chosen least often?",
       subtitle = "The group that has never used tidymodels is the most different")

By role

tidy_survey %>%
  mutate(question = str_wrap(question, width = 25),
         Q12 = fct_relabel(Q12, str_wrap, width = 40)) %>%
  group_by(Q12, question) %>%
  summarise(none = sum(dollars < 1)) %>%
  ungroup %>%
  mutate(question = reorder_within(question, none, as.character(Q12))) %>%
  ggplot(aes(none, question, fill = Q12)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~Q12, scales = "free") +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_reordered() +
  labs(x = "Number of people who allocated nothing",
       y = NULL,
       title = "Which priorities were chosen least often?",
       subtitle = "Folks in academia and industry are less different than I thought")

Other answers

We offered respondents the opportunity to give us their own ideas for priorities as well. What kinds of options did respondents suggest?

library(DT)
survey_raw %>%
  filter(!is.na(Q5_12_TEXT)) %>%
  arrange(Q1002) %>%
  select(Q1002, Q5_12_TEXT) %>%
  datatable(colnames = c("Familiarity with tidymodels",
                         "Suggested priority"),
            options = list(pageLength = 25))

Some of these suggestions cover work already planned or in process (survival analysis, deployment, case weights) and some others focus on areas we have already invested in, at least some (model explainability, butcher, torch). These highlight areas where we can develop impactful documentation and/or future work.