Data Loading and recoding

  • Total (willing) Responses: 48

Stratification variables

We stratify based on 4 variables of interest:
1. toolType = What type of scientific software/tool did you work on? Please answer for the single most developed/mature tool.
2. dataType = How would you classify your scientific software/tool?
3. experience = How many projects related to developing scientific software tools have you been involved in?
4. role = What is your current role on the tool development/maintenance projects (choose all that apply)?

Creating simplified Data Type variable

#count(wdat, dataType)
wdat <- wdat %>% 
  mutate(dataType1 = case_when(
    dataType == "Omics - proteomics, genomics, metabolomics" ~ "Omics",
    dataType == "Supports multiple types of data" ~ "Multiple Data Types",
    TRUE ~ dataType)) %>%
  ## Those with multiple to be "Multiple Data Types"
  mutate(dataType1 = case_when(
    str_detect(dataType1, "," ) ~ "Multiple Data Types",
    str_detect(dataType1, "Physics|algorithm|Visualization") ~ "Other",
    TRUE ~ dataType1))

wdat %>% count(dataType1, sort = T) %>% DT::datatable()

Creating simplified Tool Type variable

wdat %<>% 
  mutate(multi_type = case_when(str_detect(toolType, ",") ~ "Yes", TRUE ~ "No"),
         Suite = case_when(str_detect(toolType, "Suite") ~ "Yes", TRUE ~ "No"))

## Checking
complex <- wdat %>% 
  filter(Suite != "Yes") %>% 
  filter(multi_type == "Yes") %>%
  select(toolType, Link) %>% count(toolType)

wdat %<>% 
  mutate(toolType_un = case_when(
    str_detect(toolType, "Suite") ~ "Suite",
    str_detect(toolType, "Platform") ~ "Platform",
    str_detect(toolType, "Command|Stand alone") & str_detect(toolType, "GUI")~ "Desktop app and Command line",
    str_detect(toolType, "Web") ~ "Web-based tool",
    str_detect(toolType, "Jupyter|Python") ~ "Jupyter/Python",
    str_detect(toolType, "R") ~ "R",
    TRUE ~ toolType)) %>%
  separate(toolType_un, into = c("toolType_un", "description"), sep =" - ", fill = "right") %>% 
  mutate(toolType_un = str_trim(toolType_un, "both")) 

#wdat %>% count(toolType_un)

wdat %<>% 
  mutate(toolType1 = case_when(
    str_detect(toolType_un, "Command|Desktop") ~ "Desktop App/Command line/scripts",
    TRUE ~ toolType_un
  ))

## creating factors
wdat %<>% 
  mutate(experience = factor(experience, levels = c("1", "2-4", "5-9", "10 or more")))

wdat %>% count(toolType1, sort = T) %>% DT::datatable()

Graphical Analyses

## Addition to our wrangling arsenal: 
## Split a delimited string column into multiple string columns without specifying number of columns in advance
split_string_multiple <- function(column, pattern = ",", prefix = "ans") {
  strings <- str_split_fixed(column, pattern, n = Inf)
  strings[which(strings == "")] <- NA              # Fill the empty spaces "" returned by filling the matrix to the right, with NAs
  strings <- as_tibble(strings)                    # convert into a tibble
  m <- ncol(strings)                               # where m = number of columns of 'strings'
  names(strings) <- paste(prefix, 1:m, sep = "_")  # rename the columns
  return(strings)
}

Q1: Distribution of most mature Tool types (marginal and stratified)

  • Recoded: All Python responses to Python/Jupiter Notebooks.
wdat %>% count(toolType1) %>% 
  filter(toolType1 != "Not Sure") %>% 
  ggplot(aes(y = fct_reorder(toolType1, n), x = n, fill = toolType1)) +
  geom_col(position = "dodge", width = .6) + 
  geom_text(aes(label = n), hjust = 1.5, color = "white") +
  theme_light() +
  theme(legend.position = "none",
        panel.grid = element_blank()) + 
  labs(x = "Frequency", y = "Software type for most mature tool of the respondant")

wdat %>% filter(toolType1 != "Not Sure") %>% 
  ggplot(aes(y = fct_infreq(toolType1), fill = toolType1)) +
  geom_bar( width = .6) + 
  facet_wrap(~experience) +
  theme_light() +
  theme(strip.background = element_rect(fill="steelblue4"),
        strip.text = element_text(colour = 'white', size = 12), 
        legend.position = "none",
        panel.grid = element_blank()) +
  labs(x = "Frequency", y = "Software type for most mature tool of the respondant", subtitle = "Facet by experience (# of tools)")

  • Since the respondents were allowed to choose multiple options, we also investigated how many people chose single and multiple answers and their frequencies.
## Since this is a multiple choice question, we need to separate the (unequal) number of answers given by each respondent
## Note: some people selected Other and put in their version, we need to identify those as well
## Defined strings = strings up to the first comma
## First, we create a variable containing all strings to match with response data. We take names up to the first comma, which will be our separator.
## **Notice double spaces in names of certain strings: Computing, Command-line, 
SciToolsType.defined <- c("Computing Web-based Platform - A website providing computing resources and  possibly data",
"Web-based tool - A tool that runs in your web browser but doesn't necessarily provide access to data",
"Bioconductor R packages",
"Other R packages (not Bioconductor)",
"Jupyter Notebooks",
"Desktop Application - A tool that runs on your desktop environment with a GUI",
"Database/Ontology",
"Plug-in - A software component encapsulating a set of related functions, which are not standalone, i.e. depend upon other software for its use, e.g. a Javascript widget, or a plug-in, extension add-on etc. that extends the function of some existing to",
"Command-line tool/Other scripts - A tool  that works with a command-line interface or environment",
"Suite - multiple tools that work together")

## Residual strings = all comma separated strings after the first comma 
SciToolsType.residual_string <- c("which are not standalone", "i.e. depend upon other software for its use", "e.g. a Javascript widget", 
                                  "or a plug-in", "extension add-on etc. that extends the function of some existing to",
                                  "where Jupyter notebooks are a component")

## Find all relevant unique strings in a vector
SciToolsType.vec <- wdat %>% select(Timestamp, toolType)  %>%
  bind_cols(split_string_multiple(.$toolType, ",", "ans")) %>%     # split string columns into multiple columns
  select(-toolType) %>%                                            # remove original variable
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  # take data into long form to aid filtering operations
  filter(!is.na(vals)) %>%                                         # filter NA values
  mutate(vals = str_trim(vals, side = "both")) %>%                 # remove leading and trailing white spaces in the strings
  count(vals) %>% filter(!vals %in% c(SciToolsType.residual_string)) %>% pull(vals)  # remove residual strings from the vectors
## The drawback is when the user provides comma-separated response. Then we need to manually include that in residual strings

############RECODE 01: All Python responses to Python/Jupiter Notebooks###################### 
## 1. Remove Not Sure
## 2. All Python responses to Python/Jupiter Notebooks

## Consider each timestamp (all unique) as an identifier
tab.toolType <- wdat %>% select(Timestamp, toolType)  %>%
  bind_cols(split_string_multiple(.$toolType, ",", "ans")) %>%     # split string columns into multiple columns
  select(-toolType) %>%                                            # remove original variable
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  # take data into long form to aid filtering operations
  filter(!is.na(vals)) %>%                                         # filter NA values
  mutate(vals = str_trim(vals, side = "both")) %>%                 # remove leading and trailing white spaces in the strings
  filter(vals %in% SciToolsType.vec) %>% arrange(Timestamp, Answer) %>%   # keep only the actual tool names (defined up to first comma)
  group_by(Timestamp) %>%  mutate(`answer_count` = 1:n()) %>% ungroup() %>%  # create a variable to count the number of answers by each respondent
  mutate(Answer = paste0("ans_", answer_count)) %>% 
  ## RECODE 01: Python issue and Not Sure removal
  filter(!vals %in% c("Not Sure")) %>% 
  mutate(vals = case_when(str_detect(vals, 'Python|Jupyter') ~ "Python/Jupyter Notebooks",
                           TRUE ~ vals))
ggpubr::ggarrange(
## Frequency distribution of the categories
tab.toolType %>% separate(col = vals, into = c("vals", "fat"), sep = " - ") %>% select(-fat) %>% 
  mutate(vals2 = case_when(
    vals %in% c("Bioconductor R packages", "Other R packages (not Bioconductor)") ~ "R packages",
    vals %in% c("Command-line tool/Other scripts") ~ "Command-line tools/Scripts",
    vals %in% c("Computing Web-based Platform", "Web API") ~ "Computing platforms",
    vals %in% c("Desktop Application") ~ "Desktop Application",
    vals %in% "Database/Ontology" ~ "Databases/Ontology",
    vals %in% "Plug-in" ~ "Plug-in",
    vals %in% "Python/Jupyter Notebooks" ~ "Python/Jupyter",
    vals %in% "Stand alone C++ code" ~ "C++ code",
    vals %in% "Suite" ~ "Suite",
    TRUE ~ vals
  )) %>% 
  ggplot(aes(y = forcats::fct_infreq(vals2), fill = vals2)) + 
  geom_bar( width = 0.6, show.legend = F) + 
  geom_text(stat = 'count', aes(label = after_stat(count)), hjust = 1.2, color = "white") +
  scale_x_discrete(limits = rev(levels(wdat$vals2))) +
  labs(x = "Count", y = "Tool types") +
  scale_fill_manual(values = c("#7CAE00", "#00BFC4", "#C77CFF", "#B2D235", "#A65628", "#FF7F00", "#1F77B4", "#D62728", "#9467BD", "#f39de1")) +
  theme_light()+
  theme(panel.grid = element_blank()),

## How many single and multiple responses
tab.toolType %>% group_by(Timestamp) %>%
  summarise(answers_freq = max(answer_count)) %>% ungroup() %>% 
  ggplot(aes(answers_freq, fill = factor(answers_freq))) +
  geom_bar( width = 0.6, show.legend = F) + 
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = 1.2, color = "white") +
  scale_x_continuous(breaks = 1:6) +
  labs(y = "count", x = "Frequency of number of categories selected by a respondent") +
  scale_fill_manual(values = c("#7CAE00",  "#A65628", "#FF7F00", "#1F77B4", "#94a7BD", "#f39de1")) +
  theme_light() + 
  theme(panel.grid = element_blank()),
  ncol = 1, heights = c(1.5, 1)
)

Q2: Distribution of Data types for the most mature tools of respondents.

wdat %>% count(dataType1) %>%
  ggplot(aes(y = fct_reorder(dataType1, n), x = n, fill = dataType1)) +
  geom_col(position = "dodge", width = .6) +
  geom_text(aes(label = n), hjust = 1.5, color = "white") +
  theme_light() +
  theme(legend.position = "none") + 
  labs(x = "Frequency", y = "Data type for the most mature software of the respondant") +
  theme(panel.grid = element_blank())

wdat %>%
  ggplot(aes(y = fct_infreq(dataType1), fill = dataType1)) +
  geom_bar( width = .6) + 
  facet_wrap(~experience) +
  theme_light() +
  theme(strip.background = element_rect(fill="steelblue4"),
        strip.text = element_text(colour = 'white', size = 12), 
        legend.position = "none") +
  labs(x = "Frequency", y = "Data type for the most mature software of the respondant", subtitle = "Facet by experience") +
  theme(panel.grid = element_blank())

  • Since the respondents were allowed to choose multiple options, we also investigated how many people chose single and multiple answers.

  • Recoded All One off “other” responses to multiple data types

  • Merge the other response types into “Multiple data types” category

## Since this is a multiple choice question, we need to separate the (unequal) number of answers given by each respondent
## Note: some people selected Other and put in their version, we need to identify those as well
## Defined strings = strings up to the first comma
DataTypes.defined <- c("Omics - proteomics", "Clinical", "Imaging", "Supports multiple types of data")
## Residual strings = all comma separated strings after the first comma 
DataTypes.residual_strings <- c("genomics", "metabolomics")

## Find all relevant unique strings in a vector
DataTypes.vec <- wdat %>% select(Timestamp, dataType)  %>%
  bind_cols(split_string_multiple(.$dataType, ",", "ans")) %>% select(-dataType) %>%                                            
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
  count(vals) %>% filter(!vals %in% c(DataTypes.residual_strings)) %>% pull(vals)

## Consider each timestamp (all unique) as an identifier
tab.dataType <- wdat %>% select(Timestamp, dataType)  %>%
  bind_cols(split_string_multiple(.$dataType, ",", "ans")) %>% select(-dataType) %>%                                            
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>%                                         
  mutate(vals = str_trim(vals, side = "both")) %>%                
  filter(vals %in% DataTypes.vec) %>% arrange(Timestamp, Answer) %>%   
  group_by(Timestamp) %>%  mutate(`answer_count` = 1:n()) %>% ungroup() %>%  
  mutate(Answer = paste0("ans_", answer_count)) %>% 
  ##################### Recode the one-off other categories ##################
  mutate(vals = case_when(str_detect(vals, 'Omics|Imaging|Clinical') ~ vals,
                          str_detect(vals, 'Visualization|Physics and Biology') ~ "Others",
                          TRUE ~ "Multiple data types")) %>% 
  ## Add the full category names again
  mutate(vals = if_else(vals == "Omics - proteomics", true = "Omics - proteomics, genomics, metabolomics", false = vals))

#wdat$dataType %>% table()
ggpubr::ggarrange(
## Frequency distribution of the categories
tab.dataType %>% separate(col = vals, into = c("vals", "fat"), sep = " - ") %>% select(-fat) %>% 
ggplot(aes(y = forcats::fct_infreq(vals), fill = vals)) + 
  geom_bar(width = 0.6) + 
  geom_text(stat = 'count', aes(label = after_stat(count)), hjust = 1.2, color = "white") +
  scale_x_continuous(breaks = seq(3, 30, by = 3)) +
  scale_fill_manual(values = c("#7CAE00", "#1F77B4", "#D62728", "#9467BD", "#f39de1")) +
  labs(x = "count", y = "Data types") +
  theme_light() + theme(panel.grid.major = element_blank()),

## How many single and multiple responses
tab.dataType  %>% group_by(Timestamp) %>%
  summarise(answers_freq = max(answer_count)) %>% ungroup() %>% 
  ggplot(aes(answers_freq, fill = factor(answers_freq))) +
  geom_bar( width = 0.6, show.legend = F) + 
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = 1.1, color = "white") +
  scale_x_continuous(breaks = 1:6) +
  labs(y = "count", x = "Frequency of number of categories selected by a respondent") +
  scale_fill_manual(values = c("#7CAE00",  "#A65628", "#FF7F00", "#1F77B4", "#94a7BD", "#f39de1")) +
  theme_light() + theme(panel.grid = element_blank()),
  ncol = 1, heights = c(1.5, 1)
)

df.dataType2 <- wdat %>% select(Timestamp, Link,
                roles, experience, dataType) %>% 
  mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>% 
  bind_cols(split_string_multiple(.$dataType, ",", "ans")) %>% 
  select(-dataType) %>% 
  pivot_longer(cols = -c(Timestamp, Link, experience, roles), names_to = "Answer", values_to = "vals") %>% 
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  filter(vals %in% DataTypes.vec) %>% arrange(Timestamp, Answer) %>% 
  separate(col = vals, into = c("vals", "fat"), sep = " - ") %>% select(-fat) %>% 
  #### Recoding other one-off categories
  mutate(vals = case_when(str_detect(vals, 'Omics|Imaging|Clinical') ~ vals,
                          TRUE ~ "Multiple data types")) 

## Plot: data types selections together
df.dataType2 %>% rename(dataType = vals) %>% 
  group_by(Timestamp, Link, experience, roles) %>%
  summarise(selections = str_c(dataType, collapse =", ")) %>% ungroup() %>% 
  mutate(selections = if_else(selections == "Multiple data types, Multiple data types", "Multiple data types", selections)) %>% 
  ggplot(aes(y = fct_infreq(selections), fill = selections)) + 
  geom_bar( width = 0.6, show.legend = F) + 
  geom_text(stat = 'count', aes(label = after_stat(count)), hjust = 1.2, size = 3) +
  labs(x = "Frequency", y = "Data type for the most mature software of the respondant", 
       subtitle = "Facet by experience") +
  facet_wrap(~experience) +
  theme_light() +
  theme(strip.background = element_rect(fill="steelblue4"),
        strip.text = element_text(colour = 'white', size = 12), 
        legend.position = "none",
        panel.grid.major = element_blank()) 

Q3: Distribution of projects involvement/experience.

tab.Experience <- wdat %>% select(Timestamp, experience) %>% 
  mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9"))  

tab.Experience %>% 
  ggplot(aes(x = experience, fill = experience)) + 
  geom_bar(width = 0.5, show.legend = F) + 
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = 1.5, col = "white") +
  labs(y = "count", x = "Experience by number of software projects of respondent",) +
  scale_y_continuous(breaks = seq(0,15, by = 3)) +
  scale_fill_manual(values = c("#229ae1", "#5F77DF","#9467BD", "#4a2a28")) +
  theme_light() + 
  theme(panel.grid.major = element_blank())

Q4. Cross-tabulation between Tool type x Experience x Data type

## Data Type x Experience
wdat %>% 
  ggplot(aes(y = experience, fill = dataType1)) +
  geom_bar(position = "dodge", width = .6) +
  facet_wrap(~dataType1) + 
  theme_light() +
  theme(strip.background = element_rect(fill="steelblue4"),
        strip.text = element_text(colour = 'white', size = 12), 
        legend.position = "none",
        panel.grid = element_blank()) + 
  labs(x = "Frequency", y = "Experience by number of software projects of respondent",
       subtitle = "Facet by data type for the most mature software of the respondant")

## Tool type x Data type
wdat %>% 
  ggplot(aes(y = toolType1, fill = dataType1)) +
  geom_bar(position = "dodge", width = .6) +
  facet_wrap(~dataType1) + 
  theme_light() +
  theme(strip.background = element_rect(fill="steelblue4"),
        strip.text = element_text(colour = 'white', size = 12), 
        legend.position = "none",
        panel.grid = element_blank()) + 
  labs(x = "Frequency", y = "Experience by number of software projects of respondent",
       subtitle = "Facet by Tool type for the most mature software of the respondant")

## Tool type x Experience x Data type
wdat %>% 
  ggplot(aes(y = toolType1, x = experience, col = dataType1)) + 
  geom_jitter(width = .2, height = .3, size = 3, alpha = .7) +
  labs(x = "Experience by number of software projects of respondent", 
       y = "Tool type for the most mature software of the respondant", col = "Data types") +
  scale_color_brewer(palette = "Dark2") +
  theme_light() + 
  labs(subtitle = "Relationship between Experience, Tool Types, and Data Types \nfor the most mature software of the respondant") 

Q5: Distribution of current role on the tool development/maintenance projects.

Also investigated how many people chose single and multiple answers.

Recoded: “Principal Investigator” into Manager/ advisor.

## Since this is a multiple choice question, we need to separate the (unequal) number of answers given by each respondent
## Note: some people selected Other and put in their version, we need to identify those as well
## Defined strings = strings up to the first comma
Roles.defined <- c("Manager/advisor", "Software Developer/Maintainer", "Outreach Specialist", 
                   "Trainee - postdoc", "Trainee - graduate student")
Roles.residual_strings <- NULL

## Find all relevant unique strings in a vector
Roles.vec <- wdat %>% select(Timestamp, roles)  %>%
  bind_cols(split_string_multiple(.$roles, ",", "ans")) %>% select(-roles) %>%                                            
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>%                                         
  mutate(vals = str_trim(vals, side = "both")) %>% 
  count(vals) %>% filter(!vals %in% c(Roles.residual_strings)) %>% 
  pull(vals)
  

## Consider each timestamp (all unique) as an identifier
tab.Roles <- wdat %>% select(Timestamp, roles)  %>%
  bind_cols(split_string_multiple(.$roles, ",", "ans")) %>% select(-roles) %>%                                            
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>%                                         
  mutate(vals = str_trim(vals, side = "both")) %>%                
  filter(vals %in% Roles.vec) %>% arrange(Timestamp, Answer) %>%   
  group_by(Timestamp) %>%  mutate(`answer_count` = 1:n()) %>% ungroup() %>%  
  mutate(Answer = paste0("ans_", answer_count)) %>% 
################# Recode 03 ######################
  mutate(vals = if_else(vals == "Principal investigator", true = "Manager/advisor", vals))

#wdat$roles %>% table()
ggpubr::ggarrange(
## Frequency distribution of the categories
tab.Roles %>% separate(col = vals, into = c("vals", "fat"), sep = " - ") %>% select(-fat) %>% 
ggplot(aes(y = forcats::fct_infreq(vals), fill = vals)) + 
  geom_bar(width = 0.6, show.legend = F) + 
  geom_text(stat = 'count', aes(label = after_stat(count)), hjust = 1.3, col = "white") +
  scale_fill_manual(values = c("#7CAE00", "#1F77B4", "#9467BD")) +
  scale_x_continuous(breaks = seq(0, 45, by = 5)) +
  labs(x = "count", y = "Data types", fill = "Role of the respondent") +
  theme_light() + theme(panel.grid = element_blank()),

## How many single and multiple responses
tab.Roles %>% group_by(Timestamp) %>%
  summarise(answers_freq = max(answer_count)) %>% ungroup() %>% 
  ggplot(aes(answers_freq, fill = as.factor(answers_freq))) +
  geom_bar( width = 0.3, show.legend = F) + 
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = 1.2) +
  scale_x_continuous(breaks = 1:6) +
  scale_fill_manual(values = c("#A65628", "#FF7F00", "#f399e1")) +
  labs(y = "Count", x = "Frequency of number of categories selected by a respondent") +
  theme_light() + theme(panel.grid = element_blank()),
  ncol = 1, heights = c(1.5, 1)
)

\(H_{01}\): Possible association between experience and contact.

wdat %<>% rename(contact  = `Which of the following do you provide for contact information to help users use your scientific software/tool?`) %>% mutate(contact = 
                      case_when(str_detect(contact, "More")~ "More Extensive",
                      str_detect(contact, "Simple")~ "Simple",
                      TRUE ~ contact))

#Generate plot using data
wdat %>% 
  ggplot(aes(y = fct_infreq(contact), fill = experience)) + 
  geom_bar(position = "dodge", width = 0.7) + 
  labs(x = "Count", y = "Contact Type", fill = "Experience") +
  scale_x_continuous(n.breaks = 6) +
  scale_fill_manual(values = c("#64C204", "#56B4E9", "#F5A800", "#dd7ba7")) +
  theme_light() + theme(panel.grid.major  = element_blank())

\(H_{02}\): Possible association between experience and documentation/training.

wdat %<>% 
  rename(documentation  = `What type of documentation/training for users to learn how to use a tool did/do you provide?`) %>%
  mutate(doc_types = as.character(1 + str_count(documentation, ","))) %>% 
  mutate(doc_types = case_when(
    str_detect(documentation, "None") ~ "0",
    documentation == "README file" ~ "0", #readme only also counted as no additional documentation
    TRUE ~ doc_types)) %>% 
  mutate(doc_types = as.numeric(doc_types)) 

wdat %>% 
  ggplot(aes(y = doc_types, x = experience, fill = experience)) + 
  geom_boxplot(width = .4, show.legend = F) + 
  labs(y = "Number of documentation strategies employed", 
       x = "Experience by number of software projects of respondent") + 
  theme_light() + coord_flip() +
  theme(panel.grid = element_blank())

 ggplot(tab.docsTrain_exp, aes(x = experience, fill = fct_infreq(vals))) +
  geom_bar(position = "fill", width = .5) +
  geom_text(
    aes(label = after_stat(count)),
    stat="count",
    position = position_fill(vjust = .5),
    size = 3) +
  labs(y = "Proportion", x = "Experience by number of software projects of respondent", 
       fill = "Documentation") +
  scale_fill_brewer(palette = "Set3") + 
  coord_flip() +
  theme_light() +
  theme(legend.position = "right") +
  guides(fill=guide_legend(nrow = 8)) +
  scale_x_discrete(expand = c(0, 0.05)) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(title = "Documentation/Training by Experience") +
  theme(plot.title = element_text(size = 14, face = "bold", margin = margin(b = 10)),
        axis.title.x = element_blank(),
        axis.text.y = element_text(size = 11),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 6),
        panel.grid = element_blank())

\(H_{03}\): Possible association between experience and type of metric (to evaluate user engagement).

# wdat$`What types of metrics have you used to evaluate user engagement with your scientific software/tool?` %>% table
metricType.residual_strings <- c("clicks etc.)", "forks", "etc.)", "video views etc.)", "survey results",
                                 "tweets etc.)", "stars", "forks", "issues", "pull requests)", 
                                 "registered users", "job submissions", "error reports etc.)")
# Run this code to find the vector of all researcher-defined, and user-defined "other" strings.
metricType.vec <- wdat %>% select(Timestamp, 
                metricType = `What types of metrics have you used to evaluate user engagement with your scientific software/tool?`) %>%
  bind_cols(split_string_multiple(.$metricType, ",", "ans")) %>% select(-metricType) %>%  
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  count(vals) %>% filter(!vals %in% c(metricType.residual_strings)) %>% pull(vals)

## Create the data set for contact(stratifying variable) and the required variable for analysis
tab.metric_exp <- wdat %>% select(Timestamp, experience,
                metricType = `What types of metrics have you used to evaluate user engagement with your scientific software/tool?`) %>%
  mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>% 
  bind_cols(split_string_multiple(.$metricType, ",", "ans")) %>% select(-metricType) %>%  
  pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  filter(vals %in% metricType.vec) %>% arrange(Timestamp, Answer) %>% 
  group_by(Timestamp) %>%  mutate(`answer_count` = 1:n()) %>% ungroup() %>%  
  mutate(Answer = paste0("ans_", answer_count))
tab.metric_exp %>% 
  mutate(vals = if_else(str_detect(vals, "interaction"), "Website interaction (unique visitors)", vals)) %>% 
ggplot(aes(x = experience, fill = fct_rev(vals))) +
  geom_bar(position = "fill", width = .6) +
  geom_text(
    aes(label = after_stat(count)),
    stat="count",
    position = position_fill(vjust = .5),
    size = 3) +
  labs(y = "Proportion", x = "Experience by # of software projects", fill = "Type \nof metric") + 
  coord_flip() +
  scale_fill_discrete(labels = scales::wrap_format(16)) + 
  scale_fill_brewer(palette = "Set3") +
  theme_light() +
  theme(legend.position = "bottom",
        panel.grid = element_blank()) +
  guides(fill = guide_legend(nrow = 4, reverse = T, keywidth = .7)) 

\(H_{04}\): Possible association between experience and software health infrastructure.

#wdat$`Which of the following software health infrastructure have you implemented for your scientific software/tool?` %>% table

# Run this code to find the vector of all researcher-defined, and user-defined "other" strings.
infrastructure.vec <- wdat %>% select(Timestamp, 
                infractructure = `Which of the following software health infrastructure have you implemented for your scientific software/tool?`) %>%
  bind_cols(split_string_multiple(.$infractructure, ",", "ans")) %>% select(-infractructure) %>%  
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  count(vals) %>% pull(vals)

## Create the data set for contact(stratifying variable) and the required variable for analysis
tab.infrastr_exp <- wdat %>% select(Timestamp, experience,
                infractructure = `Which of the following software health infrastructure have you implemented for your scientific software/tool?`) %>%
  mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>% 
  bind_cols(split_string_multiple(.$infractructure, ",", "ans")) %>% select(-infractructure) %>%  
  pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  #filter(vals %in% metricType.vec) %>% 
  arrange(Timestamp, Answer) %>% 
  group_by(Timestamp) %>%  mutate(`answer_count` = 1:n() ) %>% ungroup() %>%  
  mutate(Answer = paste0("ans_", answer_count))
tab.infrastr_exp %>% 
  mutate(vals = if_else(str_detect(vals, "linux|windows"), "Other automations", vals)) %>% 
  ggplot(aes(x = experience, fill = fct_infreq(vals))) +
  geom_bar(position = "fill", width = .6, show.legend = T) +
  geom_text(
    aes(label = after_stat(count)),
    stat="count",
    position = position_fill(vjust = .5),
    size = 3) +
  labs(x = "Experience by # of software projects", 
       fill = "software \nhealth \ninfrastructure", y = "Proportion") + 
  scale_fill_discrete(labels = scales::wrap_format(20), type = c("#8DD3C7", "#FFF0B3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#f1453a")) +  
  coord_flip() +
  guides(fill = guide_legend(nrow = 2, keyheight = .5, keywidth = .7)) + 
  theme_light() + theme(legend.position = "bottom", panel.grid = element_blank()) 

Q6: Major Barriers vs Experience: What are less experienced people struggling with?

#wdat$`What major barriers are hindering your ability to evaluate the engagement of your tool(s)?` %>% table
# Run this code to find the vector of all researcher-defined, and user-defined "other" strings.
barriers.vec <- wdat %>% select(Timestamp, 
                barriers = `What major barriers are hindering your ability to evaluate the engagement of your tool(s)?`) %>%
  bind_cols(split_string_multiple(.$barriers, ",", "ans")) %>% select(-barriers) %>%  
  pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  count(vals) %>% pull(vals)

## Create the data set for contact(stratifying variable) and the required variable for analysis
tab.barriers_exp <- wdat %>% select(Timestamp, experience,
                barriers = `What major barriers are hindering your ability to evaluate the engagement of your tool(s)?`) %>%
  mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>% 
  bind_cols(split_string_multiple(.$barriers, ",", "ans")) %>% select(-barriers) %>%  
  pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  filter(vals %in% barriers.vec) %>% arrange(Timestamp, Answer) %>% 
  group_by(Timestamp) %>%  mutate(`answer_count` = 1:n()) %>% ungroup() %>%  
  mutate(Answer = paste0("ans_", answer_count))
tab.barriers_exp %>% 
  mutate(vals = if_else(str_detect(vals, "invade"), "Privacy concerns", vals)) %>% 
ggplot(aes(x = experience, fill = fct_infreq(vals))) +
  geom_bar(position = "fill", width = .6, show.legend = T) +
  geom_text(
    aes(label = after_stat(count)),
    stat="count",
    position = position_fill(vjust = .5),
    size = 3) +
  labs(x = "Experience by # of software projects", fill = "Major Barriers", y = "Proportion") + 
  coord_flip() +
  theme_light() +
  theme(legend.position = "bottom", panel.grid = element_blank()) +
  scale_fill_brewer(palette = "Set3", direction = 1) +
  scale_fill_discrete(labels = scales::wrap_format(18), 
                      type = c("#8DD3C7", "#FFF0B3", "#BEBADA", "#FB8072", 
                               "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#F1453a")) + 
  guides(fill=guide_legend(nrow = 3, byrow = T))

Q7: Fairness vs experience

wdat %>% 
  select(Timestamp, experience,
         fairness = `Have you been able to assess your tool's fairness (Not to be confused with FAIRness as defined as Findable, Accessible, Interoperable, and Reusable). Here we define software fairness in terms of the design being mindful of inclusivity and bias. See this link for more information.`) %>% 
  mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>% 
  filter(!is.na(fairness)) %>% 
  bind_cols(split_string_multiple(.$fairness, ",", "ans")) %>% select(-fairness) %>%  
  pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  arrange(Timestamp, Answer) %>% 
  group_by(Timestamp) %>%  mutate(`answer_count` = 1:n()) %>% ungroup() %>%  
  mutate(vals = if_else(str_detect(vals, "Attempted"), "Attempted, but encountered challenges", vals)) %>%
  filter(vals != "but encountered challenges") %>% 
  ggplot(aes(x = experience, fill = fct_infreq(vals))) +
  geom_bar(position = "fill", width = .6, show.legend = T) +
  geom_text(
    aes(label = after_stat(count)),
    stat="count",
    position = position_fill(vjust = .5),
    size = 3) +
  labs(x = "Experience by # of software projects", fill = "Fairness", y = "Proportion",
       subtitle = "fairness = Have you been able to assess your tool's fairness (Not to be confused with \nFAIRness as defined as Findable, Accessible, Interoperable, and Reusable). \nHere we define software fairness in terms of the design being mindful of inclusivity and bias.") + 
  coord_flip() + 
  theme_light() +
  theme(legend.position = "bottom", panel.grid = element_blank()) +
  scale_fill_brewer(palette = "Set3", direction = 1) +
  scale_fill_discrete(labels = scales::wrap_format(18), 
                      type = c("#8DD3C7", "#FFF0B3", "#BEBADA", "#FB8072", 
                               "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#F1453a")) + 
  guides(fill = guide_legend(nrow = 1))

Q8. Usability optimization vs Experience

wdat %>% 
  select(Timestamp, experience,
         usability = `What aspects of usability optimization have you or would you want to learn about?`) %>% 
  mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>% 
  filter(!is.na(usability)) %>% 
  bind_cols(split_string_multiple(.$usability, ",", "ans")) %>% select(-usability) %>%  
  pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%  
  filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>% 
  arrange(Timestamp, Answer) %>% 
  group_by(Timestamp) %>%  mutate(`answer_count` = 1:n()) %>% ungroup() %>%  
  mutate(Answer = paste0("ans_", answer_count)) %>% 
  ggplot(aes(x = experience, fill = fct_infreq(vals))) +
  geom_bar(position = "fill", width = .6, show.legend = T) +
  geom_text(
    aes(label = after_stat(count)),
    stat="count",
    position = position_fill(vjust = .5),
    size = 3) +
  labs(x = "Experience by # of software projects", fill = "Usability optimization", y = "Proportion",
       subtitle = "Usability: What aspects of usability optimization have you or would you want to learn about?") + 
  coord_flip() + 
  theme_light() +
  theme(legend.position = "bottom", panel.grid = element_blank()) +
  scale_fill_discrete(labels = scales::wrap_format(20), 
                      type = c("#8DD3D7", "#80B1D3", "#B3DE69")) + 
  guides(fill=guide_legend(nrow = 1))