We stratify based on 4 variables of interest:
1. toolType = What type of scientific software/tool did
you work on? Please answer for the single most developed/mature
tool.
2. dataType = How would you classify your scientific
software/tool?
3. experience = How many projects related to developing
scientific software tools have you been involved in?
4. role = What is your current role on the tool
development/maintenance projects (choose all that apply)?
#count(wdat, dataType)
<- wdat %>%
wdat mutate(dataType1 = case_when(
== "Omics - proteomics, genomics, metabolomics" ~ "Omics",
dataType == "Supports multiple types of data" ~ "Multiple Data Types",
dataType TRUE ~ dataType)) %>%
## Those with multiple to be "Multiple Data Types"
mutate(dataType1 = case_when(
str_detect(dataType1, "," ) ~ "Multiple Data Types",
str_detect(dataType1, "Physics|algorithm|Visualization") ~ "Other",
TRUE ~ dataType1))
%>% count(dataType1, sort = T) %>% DT::datatable() wdat
%<>%
wdat mutate(multi_type = case_when(str_detect(toolType, ",") ~ "Yes", TRUE ~ "No"),
Suite = case_when(str_detect(toolType, "Suite") ~ "Yes", TRUE ~ "No"))
## Checking
<- wdat %>%
complex filter(Suite != "Yes") %>%
filter(multi_type == "Yes") %>%
select(toolType, Link) %>% count(toolType)
%<>%
wdat mutate(toolType_un = case_when(
str_detect(toolType, "Suite") ~ "Suite",
str_detect(toolType, "Platform") ~ "Platform",
str_detect(toolType, "Command|Stand alone") & str_detect(toolType, "GUI")~ "Desktop app and Command line",
str_detect(toolType, "Web") ~ "Web-based tool",
str_detect(toolType, "Jupyter|Python") ~ "Jupyter/Python",
str_detect(toolType, "R") ~ "R",
TRUE ~ toolType)) %>%
separate(toolType_un, into = c("toolType_un", "description"), sep =" - ", fill = "right") %>%
mutate(toolType_un = str_trim(toolType_un, "both"))
#wdat %>% count(toolType_un)
%<>%
wdat mutate(toolType1 = case_when(
str_detect(toolType_un, "Command|Desktop") ~ "Desktop App/Command line/scripts",
TRUE ~ toolType_un
))
## creating factors
%<>%
wdat mutate(experience = factor(experience, levels = c("1", "2-4", "5-9", "10 or more")))
%>% count(toolType1, sort = T) %>% DT::datatable() wdat
## Addition to our wrangling arsenal:
## Split a delimited string column into multiple string columns without specifying number of columns in advance
<- function(column, pattern = ",", prefix = "ans") {
split_string_multiple <- str_split_fixed(column, pattern, n = Inf)
strings which(strings == "")] <- NA # Fill the empty spaces "" returned by filling the matrix to the right, with NAs
strings[<- as_tibble(strings) # convert into a tibble
strings <- ncol(strings) # where m = number of columns of 'strings'
m names(strings) <- paste(prefix, 1:m, sep = "_") # rename the columns
return(strings)
}
%>% count(toolType1) %>%
wdat filter(toolType1 != "Not Sure") %>%
ggplot(aes(y = fct_reorder(toolType1, n), x = n, fill = toolType1)) +
geom_col(position = "dodge", width = .6) +
geom_text(aes(label = n), hjust = 1.5, color = "white") +
theme_light() +
theme(legend.position = "none",
panel.grid = element_blank()) +
labs(x = "Frequency", y = "Software type for most mature tool of the respondant")
%>% filter(toolType1 != "Not Sure") %>%
wdat ggplot(aes(y = fct_infreq(toolType1), fill = toolType1)) +
geom_bar( width = .6) +
facet_wrap(~experience) +
theme_light() +
theme(strip.background = element_rect(fill="steelblue4"),
strip.text = element_text(colour = 'white', size = 12),
legend.position = "none",
panel.grid = element_blank()) +
labs(x = "Frequency", y = "Software type for most mature tool of the respondant", subtitle = "Facet by experience (# of tools)")
## Since this is a multiple choice question, we need to separate the (unequal) number of answers given by each respondent
## Note: some people selected Other and put in their version, we need to identify those as well
## Defined strings = strings up to the first comma
## First, we create a variable containing all strings to match with response data. We take names up to the first comma, which will be our separator.
## **Notice double spaces in names of certain strings: Computing, Command-line,
<- c("Computing Web-based Platform - A website providing computing resources and possibly data",
SciToolsType.defined "Web-based tool - A tool that runs in your web browser but doesn't necessarily provide access to data",
"Bioconductor R packages",
"Other R packages (not Bioconductor)",
"Jupyter Notebooks",
"Desktop Application - A tool that runs on your desktop environment with a GUI",
"Database/Ontology",
"Plug-in - A software component encapsulating a set of related functions, which are not standalone, i.e. depend upon other software for its use, e.g. a Javascript widget, or a plug-in, extension add-on etc. that extends the function of some existing to",
"Command-line tool/Other scripts - A tool that works with a command-line interface or environment",
"Suite - multiple tools that work together")
## Residual strings = all comma separated strings after the first comma
<- c("which are not standalone", "i.e. depend upon other software for its use", "e.g. a Javascript widget",
SciToolsType.residual_string "or a plug-in", "extension add-on etc. that extends the function of some existing to",
"where Jupyter notebooks are a component")
## Find all relevant unique strings in a vector
<- wdat %>% select(Timestamp, toolType) %>%
SciToolsType.vec bind_cols(split_string_multiple(.$toolType, ",", "ans")) %>% # split string columns into multiple columns
select(-toolType) %>% # remove original variable
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>% # take data into long form to aid filtering operations
filter(!is.na(vals)) %>% # filter NA values
mutate(vals = str_trim(vals, side = "both")) %>% # remove leading and trailing white spaces in the strings
count(vals) %>% filter(!vals %in% c(SciToolsType.residual_string)) %>% pull(vals) # remove residual strings from the vectors
## The drawback is when the user provides comma-separated response. Then we need to manually include that in residual strings
############RECODE 01: All Python responses to Python/Jupiter Notebooks######################
## 1. Remove Not Sure
## 2. All Python responses to Python/Jupiter Notebooks
## Consider each timestamp (all unique) as an identifier
<- wdat %>% select(Timestamp, toolType) %>%
tab.toolType bind_cols(split_string_multiple(.$toolType, ",", "ans")) %>% # split string columns into multiple columns
select(-toolType) %>% # remove original variable
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>% # take data into long form to aid filtering operations
filter(!is.na(vals)) %>% # filter NA values
mutate(vals = str_trim(vals, side = "both")) %>% # remove leading and trailing white spaces in the strings
filter(vals %in% SciToolsType.vec) %>% arrange(Timestamp, Answer) %>% # keep only the actual tool names (defined up to first comma)
group_by(Timestamp) %>% mutate(`answer_count` = 1:n()) %>% ungroup() %>% # create a variable to count the number of answers by each respondent
mutate(Answer = paste0("ans_", answer_count)) %>%
## RECODE 01: Python issue and Not Sure removal
filter(!vals %in% c("Not Sure")) %>%
mutate(vals = case_when(str_detect(vals, 'Python|Jupyter') ~ "Python/Jupyter Notebooks",
TRUE ~ vals))
::ggarrange(
ggpubr## Frequency distribution of the categories
%>% separate(col = vals, into = c("vals", "fat"), sep = " - ") %>% select(-fat) %>%
tab.toolType mutate(vals2 = case_when(
%in% c("Bioconductor R packages", "Other R packages (not Bioconductor)") ~ "R packages",
vals %in% c("Command-line tool/Other scripts") ~ "Command-line tools/Scripts",
vals %in% c("Computing Web-based Platform", "Web API") ~ "Computing platforms",
vals %in% c("Desktop Application") ~ "Desktop Application",
vals %in% "Database/Ontology" ~ "Databases/Ontology",
vals %in% "Plug-in" ~ "Plug-in",
vals %in% "Python/Jupyter Notebooks" ~ "Python/Jupyter",
vals %in% "Stand alone C++ code" ~ "C++ code",
vals %in% "Suite" ~ "Suite",
vals TRUE ~ vals
%>%
)) ggplot(aes(y = forcats::fct_infreq(vals2), fill = vals2)) +
geom_bar( width = 0.6, show.legend = F) +
geom_text(stat = 'count', aes(label = after_stat(count)), hjust = 1.2, color = "white") +
scale_x_discrete(limits = rev(levels(wdat$vals2))) +
labs(x = "Count", y = "Tool types") +
scale_fill_manual(values = c("#7CAE00", "#00BFC4", "#C77CFF", "#B2D235", "#A65628", "#FF7F00", "#1F77B4", "#D62728", "#9467BD", "#f39de1")) +
theme_light()+
theme(panel.grid = element_blank()),
## How many single and multiple responses
%>% group_by(Timestamp) %>%
tab.toolType summarise(answers_freq = max(answer_count)) %>% ungroup() %>%
ggplot(aes(answers_freq, fill = factor(answers_freq))) +
geom_bar( width = 0.6, show.legend = F) +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = 1.2, color = "white") +
scale_x_continuous(breaks = 1:6) +
labs(y = "count", x = "Frequency of number of categories selected by a respondent") +
scale_fill_manual(values = c("#7CAE00", "#A65628", "#FF7F00", "#1F77B4", "#94a7BD", "#f39de1")) +
theme_light() +
theme(panel.grid = element_blank()),
ncol = 1, heights = c(1.5, 1)
)
%>% count(dataType1) %>%
wdat ggplot(aes(y = fct_reorder(dataType1, n), x = n, fill = dataType1)) +
geom_col(position = "dodge", width = .6) +
geom_text(aes(label = n), hjust = 1.5, color = "white") +
theme_light() +
theme(legend.position = "none") +
labs(x = "Frequency", y = "Data type for the most mature software of the respondant") +
theme(panel.grid = element_blank())
%>%
wdat ggplot(aes(y = fct_infreq(dataType1), fill = dataType1)) +
geom_bar( width = .6) +
facet_wrap(~experience) +
theme_light() +
theme(strip.background = element_rect(fill="steelblue4"),
strip.text = element_text(colour = 'white', size = 12),
legend.position = "none") +
labs(x = "Frequency", y = "Data type for the most mature software of the respondant", subtitle = "Facet by experience") +
theme(panel.grid = element_blank())
Since the respondents were allowed to choose multiple options, we also investigated how many people chose single and multiple answers.
Recoded All One off “other” responses to multiple data types
Merge the other response types into “Multiple data types” category
## Since this is a multiple choice question, we need to separate the (unequal) number of answers given by each respondent
## Note: some people selected Other and put in their version, we need to identify those as well
## Defined strings = strings up to the first comma
<- c("Omics - proteomics", "Clinical", "Imaging", "Supports multiple types of data")
DataTypes.defined ## Residual strings = all comma separated strings after the first comma
<- c("genomics", "metabolomics")
DataTypes.residual_strings
## Find all relevant unique strings in a vector
<- wdat %>% select(Timestamp, dataType) %>%
DataTypes.vec bind_cols(split_string_multiple(.$dataType, ",", "ans")) %>% select(-dataType) %>%
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
count(vals) %>% filter(!vals %in% c(DataTypes.residual_strings)) %>% pull(vals)
## Consider each timestamp (all unique) as an identifier
<- wdat %>% select(Timestamp, dataType) %>%
tab.dataType bind_cols(split_string_multiple(.$dataType, ",", "ans")) %>% select(-dataType) %>%
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>%
mutate(vals = str_trim(vals, side = "both")) %>%
filter(vals %in% DataTypes.vec) %>% arrange(Timestamp, Answer) %>%
group_by(Timestamp) %>% mutate(`answer_count` = 1:n()) %>% ungroup() %>%
mutate(Answer = paste0("ans_", answer_count)) %>%
##################### Recode the one-off other categories ##################
mutate(vals = case_when(str_detect(vals, 'Omics|Imaging|Clinical') ~ vals,
str_detect(vals, 'Visualization|Physics and Biology') ~ "Others",
TRUE ~ "Multiple data types")) %>%
## Add the full category names again
mutate(vals = if_else(vals == "Omics - proteomics", true = "Omics - proteomics, genomics, metabolomics", false = vals))
#wdat$dataType %>% table()
::ggarrange(
ggpubr## Frequency distribution of the categories
%>% separate(col = vals, into = c("vals", "fat"), sep = " - ") %>% select(-fat) %>%
tab.dataType ggplot(aes(y = forcats::fct_infreq(vals), fill = vals)) +
geom_bar(width = 0.6) +
geom_text(stat = 'count', aes(label = after_stat(count)), hjust = 1.2, color = "white") +
scale_x_continuous(breaks = seq(3, 30, by = 3)) +
scale_fill_manual(values = c("#7CAE00", "#1F77B4", "#D62728", "#9467BD", "#f39de1")) +
labs(x = "count", y = "Data types") +
theme_light() + theme(panel.grid.major = element_blank()),
## How many single and multiple responses
%>% group_by(Timestamp) %>%
tab.dataType summarise(answers_freq = max(answer_count)) %>% ungroup() %>%
ggplot(aes(answers_freq, fill = factor(answers_freq))) +
geom_bar( width = 0.6, show.legend = F) +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = 1.1, color = "white") +
scale_x_continuous(breaks = 1:6) +
labs(y = "count", x = "Frequency of number of categories selected by a respondent") +
scale_fill_manual(values = c("#7CAE00", "#A65628", "#FF7F00", "#1F77B4", "#94a7BD", "#f39de1")) +
theme_light() + theme(panel.grid = element_blank()),
ncol = 1, heights = c(1.5, 1)
)
<- wdat %>% select(Timestamp, Link,
df.dataType2 %>%
roles, experience, dataType) mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>%
bind_cols(split_string_multiple(.$dataType, ",", "ans")) %>%
select(-dataType) %>%
pivot_longer(cols = -c(Timestamp, Link, experience, roles), names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
filter(vals %in% DataTypes.vec) %>% arrange(Timestamp, Answer) %>%
separate(col = vals, into = c("vals", "fat"), sep = " - ") %>% select(-fat) %>%
#### Recoding other one-off categories
mutate(vals = case_when(str_detect(vals, 'Omics|Imaging|Clinical') ~ vals,
TRUE ~ "Multiple data types"))
## Plot: data types selections together
%>% rename(dataType = vals) %>%
df.dataType2 group_by(Timestamp, Link, experience, roles) %>%
summarise(selections = str_c(dataType, collapse =", ")) %>% ungroup() %>%
mutate(selections = if_else(selections == "Multiple data types, Multiple data types", "Multiple data types", selections)) %>%
ggplot(aes(y = fct_infreq(selections), fill = selections)) +
geom_bar( width = 0.6, show.legend = F) +
geom_text(stat = 'count', aes(label = after_stat(count)), hjust = 1.2, size = 3) +
labs(x = "Frequency", y = "Data type for the most mature software of the respondant",
subtitle = "Facet by experience") +
facet_wrap(~experience) +
theme_light() +
theme(strip.background = element_rect(fill="steelblue4"),
strip.text = element_text(colour = 'white', size = 12),
legend.position = "none",
panel.grid.major = element_blank())
<- wdat %>% select(Timestamp, experience) %>%
tab.Experience mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9"))
%>%
tab.Experience ggplot(aes(x = experience, fill = experience)) +
geom_bar(width = 0.5, show.legend = F) +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = 1.5, col = "white") +
labs(y = "count", x = "Experience by number of software projects of respondent",) +
scale_y_continuous(breaks = seq(0,15, by = 3)) +
scale_fill_manual(values = c("#229ae1", "#5F77DF","#9467BD", "#4a2a28")) +
theme_light() +
theme(panel.grid.major = element_blank())
## Data Type x Experience
%>%
wdat ggplot(aes(y = experience, fill = dataType1)) +
geom_bar(position = "dodge", width = .6) +
facet_wrap(~dataType1) +
theme_light() +
theme(strip.background = element_rect(fill="steelblue4"),
strip.text = element_text(colour = 'white', size = 12),
legend.position = "none",
panel.grid = element_blank()) +
labs(x = "Frequency", y = "Experience by number of software projects of respondent",
subtitle = "Facet by data type for the most mature software of the respondant")
## Tool type x Data type
%>%
wdat ggplot(aes(y = toolType1, fill = dataType1)) +
geom_bar(position = "dodge", width = .6) +
facet_wrap(~dataType1) +
theme_light() +
theme(strip.background = element_rect(fill="steelblue4"),
strip.text = element_text(colour = 'white', size = 12),
legend.position = "none",
panel.grid = element_blank()) +
labs(x = "Frequency", y = "Experience by number of software projects of respondent",
subtitle = "Facet by Tool type for the most mature software of the respondant")
## Tool type x Experience x Data type
%>%
wdat ggplot(aes(y = toolType1, x = experience, col = dataType1)) +
geom_jitter(width = .2, height = .3, size = 3, alpha = .7) +
labs(x = "Experience by number of software projects of respondent",
y = "Tool type for the most mature software of the respondant", col = "Data types") +
scale_color_brewer(palette = "Dark2") +
theme_light() +
labs(subtitle = "Relationship between Experience, Tool Types, and Data Types \nfor the most mature software of the respondant")
Also investigated how many people chose single and multiple answers.
Recoded: “Principal Investigator” into Manager/ advisor.
## Since this is a multiple choice question, we need to separate the (unequal) number of answers given by each respondent
## Note: some people selected Other and put in their version, we need to identify those as well
## Defined strings = strings up to the first comma
<- c("Manager/advisor", "Software Developer/Maintainer", "Outreach Specialist",
Roles.defined "Trainee - postdoc", "Trainee - graduate student")
<- NULL
Roles.residual_strings
## Find all relevant unique strings in a vector
<- wdat %>% select(Timestamp, roles) %>%
Roles.vec bind_cols(split_string_multiple(.$roles, ",", "ans")) %>% select(-roles) %>%
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>%
mutate(vals = str_trim(vals, side = "both")) %>%
count(vals) %>% filter(!vals %in% c(Roles.residual_strings)) %>%
pull(vals)
## Consider each timestamp (all unique) as an identifier
<- wdat %>% select(Timestamp, roles) %>%
tab.Roles bind_cols(split_string_multiple(.$roles, ",", "ans")) %>% select(-roles) %>%
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>%
mutate(vals = str_trim(vals, side = "both")) %>%
filter(vals %in% Roles.vec) %>% arrange(Timestamp, Answer) %>%
group_by(Timestamp) %>% mutate(`answer_count` = 1:n()) %>% ungroup() %>%
mutate(Answer = paste0("ans_", answer_count)) %>%
################# Recode 03 ######################
mutate(vals = if_else(vals == "Principal investigator", true = "Manager/advisor", vals))
#wdat$roles %>% table()
::ggarrange(
ggpubr## Frequency distribution of the categories
%>% separate(col = vals, into = c("vals", "fat"), sep = " - ") %>% select(-fat) %>%
tab.Roles ggplot(aes(y = forcats::fct_infreq(vals), fill = vals)) +
geom_bar(width = 0.6, show.legend = F) +
geom_text(stat = 'count', aes(label = after_stat(count)), hjust = 1.3, col = "white") +
scale_fill_manual(values = c("#7CAE00", "#1F77B4", "#9467BD")) +
scale_x_continuous(breaks = seq(0, 45, by = 5)) +
labs(x = "count", y = "Data types", fill = "Role of the respondent") +
theme_light() + theme(panel.grid = element_blank()),
## How many single and multiple responses
%>% group_by(Timestamp) %>%
tab.Roles summarise(answers_freq = max(answer_count)) %>% ungroup() %>%
ggplot(aes(answers_freq, fill = as.factor(answers_freq))) +
geom_bar( width = 0.3, show.legend = F) +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = 1.2) +
scale_x_continuous(breaks = 1:6) +
scale_fill_manual(values = c("#A65628", "#FF7F00", "#f399e1")) +
labs(y = "Count", x = "Frequency of number of categories selected by a respondent") +
theme_light() + theme(panel.grid = element_blank()),
ncol = 1, heights = c(1.5, 1)
)
%<>% rename(contact = `Which of the following do you provide for contact information to help users use your scientific software/tool?`) %>% mutate(contact =
wdat case_when(str_detect(contact, "More")~ "More Extensive",
str_detect(contact, "Simple")~ "Simple",
TRUE ~ contact))
#Generate plot using data
%>%
wdat ggplot(aes(y = fct_infreq(contact), fill = experience)) +
geom_bar(position = "dodge", width = 0.7) +
labs(x = "Count", y = "Contact Type", fill = "Experience") +
scale_x_continuous(n.breaks = 6) +
scale_fill_manual(values = c("#64C204", "#56B4E9", "#F5A800", "#dd7ba7")) +
theme_light() + theme(panel.grid.major = element_blank())
%<>%
wdat rename(documentation = `What type of documentation/training for users to learn how to use a tool did/do you provide?`) %>%
mutate(doc_types = as.character(1 + str_count(documentation, ","))) %>%
mutate(doc_types = case_when(
str_detect(documentation, "None") ~ "0",
== "README file" ~ "0", #readme only also counted as no additional documentation
documentation TRUE ~ doc_types)) %>%
mutate(doc_types = as.numeric(doc_types))
%>%
wdat ggplot(aes(y = doc_types, x = experience, fill = experience)) +
geom_boxplot(width = .4, show.legend = F) +
labs(y = "Number of documentation strategies employed",
x = "Experience by number of software projects of respondent") +
theme_light() + coord_flip() +
theme(panel.grid = element_blank())
ggplot(tab.docsTrain_exp, aes(x = experience, fill = fct_infreq(vals))) +
geom_bar(position = "fill", width = .5) +
geom_text(
aes(label = after_stat(count)),
stat="count",
position = position_fill(vjust = .5),
size = 3) +
labs(y = "Proportion", x = "Experience by number of software projects of respondent",
fill = "Documentation") +
scale_fill_brewer(palette = "Set3") +
coord_flip() +
theme_light() +
theme(legend.position = "right") +
guides(fill=guide_legend(nrow = 8)) +
scale_x_discrete(expand = c(0, 0.05)) +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Documentation/Training by Experience") +
theme(plot.title = element_text(size = 14, face = "bold", margin = margin(b = 10)),
axis.title.x = element_blank(),
axis.text.y = element_text(size = 11),
legend.title = element_text(size = 12),
legend.text = element_text(size = 6),
panel.grid = element_blank())
# wdat$`What types of metrics have you used to evaluate user engagement with your scientific software/tool?` %>% table
<- c("clicks etc.)", "forks", "etc.)", "video views etc.)", "survey results",
metricType.residual_strings "tweets etc.)", "stars", "forks", "issues", "pull requests)",
"registered users", "job submissions", "error reports etc.)")
# Run this code to find the vector of all researcher-defined, and user-defined "other" strings.
<- wdat %>% select(Timestamp,
metricType.vec metricType = `What types of metrics have you used to evaluate user engagement with your scientific software/tool?`) %>%
bind_cols(split_string_multiple(.$metricType, ",", "ans")) %>% select(-metricType) %>%
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
count(vals) %>% filter(!vals %in% c(metricType.residual_strings)) %>% pull(vals)
## Create the data set for contact(stratifying variable) and the required variable for analysis
<- wdat %>% select(Timestamp, experience,
tab.metric_exp metricType = `What types of metrics have you used to evaluate user engagement with your scientific software/tool?`) %>%
mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>%
bind_cols(split_string_multiple(.$metricType, ",", "ans")) %>% select(-metricType) %>%
pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
filter(vals %in% metricType.vec) %>% arrange(Timestamp, Answer) %>%
group_by(Timestamp) %>% mutate(`answer_count` = 1:n()) %>% ungroup() %>%
mutate(Answer = paste0("ans_", answer_count))
%>%
tab.metric_exp mutate(vals = if_else(str_detect(vals, "interaction"), "Website interaction (unique visitors)", vals)) %>%
ggplot(aes(x = experience, fill = fct_rev(vals))) +
geom_bar(position = "fill", width = .6) +
geom_text(
aes(label = after_stat(count)),
stat="count",
position = position_fill(vjust = .5),
size = 3) +
labs(y = "Proportion", x = "Experience by # of software projects", fill = "Type \nof metric") +
coord_flip() +
scale_fill_discrete(labels = scales::wrap_format(16)) +
scale_fill_brewer(palette = "Set3") +
theme_light() +
theme(legend.position = "bottom",
panel.grid = element_blank()) +
guides(fill = guide_legend(nrow = 4, reverse = T, keywidth = .7))
#wdat$`Which of the following software health infrastructure have you implemented for your scientific software/tool?` %>% table
# Run this code to find the vector of all researcher-defined, and user-defined "other" strings.
<- wdat %>% select(Timestamp,
infrastructure.vec infractructure = `Which of the following software health infrastructure have you implemented for your scientific software/tool?`) %>%
bind_cols(split_string_multiple(.$infractructure, ",", "ans")) %>% select(-infractructure) %>%
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
count(vals) %>% pull(vals)
## Create the data set for contact(stratifying variable) and the required variable for analysis
<- wdat %>% select(Timestamp, experience,
tab.infrastr_exp infractructure = `Which of the following software health infrastructure have you implemented for your scientific software/tool?`) %>%
mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>%
bind_cols(split_string_multiple(.$infractructure, ",", "ans")) %>% select(-infractructure) %>%
pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
#filter(vals %in% metricType.vec) %>%
arrange(Timestamp, Answer) %>%
group_by(Timestamp) %>% mutate(`answer_count` = 1:n() ) %>% ungroup() %>%
mutate(Answer = paste0("ans_", answer_count))
%>%
tab.infrastr_exp mutate(vals = if_else(str_detect(vals, "linux|windows"), "Other automations", vals)) %>%
ggplot(aes(x = experience, fill = fct_infreq(vals))) +
geom_bar(position = "fill", width = .6, show.legend = T) +
geom_text(
aes(label = after_stat(count)),
stat="count",
position = position_fill(vjust = .5),
size = 3) +
labs(x = "Experience by # of software projects",
fill = "software \nhealth \ninfrastructure", y = "Proportion") +
scale_fill_discrete(labels = scales::wrap_format(20), type = c("#8DD3C7", "#FFF0B3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#f1453a")) +
coord_flip() +
guides(fill = guide_legend(nrow = 2, keyheight = .5, keywidth = .7)) +
theme_light() + theme(legend.position = "bottom", panel.grid = element_blank())
#wdat$`What major barriers are hindering your ability to evaluate the engagement of your tool(s)?` %>% table
# Run this code to find the vector of all researcher-defined, and user-defined "other" strings.
<- wdat %>% select(Timestamp,
barriers.vec barriers = `What major barriers are hindering your ability to evaluate the engagement of your tool(s)?`) %>%
bind_cols(split_string_multiple(.$barriers, ",", "ans")) %>% select(-barriers) %>%
pivot_longer(cols = -Timestamp, names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
count(vals) %>% pull(vals)
## Create the data set for contact(stratifying variable) and the required variable for analysis
<- wdat %>% select(Timestamp, experience,
tab.barriers_exp barriers = `What major barriers are hindering your ability to evaluate the engagement of your tool(s)?`) %>%
mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>%
bind_cols(split_string_multiple(.$barriers, ",", "ans")) %>% select(-barriers) %>%
pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
filter(vals %in% barriers.vec) %>% arrange(Timestamp, Answer) %>%
group_by(Timestamp) %>% mutate(`answer_count` = 1:n()) %>% ungroup() %>%
mutate(Answer = paste0("ans_", answer_count))
%>%
tab.barriers_exp mutate(vals = if_else(str_detect(vals, "invade"), "Privacy concerns", vals)) %>%
ggplot(aes(x = experience, fill = fct_infreq(vals))) +
geom_bar(position = "fill", width = .6, show.legend = T) +
geom_text(
aes(label = after_stat(count)),
stat="count",
position = position_fill(vjust = .5),
size = 3) +
labs(x = "Experience by # of software projects", fill = "Major Barriers", y = "Proportion") +
coord_flip() +
theme_light() +
theme(legend.position = "bottom", panel.grid = element_blank()) +
scale_fill_brewer(palette = "Set3", direction = 1) +
scale_fill_discrete(labels = scales::wrap_format(18),
type = c("#8DD3C7", "#FFF0B3", "#BEBADA", "#FB8072",
"#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#F1453a")) +
guides(fill=guide_legend(nrow = 3, byrow = T))
%>%
wdat select(Timestamp, experience,
fairness = `Have you been able to assess your tool's fairness (Not to be confused with FAIRness as defined as Findable, Accessible, Interoperable, and Reusable). Here we define software fairness in terms of the design being mindful of inclusivity and bias. See this link for more information.`) %>%
mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>%
filter(!is.na(fairness)) %>%
bind_cols(split_string_multiple(.$fairness, ",", "ans")) %>% select(-fairness) %>%
pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
arrange(Timestamp, Answer) %>%
group_by(Timestamp) %>% mutate(`answer_count` = 1:n()) %>% ungroup() %>%
mutate(vals = if_else(str_detect(vals, "Attempted"), "Attempted, but encountered challenges", vals)) %>%
filter(vals != "but encountered challenges") %>%
ggplot(aes(x = experience, fill = fct_infreq(vals))) +
geom_bar(position = "fill", width = .6, show.legend = T) +
geom_text(
aes(label = after_stat(count)),
stat="count",
position = position_fill(vjust = .5),
size = 3) +
labs(x = "Experience by # of software projects", fill = "Fairness", y = "Proportion",
subtitle = "fairness = Have you been able to assess your tool's fairness (Not to be confused with \nFAIRness as defined as Findable, Accessible, Interoperable, and Reusable). \nHere we define software fairness in terms of the design being mindful of inclusivity and bias.") +
coord_flip() +
theme_light() +
theme(legend.position = "bottom", panel.grid = element_blank()) +
scale_fill_brewer(palette = "Set3", direction = 1) +
scale_fill_discrete(labels = scales::wrap_format(18),
type = c("#8DD3C7", "#FFF0B3", "#BEBADA", "#FB8072",
"#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#F1453a")) +
guides(fill = guide_legend(nrow = 1))
%>%
wdat select(Timestamp, experience,
usability = `What aspects of usability optimization have you or would you want to learn about?`) %>%
mutate(experience = fct_relevel(factor(experience), "1", "2-4", "5-9")) %>%
filter(!is.na(usability)) %>%
bind_cols(split_string_multiple(.$usability, ",", "ans")) %>% select(-usability) %>%
pivot_longer(cols = -c(Timestamp, experience), names_to = "Answer", values_to = "vals") %>%
filter(!is.na(vals)) %>% mutate(vals = str_trim(vals, side = "both")) %>%
arrange(Timestamp, Answer) %>%
group_by(Timestamp) %>% mutate(`answer_count` = 1:n()) %>% ungroup() %>%
mutate(Answer = paste0("ans_", answer_count)) %>%
ggplot(aes(x = experience, fill = fct_infreq(vals))) +
geom_bar(position = "fill", width = .6, show.legend = T) +
geom_text(
aes(label = after_stat(count)),
stat="count",
position = position_fill(vjust = .5),
size = 3) +
labs(x = "Experience by # of software projects", fill = "Usability optimization", y = "Proportion",
subtitle = "Usability: What aspects of usability optimization have you or would you want to learn about?") +
coord_flip() +
theme_light() +
theme(legend.position = "bottom", panel.grid = element_blank()) +
scale_fill_discrete(labels = scales::wrap_format(20),
type = c("#8DD3D7", "#80B1D3", "#B3DE69")) +
guides(fill=guide_legend(nrow = 1))