--db resfinder --mincov 60 --minid 90 --csv *.fasta > all_paired_blood_faecal_hybracter_vfdb_fasta_L60_T90_plus_BSI_indicators.csv abricate
5: Analysing virulence factor determinants
Introduction
This tutorial screens genomes against the virulence factor database (VFDB) to search for virulence genes and visualises them as a heatmap.
It also shows how the BSI and DASSIM databases were compared.
Part 1 - Genome download and virulence factor screening with abricate
1.1: Download reads and run assemblies
1.2: Run abricate on genomes
Download abricate if you haven’t already, then you can run it simply with the following command.
Part 2 - Plotting virulence factor heatmaps in R
2.1: Import the data
Import the data from abricate
library(readr)
library(dplyr)
library(tidyr)
library(pheatmap)
library(RColorBrewer)
library(readr)
library(reshape2)
library(pheatmap)
library(RColorBrewer)
# Reading in data
= readr::read_csv("../data/all_paired_blood_faecal_hybracter_vfdb_fasta_L60_T90_plus_BSI_indicators.csv")
abricate_output
= abricate_output ab_tab
2.2: Wrangle abricate dataset
= function(ab_tab)
rm_duplicates
{
# First Remove duplicates
# These cause issues with creating a lit rather than a dataframe
# When compared against the database there can sometimes be duplciates
# arrange and group by ab_tab
<- ab_tab %>%
ab_tab_unique arrange(GENE, -`%COVERAGE`) %>%
group_by(GENE)
# Check the colnames
colnames(ab_tab_unique)
# Check the duplicates
duplicated(ab_tab_unique[c("#FILE","GENE")])
print("Duplicates found:")
print(c(which(duplicated(ab_tab_unique[c("#FILE","GENE")]) == TRUE))) # see which contain duplicates
# Remove columns that contain duplicates in FILE, GENE and COVERAGE
<- ab_tab[!duplicated(ab_tab_unique[c("#FILE","GENE")]),] #using colnames
ab_tab_unique #ab_tab_unique <- ab_tab_unique[!duplicated(ab_tab_unique[c(1,2)]),] #using col numbers
# Check whether duplicates have been delted
print("Duplicates remaining:")
print(which(duplicated(ab_tab_unique[c("#FILE","GENE")]) == TRUE)) # see which contain duplicates
{return(ab_tab_unique)
}
}
# Run function
= rm_duplicates(ab_tab) ab_tab_unique
[1] "Duplicates found:"
[1] 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
[1] "Duplicates remaining:"
[1] 724 728 729 733 734 736 740 741 745 748 749 751 752 754 757 758 760 764
= function(ab_tab)
mk_count_table
{# Funtion to make the Abricate table wide
# Create a new value from original ab_tab to wide count table
= ab_tab %>%
wide # select strain, gene and resistance and rename
select("#FILE", GENE, RESISTANCE) %>%
# rename
::rename(gene = "GENE",
dplyrfile = "#FILE",
resistance = "RESISTANCE") %>%
# count genes vs strains
count(file, gene) %>%
# Convert to wide table with genes as column headers and n as values
pivot_wider(names_from = gene,
values_from = n)
# Shorten strain names (remove the unnecessary string)
$file = gsub(".fa", "", wide$file)
wide$file = gsub("_assembled.fasta", "", wide$file)
wide
{return(wide)
}
}
# Run function
= mk_count_table(ab_tab_unique)
wide
$file wide
[1] "FSBL0558_hybractersta" "FSBL1386_hybractersta" "FSBL1448_flyesta"
[4] "FSBL1654_hybractersta" "FSBL1925_hybractersta" "FSBL2071_hybractersta"
[7] "FSBL2111_hybractersta" "FSBL2112_hybractersta" "FSBL2130_hybractersta"
[10] "FSBL2155_hybractersta" "FSBL2240_hybractersta" "FSBL2258_hybractersta"
[13] "FSBL2265_hybractersta" "FSFC0558_hybractersta" "FSFC1386_hybractersta"
[16] "FSFC1448_flyesta" "FSFC1654_unicyclersta" "FSFC1925_hybractersta"
[19] "FSFC2071_hybractersta" "FSFC2111K_hybractersta" "FSFC2112_hybractersta"
[22] "FSFC2130_hybractersta" "FSFC2155_hybractersta" "FSFC2240_unicyclersta"
[25] "FSFC2258_hybractersta" "FSFC2265_hybractersta"
$file <- gsub("_hybractersta", "", wide$file)
wide$file <- gsub("_flyesta", "", wide$file)
wide$file <- gsub("_unicyclersta", "", wide$file)
wide
# check
$file wide
[1] "FSBL0558" "FSBL1386" "FSBL1448" "FSBL1654" "FSBL1925" "FSBL2071"
[7] "FSBL2111" "FSBL2112" "FSBL2130" "FSBL2155" "FSBL2240" "FSBL2258"
[13] "FSBL2265" "FSFC0558" "FSFC1386" "FSFC1448" "FSFC1654" "FSFC1925"
[19] "FSFC2071" "FSFC2111K" "FSFC2112" "FSFC2130" "FSFC2155" "FSFC2240"
[25] "FSFC2258" "FSFC2265"
# Converts to dataframe
= function(wide)
count_to_dataframe
{= wide
wide
# make the genenames a variable
= wide$file
rows
# Remove the file column
= wide %>% select(-file)
wide3
# name the rows as strains
rownames(wide3) = rows
# convert to dataframe
= as.data.frame(wide3)
d
# convert na to 0
is.na(d)] = 0
d[
#name rows
rownames(d) = rows
# convert to matrix
= d
df
{return(df)
}
}
= count_to_dataframe(wide)
df
# Converts to matrix
= function(wide)
count_to_mat
{= wide
wide
# make the genenames a variable
= wide$file
rows
# Remove the file column
= wide %>% select(-file)
wide3
# name the rows as strains
rownames(wide3) = rows
# convert to dataframe
= as.data.frame(wide3)
d
# convert na to 0
is.na(d)] <- 0
d[
# name rows
rownames(d) = rows
# convert to matrix
= as.matrix(d)
mat
{return(mat)
}
}
# Run Function
= count_to_mat(wide)
mat
# Check output
nrow(mat)
[1] 26
class(mat)
[1] "matrix" "array"
colnames(mat)
[1] "hha" "yagZ/ecpA" "fyuA" "irp1" "irp2" "ybtA"
[7] "ybtE" "ybtP" "ybtQ" "ybtS" "ybtT" "ybtU"
[13] "ybtX" "aslA" "chuA" "chuS" "chuT" "chuU"
[19] "chuV" "chuW" "chuX" "chuY" "entA" "entB"
[25] "entC" "entD" "entE" "entF" "entS" "fdeC"
[31] "fepA" "fepB" "fepC" "fepD" "fepG" "fes"
[37] "fimA" "fimB" "fimC" "fimD" "fimE" "fimF"
[43] "fimG" "fimH" "fimI" "gspC" "gspD" "gspE"
[49] "gspF" "gspG" "gspH" "gspI" "gspJ" "gspK"
[55] "gspL" "gspM" "iucA" "iucB" "iucC" "iucD"
[61] "kpsD" "kpsM" "kpsT" "ompA" "papB" "papI"
[67] "papX" "sat" "senB" "vat" "yagV/ecpE" "yagW/ecpD"
[73] "yagX/ecpC" "yagY/ecpB" "ykgK/ecpR" "afaB-I" "afaC-I" "daaF"
[79] "draA" "draD" "draE2" "draP"
rownames(mat)
[1] "FSBL0558" "FSBL1386" "FSBL1448" "FSBL1654" "FSBL1925" "FSBL2071"
[7] "FSBL2111" "FSBL2112" "FSBL2130" "FSBL2155" "FSBL2240" "FSBL2258"
[13] "FSBL2265" "FSFC0558" "FSFC1386" "FSFC1448" "FSFC1654" "FSFC1925"
[19] "FSFC2071" "FSFC2111K" "FSFC2112" "FSFC2130" "FSFC2155" "FSFC2240"
[25] "FSFC2258" "FSFC2265"
2.3: Plot a basic heatmap
Next we will select the mutations based
# convert to binary
= count_to_dataframe(wide)
df <- ifelse(df > 0, 1, 0)
df
# Generate basic heatmap
pheatmap(df)
2.3: Plot an annotated heatmap
# Generate the heatmap with annotations
= read_csv("../data/Isolate_species.csv")
isolates
= as.data.frame(isolates)
isolates_df
= isolates_df[-1,]
isolates_df
<- c("K. pneumoniae" = "#DDAD3B",
species_colors "E. coli" = "#9F7831",
"K. quasipneumoniae similipneumoniae" = "#666666")
# Annotation for isolates (y-axis)
<- data.frame(Species = isolates_df$Species)
annotation_row rownames(annotation_row) <- isolates_df$Isolate
= read_csv("../data/VFDB_classes.csv")
vfdb_class
= as.data.frame(vfdb_class)
vfdb_class_df
<- c("Protectins" = "#4B9B7A",
class_colors "Toxins" = "#CA6728",
"Iron-uptake" = "#D43F88",
"Adhesins" = "#7470AF")
# Annotation for isolates (x-axis)
<- data.frame(class = vfdb_class_df$class)
annotation_col rownames(annotation_col) <- vfdb_class_df$GENE
# Create a list of annotation colors
<- list(class = class_colors, Species = species_colors)
annotation_colors
<- colorRampPalette(c("#FFFFFF","#FBF5BB", "#FADF9F", "#F8B877", "#D37A5A", "#A84451"))(6)
custom_colors
pheatmap(df,
annotation_col = annotation_col,
annotation_row = annotation_row,
annotation_colors = annotation_colors,
color = custom_colors,
show_rownames = TRUE,
show_colnames = TRUE,
border_color = "black")
Part 3 - Comparison of virulence factors across datasets in R
3.1: Import the data
Import the data from abricate for 773 isolates from the BSI and DASSIM datasets.
# Load necessary libraries
library(readxl)
library(readr)
library(pheatmap)
library(dplyr)
library(tidyr)
library(tibble)
library(RColorBrewer)
# 772 Malawi Kleb Strains
= readr::read_csv("../data/all_abricate_773_Ec_Kp_CHL_VFDB_fasta_L60_T90.csv")
abricate_output
= abricate_output ab_tab
#DASSIM <- DASSIM %>% select(matches("^ybt|^irp|^fyu"))
<- ab_tab %>% filter(grepl("ybt|fyu|irp", GENE)) iron_VF
3.2: Wrangle dataset
# Remove duplicates
= rm_duplicates(ab_tab) ab_tab_unique
[1] "Duplicates found:"
[1] 67 630 750 754 757 761 770 788 793 3914 4276 4279
[13] 4288 4291 4304 4319 4321 4482 4487 4497 4501 4515 4537 5093
[25] 5130 5174 5221 5321 5880 5914 5915 5922 5975 6017 8089 8780
[37] 9523 9759 9992 10037 10120 10123 10176 10195 10196 10197 10584 10598
[49] 10600 10602 10603 10703 10711 10948 10949 12707 12949 13359 13503 13537
[61] 13599 13667 14220 15243 15245 15320 15325 15389 15411 15456 15554 15606
[73] 16078 17304 17306 17418 17425 17512 17593 17892 17919 18066 18068 18258
[85] 18992 19318 20957 20958 21293 21648 22228 22317 22449 22560 22667 22673
[97] 22674 23783 23785 23796 23798 23803 23822 23824 23855 23858 23882 23910
[109] 23912 23922 23935 23944 23995 24008 24012 24017 24020 24073 24075 24086
[121] 24088 24093 24112 24114 24145 24148 24172 24200 24202 24212 24225 24235
[133] 24300 25000 25095 25100 25103 25110 25376 26040 26104 26150 26213 26273
[145] 26325 26356 26359 26368 26371 26404 26407 26516 26518 26555 26559 26644
[157] 26647 26655 26666 26671 26673 26675 26677 26682 26686 26690 26692 26694
[169] 26701 26704 26706 26710 26720 26722 26726 26746 26749 26758 26762 26782
[181] 26866 26869 26899 26907 26909 26911 26913 26929 26933 26937 26948 26952
[193] 26974 27016 27267 27271 27272 28096 28133 28198 28364 28366 28876 28941
[205] 29050 29111 29641 29672 29673 29732 29797 30328 30365 30367 30375 30431
[217] 30538 31116 31151 31159 31213 31257 31283 34416 34417 34942 34977 34985
[229] 35040 35108 35171
[1] "Duplicates remaining:"
[1] 1075 1083 1411 1414 3115 3116 3198 3199 3284 3967 3977 3978
[13] 4220 4221 4406 4407 4885 4886 5738 6502 6504 7153 7154 7235
[25] 7236 7692 7776 7779 7853 7859 7938 7941 8019 8022 8385 8422
[37] 9591 9592 9593 9594 9597 9599 9609 9610 9998 10000 10002 10012
[49] 10013 10014 10019 10508 10511 10624 10891 11065 11456 11457 11697 11698
[61] 11945 11951 12493 12650 12948 12957 12958 14295 14328 14417 14418 14537
[73] 14543 14999 15000 15143 15146 15152 15239 15240 15252 15253 15254 15291
[85] 15298 15299 15300 15306 15307 15401 15430 15646 15647 15665 15685 15704
[97] 15728 15792 15799 16387 16388 16389 16392 16393 16394 16398 17238 17239
[109] 18350 18351 18444 18445 18575 18606 18609 18930 19407 19413 19414 19627
[121] 19628 19934 19979 20757 20758 20759 20952 20953 21293 21720 21721 22104
[133] 22105 22672 22724 22725 23400 23427 23428 23431 23615 25109 25110 25118
[145] 25132 25566 25946 25964 25987 26450 26451 26494 26793 27176 27192 28250
[157] 28251 28274 28276 28277 28278 28279 28280 28801 28806 28818 28819 28820
[169] 28824 28883 28884 28891 28896 28901 28902 28903 28904 28905 28971 28977
[181] 28978 28979 28992 29884 29907 29908 29909 29917 29918 29919 29920 29921
[193] 29922 30323 30584 30594 30634 30635 30952 30971 30979 31031 32253 32262
[205] 32347 32358 32359 32485 32499 32500 32594 32652 32653 32787 32979 33063
[217] 33513 33514 33517 33533 33542 34034 34037 34038 34039 34040 34041 34089
[229] 34096
# Make count table
= mk_count_table(ab_tab_unique)
wide
# Rename necessary
$file = gsub("all_spades_assemblies/", "", wide$file)
wide$file = gsub(".fa", "", wide$file)
wide$file = gsub(".fasta", "", wide$file)
wide$file = gsub("_assembledsta", "", wide$file)
wide$file = gsub("_1_SPAdes_assemblysta", "", wide$file)
wide$file = gsub("_assembled.fasta", "", wide$file)
wide$file = gsub("_contigssta", "", wide$file)
wide$file = gsub("sta", "", wide$file)
wide
# Convert count table to dataframe
= count_to_dataframe(wide)
d
# Convert count table to matrix
= count_to_mat(wide) mat
3.3: Filter for ybt + fyu + irp
# Filter the dataframe for the column "ybt"
#filtered_df <- d[, grepl("ybt", colnames(d)), drop = FALSE]
#filtered_df$seq = rownames(filtered_df)
# Filter the dataframe for columns containing "ybt", "fyu", or "irp"
<- d[, grepl("ybt|fyu|irp", colnames(d)), drop = FALSE]
filtered_df
# Add row names as a new column
$seq <- rownames(filtered_df)
filtered_df
# View the filtered dataframe
head(filtered_df)
fyuA irp1 irp2 ybtA ybtE ybtP ybtQ ybtS ybtT ybtU ybtX seq
26141-1-134 1 1 1 1 1 1 1 1 1 1 1 26141-1-134
26141-1-135 1 1 1 1 1 1 1 1 1 1 1 26141-1-135
26141-1-136 1 1 1 1 1 1 1 1 1 1 1 26141-1-136
26141-1-137 1 1 1 1 1 1 1 1 1 1 1 26141-1-137
26141-1-138 0 0 0 0 0 0 0 0 0 0 0 26141-1-138
26141-1-139 1 1 1 1 1 1 1 1 1 1 1 26141-1-139
# Sum values by category
library(dplyr)
3.4: Join to metadata
= read_csv("../data/all_773_DASSIM_BSI_metadata.csv")
metadata = read_csv("../data/BSI_metadata_Ec.csv")
BSI_metadata_Ec = read_csv("../data/BSI_metadata_KpSc_2.csv")
BSI_metadata_KpSc
library(dplyr)
library(stringr)
<- d[, grepl("ybt|fyu|irp", colnames(d)), drop = FALSE]
filtered_df $seq <- rownames(filtered_df)
filtered_df
<- filtered_df %>%
filtered_df mutate(seq = gsub("-", "_", seq))
rownames(filtered_df) = gsub("-", "_", rownames(filtered_df))
# DASSIM ERR3865256 28099_1_10 ERR3865256
# Create a named vector for replacement
<- setNames(metadata$ENA_accession, metadata$lane)
replacement_map
# Perform the replacement
<- filtered_df %>%
filtered_df mutate(accession = str_replace_all(seq, replacement_map) # Replace lanes with ENA accessions
)
= data.frame(accession = metadata$ENA_accession,
comparison_df lane = metadata$lane)
# Add the accession column with conditional logic
"accession"] <- ifelse(
filtered_df[,grepl("^E", filtered_df[,"seq"]), # Check if seq starts with "E"
"seq"], # If true, copy seq to accession
filtered_df[,$accession[match(unlist(filtered_df[,"seq"]), comparison_df$lane)] # Otherwise, match and replace
comparison_df
)
# View the results
print(filtered_df["28099_1_10",]) # Should be ERR3865256
fyuA irp1 irp2 ybtA ybtE ybtP ybtQ ybtS ybtT ybtU ybtX seq
28099_1_10 0 0 0 0 0 0 0 0 0 0 0 28099_1_10
accession
28099_1_10 ERR3865256
# Reshape and summarize data for wide
$accession = metadata$ENA_accession
metadata
= metadata %>%
metadata_Ec_update left_join(BSI_metadata_Ec, by = c('Strain_ID'))
= metadata_Ec_update %>%
metadata_KpSc_update left_join(BSI_metadata_KpSc, by = c('Strain_ID'))
= filtered_df %>%
meta_join left_join(metadata_KpSc_update, by = c('accession'))
# For KpSC
unique(meta_join$Species)
[1] "E. coli" NA "KpSC"
= meta_join %>% filter(Species == "KpSC")
meta_join_KpSc
unique(meta_join$study )
[1] "DASSIM" NA "bacteraemia"
= meta_join_KpSc %>% filter(study == "DASSIM")
DASSIM = meta_join_KpSc %>% filter(study == "bacteraemia")
BSI
= BSI %>% filter(Source.y == "Blood")
BSI
$Source.y BSI
[1] "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood"
[10] "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood"
[19] "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood"
[28] "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood"
[37] "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood"
[46] "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood" "Blood"
[55] "Blood" "Blood" "Blood"
#DASSIM = DASSIM %>% select(starts_with("ybt"))
#BSI= BSI %>% select(starts_with("ybt"))
<- DASSIM %>% select(matches("^ybt|^irp|^fyu"))
DASSIM <- BSI %>% select(matches("^ybt|^irp|^fyu"))
BSI
<- DASSIM %>% summarize(across(where(is.numeric), mean) * 100)
summarized_DASSIM <- BSI %>% summarize(across(where(is.numeric), mean) * 100)
summarized_BSI
$study = "DASSIM"
summarized_DASSIM$study = "BSI"
summarized_BSI
# Reshape and summarize data
= rbind(summarized_DASSIM, summarized_BSI)
summarized_pc
# Bar plot
library(ggplot2)
ggplot(summarized_pc, aes(x = study, y = ybtA, fill = study)) +
geom_bar(stat = "identity") +
labs(x = "Category", y = "Total Value", title = "Bar Plot by Protein Category") +
theme_minimal()
3.5: Join to DASSIM metadata
= read_csv("../data/DASSIM_metadata.csv")
DASSIM_meta
library(dplyr)
library(stringr)
= meta_join %>%
meta_join_DASSIM left_join(DASSIM_meta, by = c('accession'))
3.6: Klebsiella pneumoniae species complex (KpSc)
# For KpSC
unique(meta_join_DASSIM$Species)
[1] "E. coli" NA "KpSC"
= meta_join_DASSIM %>% filter(Species == "KpSC")
meta_join_KpSc_DASSIM
unique(meta_join$study)
[1] "DASSIM" NA "bacteraemia"
= meta_join_KpSc_DASSIM %>% filter(study == "DASSIM")
DASSIM = DASSIM %>% filter(condition == "sepsis")
DASSIM_sepsis = DASSIM %>% filter(condition == "inpatient")
DASSIM_inpatient = DASSIM %>% filter(condition == "community")
DASSIM_community = meta_join_KpSc_DASSIM %>% filter(study == "bacteraemia")
BSI = BSI %>% filter(Source.y == "Blood")
BSI
#DASSIM_sepsis = DASSIM_sepsis %>% select(starts_with("ybt"))
#DASSIM_inpatient = DASSIM_inpatient %>% select(starts_with("ybt"))
#DASSIM_community = DASSIM_community %>% select(starts_with("ybt"))
#BSI = BSI %>% select(starts_with("ybt"))
# Select columns that start with "ybt", "adr", or "ppy"
= DASSIM_sepsis %>% select(matches("^ybt|^irp|^fyu"))
DASSIM_sepsis = DASSIM_inpatient %>% select(matches("^ybt|^irp|^fyu"))
DASSIM_inpatient = DASSIM_community %>% select(matches("^ybt|^irp|^fyu"))
DASSIM_community = BSI %>% select(matches("^ybt|^irp|^fyu"))
BSI
nrow(DASSIM_sepsis) # 138
[1] 138
nrow(DASSIM_inpatient) # 33
[1] 33
nrow(DASSIM_community) # 16
[1] 16
nrow(BSI) # 57
[1] 57
# summarized_DASSIM_sepsis = DASSIM_sepsis %>% summarize(across(where(is.numeric), mean) * 100)
= DASSIM_sepsis %>% summarize(across(where(is.numeric), ~ sum(. == 1) / n() * 100))
summarized_DASSIM_sepsis = DASSIM_inpatient %>% summarize(across(where(is.numeric), ~ sum(. == 1) / n() * 100))
summarized_DASSIM_inpatient = DASSIM_community %>% summarize(across(where(is.numeric), ~ sum(. == 1) / n() * 100))
summarized_DASSIM_community <- BSI %>% summarize(across(where(is.numeric), ~ sum(. == 1) / n() * 100))
summarized_BSI
$study = "DASSIM_sepsis"
summarized_DASSIM_sepsis$study = "DASSIM_inpatient"
summarized_DASSIM_inpatient$study = "DASSIM_community"
summarized_DASSIM_community$study = "BSI"
summarized_BSI
# Reshape and summarize data
= rbind(summarized_DASSIM_sepsis, summarized_DASSIM_inpatient, summarized_DASSIM_community, summarized_BSI)
summarized_pc
# Bar plot
library(ggplot2)
ggplot(summarized_pc, aes(x = study, y = ybtA, fill = study)) +
geom_bar(stat = "identity") +
labs(x = "Category", y = "Total Value", title = "Bar Plot by Protein Category") +
theme_minimal()
# Convert to long ##
= summarized_pc %>% pivot_longer(!study, names_to = "gene", values_to = "count")
summarized_long_pc
= summarized_long_pc %>% group_by(study) %>% summarize(meant_count = mean(count))
summarized_pc_for_paper_KpSc
$order <- factor(summarized_long_pc$study, levels = sort(unique(summarized_long_pc$study)))
summarized_long_pc
<- c('BSI', 'DASSIM_inpatient', 'DASSIM_sepsis', "DASSIM_community")
level_order
= ggplot(summarized_long_pc, aes(x = factor(study, level = level_order), y = count, fill = study)) +
KPSC_plot geom_bar(stat = "identity") +
ylim(0, 100) +
labs(x = "Study", y = "%", title = "K. pneumoniae") +
theme_minimal() + facet_wrap(~ gene)
KPSC_plot
3.7: E. coli
# For Ecoli
unique(meta_join_DASSIM$Species)
[1] "E. coli" NA "KpSC"
= meta_join_DASSIM %>% filter(Species == "E. coli")
meta_join_Ecoli_DASSIM
unique(meta_join$study)
[1] "DASSIM" NA "bacteraemia"
= meta_join_Ecoli_DASSIM %>% filter(study == "DASSIM")
DASSIM = DASSIM %>% filter(condition == "sepsis")
DASSIM_sepsis = DASSIM %>% filter(condition == "inpatient")
DASSIM_inpatient = DASSIM %>% filter(condition == "community")
DASSIM_community = meta_join_Ecoli_DASSIM %>% filter(study == "bacteraemia")
BSI = BSI %>% filter(Source.x == "Blood")
BSI
#DASSIM_sepsis = DASSIM_sepsis %>% select(starts_with("ybt"))
#DASSIM_inpatient = DASSIM_inpatient %>% select(starts_with("ybt"))
#DASSIM_community = DASSIM_community %>% select(starts_with("ybt"))
#BSI = BSI %>% select(starts_with("ybt"))
= DASSIM_sepsis %>% select(matches("^ybt|^irp|^fyu"))
DASSIM_sepsis = DASSIM_inpatient %>% select(matches("^ybt|^irp|^fyu"))
DASSIM_inpatient = DASSIM_community %>% select(matches("^ybt|^irp|^fyu"))
DASSIM_community = BSI %>% select(matches("^ybt|^irp|^fyu"))
BSI
nrow(DASSIM_sepsis) # 334
[1] 334
nrow(DASSIM_inpatient) # 84
[1] 84
nrow(DASSIM_community) # 31
[1] 31
nrow(BSI) # 23
[1] 23
# summarized_DASSIM_sepsis = DASSIM_sepsis %>% summarize(across(where(is.numeric), mean) * 100)
= DASSIM_sepsis %>% summarize(across(where(is.numeric), ~ sum(. == 1) / n() * 100))
summarized_DASSIM_sepsis = DASSIM_inpatient %>% summarize(across(where(is.numeric), ~ sum(. == 1) / n() * 100))
summarized_DASSIM_inpatient = DASSIM_community %>% summarize(across(where(is.numeric), ~ sum(. == 1) / n() * 100))
summarized_DASSIM_community <- BSI %>% summarize(across(where(is.numeric), ~ sum(. == 1) / n() * 100))
summarized_BSI
$study = "DASSIM_sepsis"
summarized_DASSIM_sepsis$study = "DASSIM_inpatient"
summarized_DASSIM_inpatient$study = "DASSIM_community"
summarized_DASSIM_community$study = "BSI"
summarized_BSI
# Reshape and summarize data
= rbind(summarized_DASSIM_sepsis, summarized_DASSIM_inpatient, summarized_DASSIM_community, summarized_BSI)
summarized_pc
# Bar plot
library(ggplot2)
ggplot(summarized_pc, aes(x = study, y = ybtA, fill = study)) +
geom_bar(stat = "identity") +
labs(x = "Category", y = "Total Value", title = "Bar Plot by Protein Category") +
theme_minimal()
# Convert to long ##
= summarized_pc %>% pivot_longer(!study, names_to = "gene", values_to = "count")
summarized_long_pc
= summarized_long_pc %>% group_by(study) %>% summarize(meant_count = mean(count))
summarized_pc_for_paper_Ec
$order <- factor(summarized_long_pc$study, levels = sort(unique(summarized_long_pc$study)))
summarized_long_pc
<- c("DASSIM_community", 'DASSIM_inpatient', 'DASSIM_sepsis', 'BSI')
level_order
= ggplot(summarized_long_pc, aes(x = factor(study, level = level_order), y = count, fill = study)) +
Ecoli_plot geom_bar(stat = "identity") +
ylim(0, 100) +
labs(x = "Study", y = "%", title = "E. coli") +
theme_minimal() + facet_wrap(~ gene)
Ecoli_plot
3.8: E. coli and KpSc plotted together
library(ggpubr)
ggarrange(KPSC_plot, Ecoli_plot, ncol = 2, nrow = 1)