FAS=FSFC1386_hybracter.fasta
NAME=FSFC1386
FOLDER=prokka_annotated
GENUS='Escherichia'
TAG='Gene'
prokka $FAS --outdir $FOLDER --prefix ${NAME}_anno --locustag $TAG --genus $GENUS --addgenes
3: Variant calling heatmaps with breseq
Introduction
This tutorial will show how to visualise the output from variant calling with breseq as heatmaps in R.
Part 1 - Genome download and variant calling with breseq in bash
1.1: Download reads and run assemblies
1.2: Annotate genome assemblies with prokka
In order to get an informative output from breseq it is best to annotate the genome, this can be done with prokka.
1.3: Check files
You should now have the following files which we can use for the tutorial, use ls
to check you have these files for this tutorial.
FSFC1386_hybracter.gbk
FSBL1386_long.fastq
1.4: Run Breseq on genbank files
Make sure you have breseq downloaded see the breseq Github page for more information.
mkdir breseq/FS2071
REF=prokka_annotated/FSFC1386.gbk
LR=long_reads/FSBL1386_long.fastq
OUT=breseq/FS1386
breseq -r $REF -x $LR -o $OUT
After all the breseq annotations are done we can compile them together into one table.
Part 2 - Plotting variant calling heatmaps in R
2.1: Import the data
Import the data from breseq - this is the compiled table of all breseq outputs from all our highly related pairs.
# Load necessary libraries
library(dplyr)
library(stringr)
library(readxl)
library(pheatmap)
library(dplyr)
library(tidyr)
library(tibble)
library(RColorBrewer)
library(ggplot2)
# Load the data (adjust the path to your file location)
= "../data/paired_blood_faecal_breseq_comparisons_for_R.xlsx"
file_path = read_excel(file_path) data
2.2: Wrangle dataset
# Extract relevant columns
= data %>% select(isolate = `isolate`,
class_df description = `description`,
annotation = `annotation`,
class = `class`,
COG_class = `COG_class`)
# Remove rows where isolate is "FSFC2155_v_FSBL2155"
= class_df %>% filter(isolate != "FSFC2155_v_FSBL2155")
class_df
# Drop rows with missing values in the isolate or description columns
= class_df %>% filter(!is.na(isolate) & !is.na(description))
class_df
# Check for "intergenic" in the annotation column and create a new column
= class_df %>%
class_df mutate(context = ifelse(grepl("intergenic", annotation, ignore.case = TRUE), "Intergenic", "Coding sequence"))
# Add a new column for mutation type
= class_df %>%
class_df mutate(mutation_type = case_when(
str_detect(annotation, "^intergenic") ~ "intergenic", # If it starts with "intergenic", label as intergenic
str_detect(annotation, "^coding") ~ "coding_indel", # If it starts with "coding", label as coding
str_detect(annotation, "^[A-Z](\\d+)[A-Z*]") ~ ifelse(
str_sub(annotation, 1, 1) == str_sub(annotation, nchar(str_extract(annotation, "^[A-Z](\\d+)[A-Z*]")), nchar(str_extract(annotation, "^[A-Z](\\d+)[A-Z*]"))),
"synonymous",
"non-synonymous"
),TRUE ~ NA_character_
))
# Print result
head(class_df)
# A tibble: 6 × 7
isolate description annotation class COG_class context mutation_type
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 FSFC1386_v_FSBL1… UPF0380 pr… intergeni… Phag… K. Trans… Interg… intergenic
2 FSFC2071_v_FSBL2… Alpha‑gluc… D122Y (GA… Suga… G. Carbo… Coding… non-synonymo…
3 FSFC2071_v_FSBL2… Glutamate … P326P (CC… Amin… C. Energ… Coding… synonymous
4 FSFC2071_v_FSBL2… Exported z… R238C (CG… Chap… O. Postt… Coding… non-synonymo…
5 FSFC2071_v_FSBL2… Phage holi… L10L (TTA… Phag… M. Cell … Coding… synonymous
6 FSFC2071_v_FSBL2… Phage holi… W56* (TGG… Phag… M. Cell … Coding… non-synonymo…
head(class_df$mutation_type)
[1] "intergenic" "non-synonymous" "synonymous" "non-synonymous"
[5] "synonymous" "non-synonymous"
# Create the pivot table
= table(class_df$isolate, class_df$description)
class_pivot = as.data.frame.matrix(class_pivot)
class_pivot
# Create annotation data frame for intergenic and class information
= class_df %>%
annotation_df select(description, context, class, COG_class, mutation_type) %>%
distinct()
Next we will select the mutations based on synonymous/non-synonymous
# synonymous/non-synonymous mutation selection
unique(annotation_df$mutation_type)
[1] "intergenic" "non-synonymous" "synonymous" "coding_indel"
= annotation_df %>% filter(mutation_type == "synonymous")
annotation_df_synon = annotation_df %>% filter(mutation_type == "non-synonymous")
annotation_df_non_synon = annotation_df %>% filter(mutation_type == "intergenic")
annotation_df_intergenic = annotation_df %>% filter(mutation_type == "coding_indel") annotation_df_coding_indel
2.3: Plotting variants as heatmaps with pheatmap
2.3.1 Plotting synonymous mutations
= intersect(colnames(class_pivot), annotation_df_synon$description)
genes_synon = class_pivot %>% select(all_of(genes_synon))
class_pivot_synon
= annotation_df_synon %>%
annotation_df_synon column_to_rownames("description")
# Ensure the annotation rows are in the same order as the matrix rows
= annotation_df_synon[colnames(class_pivot), , drop = FALSE]
annotation_df_synon
# remove context as redudnant
= annotation_df_synon %>% select(-context, -mutation_type)
annotation_df_synon
# Define annotation colors
= list(
annotation_colors context = c("Intergenic" = "red", "Coding sequence" = "blue"),
class = setNames(colorRampPalette(brewer.pal(9, "Set1"))(length(unique(annotation_df_synon$class))), unique(annotation_df_synon$class))
)
# Yellow gradient
= colorRampPalette(c("#FFFACD", "#FFE680", "#FFD700", "#DAA520", "#8B7500"))(100)
yellow_colors
# Transpose the pivot table
= t(class_pivot_synon)
class_pivot_synon_t
= pheatmap(class_pivot_synon_t,
synon_p cluster_rows = FALSE,
cluster_cols = FALSE,
display_numbers = FALSE,
main = "synonymous mutations",
fontsize_number = 8,
color = yellow_colors,
border_color = "black",
annotation_row = annotation_df_synon,
annotation_colors = annotation_colors)
synon_p
2.3.2 Plotting non synonymous mutations
= intersect(colnames(class_pivot), annotation_df_non_synon$description)
genes_non_synon = class_pivot %>% select(all_of(genes_non_synon))
class_pivot_non_synon
= annotation_df_non_synon %>%
annotation_df_non_synon column_to_rownames("description")
# Ensure the annotation rows are in the same order as the matrix rows
= annotation_df_non_synon[colnames(class_pivot), , drop = FALSE]
annotation_df_non_synon
# remove context as redudnant
= annotation_df_non_synon %>% select(-context, -mutation_type)
annotation_df_non_synon
# Define annotation colors
= list(
annotation_colors context = c("Intergenic" = "red", "Coding sequence" = "blue"),
class = setNames(colorRampPalette(brewer.pal(9, "Set1"))(length(unique(annotation_df_non_synon$class))), unique(annotation_df_non_synon$class))
)
# Red gradient
= colorRampPalette(c("#FFEBEB", "#FFB3B3", "#FF8080", "#E63946", "#990000"))(100)
red_colors = brewer.pal(9, "YlOrRd")
non_synon_cols = brewer.pal(9, "YlOrRd")
cols
# Transpose the pivot table
= t(class_pivot_non_synon)
class_pivot_non_synon_t
= pheatmap(class_pivot_non_synon_t,
non_synon_p cluster_rows = FALSE,
cluster_cols = FALSE,
display_numbers = FALSE,
main = "non synonymous mutations",
fontsize_number = 8,
color = red_colors,
border_color = "black",
annotation_row = annotation_df_non_synon,
annotation_colors = annotation_colors)
non_synon_p
2.3.3 Plotting coding INDELs
= intersect(colnames(class_pivot), annotation_df_coding_indel$description)
genes_coding_indel = class_pivot %>% select(all_of(genes_coding_indel))
class_pivot_coding_indel
= annotation_df_coding_indel %>%
annotation_df_coding_indel column_to_rownames("description")
# Ensure the annotation rows are in the same order as the matrix rows
= annotation_df_coding_indel[colnames(class_pivot), , drop = FALSE]
annotation_df_coding_indel
# remove context as redudnant
= annotation_df_coding_indel %>% select(-context, -mutation_type)
annotation_df_coding_indel
# Define annotation colors
= list(
annotation_colors context = c("coding_indel" = "red", "Coding sequence" = "blue"),
class = setNames(colorRampPalette(brewer.pal(9, "Set1"))(length(unique(annotation_df_coding_indel$class))), unique(annotation_df_coding_indel$class))
)
= brewer.pal(9, "YlOrRd")
coding_indel_cols
# Green gradient
= colorRampPalette(c("#E6F9E6", "#B3E6B3", "#80CC80", "#3CB371", "#006400"))(100)
green_colors
= brewer.pal(9, "YlOrRd")
coding_indel_cols = brewer.pal(9, "YlOrRd")
cols
# Transpose the pivot table
= t(class_pivot_coding_indel)
class_pivot_coding_indel_t
= pheatmap(class_pivot_coding_indel_t,
coding_indel_p cluster_rows = FALSE,
cluster_cols = FALSE,
display_numbers = FALSE,
main = "coding_indel mutations",
fontsize_number = 8,
color = green_colors,
border_color = "black",
annotation_row = annotation_df_coding_indel,
annotation_colors = annotation_colors)
coding_indel_p
2.3.3 Plotting intergenic mutations
= intersect(colnames(class_pivot), annotation_df_intergenic$description)
genes_intergenic = class_pivot %>% select(all_of(genes_intergenic))
class_pivot_intergenic
= annotation_df_intergenic %>%
annotation_df_intergenic column_to_rownames("description")
# Ensure the annotation rows are in the same order as the matrix rows
= annotation_df_intergenic[colnames(class_pivot), , drop = FALSE]
annotation_df_intergenic
# remove context as redudnant
= annotation_df_intergenic %>% select(-context, -mutation_type)
annotation_df_intergenic
# Define annotation colors
= list(
annotation_colors context = c("Intergenic" = "red", "Coding sequence" = "blue"),
class = setNames(colorRampPalette(brewer.pal(9, "Set1"))(length(unique(annotation_df_intergenic$class))), unique(annotation_df_intergenic$class))
)
= brewer.pal(9, "YlOrRd")
intergenic_cols
# Blue gradient
= colorRampPalette(c("#E6F7FF", "#B3DAF1", "#80BFFF", "#4682B4", "#003366"))(100)
blue_colors
= brewer.pal(9, "YlOrRd")
intergenic_cols = brewer.pal(9, "YlOrRd")
cols
# Transpose the pivot table
= t(class_pivot_intergenic)
class_pivot_intergenic_t
= pheatmap(class_pivot_intergenic_t,
intergenic_p cluster_rows = FALSE,
cluster_cols = FALSE,
display_numbers = FALSE,
main = "intergenic mutations",
fontsize_number = 8,
color = blue_colors,
border_color = "black",
annotation_row = annotation_df_intergenic,
annotation_colors = annotation_colors)
intergenic_p