Dot plot

Category: other

thumbnail for this post

More context (and code) for this plot can be found in my scRNA-seq workflow in the chapter “Expression of individual genes”.

library(tidyverse)
library(Seurat)

# load a single cell expression data set (generated in the lab I work at)
seurat <- readRDS('seurat.rds')

# cells will be grouped by clusters that they have been assigned to
cluster_ids <- levels(seurat@meta.data$seurat_clusters)

# select a set of genes for which we want to show expression
genes_to_show <- seurat@misc$marker_genes$by_cluster %>%
  group_by(cluster) %>%
  arrange(p_val_adj) %>%
  slice(1) %>%
  pull(gene)

# for every cluster-gene combination, calculate the average expression across
# all cells and then transform the data into a data frame
expression_levels_per_cluster <- vapply(
    cluster_ids, FUN.VALUE = numeric(length(cluster_ids)), function(x) {
      cells_in_current_cluster <- which(seurat@meta.data$seurat_cluster == x)
      Matrix::rowMeans(seurat@assays$SCT@data[genes_to_show,cells_in_current_cluster])
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cluster = rownames(.)) %>%
  select(cluster, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  rename(expression = value) %>%
  mutate(id_to_merge = paste0(cluster, '_', gene))

# for every cluster-gene combination, calculate the percentage of cells in the
# respective group that has at least 1 transcript (this means we consider it
# as expressing the gene) and then transform the data into a data frame
percentage_of_cells_expressing_gene <- vapply(
    cluster_ids, FUN.VALUE = numeric(length(cluster_ids)), function(x) {
      cells_in_current_cluster <- which(seurat@meta.data$seurat_cluster == x)
      Matrix::rowSums(seurat@assays$SCT@data[genes_to_show,cells_in_current_cluster] != 0)
    }
  ) %>%
  t() %>%
  as.data.frame() %>%
  mutate(cluster = rownames(.)) %>%
  select(cluster, everything()) %>%
  pivot_longer(
    cols = c(2:ncol(.)),
    names_to = 'gene'
  ) %>%
  rename(cell_count = value) %>%
  left_join(
    .,
    seurat@meta.data %>%
      group_by(seurat_clusters) %>%
      tally() %>%
      rename(cluster = seurat_clusters),
    by = 'cluster') %>%
  mutate(
    id_to_merge = paste0(cluster, '_', gene),
    percent_cells = cell_count / n
  )

# merge the two data frames created before and plot the data
p <- left_join(
    expression_levels_per_cluster,
    percentage_of_cells_expressing_gene %>% select(id_to_merge, percent_cells),
    by = 'id_to_merge'
  ) %>%
  mutate(cluster = factor(cluster, levels = rev(cluster_ids))) %>%
  ggplot(aes(gene, cluster)) +
  geom_point(aes(color = expression, size = percent_cells)) +
  scale_color_distiller(
    palette = 'Reds',
    direction = 1,
    name = 'Log-normalised\nexpression',
    guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")
  ) +
  scale_size(name = 'Percent\nof cells', labels = scales::percent) +
  labs(y = 'Cluster', color = 'Expression') +
  coord_fixed() +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )
ggsave('4.png', p, height = 7, width = 8)