# LOAD DATA
library(SpatialExperiment)
library(STexampleData)
<- Visium_humanDLPFC()
spe
# QUALITY CONTROL (QC)
library(scater)
# subset to keep only spots over tissue
<- spe[, colData(spe)$in_tissue == 1]
spe # identify mitochondrial genes
<- grepl("(^MT-)|(^mt-)", rowData(spe)$gene_name)
is_mito # calculate per-spot QC metrics
<- addPerCellQC(spe, subsets = list(mito = is_mito))
spe # select QC thresholds
<- colData(spe)$sum < 600
qc_lib_size <- colData(spe)$detected < 400
qc_detected <- colData(spe)$subsets_mito_percent > 28
qc_mito <- colData(spe)$cell_count > 10
qc_cell_count # combined set of discarded spots
<- qc_lib_size | qc_detected | qc_mito | qc_cell_count
discard colData(spe)$discard <- discard
# filter low-quality spots
<- spe[, !colData(spe)$discard]
spe
# NORMALIZATION
library(scran)
# calculate logcounts using library size factors
<- logNormCounts(spe)
spe
# FEATURE SELECTION
# remove mitochondrial genes
<- spe[!is_mito, ]
spe # fit mean-variance relationship
<- modelGeneVar(spe)
dec # select top HVGs
<- getTopHVGs(dec, prop = 0.1)
top_hvgs
# DIMENSIONALITY REDUCTION
# compute PCA
set.seed(123)
<- runPCA(spe, subset_row = top_hvgs)
spe # compute UMAP on top 50 PCs
set.seed(123)
<- runUMAP(spe, dimred = "PCA")
spe # update column names
colnames(reducedDim(spe, "UMAP")) <- paste0("UMAP", 1:2)
# CLUSTERING
# graph-based clustering
set.seed(123)
<- 10
k <- buildSNNGraph(spe, k = k, use.dimred = "PCA")
g <- igraph::cluster_walktrap(g)
g_walk <- g_walk$membership
clus colLabels(spe) <- factor(clus)
8 Marker genes
8.1 Background
In this chapter, we identify marker genes for each cluster by testing for differential expression between clusters.
8.2 Previous steps
Code to run steps from the previous chapters to generate the SpatialExperiment
object required for this chapter.
8.3 Marker genes
Identify marker genes by testing for differential gene expression between clusters.
We use the findMarkers
implementation in scran
(Lun, McCarthy, and Marioni 2016), using a binomial test, which tests for genes that differ in the proportion expressed vs. not expressed between clusters. This is a more stringent test than the default t-tests, and tends to select genes that are easier to interpret and validate experimentally.
library(scran)
library(scater)
library(pheatmap)
# set gene names as row names for easier plotting
rownames(spe) <- rowData(spe)$gene_name
# test for marker genes
<- findMarkers(spe, test = "binom", direction = "up")
markers
# returns a list with one DataFrame per cluster
markers
List of length 7
names(7): 1 2 3 4 5 6 7
# plot log-fold changes for one cluster over all other clusters
# selecting cluster 1
<- markers[[1]]
interesting <- interesting[interesting$Top <= 5, ]
best_set <- getMarkerEffects(best_set)
logFCs
pheatmap(logFCs, breaks = seq(-5, 5, length.out = 101))
# plot log-transformed normalized expression of top genes for one cluster
<- head(rownames(interesting))
top_genes
plotExpression(spe, x = "label", features = top_genes)