18 Mouse coronal workflow

18.1 Overview

This workflow analyzes a mouse coronal brain section dataset from the 10x Genomics Visium platform. This dataset was generated by 10x Genomics, and the raw data files are publicly available from the 10x Genomics website.

18.2 Description of dataset

This dataset measures transcriptome-wide gene expression on a Visium slide spanning one hemisphere of a mouse coronal brain section. For experimental details, see the 10x Genomics website.

Due to the small size of the mouse brain and the dimensions of the Visium slide (6.5mm x 6.5mm), the measurements span an entire brain hemisphere. Therefore, we can use this dataset to compare gene expression profiles between major anatomical regions of the mouse brain. Due to the small size of cells in the mouse brain, each spot can contain up to 50 cells. In this dataset, we do not know the exact number of cells per spot.

18.3 Load data

The dataset is available in SpatialExperiment format from the STexampleData package.

library(SpatialExperiment)
library(STexampleData)

# load object
spe <- Visium_mouseCoronal()
spe

class: SpatialExperiment 
dim: 32285 4992 
metadata(0):
assays(1): counts
rownames(32285): ENSMUSG00000051951 ENSMUSG00000089699 ...
  ENSMUSG00000095019 ENSMUSG00000095041
rowData names(3): gene_id gene_name feature_type
colnames(4992): AAACAACGAATAGTTC-1 AAACAAGTATCTCCCA-1 ...
  TTGTTTGTATTACACG-1 TTGTTTGTGTAAATTC-1
colData names(5): barcode_id sample_id in_tissue array_row array_col
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
spatialCoords names(2) : pxl_col_in_fullres pxl_row_in_fullres
imgData names(4): sample_id image_id data scaleFactor

18.4 Plot data

As an initial check, plot the spatial coordinates (spots) in x-y dimensions on the tissue slide. This confirms that the object has loaded correctly, and the orientation matches the 10x Genomics website.

We use visualization functions from the ggspavis package to generate plots.

library(ggspavis)

# plot spatial coordinates (spots)
plotSpots(spe)

18.5 Quality control (QC)

Subset object to keep only spots over tissue.

# subset to keep only spots over tissue
spe <- spe[, colData(spe)$in_tissue == 1]
dim(spe)

[1] 32285  2702

Calculate spot-level QC metrics using the scater package (McCarthy et al. 2017), and store the QC metrics in colData.

library(scater)

# identify mitochondrial genes
is_mito <- grepl("(^MT-)|(^mt-)", rowData(spe)$gene_name)
table(is_mito)

is_mito
FALSE  TRUE 
32272    13

rowData(spe)$gene_name[is_mito]

 [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
 [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"

# calculate per-spot QC metrics and store in colData
spe <- addPerCellQC(spe, subsets = list(mito = is_mito))
head(colData(spe), 3)

DataFrame with 3 rows and 11 columns
                           barcode_id   sample_id in_tissue array_row array_col
                          <character> <character> <integer> <integer> <integer>
AAACAAGTATCTCCCA-1 AAACAAGTATCTCCCA-1    sample01         1        50       102
AAACAATCTACTAGCA-1 AAACAATCTACTAGCA-1    sample01         1         3        43
AAACACCAATAACTGC-1 AAACACCAATAACTGC-1    sample01         1        59        19
                         sum  detected subsets_mito_sum subsets_mito_detected
                   <numeric> <numeric>        <numeric>             <numeric>
AAACAAGTATCTCCCA-1     20935      5230             4036                    13
AAACAATCTACTAGCA-1     14789      3646             3419                    13
AAACACCAATAACTGC-1     34646      6272             5068                    13
                   subsets_mito_percent     total
                              <numeric> <numeric>
AAACAAGTATCTCCCA-1              19.2787     20935
AAACAATCTACTAGCA-1              23.1185     14789
AAACACCAATAACTGC-1              14.6280     34646

Select filtering thresholds for the QC metrics by examining distributions using histograms.

# histograms of QC metrics
par(mfrow = c(1, 3))
hist(colData(spe)$sum, xlab = "sum", main = "UMIs per spot")
hist(colData(spe)$detected, xlab = "detected", main = "Genes per spot")
hist(colData(spe)$subsets_mito_percent, xlab = "percent mitochondrial", main = "Percent mito UMIs")

par(mfrow = c(1, 1))

# select QC thresholds
qc_lib_size <- colData(spe)$sum < 5000
qc_detected <- colData(spe)$detected < 1000
qc_mito <- colData(spe)$subsets_mito_percent > 30

# number of discarded spots for each QC metric
apply(cbind(qc_lib_size, qc_detected, qc_mito), 2, sum)

qc_lib_size qc_detected     qc_mito 
          9           4          11

# combined set of discarded spots
discard <- qc_lib_size | qc_detected | qc_mito
table(discard)

discard
FALSE  TRUE 
 2683    19

# store in object
colData(spe)$discard <- discard

Plot discarded spots in x-y coordinates on the tissue slide to check if there is any biologically meaningful spatial pattern. This would be problematic, since it would mean we are removing biologically informative spots.

# check spatial pattern of discarded spots
plotSpotQC(spe, plot_type = "spot", annotate = "discard")

There is one small region with some concentrated discarded spots at the top-left. However, this does not appear to correspond to any specific known anatomical region of interest. We assume that these are low-quality spots, and filtering them out will not cause problems in the biological interpretation.

We filter out the low-quality spots from the object.

# filter low-quality spots
spe <- spe[, !colData(spe)$discard]
dim(spe)

[1] 32285  2683

18.6 Normalization

Next, we calculate log-transformed normalized counts (logcounts) with the library size factors methodology, using methods from scater (McCarthy et al. 2017) and scran (Lun, McCarthy, and Marioni 2016).

library(scran)

# calculate library size factors
spe <- computeLibraryFactors(spe)

summary(sizeFactors(spe))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1615  0.6597  0.9083  1.0000  1.2342  4.8973

hist(log10(sizeFactors(spe)), xlab = "log10 (size factors)", main = "Size factors")

# calculate logcounts and store in object
spe <- logNormCounts(spe)

assayNames(spe)

[1] "counts"    "logcounts"

18.7 Feature selection

Identify a set of top highly variable genes (HVGs), which will be used to define cell types. We use methods from scran (Lun, McCarthy, and Marioni 2016), and first filter out mitochondrial genes (since these are very highly expressed and not of biological interest here).

# remove mitochondrial genes
spe <- spe[!is_mito, ]
dim(spe)

[1] 32272  2683

# fit mean-variance relationship
dec <- modelGeneVar(spe)

# visualize mean-variance relationship
fit <- metadata(dec)
plot(fit$mean, fit$var, 
     xlab = "mean of log-expression", ylab = "variance of log-expression")
curve(fit$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)

# select top HVGs
top_hvgs <- getTopHVGs(dec, prop = 0.1)
length(top_hvgs)

[1] 1274

Note there are a few extremely highly expressed genes, which influence the fitted mean-variance relationship. We check the names of these genes to decide whether they should be removed as outliers.

# identify outlier genes
rev(sort(fit$mean))[1:3]

ENSMUSG00000115783 ENSMUSG00000098178 ENSMUSG00000024661 
         10.934946           9.847532           7.313407

outlier_ids <- names(rev(sort(fit$mean))[1:3])

rowData(spe)[outlier_ids, ]

DataFrame with 3 rows and 3 columns
                              gene_id   gene_name    feature_type
                          <character> <character>     <character>
ENSMUSG00000115783 ENSMUSG00000115783         Bc1 Gene Expression
ENSMUSG00000098178 ENSMUSG00000098178     Gm42418 Gene Expression
ENSMUSG00000024661 ENSMUSG00000024661        Fth1 Gene Expression

These appear to be biologically meaningful genes, so we leave them in.

18.8 Spatially-aware feature selection

Alternatively, run nnSVG (Weber et al. 2023) to identify a set of top spatially variable genes (SVGs) instead of HVGs.

Here, we run nnSVG using a small subset of the dataset for faster runtime. We select a subset of the data by subsampling on the set of spots and including stringent filtering for low-expressed genes. For a full analysis, we recommend running nnSVG on all spots and using alternative filtering parameters (for Visium data from mouse brain tissue), which takes around 45 minutes for one Visium slide on a standard laptop using multiple cores.

library(nnSVG)

# subsample spots
n <- 100
set.seed(123)
ix <- sample(seq_len(n), n)

spe_nnSVG <- spe[, ix]

# filter low-expressed and mitochondrial genes
# using very stringent filtering parameters for faster runtime in this example
# note: for a full analysis, use alternative filtering parameters (e.g. defaults)
spe_nnSVG <- filter_genes(
  spe_nnSVG, filter_genes_ncounts = 50, filter_genes_pcspots = 5
)

Gene filtering: removing mitochondrial genes

removed 0 mitochondrial genes

Gene filtering: retaining genes with at least 50 counts in at least 5% (n = 5) of spatial locations

removed 32074 out of 32272 genes due to low expression

# re-calculate logcounts after filtering
# using library size factors
spe_nnSVG <- logNormCounts(spe_nnSVG)

# run nnSVG
# using a single core for compatibility on build system
# note: for a full analysis, use multiple cores
set.seed(123)
spe_nnSVG <- nnSVG(spe_nnSVG, n_threads = 1)

# investigate results

# show results
head(rowData(spe_nnSVG), 3)

DataFrame with 3 rows and 17 columns
                              gene_id   gene_name    feature_type  sigma.sq
                          <character> <character>     <character> <numeric>
ENSMUSG00000061518 ENSMUSG00000061518       Cox5b Gene Expression 0.0210665
ENSMUSG00000073702 ENSMUSG00000073702       Rpl31 Gene Expression 0.0688871
ENSMUSG00000046330 ENSMUSG00000046330      Rpl37a Gene Expression 0.0549986
                      tau.sq       phi    loglik   runtime      mean       var
                   <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000061518 0.1202984  13.53194  -43.6472     0.024   4.98425  0.142977
ENSMUSG00000073702 0.0860842   5.84110  -40.3003     0.015   4.83432  0.143616
ENSMUSG00000046330 0.1133114   6.47579  -48.2811     0.018   5.35620  0.166734
                       spcov   prop_sv loglik_lm   LR_stat      rank      pval
                   <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000061518 0.0291204  0.149022  -44.1377  0.981135       184 0.6122789
ENSMUSG00000073702 0.0542917  0.444515  -44.3607  8.120691       147 0.0172431
ENSMUSG00000046330 0.0437844  0.326770  -51.8235  7.084636       151 0.0289462
                        padj
                   <numeric>
ENSMUSG00000061518 0.6588653
ENSMUSG00000073702 0.0232254
ENSMUSG00000046330 0.0379559

# number of significant SVGs
table(rowData(spe_nnSVG)$padj <= 0.05)


FALSE  TRUE 
   46   152

# show results for top n SVGs
rowData(spe_nnSVG)[order(rowData(spe_nnSVG)$rank)[1:6], ]

DataFrame with 6 rows and 17 columns
                              gene_id   gene_name    feature_type  sigma.sq
                          <character> <character>     <character> <numeric>
ENSMUSG00000046447 ENSMUSG00000046447     Camk2n1 Gene Expression   1.88342
ENSMUSG00000053310 ENSMUSG00000053310        Nrgn Gene Expression   3.17506
ENSMUSG00000018593 ENSMUSG00000018593       Sparc Gene Expression   2.04163
ENSMUSG00000032532 ENSMUSG00000032532         Cck Gene Expression   2.98145
ENSMUSG00000070570 ENSMUSG00000070570     Slc17a7 Gene Expression   2.64081
ENSMUSG00000024617 ENSMUSG00000024617      Camk2a Gene Expression   1.41249
                        tau.sq       phi    loglik   runtime      mean
                     <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000046447 6.28372e-02   2.05206   -99.021     0.021   5.35338
ENSMUSG00000053310 2.47772e-01   3.26420  -151.225     0.021   4.85871
ENSMUSG00000018593 7.63186e-02   3.68722  -124.793     0.022   3.61331
ENSMUSG00000032532 2.98145e-08   5.07869  -147.464     0.018   4.19282
ENSMUSG00000070570 3.20107e-01   4.15601  -155.620     0.017   3.62933
ENSMUSG00000024617 9.79965e-02   3.74573  -113.269     0.025   4.11652
                         var     spcov   prop_sv loglik_lm   LR_stat      rank
                   <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000046447   1.50820  0.256357  0.967714  -161.937  125.8321         1
ENSMUSG00000053310   3.71912  0.366737  0.927612  -207.066  111.6804         2
ENSMUSG00000018593   1.87664  0.395442  0.963966  -172.866   96.1453         3
ENSMUSG00000032532   2.72400  0.411820  1.000000  -191.496   88.0644         4
ENSMUSG00000070570   3.00604  0.447757  0.891889  -196.423   81.6050         5
ENSMUSG00000024617   1.17870  0.288711  0.935123  -149.612   72.6864         6
                          pval        padj
                     <numeric>   <numeric>
ENSMUSG00000046447 0.00000e+00 0.00000e+00
ENSMUSG00000053310 0.00000e+00 0.00000e+00
ENSMUSG00000018593 0.00000e+00 0.00000e+00
ENSMUSG00000032532 0.00000e+00 0.00000e+00
ENSMUSG00000070570 0.00000e+00 0.00000e+00
ENSMUSG00000024617 1.11022e-16 3.66374e-15

# identify top-ranked SVG
rowData(spe_nnSVG)$gene_name[which(rowData(spe_nnSVG)$rank == 1)]

[1] "Camk2n1"

18.9 Dimensionality reduction

Run principal component analysis (PCA) on the set of top HVGs using scater (McCarthy et al. 2017), and retain the top 50 principal components (PCs) for downstream analyses. Also run UMAP on the top 50 PCs, and retain the top 2 UMAP components for visualization purposes.

# compute PCA
set.seed(123)
spe <- runPCA(spe, subset_row = top_hvgs)

reducedDimNames(spe)

[1] "PCA"

dim(reducedDim(spe, "PCA"))

[1] 2683   50

# compute UMAP on top 50 PCs
set.seed(123)
spe <- runUMAP(spe, dimred = "PCA")

reducedDimNames(spe)

[1] "PCA"  "UMAP"

dim(reducedDim(spe, "UMAP"))

[1] 2683    2

# update column names for easier plotting
colnames(reducedDim(spe, "UMAP")) <- paste0("UMAP", 1:2)

18.10 Clustering

Perform clustering to define cell types. We apply graph-based clustering using the Walktrap method implemented in scran (Lun, McCarthy, and Marioni 2016), applied to the top 50 PCs calculated on the set of top HVGs.

# graph-based clustering
set.seed(123)
k <- 10
g <- buildSNNGraph(spe, k = k, use.dimred = "PCA")
g_walk <- igraph::cluster_walktrap(g)
clus <- g_walk$membership
table(clus)

clus
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
218 255 176 179  60 127 240 217  83  77 271 100 241 125  57 160  23  22  21  31

# store cluster labels in column 'label' in colData
colLabels(spe) <- factor(clus)

Visualize the clusters by plotting in (i) spatial (x-y) coordinates on the tissue slide, and (ii) UMAP dimensions.

# define custom color palette
colors <- unname(palette.colors(palette = "Polychrome 36"))

# plot clusters in spatial x-y coordinates
plotSpots(spe, annotate = "label", 
          pal = colors)

# plot clusters in UMAP dimensions
plotDimRed(spe, plot_type = "UMAP", 
           annotate = "label", pal = colors)

18.11 Differential expression

Identify marker genes by testing for differential gene expression between clusters, using the binomial test implemented in findMarkers in scran (Lun, McCarthy, and Marioni 2016).

# set gene names as row names for easier plotting
rownames(spe) <- rowData(spe)$gene_name

# test for marker genes
markers <- findMarkers(spe, test = "binom", direction = "up")

# returns a list with one DataFrame per cluster
markers

List of length 20
names(20): 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

library(pheatmap)

# plot log-fold changes for one cluster over all other clusters
# selecting cluster 5
interesting <- markers[[5]]
best_set <- interesting[interesting$Top <= 5, ]
logFCs <- getMarkerEffects(best_set)

pheatmap(logFCs, breaks = seq(-5, 5, length.out = 101))

# plot log-transformed normalized expression of top genes for one cluster
top_genes <- head(rownames(interesting))

plotExpression(spe, x = "label", features = top_genes)