# LOAD DATA
library(SpatialExperiment)
library(STexampleData)
<- Visium_humanDLPFC()
spe
# QUALITY CONTROL (QC)
library(scater)
# subset to keep only spots over tissue
<- spe[, colData(spe)$in_tissue == 1]
spe # identify mitochondrial genes
<- grepl("(^MT-)|(^mt-)", rowData(spe)$gene_name)
is_mito # calculate per-spot QC metrics
<- addPerCellQC(spe, subsets = list(mito = is_mito))
spe # select QC thresholds
<- colData(spe)$sum < 600
qc_lib_size <- colData(spe)$detected < 400
qc_detected <- colData(spe)$subsets_mito_percent > 28
qc_mito <- colData(spe)$cell_count > 10
qc_cell_count # combined set of discarded spots
<- qc_lib_size | qc_detected | qc_mito | qc_cell_count
discard colData(spe)$discard <- discard
# filter low-quality spots
<- spe[, !colData(spe)$discard]
spe
# NORMALIZATION
library(scran)
# calculate logcounts using library size factors
<- logNormCounts(spe)
spe
# FEATURE SELECTION
# remove mitochondrial genes
<- spe[!is_mito, ]
spe # fit mean-variance relationship
<- modelGeneVar(spe)
dec # select top HVGs
<- getTopHVGs(dec, prop = 0.1) top_hvgs
6 Dimensionality reduction
6.1 Background
In this chapter, we apply dimensionality reduction methods to visualize the data and to generate inputs for further downstream analyses.
6.2 Previous steps
Code to run steps from the previous chapters to generate the SpatialExperiment
object required for this chapter.
6.3 Principal component analysis (PCA)
Apply principal component analysis (PCA) to the set of top highly variable genes (HVGs) to reduce the dimensionality of the dataset, and retain the top 50 principal components (PCs) for further downstream analyses.
This is done for two reasons: (i) to reduce noise due to random variation in expression of biologically uninteresting genes, which are assumed to have expression patterns that are independent of each other, and (ii) to improve computational efficiency during downstream analyses.
We use the computationally efficient implementation of PCA provided in the scater
package (McCarthy et al. 2017). This implementation uses randomization, and therefore requires setting a random seed for reproducibility.
# compute PCA
set.seed(123)
<- runPCA(spe, subset_row = top_hvgs)
spe
reducedDimNames(spe)
[1] "PCA"
dim(reducedDim(spe, "PCA"))
[1] 3524 50
6.4 Uniform Manifold Approximation and Projection (UMAP)
We also run UMAP (McInnes, Healy, and Melville 2018) on the set of top 50 PCs and retain the top 2 UMAP components, which will be used for visualization purposes.
# compute UMAP on top 50 PCs
set.seed(123)
<- runUMAP(spe, dimred = "PCA")
spe
reducedDimNames(spe)
[1] "PCA" "UMAP"
dim(reducedDim(spe, "UMAP"))
[1] 3524 2
# update column names for easier plotting
colnames(reducedDim(spe, "UMAP")) <- paste0("UMAP", 1:2)
6.5 Visualizations
Generate plots using plotting functions from the ggspavis package. In the next chapter on clustering, we will add cluster labels to these reduced dimension plots.
library(ggspavis)
# plot top 2 PCA dimensions
plotDimRed(spe, type = "PCA")
# plot top 2 UMAP dimensions
plotDimRed(spe, type = "UMAP")