Day 02B - PBMC Integration

Author

Saket Choudhary for QSCB 2025

Published

December 18, 2025

1 Setup

Code
setRepositories(ind = 1:3)
required.packages <- c(
  "Seurat", "BiocManager",
  "harmony", "glmGamPoi", "devtools"
)
InstallMissing <- function(packages) {
  for (pkg in packages) {
    if (!require(pkg, character.only = TRUE)) {
      install.packages(pkg, dependencies = TRUE)
      library(pkg, character.only = TRUE)
    }
  }
}

BiocManager::install("batchelor")
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cloud.r-project.org
    BioCsoft: https://bioconductor.org/packages/3.22/bioc
    BioCann: https://bioconductor.org/packages/3.22/data/annotation
Bioconductor version 3.22 (BiocManager 1.30.27), R 4.5.2 (2025-10-31)
Warning: package(s) not installed when version(s) same as or greater than current; use
  `force = TRUE` to re-install: 'batchelor'
Installation paths not writeable, unable to update packages
  path: /usr/lib/R/site-library
  packages:
    cvar, DESeq2, edgeR, emmeans, GenomeInfoDb, GenomicRanges, ggeffects,
    ggsci, graph, HSMMSingleCell, leidenbase, mclogit, R.devices, reformulas,
    SparseArray, statnet.common, TMB, zoo
  path: /usr/local/lib/R/site-library
  packages:
    clusterProfiler, UCSC.utils, wk, xfun
Old packages: 'BiocFileCache', 'biomaRt', 'epiR', 'GenomicDataCommons',
  'RcppArmadillo', 'selectr', 'TCGAbiolinks', 'TCGAbiolinksGUI.data', 'yaml',
  'yulab.utils'
Code
library(Seurat)
Loading required package: SeuratObject
Loading required package: sp

Attaching package: 'SeuratObject'
The following objects are masked from 'package:base':

    intersect, t
Code
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Code
devtools::install_github("satijalab/seurat-wrappers")
Downloading GitHub repo satijalab/seurat-wrappers@HEAD
RcppArmad... (15.2.2-1 -> 15.2.3-1) [CRAN]
xfun         (0.54     -> 0.55    ) [CRAN]
yaml         (2.3.11   -> 2.3.12  ) [CRAN]
zoo          (1.8-14   -> 1.8-15  ) [CRAN]
Installing 4 packages: RcppArmadillo, xfun, yaml, zoo
Installing packages into '/home/saket/R/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)
── R CMD build ─────────────────────────────────────────────────────────────────
* checking for file ‘/tmp/Rtmp7Jc9W7/remotes3b227548d0b204/satijalab-seurat-wrappers-a1eb0d8/DESCRIPTION’ ... OK
* preparing ‘SeuratWrappers’:
* checking DESCRIPTION meta-information ... OK
* checking for LF line-endings in source and make files and shell scripts
* checking for empty or unneeded directories
Omitted ‘LazyData’ from DESCRIPTION
* building ‘SeuratWrappers_0.4.0.tar.gz’
Installing package into '/home/saket/R/x86_64-pc-linux-gnu-library/4.5'
(as 'lib' is unspecified)
Code
library(SeuratWrappers)

2 Load data

Code
pbmc3k <- readRDS("/NAS/qscb2025/Guilliams_2022_livercellatlas/data/pbmc_integration/pbmc3k_annotated.rds")
pbmc3k$dataset <- "pbmc3k"
pbmc4k <- readRDS("/NAS/qscb2025/Guilliams_2022_livercellatlas/data/pbmc_integration/pbmc4k.rds")
pbmc4k$dataset <- "pbmc4k"

pbmc4k <- NormalizeData(pbmc4k) %>%
  FindVariableFeatures() %>%
  ScaleData() %>%
  RunPCA(npcs = 30) %>%
  FindNeighbors(dims = 1:20) %>%
  RunUMAP(dims = 1:20)
Normalizing layer: counts
Finding variable features for layer counts
Centering and scaling data matrix
PC_ 1 
Positive:  TRAC, LTB, RPS18, IL32, EEF1A1, TRBC2, CD69, CD7, CD3G, IL7R 
       CD27, RPS2, IFITM1, CD2, TRBC1, LEF1, CCR7, CD247, FLT3LG, NOSIP 
       TRAT1, GZMM, CTSW, MAL, BCL11B, PIM2, CD8B, RORA, CAMK4, SYNE2 
Negative:  CST3, LYZ, CSTA, LST1, MNDA, TYROBP, FCN1, FTL, AIF1, CTSS 
       TYMP, S100A9, FCER1G, LGALS1, RP11-1143G9.4, S100A8, FTH1, LGALS2, SERPINA1, FGL2 
       PSAP, SPI1, S100A11, GPX1, CFD, VCAN, CD68, S100A12, AP1S2, MS4A6A 
PC_ 2 
Positive:  CD79A, MS4A1, IGHM, CD79B, IGHD, LINC00926, TCL1A, BANK1, IGKC, CD74 
       HLA-DQB1, CD22, HLA-DRA, HLA-DPA1, FCER2, TNFRSF13C, HLA-DPB1, VPREB3, FAM129C, HLA-DRB1 
       HLA-DMA, HLA-DQA1, MEF2C, RALGPS2, HVCN1, SPIB, FCRLA, TSPAN13, HLA-DMB, HLA-DOB 
Negative:  IL32, CTSW, TMSB4X, NKG7, GZMA, CST7, CCL5, CD7, PRF1, GZMM 
       S100A4, TRAC, CD247, IFITM1, ANXA1, KLRD1, HOPX, KLRB1, SRGN, ITGB2 
       PFN1, FGFBP2, CD2, LYAR, KLRG1, GNLY, MATK, CCL4, ID2, CMC1 
PC_ 3 
Positive:  GZMB, NKG7, CLIC3, CST7, PRF1, GZMA, KLRD1, FGFBP2, KLRF1, HOPX 
       GNLY, C12orf75, SPON2, CTSW, RHOC, CCL5, CCL4, CD160, GZMH, XCL2 
       MATK, IL2RB, CMC1, TTC38, FCGR3A, APOBEC3G, CLIC1, APMAP, TRDC, S1PR5 
Negative:  RPLP1, LEF1, RPS18, CCR7, MAL, RPS2, EEF1A1, RPS24, S100A12, VCAN 
       ACTN1, S100A8, TRABD2A, TRAC, RGCC, S100A9, NOSIP, JUNB, LTB, IL7R 
       RP11-1143G9.4, FHIT, SOCS3, FLT3LG, CD14, RGS10, CAMK4, LRRN3, MNDA, FCN1 
PC_ 4 
Positive:  MS4A1, FGFBP2, CD79B, IGHD, NKG7, CD79A, KLRD1, PRF1, CST7, KLRF1 
       HOPX, GNLY, GZMA, LINC00926, SPON2, CCL4, CD160, FCER2, FCGR3A, XCL2 
       CD22, BANK1, TNFRSF13C, TTC38, IGHM, VPREB3, IL2RB, MATK, S1PR5, MT-CO3 
Negative:  LILRA4, LRRC26, CLEC4C, SERPINF1, TPM2, SCT, PTCRA, DNASE1L3, TNFRSF21, PLD4 
       GAS6, SMPD3, IL3RA, LINC00996, PPP1R14B, RP11-73G16.2, DERL3, PTPRS, ITM2C, PACSIN1 
       SMIM5, IRF7, MYBL2, UGCG, LILRB4, LAMP5, MAP1A, CIB2, KCNK17, ASIP 
PC_ 5 
Positive:  S100A12, VCAN, S100A8, RP11-1143G9.4, MS4A6A, S100A9, CD14, CD36, GZMB, CXCL8 
       RBP7, RNASE6, QPCT, MGST1, NCF1, ITGAM, CLEC4C, LRRC26, CYP1B1, CLIC3 
       ALOX5AP, CCL3, CLEC4E, GPX1, LINC00996, LILRA4, DERL3, CD93, KCTD12, SCT 
Negative:  CDKN1C, HES4, FCGR3A, SIGLEC10, CSF1R, BATF3, CKB, TCF7L2, CTSL, RP11-1008C21.1 
       MS4A7, CTD-2006K23.1, MS4A4A, FAM110A, ICAM4, LRRC25, HMOX1, C1QA, IFITM3, ZNF703 
       HCK, RRAS, RHOC, OAS1, MTSS1, NEURL1, TUBA1B, VMO1, YBX1, TNFRSF8 
Computing nearest neighbor graph
Computing SNN
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
15:19:14 UMAP embedding parameters a = 0.9922 b = 1.112
15:19:14 Read 4340 rows and found 20 numeric columns
15:19:14 Using Annoy for neighbor search, n_neighbors = 30
15:19:14 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:19:14 Writing NN index file to temp file /tmp/Rtmp7Jc9W7/file3b2275177ae91b
15:19:14 Searching Annoy index using 1 thread, search_k = 3000
15:19:15 Annoy recall = 100%
15:19:15 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:19:16 Initializing from normalized Laplacian + noise (using RSpectra)
15:19:16 Commencing optimization for 500 epochs, with 180242 positive edges
15:19:16 Using rng type: pcg
15:19:21 Optimization finished

3 Merge the two datasets

Code
merged <- merge(pbmc3k,
  y = pbmc4k,
  add.cell.ids = c("pbmc3k", "pbmc4k")
)

4 Perform standard workflow for visualization and clustering

Code
merged <- JoinLayers(object = merged)

library(sparseMatrixStats)
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: 'matrixStats'
The following object is masked from 'package:dplyr':

    count

Attaching package: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars
Code
df <- data.frame(
  mean = rowMeans(merged@assays$RNA@layers$counts),
  variance = rowVars(merged@assays$RNA@layers$counts)
)
merged <- NormalizeData(merged) %>%
  FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>%
  ScaleData() %>%
  RunPCA(npcs = 30) %>%
  FindNeighbors(dims = 1:20) %>%
  RunUMAP(dims = 1:20)
Normalizing layer: counts
Finding variable features for layer counts
Centering and scaling data matrix
Warning: Different features in new layer data than already exists for
scale.data
PC_ 1 
Positive:  LTB, RPS27, IL32, MALAT1, IL7R, TRAC, RPS29, CD2, EEF1A1, STK17A 
       TRBC2, CD247, CTSW, PTPRCAP, TRBC1, MAL, CCL5, CD8B, GIMAP7, CD8A 
       AQP3, RORA, CST7, ITM2A, GZMA, TRABD2A, FKBP11, DUSP2, OPTN, KLRG1 
Negative:  CST3, LYZ, LST1, TYROBP, AIF1, FCN1, FTL, S100A9, TYMP, FCER1G 
       CTSS, LGALS1, S100A8, MNDA, LGALS2, FTH1, SERPINA1, CFD, SPI1, CD68 
       PSAP, S100A11, CSTA, GRN, AP1S2, CFP, GPX1, SAT1, IFI30, FGL2 
PC_ 2 
Positive:  NKG7, TMSB4X, CTSW, GZMA, CST7, PRF1, S100A4, GIMAP7, IL32, CCL5 
       ANXA1, FGFBP2, CD247, GNLY, IFITM2, HOPX, SRGN, FCGR3A, GZMH, GIMAP4 
       CCL4, KLRD1, TMEM66, SPON2, GAPDH, ID2, GZMB, UQCR11.1, MT2A, XCL2 
Negative:  CD79A, MS4A1, IGHM, HLA-DQB1, IGHD, CD79B, TCL1A, IGKC, LINC00926, HLA-DQA1 
       HLA-DRA, BANK1, CD74, MEF2C, CD22, TNFRSF13C, VPREB3, HLA-DRB1, FCER2, FAM129C 
       HLA-DMB, HLA-DMA, HLA-DPB1, HLA-DPA1, RALGPS2, FCRLA, SPIB, TSPAN13, RP11-231C14.7, HVCN1 
PC_ 3 
Positive:  TMEM66, RPSAP58, UQCR11.1, PTPRCAP, HLA-DRB5, NHP2L1, NDUFB8.1, AMICA1, TMBIM4.1, IFI6 
       IFI44L, RP11-290F20.3, FAIM3, C1orf86, MRP63, FTH1, PSMA2.1, C19orf10, HLA-DQA2, LSMD1 
       ISG15, C1orf63, NAPRT1, TREX1, GIMAP5, MIR24-2, NDNL2, APOBEC3A, PPAPDC1B, FTL 
Negative:  RPS17, RPL41, RPL39, MT-ND3, HLA-B, H3F3A, RPS29, MALAT1, HMGN2, RPL34 
       MT-ATP6, TRAC, KLRD1, KLRB1, TRBC2, GZMA, NKG7, SUB1, MYDGF, TRDC 
       RPS24, CST7, MT-CO3, KLRF1, CMC1, MT-CO2, TRBC1, TRGC2, CTSW, JAML 
PC_ 4 
Positive:  GZMB, FCGR3A, NKG7, FGFBP2, HLA-DPA1, PRF1, CD74, HLA-DPB1, CLIC3, CST7 
       RHOC, CD79B, GNLY, SPON2, GZMA, HLA-DRB5, GZMH, PLAC8, CD79A, HOPX 
       MS4A1, KLRD1, APOBEC3G, CCL4, HLA-DQA1, KLRF1, HLA-DRB1, XCL2, TTC38, C12orf75 
Negative:  RPL34, RPL39, TRAC, S100A12, RPS27, RPS29, MT-ND3, EEF1A1, RPL41, RPS17 
       VCAN, S100A8, RPS24, CSTA, RP11-1143G9.4, MAL, CXCL8, IL7R, TRBC2, S100A9 
       MNDA, MALAT1, RBP7, TRABD2A, MGST1, JAML, KCTD12, JUND, NAMPT, MT-ATP6 
PC_ 5 
Positive:  PTCRA, LILRA4, CLEC4C, LRRC26, SERPINF1, DNASE1L3, TPM2, SCT, GAS6, PPP1R14B 
       IL3RA, SMPD3, TNFRSF21, SMIM5, PLD4, DERL3, PTPRS, LINC00996, ITM2C, JCHAIN 
       RP11-73G16.2, LAMP5, PACSIN1, MYBL2, PF4, APP, MAP1A, UGCG, SDPR, PPBP 
Negative:  NKG7, FGFBP2, KLRD1, GNLY, CST7, PRF1, GZMA, MS4A1, CD79B, CCL4 
       CD79A, IGHD, SPON2, KLRF1, LINC00926, MT-CO1, HOPX, XCL2, FCER2, CD160 
       TTC38, GZMH, TNFRSF13C, FCGR3A, MT-CO3, MATK, CD22, IL2RB, VPREB3, CTSW 
Computing nearest neighbor graph
Computing SNN
15:19:39 UMAP embedding parameters a = 0.9922 b = 1.112
15:19:39 Read 6978 rows and found 20 numeric columns
15:19:39 Using Annoy for neighbor search, n_neighbors = 30
15:19:39 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:19:40 Writing NN index file to temp file /tmp/Rtmp7Jc9W7/file3b22751c9272f5
15:19:40 Searching Annoy index using 1 thread, search_k = 3000
15:19:42 Annoy recall = 100%
15:19:42 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:19:43 Initializing from normalized Laplacian + noise (using RSpectra)
15:19:43 Commencing optimization for 500 epochs, with 291788 positive edges
15:19:43 Using rng type: pcg
15:19:50 Optimization finished
Code
df <- data.frame(
  mean = rowMeans(merged@assays$RNA@layers$data),
  variance = rowVars(merged@assays$RNA@layers$data)
)

5 Visualize the merged dataset

do the datasets merge into each other well?

Code
DimPlot(merged, reduction = "umap", group.by = "dataset")

Code
merged <- FindClusters(merged, resolution = 0.3)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 6978
Number of edges: 284691

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9529
Number of communities: 13
Elapsed time: 0 seconds
Code
merged$merged_cluster <- Idents(merged)
Code
DimPlot(merged, reduction = "umap", group.by = "merged_cluster")

6 visualise the clusters

Code
DimPlot(merged, reduction = "umap", group.by = "merged_cluster", label = TRUE)

Code
DimPlot(merged, reduction = "umap", group.by = c(
  "dataset",
  "merged_cluster"
), label = TRUE)

7 Let’s integrate

Code
merged <- JoinLayers(merged)
merged[["RNA"]] <- split(merged[["RNA"]], f = merged$dataset)
Splitting 'counts', 'data' layers. Not splitting 'scale.data'. If you would like to split other layers, set in `layers` argument.
Code
merged <- NormalizeData(merged) %>%
  FindVariableFeatures() %>%
  ScaleData() %>%
  RunPCA()
Normalizing layer: counts.pbmc3k
Normalizing layer: counts.pbmc4k
Finding variable features for layer counts.pbmc3k
Finding variable features for layer counts.pbmc4k
Centering and scaling data matrix
Warning: Different features in new layer data than already exists for
scale.data
PC_ 1 
Positive:  CST3, LYZ, LST1, TYROBP, AIF1, FCN1, FTL, S100A9, TYMP, FCER1G 
       CTSS, LGALS1, MNDA, S100A8, SERPINA1, LGALS2, CFD, FTH1, SPI1, CSTA 
       CD68, PSAP, S100A11, GRN, AP1S2, GPX1, CFP, IFI30, FGL2, SAT1 
Negative:  LTB, IL32, CD7, IL7R, ACAP1, TRAC, CD2, CD27, STK17A, TRBC2 
       CCR7, CD247, LEF1, CTSW, GZMM, TRAT1, LDLRAP1, TRBC1, MAL, CD8B 
       CCL5, AQP3, JUN, CD8A, ITM2A, RORA, CST7, GZMA, PIM2, TRABD2A 
PC_ 2 
Positive:  NKG7, GZMA, CTSW, CST7, IL32, TMSB4X, PRF1, CCL5, S100A4, GZMM 
       CD247, ANXA1, CD7, FGFBP2, KLRD1, GNLY, PFN1, HOPX, SRGN, CCL4 
       GZMH, ID2, GIMAP4, MATK, SPON2, LYAR, KLRG1, KLRB1, CD2, GAPDH 
Negative:  CD79A, MS4A1, CD79B, TCL1A, IGHM, LINC00926, IGHD, HLA-DQB1, BANK1, CD74 
       HLA-DRA, HLA-DQA1, IGKC, VPREB3, CD22, FCER2, HLA-DRB1, HLA-DPA1, HLA-DPB1, TNFRSF13C 
       MEF2C, HLA-DMA, FAM129C, HLA-DMB, FCRLA, HVCN1, TSPAN13, HLA-DOB, SPIB, RALGPS2 
PC_ 3 
Positive:  GZMB, NKG7, CST7, PRF1, GZMA, CLIC3, FGFBP2, GNLY, KLRD1, KLRF1 
       SPON2, HOPX, CTSW, CCL4, C12orf75, GZMH, XCL2, CCL5, CD160, RHOC 
       TTC38, MATK, FCGR3A, IL2RB, PLEK, S1PR5, APMAP, CMC1, APOBEC3G, PLAC8 
Negative:  LEF1, CCR7, MAL, IL7R, JUNB, S100A8, VIM, FOS, SOCS3, S100A12 
       TRABD2A, LDLRAP1, LTB, S100A9, FHIT, CD14, FCN1, FYB, VCAN, RBP7 
       LGALS2, LYZ, AIF1, TRAT1, FTL, CD27, NGFRAP1, NELL2, AQP3, TSHZ2 
PC_ 4 
Positive:  JAML, S100A12, CSTA, RP11-1143G9.4, CH17-373J23.1, HMGN2, TRAC, CTD-3252C9.4, VCAN, CXCL8 
       SUB1, MT-CO3, KCTD12, MYDGF, CD36, TRBC2, RP11-386I14.4, LILRA4, DPYSL2, MNDA 
       NAIP, MGST1, MT-CO2, JUND, PPP1R14B, SERPINF1, TXN, OGFRL1, LRRC26, CLEC4C 
Negative:  HLA-DRB5, FCGR3A, IFITM2, ISG15, IFITM3, IFI6, APOBEC3A, HES4, CDKN1C, OAS1 
       FAM26F, RHOC, NAPRT1, IFI44L, CTSL, CKB, FTH1, ABI3, SIGLEC10, CD79B 
       HLA-DQA2, HLA-DPA1, LILRA5, IFIT3, MT2A, EPSTI1, HMOX1, APOBEC3B, C1orf63, MS4A7 
PC_ 5 
Positive:  PTCRA, LILRA4, CLEC4C, SERPINF1, LRRC26, PLD4, DNASE1L3, TPM2, SCT, IL3RA 
       GAS6, PPP1R14B, SMPD3, TNFRSF21, SMIM5, DERL3, ITM2C, PTPRS, LINC00996, IRF7 
       RP11-73G16.2, JCHAIN, LILRB4, MYBL2, PACSIN1, PF4, LAMP5, SDPR, MAP1A, TUBB1 
Negative:  S100A12, MT-CO3, KLRD1, NKG7, FGFBP2, VCAN, GNLY, CST7, GZMA, CCL4 
       PRF1, MT-CO1, KLRF1, SPON2, CXCL8, S100A8, RP11-1143G9.4, CSTA, HOPX, XCL2 
       CD160, MT-CO2, MATK, IGHD, TTC38, CTSW, TRDC, S1PR5, CCL3, GZMH 
Warning: Number of dimensions changing from 30 to 50
Code
integrated <- IntegrateLayers(
  object = merged,
  method = FastMNNIntegration,
  orig.reduction = "pca",
  new.reduction = "integrated.mnn",
  verbose = FALSE
)
Warning: Layer counts isn't present in the assay object; returning NULL
Code
integrated <- RunUMAP(integrated,
  reduction = "integrated.mnn", dims = 1:20,
  reduction.name = "umap.mnn"
)
15:20:39 UMAP embedding parameters a = 0.9922 b = 1.112
15:20:39 Read 6978 rows and found 20 numeric columns
15:20:39 Using Annoy for neighbor search, n_neighbors = 30
15:20:39 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:20:39 Writing NN index file to temp file /tmp/Rtmp7Jc9W7/file3b22756f605dd7
15:20:39 Searching Annoy index using 1 thread, search_k = 3000
15:20:42 Annoy recall = 100%
15:20:43 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:20:44 Initializing from normalized Laplacian + noise (using RSpectra)
15:20:44 Commencing optimization for 500 epochs, with 306752 positive edges
15:20:44 Using rng type: pcg
15:20:51 Optimization finished
Code
integrated <- IntegrateLayers(
  object = integrated, method = HarmonyIntegration,
  orig.reduction = "pca", new.reduction = "integrated.harmony",
  verbose = FALSE
)
The `features` argument is ignored by `HarmonyIntegration`.
This message is displayed once per session.
Code
integrated <- IntegrateLayers(
  object = integrated, method = CCAIntegration,
  orig.reduction = "pca", new.reduction = "integrated.cca",
  verbose = FALSE
)

integrated <- FindNeighbors(integrated, reduction = "integrated.cca", dims = 1:20)
Computing nearest neighbor graph
Computing SNN
Code
integrated <- FindClusters(integrated,
  resolution = 0.1,
  cluster.name = "cca_clusters"
)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 6978
Number of edges: 294646

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9687
Number of communities: 8
Elapsed time: 0 seconds
Code
integrated <- RunUMAP(integrated,
  reduction = "integrated.cca", dims = 1:20,
  reduction.name = "umap.cca"
)
15:21:36 UMAP embedding parameters a = 0.9922 b = 1.112
15:21:36 Read 6978 rows and found 20 numeric columns
15:21:36 Using Annoy for neighbor search, n_neighbors = 30
15:21:36 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:21:37 Writing NN index file to temp file /tmp/Rtmp7Jc9W7/file3b227558d40940
15:21:37 Searching Annoy index using 1 thread, search_k = 3000
15:21:39 Annoy recall = 100%
15:21:39 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:21:41 Initializing from normalized Laplacian + noise (using RSpectra)
15:21:41 Commencing optimization for 500 epochs, with 300314 positive edges
15:21:41 Using rng type: pcg
15:21:48 Optimization finished
Code
DimPlot(integrated,
  reduction = "umap.cca", group.by = c(
    "dataset",
    "merged_cluster", "cca_clusters"
  ),
  label = TRUE
)

Code
integrated <- FindNeighbors(integrated, reduction = "integrated.harmony", dims = 1:20)
Computing nearest neighbor graph
Computing SNN
Code
integrated <- FindClusters(integrated,
  resolution = 0.3,
  cluster.name = "harmony_clusters"
)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 6978
Number of edges: 274452

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9302
Number of communities: 11
Elapsed time: 0 seconds
Code
integrated <- RunUMAP(integrated,
  reduction = "integrated.harmony", dims = 1:20,
  reduction.name = "umap.harmony"
)
15:21:53 UMAP embedding parameters a = 0.9922 b = 1.112
15:21:53 Read 6978 rows and found 20 numeric columns
15:21:53 Using Annoy for neighbor search, n_neighbors = 30
15:21:53 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:21:53 Writing NN index file to temp file /tmp/Rtmp7Jc9W7/file3b2275625f4a62
15:21:53 Searching Annoy index using 1 thread, search_k = 3000
15:21:55 Annoy recall = 100%
15:21:56 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:21:58 Initializing from normalized Laplacian + noise (using RSpectra)
15:21:58 Commencing optimization for 500 epochs, with 296174 positive edges
15:21:58 Using rng type: pcg
15:22:05 Optimization finished
Code
DimPlot(integrated)

Code
DimPlot(integrated,
  reduction = "umap.harmony", group.by = c(
    "dataset",
    "harmony_clusters", "cca_clusters"
  ),
  label = TRUE
)

8 We can use the same idea to ‘transfer’ annotation from pbmc3k to pbmc4k

Code
pbmc3k <- NormalizeData(pbmc3k) %>%
  FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>%
  ScaleData() %>%
  RunPCA(npcs = 30) %>%
  FindNeighbors(dims = 1:20) %>%
  RunUMAP(dims = 1:20)
Centering and scaling data matrix
PC_ 1 
Positive:  CST3, TYROBP, LST1, AIF1, FTL, FTH1, LYZ, FCN1, S100A9, TYMP 
       FCER1G, CFD, LGALS1, S100A8, CTSS, LGALS2, SERPINA1, IFITM3, SPI1, CFP 
       PSAP, IFI30, SAT1, COTL1, S100A11, NPC2, GRN, LGALS3, GSTP1, PYCARD 
Negative:  MALAT1, LTB, IL32, IL7R, CD2, B2M, ACAP1, CD27, STK17A, CTSW 
       CD247, GIMAP5, AQP3, CCL5, SELL, TRAF3IP3, GZMA, MAL, CST7, ITM2A 
       MYC, GIMAP7, HOPX, BEX2, LDLRAP1, GZMK, ETS1, ZAP70, TNFAIP8, RIC3 
PC_ 2 
Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1, HLA-DRA, LINC00926, CD79B, HLA-DRB1, CD74 
       HLA-DMA, HLA-DPB1, HLA-DQA2, CD37, HLA-DRB5, HLA-DMB, HLA-DPA1, FCRLA, HVCN1, LTB 
       BLNK, P2RX5, IGLL5, IRF8, SWAP70, ARHGAP24, FCGR2B, SMIM14, PPP1R14A, C16orf74 
Negative:  NKG7, PRF1, CST7, GZMB, GZMA, FGFBP2, CTSW, GNLY, B2M, SPON2 
       CCL4, GZMH, FCGR3A, CCL5, CD247, XCL2, CLIC3, AKR1C3, SRGN, HOPX 
       TTC38, APMAP, CTSC, S100A4, IGFBP7, ANXA1, ID2, IL32, XCL1, RHOC 
PC_ 3 
Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPB1, HLA-DPA1, CD74, MS4A1, HLA-DRB1, HLA-DRA 
       HLA-DRB5, HLA-DQA2, TCL1A, LINC00926, HLA-DMB, HLA-DMA, CD37, HVCN1, FCRLA, IRF8 
       PLAC8, BLNK, MALAT1, SMIM14, PLD4, P2RX5, IGLL5, LAT2, SWAP70, FCGR2B 
Negative:  PPBP, PF4, SDPR, SPARC, GNG11, NRGN, GP9, RGS18, TUBB1, CLU 
       HIST1H2AC, AP001189.4, ITGA2B, CD9, TMEM40, PTCRA, CA2, ACRBP, MMD, TREML1 
       NGFRAP1, F13A1, SEPT5, RUFY1, TSC22D1, MPP1, CMTM5, RP11-367G6.3, MYL9, GP1BA 
PC_ 4 
Positive:  VIM, IL7R, S100A6, IL32, S100A8, S100A4, GIMAP7, S100A10, S100A9, MAL 
       AQP3, CD2, CD14, FYB, LGALS2, GIMAP4, ANXA1, CD27, FCN1, RBP7 
       LYZ, S100A11, GIMAP5, MS4A6A, S100A12, FOLR3, TRABD2A, AIF1, IL8, IFI6 
Negative:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1, CD74, HIST1H2AC, HLA-DPB1, PF4, SDPR 
       TCL1A, HLA-DRB1, HLA-DPA1, HLA-DQA2, PPBP, HLA-DRA, LINC00926, GNG11, SPARC, HLA-DRB5 
       GP9, AP001189.4, CA2, PTCRA, CD9, NRGN, RGS18, CLU, TUBB1, GZMB 
PC_ 5 
Positive:  LTB, IL7R, CKB, VIM, MS4A7, AQP3, CYTIP, RP11-290F20.3, SIGLEC10, HMOX1 
       LILRB2, PTGES3, MAL, CD27, HN1, CD2, GDI2, CORO1B, ANXA5, TUBA1B 
       FAM110A, ATP1A1, TRADD, PPA1, CCDC109B, ABRACL, CTD-2006K23.1, WARS, VMO1, FYB 
Negative:  GZMB, NKG7, S100A8, FGFBP2, GNLY, CCL4, CST7, PRF1, GZMA, SPON2 
       GZMH, S100A9, LGALS2, CCL3, CTSW, XCL2, CD14, CLIC3, S100A12, RBP7 
       CCL5, MS4A6A, GSTP1, FOLR3, IGFBP7, TYROBP, TTC38, AKR1C3, XCL1, HOPX 
Warning: Number of dimensions changing from 50 to 30
Computing nearest neighbor graph
Computing SNN
15:22:12 UMAP embedding parameters a = 0.9922 b = 1.112
15:22:12 Read 2638 rows and found 20 numeric columns
15:22:12 Using Annoy for neighbor search, n_neighbors = 30
15:22:12 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:22:12 Writing NN index file to temp file /tmp/Rtmp7Jc9W7/file3b2275694fad9b
15:22:12 Searching Annoy index using 1 thread, search_k = 3000
15:22:13 Annoy recall = 100%
15:22:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:22:15 Initializing from normalized Laplacian + noise (using RSpectra)
15:22:15 Commencing optimization for 500 epochs, with 109042 positive edges
15:22:15 Using rng type: pcg
15:22:18 Optimization finished
Code
pbmc4k <- NormalizeData(pbmc4k) %>%
  FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>%
  ScaleData() %>%
  RunPCA(npcs = 30) %>%
  FindNeighbors(dims = 1:20) %>%
  RunUMAP(dims = 1:20)
Normalizing layer: counts
Finding variable features for layer counts
Centering and scaling data matrix
PC_ 1 
Positive:  TRAC, LTB, RPS18, IL32, EEF1A1, TRBC2, CD69, CD7, CD3G, IL7R 
       CD27, RPS2, IFITM1, CD2, TRBC1, LEF1, CCR7, CD247, FLT3LG, NOSIP 
       TRAT1, GZMM, CTSW, MAL, BCL11B, PIM2, CD8B, RORA, CAMK4, SYNE2 
Negative:  CST3, LYZ, CSTA, LST1, MNDA, TYROBP, FCN1, FTL, AIF1, CTSS 
       TYMP, S100A9, FCER1G, LGALS1, RP11-1143G9.4, S100A8, FTH1, LGALS2, SERPINA1, FGL2 
       PSAP, SPI1, S100A11, GPX1, CFD, VCAN, CD68, S100A12, AP1S2, MS4A6A 
PC_ 2 
Positive:  CD79A, MS4A1, IGHM, CD79B, IGHD, LINC00926, TCL1A, BANK1, IGKC, CD74 
       HLA-DQB1, CD22, HLA-DRA, HLA-DPA1, FCER2, TNFRSF13C, HLA-DPB1, VPREB3, FAM129C, HLA-DRB1 
       HLA-DMA, HLA-DQA1, MEF2C, RALGPS2, HVCN1, SPIB, FCRLA, TSPAN13, HLA-DMB, HLA-DOB 
Negative:  IL32, CTSW, TMSB4X, NKG7, GZMA, CST7, CCL5, CD7, PRF1, GZMM 
       S100A4, TRAC, CD247, IFITM1, ANXA1, KLRD1, HOPX, KLRB1, SRGN, ITGB2 
       PFN1, FGFBP2, CD2, LYAR, KLRG1, GNLY, MATK, CCL4, ID2, CMC1 
PC_ 3 
Positive:  GZMB, NKG7, CLIC3, CST7, PRF1, GZMA, KLRD1, FGFBP2, KLRF1, HOPX 
       GNLY, C12orf75, SPON2, CTSW, RHOC, CCL5, CCL4, CD160, GZMH, XCL2 
       MATK, IL2RB, CMC1, TTC38, FCGR3A, APOBEC3G, CLIC1, APMAP, TRDC, S1PR5 
Negative:  RPLP1, LEF1, RPS18, CCR7, MAL, RPS2, EEF1A1, RPS24, S100A12, VCAN 
       ACTN1, S100A8, TRABD2A, TRAC, RGCC, S100A9, NOSIP, JUNB, LTB, IL7R 
       RP11-1143G9.4, FHIT, SOCS3, FLT3LG, CD14, RGS10, CAMK4, LRRN3, MNDA, FCN1 
PC_ 4 
Positive:  MS4A1, FGFBP2, CD79B, IGHD, NKG7, CD79A, KLRD1, PRF1, CST7, KLRF1 
       HOPX, GNLY, GZMA, LINC00926, SPON2, CCL4, CD160, FCER2, FCGR3A, XCL2 
       CD22, BANK1, TNFRSF13C, TTC38, IGHM, VPREB3, IL2RB, MATK, S1PR5, MT-CO3 
Negative:  LILRA4, LRRC26, CLEC4C, SERPINF1, TPM2, SCT, PTCRA, DNASE1L3, TNFRSF21, PLD4 
       GAS6, SMPD3, IL3RA, LINC00996, PPP1R14B, RP11-73G16.2, DERL3, PTPRS, ITM2C, PACSIN1 
       SMIM5, IRF7, MYBL2, UGCG, LILRB4, LAMP5, MAP1A, CIB2, KCNK17, ASIP 
PC_ 5 
Positive:  S100A12, VCAN, S100A8, RP11-1143G9.4, MS4A6A, S100A9, CD14, CD36, GZMB, CXCL8 
       RBP7, RNASE6, QPCT, MGST1, NCF1, ITGAM, CLEC4C, LRRC26, CYP1B1, CLIC3 
       ALOX5AP, CCL3, CLEC4E, GPX1, LINC00996, LILRA4, DERL3, CD93, KCTD12, SCT 
Negative:  CDKN1C, HES4, FCGR3A, SIGLEC10, CSF1R, BATF3, CKB, TCF7L2, CTSL, RP11-1008C21.1 
       MS4A7, CTD-2006K23.1, MS4A4A, FAM110A, ICAM4, LRRC25, HMOX1, C1QA, IFITM3, ZNF703 
       HCK, RRAS, RHOC, OAS1, MTSS1, NEURL1, TUBA1B, VMO1, YBX1, TNFRSF8 
Computing nearest neighbor graph
Computing SNN
15:22:24 UMAP embedding parameters a = 0.9922 b = 1.112
15:22:24 Read 4340 rows and found 20 numeric columns
15:22:24 Using Annoy for neighbor search, n_neighbors = 30
15:22:24 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:22:25 Writing NN index file to temp file /tmp/Rtmp7Jc9W7/file3b22753dc33440
15:22:25 Searching Annoy index using 1 thread, search_k = 3000
15:22:26 Annoy recall = 100%
15:22:27 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
15:22:28 Initializing from normalized Laplacian + noise (using RSpectra)
15:22:28 Commencing optimization for 500 epochs, with 180242 positive edges
15:22:28 Using rng type: pcg
15:22:33 Optimization finished
Code
anchors <- FindTransferAnchors(
  reference = pbmc3k,
  query = pbmc4k,
  dims = 1:30,
  reference.reduction = "pca"
)
Projecting cell embeddings
Finding neighborhoods
Finding anchors
    Found 4763 anchors
Code
predictions <- TransferData(
  anchorset = anchors, refdata = pbmc3k$seurat_annotations,
  dims = 1:30
)
Finding integration vectors
Finding integration vector weights
Predicting cell labels
Code
pbmc4k <- AddMetaData(pbmc4k, metadata = predictions)
Code
DimPlot(pbmc4k, group.by = "predicted.id")

9 uamp is only for visualisatio

always check using markers if the results make sense

Code
Idents(pbmc4k) <- pbmc4k$predicted.id
markers.B <- FindMarkers(object = pbmc4k, ident.1 = "B", only.pos = T, group.by = "predicted.id")