Skip to content

Commit 543a46f

Browse files
WT: Updated batch inference
1 parent a696373 commit 543a46f

6 files changed

Lines changed: 121 additions & 51 deletions

File tree

DESCRIPTION

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: SPEEDI
22
Title: Single-cell Pipeline for End to End Data Integration (SPEEDI)
3-
Version: 1.0.0.0000
3+
Version: 1.1.0.0000
44
Authors@R: c(
55
person("Yuan", "Wang", email = "yuanwang@princeton.edu", role = c("aut", "cre")),
66
person("William", "Thistlethwaite", email = "wat2@princeton.edu", role = "aut"))
@@ -15,6 +15,7 @@ Imports:
1515
BiocGenerics,
1616
BiocManager,
1717
Biostrings,
18+
bluster,
1819
chromVAR,
1920
ComplexHeatmap,
2021
DESeq2,

R/map_cell_type_from_reference.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ MajorityVote_RNA <- function(sc_obj, current_resolution = 2, log_flag = FALSE) {
177177
} else {
178178
print_SPEEDI("Neighbors exist. Skipping constructing neighborhood graph...", log_flag)
179179
}
180-
sc_obj <- find_clusters_SPEEDI(sc_obj = sc_obj, resolution = current_resolution, log_flag)
180+
sc_obj <- find_clusters_SPEEDI(sc_obj = sc_obj, resolution = current_resolution, method = "Leiden", log_flag = log_flag)
181181
sc_obj$predicted.id <- as.character(sc_obj$predicted.id)
182182
votes <- c()
183183
vote_levels_fix <- as.character(levels(sc_obj$seurat_clusters))

R/process_batches.R

Lines changed: 61 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#' @examples
88
#' \dontrun{sc_obj <- InferBatches(sc_obj)}
99
#' @export
10+
#' @importFrom foreach %dopar%
1011
InferBatches <- function(sc_obj, exit_with_code = FALSE, log_flag = FALSE) {
1112
exit_code <- -1
1213
sc_obj <- tryCatch(
@@ -27,23 +28,49 @@ InferBatches <- function(sc_obj, exit_with_code = FALSE, log_flag = FALSE) {
2728
print_SPEEDI("Neighbors exist. Skipping construction of neighborhood graph...", log_flag)
2829
}
2930
}
30-
sc_obj <- find_clusters_SPEEDI(sc_obj, resolution = 0.3, log_flag = log_flag)
31-
if (length(unique(sc_obj$seurat_clusters)) > 30) {
32-
sc_obj <- find_clusters_SPEEDI(sc_obj, resolution = 0.2, log_flag = log_flag)
33-
if (length(unique(sc_obj$seurat_clusters)) > 30) {
34-
sc_obj <- find_clusters_SPEEDI(sc_obj, resolution = 0.1, log_flag = log_flag)
35-
}
31+
32+
# Set up processing of different resolutions so it's parallel (max = 7 cores)
33+
if (Sys.getenv("SLURM_NTASKS_PER_NODE") == "") {
34+
n.cores <- as.numeric(parallel::detectCores())
35+
} else {
36+
n.cores <- as.numeric(Sys.getenv("SLURM_NTASKS_PER_NODE"))
37+
}
38+
39+
if (n.cores > 7) {
40+
n.cores <- 7
41+
}
42+
43+
print_SPEEDI(paste0("Number of cores: ", n.cores), log_flag)
44+
doParallel::registerDoParallel(n.cores)
45+
metrics_list <- foreach::foreach(
46+
i = c(0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4),
47+
.combine = 'c',
48+
.packages = c("Seurat", "bluster")
49+
) %dopar% {
50+
tmp <- find_clusters_SPEEDI(sc_obj = sc_obj, resolution = i, method = "Louvain", log_flag = log_flag)
51+
dims <- 1:30
52+
clusters <- tmp$seurat_clusters
53+
sil.out <- bluster::approxSilhouette(Seurat::Embeddings(tmp@reductions$pca)[, dims], clusters)
54+
sil.score <- mean(sil.out$width)
55+
names(sil.score) <- i
56+
return(sil.score)
3657
}
58+
59+
max.res <- as.numeric(as.character(names(which.max(metrics_list))))
60+
print_SPEEDI(paste0("Resolution = ", max.res), log_flag = log_flag)
61+
62+
tmp <- find_clusters_SPEEDI(sc_obj = sc_obj, resolution = max.res, method = "Louvain", log_flag = log_flag)
63+
3764
# Use LISI metric to guess batch labels
38-
X <- sc_obj@reductions$umap@cell.embeddings
39-
meta_data <- data.frame(sc_obj$sample)
65+
X <- tmp@reductions$umap@cell.embeddings
66+
meta_data <- data.frame(tmp$sample)
4067
colnames(meta_data) <- "batch"
41-
meta_data$cluster <- sc_obj$seurat_clusters
68+
meta_data$cluster <- tmp$seurat_clusters
4269
lisi.res <- data.frame(matrix(ncol = 4, nrow = 0))
4370
colnames(lisi.res) <- c("batch", "score", "cluster", "freq")
44-
clusters.interest <- names(table(sc_obj$seurat_clusters))[prop.table(table(sc_obj$seurat_clusters)) > 0.01]
45-
for (cluster in clusters.interest) { #levels(sc_obj$seurat_clusters)) {
46-
cells <- names(sc_obj$seurat_clusters[sc_obj$seurat_clusters == cluster])
71+
clusters.interest <- names(table(tmp$seurat_clusters))[prop.table(table(tmp$seurat_clusters)) > 0.01]
72+
for (cluster in clusters.interest) {
73+
cells <- names(tmp$seurat_clusters[tmp$seurat_clusters == cluster])
4774
X.sub <- X[which(rownames(X) %in% cells),]
4875
meta_data.sub <- meta_data[which(rownames(meta_data) %in% cells),]
4976
res <- lisi::compute_lisi(X.sub, meta_data.sub, label_colnames = "batch")
@@ -56,13 +83,19 @@ InferBatches <- function(sc_obj, exit_with_code = FALSE, log_flag = FALSE) {
5683
lisi.res <- rbind(lisi.res, agg.res)
5784
}
5885

86+
lisi.res <- lisi.res[lisi.res$freq > 10,]
87+
5988
p.values <- list()
6089
used.sample.dump <- c()
6190
batch.assign <- list()
6291
for ( i in clusters.interest) {
6392
lisi.res.sub <- lisi.res[lisi.res$cluster == i,]
64-
lisi.res.sub$batch_count <- as.numeric(as.character(table(sc_obj$sample)[lisi.res.sub$batch]))
65-
lisi.res.sub$scaled.score <- (lisi.res.sub$score / max(lisi.res.sub$score)) * (lisi.res.sub$freq / lisi.res.sub$batch_count)
93+
lisi.res.sub$batch_count <- as.numeric(as.character(table(tmp$sample)[lisi.res.sub$batch]))
94+
95+
lisi.res.sub$scaled.score <- mapply(
96+
geometric.mean,
97+
(max(lisi.res.sub$score) / lisi.res.sub$score),
98+
(lisi.res.sub$freq / lisi.res.sub$batch_count))
6699

67100
if (max(lisi.res.sub$score) <= 1.01) {
68101
lisi.res.sub <- lisi.res.sub[order(lisi.res.sub$score, decreasing = TRUE),]
@@ -78,30 +111,28 @@ InferBatches <- function(sc_obj, exit_with_code = FALSE, log_flag = FALSE) {
78111
if (dim(lisi.res.sub)[1] > 30) {
79112
lisi.res.sub <- lisi.res.sub[1:30,]
80113
}
114+
81115
lisi.res.sub$diff.scaled.score <- abs(c(diff(lisi.res.sub$scaled.score), 0))
82116

83-
if (dim(lisi.res.sub)[1] >= 3) {
117+
if (dim(lisi.res.sub)[1] >= 3 & sum(lisi.res.sub$diff.scaled.score) != 0) {
84118
p.values[[i]] <- outliers::dixon.test(lisi.res.sub$diff.scaled.score)$p.value[[1]]
85119
} else {
86120
p.values[[i]] <- 1
87121
}
88122

89-
if (p.values[[i]] > 0.05 & dim(lisi.res.sub)[1] >= 3) {
90-
lisi.res.sub <- lisi.res.sub[-which.max(lisi.res.sub$diff.scaled.score),]
91-
p.values[[i]] <- outliers::dixon.test(lisi.res.sub$diff.scaled.score)$p.value[[1]]
92-
}
93-
94123
if (p.values[[i]] < 0.05 & dim(lisi.res.sub)[1] >= 3) {
95124
max.index <- which.max(lisi.res.sub$diff.scaled.score)
96125
samples.of.batch <- lisi.res.sub$batch[1:max.index]
97126

98127
if (any(samples.of.batch %in% used.sample.dump)) {
128+
99129
if (!all(samples.of.batch %in% used.sample.dump)) {
100130
used.index <- which(samples.of.batch %in% used.sample.dump)
101131
samples.of.batch <- samples.of.batch[-used.index]
102132
if (length(samples.of.batch) > 0) {
103133
batch.assign <- lappend(batch.assign, samples.of.batch)
104134
}
135+
105136
} else if (!list(samples.of.batch) %in% batch.assign) {
106137
if (length(samples.of.batch) == 1) {
107138
batch.assign <- lappend(batch.assign, samples.of.batch)
@@ -118,25 +149,27 @@ InferBatches <- function(sc_obj, exit_with_code = FALSE, log_flag = FALSE) {
118149
}
119150
used.sample.dump <- union(used.sample.dump, samples.of.batch)
120151
}
152+
121153
}
122154
}
123155

124-
batch <- as.factor(sc_obj$sample)
156+
batch <- as.factor(tmp$sample)
157+
125158
if (length(batch.assign) > 0) {
126159
levels.batch <- levels(batch)
127160
for (i in 1:length(batch.assign)) {
128161
levels.batch[which(levels(batch) %in% batch.assign[[i]])] <- i
129162
}
130163
levels.batch[!levels.batch %in% c(1:length(batch.assign))] <- length(batch.assign)+1
131164
levels(batch) <- levels.batch
132-
sc_obj$batch <- as.character(batch)
165+
tmp$batch <- as.character(batch)
133166
} else {
134167
print_SPEEDI("No batch effect detected!", log_flag)
135-
sc_obj$batch <- "No Batch"
168+
tmp$batch <- "No Batch"
136169
}
137-
print_SPEEDI(paste0("Total batches detected: ", length(unique(sc_obj$batch))), log_flag)
170+
print_SPEEDI(paste0("Total batches detected: ", length(unique(tmp$batch))), log_flag)
138171
print_SPEEDI("Step 5: Complete", log_flag)
139-
return(sc_obj)
172+
return(tmp)
140173
},
141174
error = function(cond) {
142175
if(exit_code == -1) {
@@ -228,8 +261,7 @@ IntegrateByBatch_RNA <- function(sc_obj, exit_with_code = FALSE, log_flag = FALS
228261

229262
print_SPEEDI("Beginning integration", log_flag)
230263
integrated_obj <- Seurat::IntegrateData(anchorset = anchors,
231-
normalization.method = "SCT",
232-
k.weight = 100)
264+
normalization.method = "SCT")
233265
Seurat::DefaultAssay(integrated_obj) <- "integrated"
234266

235267
rm(sc_obj_list)
@@ -347,7 +379,7 @@ IntegrateByBatch_ATAC <- function(proj, output_dir = getwd(), exit_with_code = F
347379
obj <- Seurat::CreateSeuratObject(tmp, project='scATAC', min.cells=0, min.features=0)
348380
obj[[reducedDims_param]] <- Seurat::CreateDimReducObject(embeddings=tile_reduc, key=paste0(reducedDims_param, "_"), assay='RNA')
349381
obj <- Seurat::FindNeighbors(obj, reduction = reducedDims_param, dims = 1:29)
350-
obj <- find_clusters_SPEEDI(obj, resolution = 2, log_flag = log_flag)
382+
obj <- find_clusters_SPEEDI(obj, resolution = 2, method = "Leiden", log_flag = log_flag)
351383
obj <- Seurat::RunUMAP(obj, reduction = reducedDims_param, dims = 1:29, seed.use = get_speedi_seed())
352384
proj <- ArchR::addCellColData(
353385
ArchRProj = proj,
@@ -422,7 +454,7 @@ VisualizeIntegration <- function(sc_obj, output_dir = getwd(), exit_with_code =
422454
} else {
423455
print_SPEEDI("Neighbors exist. Skipping constructing neighborhood graph...", log_flag)
424456
}
425-
sc_obj <- find_clusters_SPEEDI(sc_obj = sc_obj, resolution = 2, log_flag = log_flag)
457+
sc_obj <- find_clusters_SPEEDI(sc_obj = sc_obj, resolution = 2, method = "Leiden", log_flag = log_flag)
426458
sample_count <- length(unique(sc_obj$sample))
427459
cell_count <- length(sc_obj$cell_name)
428460
# Plot by cluster

R/utils.R

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,15 @@ scale_zero_one <- function(x) {(x - min(x))/(max(x) - min(x))}
1111
#' @return A list of lists
1212
lappend <- function (lst, ...){ c(lst, list(...))}
1313

14+
#' Compute geometric mean
15+
#'
16+
#' @param x First number
17+
#' @param y Second number
18+
#' @return Geometric mean of two numbers
19+
geometric.mean <- function(x, y) {
20+
exp(mean(log(c(x, y))))
21+
}
22+
1423
#' Parse namespace and function name for do.call
1524
#'
1625
#' @param x Function name (potentially with namespace attached)
@@ -238,27 +247,34 @@ print_heatmap_cell_type_proportions_RNA <- function(sc_obj, file_name, output_di
238247
return(TRUE)
239248
}
240249

241-
#' First, try Leiden algorithm for clustering. If that doesn't work, then try Louvain
250+
#' Use either Louvain or Leiden for clustering. If using Leiden, tries Leiden first and then if it fails, uses Louvain
242251
#' @param sc_obj Seurat object containing cells for all samples
243252
#' @param resolution Resolution for clustering
253+
#' @param method Clustering method
244254
#' @param log_flag boolean to indicate whether we're also printing to log file
245255
#' @return Seurat object with clustering complete
246256
#' @export
247-
find_clusters_SPEEDI <- function(sc_obj, resolution, log_flag = FALSE) {
248-
sc_obj <- tryCatch(
249-
{
250-
print_SPEEDI("Trying to use Leiden algorithm for clustering", log_flag)
251-
Seurat::FindClusters(object = sc_obj, resolution = resolution, algorithm = 4, method='igraph', random.seed = get_speedi_seed())
252-
},
253-
error=function(cond) {
254-
print_SPEEDI("Error using Leiden algorithm for clustering", log_flag)
255-
print_SPEEDI(cond, log_flag)
256-
print_SPEEDI("Trying Louvain instead", log_flag)
257-
return(Seurat::FindClusters(object = sc_obj, resolution = resolution, algorithm = 2, random.seed = get_speedi_seed()))
258-
},
259-
finally={
260-
print_SPEEDI("Clustering complete", log_flag)
261-
}
262-
)
257+
find_clusters_SPEEDI <- function(sc_obj, resolution, method = "Louvain", log_flag = FALSE) {
258+
if(method == "Louvain") {
259+
print_SPEEDI("Trying to use Louvain algorithm for clustering", log_flag)
260+
sc_obj <- Seurat::FindClusters(object = sc_obj, resolution = resolution, algorithm = 2, random.seed = get_speedi_seed())
261+
print_SPEEDI("Clustering complete", log_flag)
262+
} else if(method == "Leiden") {
263+
sc_obj <- tryCatch(
264+
{
265+
print_SPEEDI("Trying to use Leiden algorithm for clustering", log_flag)
266+
Seurat::FindClusters(object = sc_obj, resolution = resolution, algorithm = 4, method='igraph', random.seed = get_speedi_seed())
267+
},
268+
error=function(cond) {
269+
print_SPEEDI("Error using Leiden algorithm for clustering", log_flag)
270+
print_SPEEDI(cond, log_flag)
271+
print_SPEEDI("Trying Louvain instead", log_flag)
272+
return(Seurat::FindClusters(object = sc_obj, resolution = resolution, algorithm = 2, random.seed = get_speedi_seed()))
273+
},
274+
finally={
275+
print_SPEEDI("Clustering complete", log_flag)
276+
}
277+
)
278+
}
263279
return(sc_obj)
264280
}

man/find_clusters_SPEEDI.Rd

Lines changed: 5 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/geometric.mean.Rd

Lines changed: 19 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)