This document is compiled from an RMarkdown file which contains all code or description necessary to reproduce the analysis for the accompanying project. Each section below describes a different component of the analysis and most of numbers and figures are generated directly from the underlying data on compilation.
LICENSE
If you want to reuse the code in this report, please note the license below should be followed.
The code is made available for non commercial research purposes only under the MIT. However, notwithstanding any provision of the MIT License, the software currently may not be used for commercial purposes without explicit written permission after contacting Ziyu Tao taozy@shanghaitech.edu.cn or Xue-Song Liu liuxs@shanghaitech.edu.cn.
PART 0: Data preprocessing
In this part, raw data are collected from databases or papers and the core pre-processing steps are described in sections below.
The pre-processing work has been done by setting root path of the project repository as work directory. Therefore, keep in mind the work directory should be properly set if you are interested in reproducing the pre-processing procedure.
Prepare PCAWG datasets
We download PCAWG phenotype and copy number data from UCSC Xena and save them to local machine with R format.
dir.create("Xena")
# Phenotype (Specimen Centric) -----------------------------------------------
library(UCSCXenaTools)
<- XenaData %>%
pcawg_phenotype ::filter(XenaHostNames == "pcawgHub") %>%
dplyrXenaScan("phenotype") %>%
XenaScan("specimen centric") %>%
XenaGenerate() %>%
XenaQuery()
<- pcawg_phenotype %>%
pcawg_phenotype XenaDownload(destdir = "Xena", trans_slash = TRUE)
<- XenaPrepare(pcawg_phenotype)
phenotype_list <- phenotype_list[1:4]
pcawg_samp_info_sp
saveRDS(pcawg_samp_info_sp, file = "data/pcawg_samp_info_sp.rds")
# PCAWG (Specimen Centric) ------------------------------------------------
download.file(
"https://pcawg.xenahubs.net/download/20170119_final_consensus_copynumber_sp.gz",
"Xena/pcawg_copynumber_sp.gz"
)
<- data.table::fread("Xena/pcawg_copynumber_sp.gz")
pcawg_cn <- readRDS("data/pcawg_samp_info_sp.rds")
pcawg_samp_info_sp
<- pcawg_samp_info_sp$pcawg_donor_clinical_August2016_v9_sp %>%
sex_dt ::select(xena_sample, donor_sex) %>%
dplyr::set_names(c("sample", "sex")) %>%
purrr::as.data.table()
data.table
saveRDS(sex_dt, file = "data/pcawg_sex_sp.rds")
<- pcawg_cn[!is.na(total_cn)]
pcawg_cn $value <- NULL
pcawg_cn<- pcawg_cn[, c(1:5, 7)]
pcawg_cn colnames(pcawg_cn)[1:5] <- c("sample", "Chromosome", "Start.bp", "End.bp", "modal_cn")
saveRDS(pcawg_cn, file = "data/pcawg_copynumber_sp.rds")
# LOH ---------------------------------------------------------------------
<- readRDS("data/pcawg_copynumber_sp.rds")
pcawg_cn
<- pcawg_cn %>%
pcawg_loh ::filter(Chromosome %in% as.character(1:22)) %>%
dplyr::mutate(len = End.bp - Start.bp + 1) %>%
dplyr::group_by(sample) %>%
dplyr::summarise(
dplyrn_LOH = sum(minor_cn == 0 & modal_cn > 0 & len >= 1e4)
%>%
) setNames(c("sample", "n_LOH")) %>%
::as.data.table()
data.table
saveRDS(pcawg_loh, file = "data/pcawg_loh.rds")
Generate PCAWG sample-by-component matrix
We then read the copy number data and transform it into a CopyNumber
object with R package sigminer. Then sig_tally()
is used to generate the matrix for NMF decomposition.
library(sigminer)
# Only focus autosomes, suggested by Prof. Liu
<- readRDS("data/pcawg_copynumber_sp.rds")
pcawg_cn table(pcawg_cn$Chromosome)
<- pcawg_cn[!Chromosome %in% c("X", "Y")]
pcawg_cn <- read_copynumber(pcawg_cn,
cn_obj add_loh = TRUE,
loh_min_len = 1e4,
loh_min_frac = 0.05,
max_copynumber = 1000L,
genome_build = "hg19",
complement = FALSE,
genome_measure = "called"
)saveRDS(cn_obj, file = "data/pcawg_cn_obj.rds")
# Tally -------------------------------------------------------------------
library(sigminer)
<- readRDS("data/pcawg_cn_obj.rds")
cn_obj table(cn_obj@data$chromosome)
<- sig_tally(cn_obj,
tally_X method = "X",
add_loh = TRUE,
cores = 10
)saveRDS(tally_X, file = "data/pcawg_cn_tally_X.rds")
str(tally_X$all_matrices, max.level = 1)
<- show_catalogue(tally_X,
p mode = "copynumber", method = "X",
style = "cosmic", y_tr = function(x) log10(x + 1),
y_lab = "log10(count +1)"
)
p::ggsave("output/pcawg_catalogs_tally_X.pdf", plot = p, width = 16, height = 2.5)
ggplot2
head(sort(colSums(tally_X$nmf_matrix)), n = 50)
## classes without LOH labels
<- sig_tally(cn_obj,
tally_X_noLOH method = "X",
add_loh = FALSE,
cores = 10
)saveRDS(tally_X_noLOH$all_matrices, file = "data/pcawg_cn_tally_X_noLOH.rds")
str(tally_X_noLOH$all_matrices, max.level = 1)
<- show_catalogue(tally_X_noLOH,
p mode = "copynumber", method = "X",
style = "cosmic", y_tr = function(x) log10(x + 1),
y_lab = "log10(count +1)"
)
p::ggsave("output/pcawg_catalogs_tally_X_noLOH.pdf", plot = p, width = 16, height = 2.5) ggplot2
The generated catalog profiles for LOH version and non-LOH version component classification can be viewed by the following link:
- pcawg_catalogs_tally_X.pdf - 176 components
- pcawg_catalogs_tally_X_noLOH.pdf - 136 components
Prepare TCGA datasets
TCGA allele-specific copy number data are downloaded from GDC portal and transformed into R format by Huimin Li.
The data is go further checked and cleaned by Shixiang.
# Huimin collected TCGA data from GDC portal
# and generate file with format needed by sigminer
# Here I will firstly further clean up the data
library(data.table)
<- readRDS("data/TCGA/datamicopy.rds")
x setDT(x)
colnames(x)[6] <- "minor_cn"
head(x)
:= substr(sample, 1, 15)]
x[, sample saveRDS(x, file = "data/TCGA/tcga_cn.rds")
rm(list = ls())
TCGA phenotype data is downloaded from UCSC Xena.
download.file(
"https://pancanatlas.xenahubs.net/download/Survival_SupplementalTable_S1_20171025_xena_sp.gz",
"Xena/Survival_SupplementalTable_S1_20171025_xena_sp.gz"
)
<- data.table::fread("Xena/Survival_SupplementalTable_S1_20171025_xena_sp.gz")
tcga_cli table(tcga_cli$`cancer type abbreviation`)
saveRDS(tcga_cli, file = "data/TCGA/tcga_cli.rds")
Generate TCGA sample-by-component matrix
Similar to PCAWG, TCGA matrices are also generated.
library(sigminer)
# Only focus autosomes
<- readRDS("data/TCGA/tcga_cn.rds")
tcga_cn table(tcga_cn$chromosome)
<- tcga_cn[!chromosome %in% c("chrX", "chrY")]
tcga_cn <- read_copynumber(
cn_obj
tcga_cn,seg_cols = c("chromosome", "start", "end", "segVal"),
add_loh = TRUE,
loh_min_len = 1e4,
loh_min_frac = 0.05,
max_copynumber = 1000L,
genome_build = "hg38",
complement = FALSE,
genome_measure = "called"
)saveRDS(cn_obj, file = "data/TCGA/tcga_cn_obj.rds")
# Tally step
# Generate the counting matrices
# Same as what have done for PCAWG dataset
library(sigminer)
<- readRDS("data/tcga_cn_obj.rds")
cn_obj table(cn_obj@data$chromosome)
<- sig_tally(cn_obj,
tally_X method = "X",
add_loh = TRUE,
cores = 10
)saveRDS(tally_X, file = "data/TCGA/tcga_cn_tally_X.rds")
<- show_catalogue(tally_X,
p mode = "copynumber", method = "X",
style = "cosmic", y_tr = function(x) log10(x + 1),
y_lab = "log10(count +1)"
)
p::ggsave("output/tcga_catalogs_tally_X.pdf", plot = p, width = 16, height = 2.5)
ggplot2
head(sort(colSums(tally_X$nmf_matrix)), n = 50)
## classes without LOH labels
<- sig_tally(cn_obj,
tally_X_noLOH method = "X",
add_loh = FALSE,
cores = 10
)saveRDS(tally_X_noLOH$all_matrices, file = "data/TCGA/tcga_cn_tally_X_noLOH.rds")
str(tally_X_noLOH$all_matrices, max.level = 1)
<- show_catalogue(tally_X_noLOH,
p mode = "copynumber", method = "X",
style = "cosmic", y_tr = function(x) log10(x + 1),
y_lab = "log10(count +1)"
)
p::ggsave("output/tcga_catalogs_tally_X_noLOH.pdf", plot = p, width = 16, height = 2.5) ggplot2
The generated catalog profiles for LOH version and non-LOH version component classification can be viewed by the following link:
- tcga_catalogs_tally_X.pdf - 176 components
- tcga_catalogs_tally_X_noLOH.pdf - 136 components
Distribution of CNA segment features in PCAWG cancer types
<- readRDS("../data/pcawg_cn_obj.rds") %>%
pcawg_cn2 @data
.<- readRDS("../data/pcawg_type_info.rds")
pcawg_types <- dplyr::left_join(
pcawg_cn2
pcawg_cn2,
pcawg_types,by = "sample"
)$length <- pcawg_cn2$end - pcawg_cn2$start
pcawg_cn2$length2 <- log10((pcawg_cn2$length) / 5)
pcawg_cn2
library(ggpubr)
library(ggsci)
# distribution of segment size
<- ggpubr::ggdensity(pcawg_cn2,
p_seg_size x = "length2",
facet.by = "cancer_type",
fill = "cancer_type",
alpha = 0.8
+
) scale_fill_igv() +
scale_color_igv() +
theme(
legend.position = "none",
axis.title.x = element_blank()
+
) scale_x_continuous(
name = "length2",
limits = c(3, 7),
breaks = c(4, 5, 6),
labels = c("50Kb", "500Kb", "5Mb")
+
) geom_vline(xintercept = c(4, 5, 6), linetype = "dashed")
p_seg_size
# distribution of copy number
<- c(
high_cancer "Breast", "Liver-HCC", "Ovary-AdenoCA",
"Panc-AdenoCA", "Prost-AdenoCA", "Eso-AdenoCA"
)
$cancer_type <- factor(
pcawg_cn2$cancer_type,
pcawg_cn2levels = c(
high_cancer,setdiff(names(table(pcawg_cn2$cancer_type)), high_cancer)
)
)<- ggpubr::ggdensity(pcawg_cn2,
p_cn_number x = "segVal",
y = "..count..",
facet.by = "cancer_type",
scales = "free_y",
fill = "cancer_type",
alpha = 0.8,
+
) scale_fill_igv() +
scale_color_igv() +
theme(
legend.position = "none",
axis.title.x = element_blank()
+
) scale_x_continuous(
name = "segVal",
limits = c(0, 32),
breaks = c(0, 2, 16, 32),
labels = c("0", "2", "16", "32")
+
) geom_vline(xintercept = c(2, 16, 32), linetype = "dashed")
p_cn_number
PCAWG genotype/phenotype data type and source
If no references described, PCAWG genotype/phenotype data used in this study are obtained from UCSC Xena database, the original data source is PCAWG paper collection published in early 2020.
- Consensus copy number (page in UCSC Xena).
- General phenotype information (page in UCSC Xena).
- Tumor purity & ploidy & WGD status (page in UCSC Xena).
- Gene level fusion events (page in UCSC Xena).
- LOH number, calculated from copy number data, defined as number of copy number segments passing the following criterion:
- Minor copy number equals to
0
. - Total copy number is greater than
0
. - Segment length is greater than
10Kb
.
- Minor copy number equals to
- HRD status by CHORD framework, the raw data is available at Nguyen et al. (2020), cleaned by Huimin Li.
- Chromothripsis detected by ShatterSeek, the data is obtained from http://compbio.med.harvard.edu/chromothripsis/.
- Amplicons (including ecDNA) detected by AmpliconArchitect, obtained from Kim et al. (2020).
- Telomere content detected by TelomereHunter, obtained from PCAWG-Structural Variation Working Group et al. (2020).
- APOBEC mutations (page in UCSC Xena).
PART 1: Copy number signature identification and activity attribution
In this part, how the copy number signatures are identified and how the corresponding signature activities (a.k.a. exposures) are attributed are described in sections below. The 176 component classification for PCAWG data is our main focus.
The work of this part has been done by either setting root path of the project repository as work directory or moving data and code to the same path (see below). Therefore, keep in mind the work directory/data path should be properly set if you are interested in reproducing the analysis procedure.
The signature identification work cannot be done in local machine because high intensity calculations are required, we used HPC of ShanghaiTech University to submit computation jobs to HPC clusters. Input data files, R scripts and PBS job scripts are stored in a same path, the structure can be viewed as:
.
├── call_pcawg_sp.R
├── pcawg_cn_tally_X.rds
├── pcawg.pbs
Copy number signature identification
We use the gold-standard tool SigProfiler v1.0.17 to identify Pan-Cancer copy number signatures and their activities in each sample. The SigProfiler has been successfully applied to many studies, especially in PCAWG Mutational Signatures Working Group et al. (2020) for multiple types of signatures.
Apply to PCAWG catalog matrix
PBS file pcawg.pbs
content:
#PBS -l walltime=1000:00:00
#PBS -l nodes=1:ppn=36
#PBS -S /bin/bash
#PBS -l mem=20gb
#PBS -j oe
#PBS -M w_shixiang@163.com
#PBS -q pub_fast
# Please set PBS arguments above
cd /public/home/wangshx/wangshx/PCAWG-TCGA
module load apps/R/3.6.1
# Following are commands/scripts you want to run
# If you do not know how to set this, please check README.md file
Rscript call_pcawg_sp.R
We developed a SigProfiler caller in our R package [sigminer] and run SigProfiler with default settings for most of parameters.
R script call_pcawg_sp.R
content:
library(sigminer)
<- readRDS("pcawg_cn_tally_X.rds")
tally_X
sigprofiler_extract(tally_X$nmf_matrix,
output = "PCAWG_CN176X",
range = 2:30,
nrun = 100,
init_method = "random",
is_exome = FALSE,
use_conda = TRUE
)
All metadata info and detail settings reported by SigProfiler are copied below:
THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME
-------System Info-------
Operating System Name: Linux
Nodename: node34
Release: 3.10.0-693.el7.x86_64
Version: #1 SMP Tue Aug 22 21:09:27 UTC 2017
-------Python and Package Versions-------
Python Version: 3.8.5
Sigproextractor Version: 1.0.17
SigprofilerPlotting Version: 1.1.8
SigprofilerMatrixGenerator Version: 1.1.20
Pandas version: 1.1.2
Numpy version: 1.19.2
Scipy version: 1.5.2
Scikit-learn version: 0.23.2
--------------EXECUTION PARAMETERS--------------
INPUT DATA
input_type: matrix
output: TCGA_CN176X
input_data: /tmp/RtmphYcBmk/dir852158ab0364/sigprofiler_input.txt
reference_genome: GRCh37
context_types: SBS176
exome: False
NMF REPLICATES
minimum_signatures: 2
maximum_signatures: 30
NMF_replicates: 100
NMF ENGINE
NMF_init: random
precision: single
matrix_normalization: gmm
resample: True
seeds: random
min_NMF_iterations: 10,000
max_NMF_iterations: 1,000,000
NMF_test_conv: 10,000
NMF_tolerance: 1e-15
CLUSTERING
clustering_distance: cosine
EXECUTION
cpu: 36; Maximum number of CPU is 36
gpu: False
Solution Estimation
stability: 0.8
min_stability: 0.2
combined_stability: 1.0
COSMIC MATCH
opportunity_genome: GRCh37
nnls_add_penalty: 0.05
nnls_remove_penalty: 0.01
initial_remove_penalty: 0.05
de_novo_fit_penalty: 0.02
refit_denovo_signatures: False
-------Analysis Progress-------
[2020-11-29 17:39:12] Analysis started:
##################################
[2020-11-29 17:39:18] Analysis started for SBS176. Matrix size [176 rows x 10851 columns]
[2020-11-29 17:39:18] Normalization GMM with cutoff value set at 17600
[2020-11-29 19:12:03] SBS176 de novo extraction completed for a total of 2 signatures!
Execution time:1:32:45
[2020-11-29 23:58:10] SBS176 de novo extraction completed for a total of 3 signatures!
Execution time:4:46:06
[2020-11-30 08:32:21] SBS176 de novo extraction completed for a total of 4 signatures!
Execution time:8:34:11
[2020-11-30 19:51:18] SBS176 de novo extraction completed for a total of 5 signatures!
Execution time:11:18:56
[2020-12-01 07:10:18] SBS176 de novo extraction completed for a total of 6 signatures!
Execution time:11:18:59
[2020-12-01 21:14:24] SBS176 de novo extraction completed for a total of 7 signatures!
Execution time:14:04:06
[2020-12-02 15:48:43] SBS176 de novo extraction completed for a total of 8 signatures!
Execution time:18:34:19
[2020-12-03 12:53:25] SBS176 de novo extraction completed for a total of 9 signatures!
Execution time:21:04:41
[2020-12-04 11:25:25] SBS176 de novo extraction completed for a total of 10 signatures!
Execution time:22:31:59
[2020-12-05 10:20:15] SBS176 de novo extraction completed for a total of 11 signatures!
Execution time:22:54:50
[2020-12-06 11:36:03] SBS176 de novo extraction completed for a total of 12 signatures!
Execution time:1 day, 1:15:48
[2020-12-07 14:43:31] SBS176 de novo extraction completed for a total of 13 signatures!
Execution time:1 day, 3:07:28
[2020-12-09 01:15:22] SBS176 de novo extraction completed for a total of 14 signatures!
Execution time:1 day, 10:31:50
[2020-12-10 08:02:39] SBS176 de novo extraction completed for a total of 15 signatures!
Execution time:1 day, 6:47:17
[2020-12-11 16:05:31] SBS176 de novo extraction completed for a total of 16 signatures!
Execution time:1 day, 8:02:51
[2020-12-13 02:19:40] SBS176 de novo extraction completed for a total of 17 signatures!
Execution time:1 day, 10:14:09
[2020-12-14 14:56:13] SBS176 de novo extraction completed for a total of 18 signatures!
Execution time:1 day, 12:36:32
[2020-12-16 02:39:41] SBS176 de novo extraction completed for a total of 19 signatures!
Execution time:1 day, 11:43:27
[2020-12-18 02:54:09] SBS176 de novo extraction completed for a total of 20 signatures!
Execution time:2 days, 0:14:28
[2020-12-19 20:20:57] SBS176 de novo extraction completed for a total of 21 signatures!
Execution time:1 day, 17:26:47
[2020-12-21 19:30:07] SBS176 de novo extraction completed for a total of 22 signatures!
Execution time:1 day, 23:09:10
[2020-12-24 00:44:41] SBS176 de novo extraction completed for a total of 23 signatures!
Execution time:2 days, 5:14:34
[2020-12-25 19:03:07] SBS176 de novo extraction completed for a total of 24 signatures!
Execution time:1 day, 18:18:25
[2020-12-27 07:50:06] SBS176 de novo extraction completed for a total of 25 signatures!
Execution time:1 day, 12:46:58
[2020-12-29 04:40:22] SBS176 de novo extraction completed for a total of 26 signatures!
Execution time:1 day, 20:50:16
[2020-12-30 12:35:04] SBS176 de novo extraction completed for a total of 27 signatures!
Execution time:1 day, 7:54:41
[2020-12-31 19:04:04] SBS176 de novo extraction completed for a total of 28 signatures!
Execution time:1 day, 6:29:00
[2021-01-02 04:50:07] SBS176 de novo extraction completed for a total of 29 signatures!
Execution time:1 day, 9:46:02
[2021-01-03 14:15:32] SBS176 de novo extraction completed for a total of 30 signatures!
Execution time:1 day, 9:25:25
[2021-01-03 14:48:21] Analysis ended:
-------Job Status-------
Analysis of mutational signatures completed successfully!
Total execution time: 34 days, 21:09:09
Results can be found in: TCGA_CN176X folder
Signature number determination
First, load the results.
library(sigminer)
<- sigprofiler_import("../SP/PCAWG_CN176X/", order_by_expo = TRUE, type = "all")
SP_PCAWG saveRDS(SP_PCAWG, file = "../data/pcawg_cn_solutions_sp.rds")
Then we visualize the signature number survey.
<- readRDS("../data/pcawg_cn_solutions_sp.rds") SP_PCAWG
show_sig_number_survey(
$all_stats %>%
SP_PCAWG::rename(
dplyrs = `Stability (Avg Silhouette)`,
e = `Mean Cosine Distance`
%>%
) ::mutate(
dplyrSignatureNumber = as.integer(gsub("[^0-9]", "", SignatureNumber))
),x = "SignatureNumber",
left_y = "s", right_y = "e",
left_name = "Stability (Avg Silhouette)",
right_name = "Mean Cosine Distance",
highlight = 14
)
We pick up 14
signatures for PCAWG data due to its relatively high stability and low distance.
The Signature
object with 14 signature profiles are stored for further analysis.
To better describe the signatures, we will rename them. The names are ordered by mean activity in samples.
<- SP_PCAWG$solution_list$S14
pcawg_sigs apply(pcawg_sigs$Exposure, 1, mean)
Sig1 Sig2 Sig3 Sig4 Sig5 Sig6 Sig7 Sig8
15.830094 13.944564 12.504680 11.570554 10.709143 10.506479 10.429086 9.727142
Sig9 Sig10 Sig11 Sig12 Sig13 Sig14
9.621310 9.116271 7.989201 7.122750 6.835853 6.232541
sig_names(pcawg_sigs)
[1] "Sig1" "Sig2" "Sig3" "Sig4" "Sig5" "Sig6" "Sig7" "Sig8" "Sig9"
[10] "Sig10" "Sig11" "Sig12" "Sig13" "Sig14"
colnames(pcawg_sigs$Signature) <- colnames(pcawg_sigs$Signature.norm) <- rownames(pcawg_sigs$Exposure) <- rownames(pcawg_sigs$Exposure.norm) <- pcawg_sigs$Stats$signatures$Signatures <- paste0("CNS", 1:14)
sig_names(pcawg_sigs)
[1] "CNS1" "CNS2" "CNS3" "CNS4" "CNS5" "CNS6" "CNS7" "CNS8" "CNS9"
[10] "CNS10" "CNS11" "CNS12" "CNS13" "CNS14"
saveRDS(pcawg_sigs, file = "../data/pcawg_cn_sigs_CN176_signature.rds")
Sample copy number catalog reconstruction
Here we try know how well each sample catalog profile can be constructed from the signature combination.
Firstly we get the activity data.
<- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds")
pcawg_sigs <- list(
pcawg_act absolute = get_sig_exposure(SP_PCAWG$solution_list$S14, type = "absolute"),
relative = get_sig_exposure(SP_PCAWG$solution_list$S14, type = "relative", rel_threshold = 0),
similarity = pcawg_sigs$Stats$samples[, .(Samples, `Cosine Similarity`)]
)
colnames(pcawg_act$absolute)[-1] <- colnames(pcawg_act$relative)[-1] <- paste0("CNS", 1:14)
colnames(pcawg_act$similarity) <- c("sample", "similarity")
saveRDS(pcawg_act, file = "../data/pcawg_cn_sigs_CN176_activity.rds")
Check the stat summary of reconstructed Cosine similarity.
summary(pcawg_act$similarity$similarity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.5130 0.8980 0.9490 0.9326 0.9850 1.0000
Visualize with plot
hist(pcawg_act$similarity$similarity, breaks = 100, xlab = "Reconstructed similarity", main = NA)
Most of samples are well constructed. Next we will use a similarity threshold 0.75
to filter out some samples for removing their effects on the following analysis.
PART 2: Signature profile visualization and analysis
In this part, we would like to visualize and analyze all tumor samples as a whole.
Signature profile visualization
library(tidyverse)
library(sigminer)
<- readRDS(file = "../data/pcawg_cn_sigs_CN176_signature.rds") pcawg_sigs
Show signature profile with relative contribution value for each component.
show_sig_profile(
pcawg_sigs,mode = "copynumber",
method = "X",
style = "cosmic"
)
Show signature profile with absolute contribution value (estimated segment count) for each component.
show_sig_profile(
pcawg_sigs,mode = "copynumber",
method = "X",
style = "cosmic",
normalize = "raw"
)
Show signature activity profile.
show_sig_exposure(
pcawg_sigs,style = "cosmic",
rm_space = TRUE,
palette = sigminer:::letter_colors
)
Signature analysis
Signature similarity
Here we explore the similarity between copy number signatures identified in PCAWG data. High cosine similarity will cause some issues in signature assignment and activity attribution (Maura et al. 2019).
<- get_sig_similarity(pcawg_sigs, pcawg_sigs) sim
::pheatmap(sim$similarity, cluster_cols = F, cluster_rows = F, display_numbers = TRUE) pheatmap
Most of signature-pairs have similarity around 0.1
.
Signature activity distribution
Transform data to long format.
<- get_sig_exposure(pcawg_sigs) %>%
df_abs ::pivot_longer(cols = starts_with("CNS"), names_to = "sig", values_to = "activity") %>%
tidyr::mutate(
dplyractivity = log10(activity + 1),
sig = factor(sig, levels = paste0("CNS", 1:14))
)
<- get_sig_exposure(pcawg_sigs, type = "relative", rel_threshold = 0) %>%
df_rel ::pivot_longer(cols = starts_with("CNS"), names_to = "sig", values_to = "activity") %>%
tidyr::mutate(
dplyrsig = factor(sig, levels = paste0("CNS", 1:14))
)
Plot with function in sigminer
.
show_group_distribution(
df_abs,gvar = "sig",
dvar = "activity",
order_by_fun = FALSE,
g_angle = 90,
ylab = "log10(activity+1)"
)
show_group_distribution(
df_rel,gvar = "sig",
dvar = "activity",
order_by_fun = FALSE,
g_angle = 90,
ylab = "relative activity"
)
Copy number profile and signature activity
Copy number profile for representative samples
Here we will draw copy number profile of 2 representative samples for each signature.
<- function(dt, sig) {
get_sample_from_sig <- head(dt[order(dt[[sig]], dt[[paste0("ABS_", sig)]], decreasing = TRUE)], 2L)
res
res }
Read CopyNumber
object for PCAWG data.
<- readRDS("../data/pcawg_cn_obj.rds")
pcawg_cn_obj <- pcawg_cn_obj@summary.per.sample
samp_summary <- get_sig_exposure(pcawg_sigs, type = "relative", rel_threshold = 0)
rel_activity <- get_sig_exposure(pcawg_sigs, type = "absolute")
abs_activity
<- rel_activity[, lapply(.SD, function(x) {
rel_activity if (is.numeric(x)) round(x, 2) else x
})]# all(rownames(abs_activity) == rownames(rel_activity))
colnames(abs_activity) <- c("sample", paste0("ABS_CNS", 1:14))
<- merge(
act
rel_activity, abs_activity,by = "sample"
)
dir.create("../output/enrich_samples/", showWarnings = FALSE)
for (i in paste0("CNS", 1:14)) {
cat(paste0("Most enriched in ", i, "\n"))
<- get_sample_from_sig(act, i)
s print(s)
<- show_cn_profile(pcawg_cn_obj,
plist samples = s$sample,
show_title = TRUE,
return_plotlist = TRUE
)<- purrr::map2(plist, s$sample, function(p, x) {
plist <- samp_summary[sample == x]
s <- paste0(
text "n_of_seg:", s$n_of_seg, "\n",
"n_of_amp:", s$n_of_amp, "\n",
"n_of_del:", s$n_of_del, "\n",
"rel:", act[sample == x][[i]], "\n",
"abs:", act[sample == x][[paste0("ABS_", i)]], "\n"
)<- p + annotate("text",
p x = Inf, y = Inf, hjust = 1, vjust = 1,
label = text, color = "gray50"
)
p
})<- cowplot::plot_grid(plotlist = plist, nrow = 2)
p print(p)
::ggsave(file.path("../output/enrich_samples/", paste0(i, ".pdf")),
ggplot2plot = p, width = 12, height = 6
) }
Most enriched in CNS1
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP117655 0.86 0 0.01 0 0.00 0.02 0.04 0 0.02 0.01 0.01 0.02
2: SP112248 0.79 0 0.00 0 0.06 0.05 0.01 0 0.07 0.00 0.00 0.01
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0 403 0 7 0 0 9 17
2: 0 0 183 0 1 0 15 11 2
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 10 6 6 11 0 0
2: 0 16 0 0 3 0 0
Most enriched in CNS2
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP8564 0.03 0.70 0.01 0.09 0.14 0.00 0.02 0 0.00 0 0 0
2: SP123894 0.00 0.55 0.00 0.00 0.00 0.11 0.21 0 0.09 0 0 0
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0.00 16 338 5 43 67 0 11
2: 0 0.04 0 163 0 0 0 34 62
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 0 0 0 0 0 0
2: 0 26 0 0 0 0 13
Most enriched in CNS3
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP105708 0 0 1 0 0 0 0 0 0 0 0 0
2: SP107381 0 0 1 0 0 0 0 0 0 0 0 0
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0 0 0 22 0 0 0 0
2: 0 0 0 0 22 0 0 0 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 0 0 0 0 0 0
2: 0 0 0 0 0 0 0
Most enriched in CNS4
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP116525 0.07 0.01 0.00 0.79 0 0.08 0.00 0 0.04 0 0 0
2: SP121824 0.00 0.00 0.02 0.79 0 0.00 0.01 0 0.00 0 0 0
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0.00 0.00 63 13 0 740 0 75 0
2: 0.18 0.01 0 0 14 446 0 0 3
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 35 0 0 0 4 2
2: 0 0 0 0 0 100 5
Most enriched in CNS5
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP56303 0.00 0 0 0.07 0.90 0.02 0 0 0.00 0 0 0
2: SP117309 0.03 0 0 0.00 0.86 0.00 0 0 0.11 0 0 0
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0.01 0 0 0 0 21 278 6 0
2: 0.00 0 2 0 0 0 62 0 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 0 0 1 0 2 0
2: 0 8 0 0 0 0 0
Most enriched in CNS6
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP77443 0.00 0.00 0.02 0 0.00 0.58 0.02 0 0.27 0 0.02 0.10
2: SP124197 0.15 0.02 0.00 0 0.06 0.57 0.00 0 0.11 0 0.02 0.05
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0.00 0 0 1 0 0 30 1
2: 0 0.02 32 5 0 0 12 123 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 14 0 1 5 0 0
2: 0 23 0 5 10 0 4
Most enriched in CNS7
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP49114 0.04 0.00 0.23 0 0 0.01 0.63 0 0.01 0.05 0.03 0.00
2: SP114903 0.04 0.15 0.17 0 0 0.00 0.60 0 0.00 0.01 0.02 0.01
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0 15 0 78 0 0 4 212
2: 0 0 28 100 118 0 0 0 413
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 2 18 9 0 0 0
2: 0 0 7 17 5 0 0
Most enriched in CNS8
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP116539 0 0 0.08 0 0.00 0.00 0 0.86 0.00 0.00 0.00 0
2: SP116706 0 0 0.06 0 0.01 0.01 0 0.78 0.01 0.11 0.04 0
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0.06 0 0 16 0 0 0 0
2: 0 0.00 0 0 10 0 1 1 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 164 0 0 0 0 0 11
2: 135 1 19 7 0 0 0
Most enriched in CNS9
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP118048 0 0 0 0 0 0 0 0 1 0 0 0
2: SP123964 0 0 0 0 0 0 0 0 1 0 0 0
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0 0 0 0 0 0 0 0
2: 0 0 0 0 0 0 0 0 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 47 0 0 0 0 0
2: 0 34 0 0 0 0 0
Most enriched in CNS10
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP112887 0 0 0.10 0 0 0 0 0.00 0 0.89 0.02 0
2: SP112917 0 0 0.06 0 0 0 0 0.04 0 0.82 0.08 0
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0 0 0 6 0 0 0 0
2: 0 0 0 0 5 0 0 0 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 0 56 1 0 0 0
2: 3 0 63 6 0 0 0
Most enriched in CNS11
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP106797 0 0 0.12 0 0 0 0.03 0.03 0 0 0.81 0
2: SP135354 0 0 0.22 0 0 0 0.00 0.00 0 0 0.78 0
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0 0 0 4 0 0 0 1
2: 0 0 0 0 6 0 0 0 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 1 0 0 26 0 0 0
2: 0 0 0 21 0 0 0
Most enriched in CNS12
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP106734 0 0.00 0.09 0 0 0 0 0 0.18 0 0 0.73
2: SP37636 0 0.06 0.03 0 0 0 0 0 0.19 0 0 0.72
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0 0 0 2 0 0 0 0
2: 0 0 0 2 1 0 0 0 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 0 4 0 0 16 0 0
2: 0 6 0 0 23 0 0
Most enriched in CNS13
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12 CNS13
1: SP90269 0.00 0.17 0.07 0.01 0.04 0 0.16 0.02 0 0.03 0.08 0 0.42
2: SP43664 0.11 0.00 0.11 0.11 0.18 0 0.00 0.00 0 0.01 0.08 0 0.41
CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0 51 20 2 13 0 47
2: 0 13 0 13 14 22 0 0
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 6 0 10 25 0 124 0
2: 0 0 1 10 0 50 0
Most enriched in CNS14
sample CNS1 CNS2 CNS3 CNS4 CNS5 CNS6 CNS7 CNS8 CNS9 CNS10 CNS11 CNS12
1: SP96163 0 0.08 0.09 0 0 0 0.09 0.03 0.01 0.02 0.12 0.01
2: SP116365 0 0.01 0.10 0 0 0 0.11 0.01 0.00 0.00 0.21 0.01
CNS13 CNS14 ABS_CNS1 ABS_CNS2 ABS_CNS3 ABS_CNS4 ABS_CNS5 ABS_CNS6 ABS_CNS7
1: 0 0.56 0 44 48 0 0 0 50
2: 0 0.55 0 5 35 0 0 0 41
ABS_CNS8 ABS_CNS9 ABS_CNS10 ABS_CNS11 ABS_CNS12 ABS_CNS13 ABS_CNS14
1: 18 4 12 64 5 0 309
2: 4 0 0 78 2 0 203
Plot sample profile for presentation in manuscript.
<- c(
sel_samps "SP117655", "SP123894", "SP105708", "SP116525", "SP117309",
"SP77443", "SP114903", "SP116539", "SP118048", "SP112887",
"SP135354", "SP37636", "SP43664", "SP96163"
)# table(pcawg_cn_obj@data[sample == "SP116539"]$chromosome)
# table(pcawg_cn_obj@data[sample == "SP112887"]$chromosome)
<- list()
plist for (s in sel_samps) {
if (s == sel_samps[4]) {
<- show_cn_profile(pcawg_cn_obj,
p samples = s, chrs = c("chr12"),
show_title = TRUE,
return_plotlist = TRUE
)else if (s == sel_samps[8]) {
} <- show_cn_profile(pcawg_cn_obj,
p samples = s, chrs = c("chr1", "chr2", "chr4", "chr5", "chr6"),
show_title = TRUE,
return_plotlist = TRUE
)else if (s == sel_samps[10]) {
} <- show_cn_profile(pcawg_cn_obj,
p samples = s, chrs = paste0("chr", c(1:10, 14, 16, 18, 19, 21)),
show_title = TRUE,
return_plotlist = TRUE
)else if (s == sel_samps[13]) {
} <- show_cn_profile(pcawg_cn_obj,
p samples = s, chrs = c("chr7"),
show_title = TRUE,
return_plotlist = TRUE
)else {
} <- show_cn_profile(pcawg_cn_obj,
p samples = s,
show_title = TRUE,
return_plotlist = TRUE
)
}<- p[[1]] + ggplot2::labs(x = NULL)
plist[[s]]
}
<- purrr::pmap(list(plist, sel_samps, paste0("CNS", 1:14)), function(p, x, y) {
plist <- samp_summary[sample == x]
s <- paste0(
text "n_of_seg:", s$n_of_seg, "\n",
"n_of_amp:", s$n_of_amp, "\n",
"n_of_del:", s$n_of_del, "\n",
"rel:", act[sample == x][[y]], "\n",
"abs:", act[sample == x][[paste0("ABS_", y)]], "\n"
)<- p + annotate("text",
p x = Inf, y = Inf, hjust = 1, vjust = 1,
label = text, color = "gray50"
)
p
})<- cowplot::plot_grid(plotlist = plist, ncol = 1)
p ::ggsave(file.path("../output/enrich_samples/", "selected_samples.pdf"),
ggplot2plot = p, width = 12, height = 42
)
Ideograph for copy number profile reconstruction
Select one/two samples and draw copy number profile -> catalog profile -> signature relative activity.
<- c("SP102561", "SP123964") ss
<- show_cn_profile(pcawg_cn_obj,
p samples = ss,
show_title = TRUE, ncol = 1
)::ggsave("../output/example_cn_profile.pdf",
ggplot2plot = p, width = 12, height = 6
)
<- readRDS("../data/pcawg_cn_tally_X.rds") tally_X
<- show_catalogue(tally_X, mode = "copynumber", method = "X", style = "cosmic", samples = ss, by_context = TRUE, font_scale = 0.7)
p ::ggsave("../output/example_sig_profile.pdf",
ggplot2plot = p, width = 15, height = 4
)
<- show_sig_exposure(sig_exposure(pcawg_sigs)[, ss], hide_samps = FALSE) + ggplot2::ylab("Activity") + ggpubr::rotate_x_text(0, hjust = 0.5)
p ::ggsave("../output/example_sig_activity.pdf",
ggplot2plot = p, width = 4, height = 5
)
PART 3: Pan-Cancer signature analysis
In this part, we will analyze copy number signatures across cancer types and show the landscape.
Signature number and contribution in each cancer type
Load tidy cancer type annotation data.
library(sigminer)
library(tidyverse)
<- readRDS("../data/pcawg_type_info.rds") pcawg_types
Load signature activity data.
<- readRDS("../data/pcawg_cn_sigs_CN176_activity.rds") pcawg_activity
Combine the cancer type annotation and activity data and only keep samples with good reconstruction (>0.75
cosine similarity).
<- pcawg_activity$similarity[similarity > 0.75]$sample
keep_samps
<- merge(pcawg_activity$absolute[sample %in% keep_samps], pcawg_types, by = "sample")
df_abs <- merge(pcawg_activity$relative[sample %in% keep_samps], pcawg_types, by = "sample") df_rel
Signature activity in each cancer type
Here we draw distribution of a signature across cancer types.
show_group_distribution(
df_abs,gvar = "cancer_type",
dvar = "CNS1",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3
)
We have many signatures here, so we output them to PDF files.
dir.create("../output/cancer-type-dist", showWarnings = F)
<- paste0("CNS", 1:14)
signames for (i in signames) {
<- show_group_distribution(df_abs,
pxx gvar = "cancer_type",
dvar = i, order_by_fun = FALSE,
ylab = i,
g_angle = 90, point_size = 0.3
)::ggsave(file.path("../output/cancer-type-dist/", paste0("Absolute_activity_", i, ".pdf")),
ggplot2plot = pxx, width = 12, height = 6
)<- show_group_distribution(df_rel,
pxx gvar = "cancer_type",
dvar = i, order_by_fun = FALSE,
ylab = i,
g_angle = 90, point_size = 0.3
)::ggsave(file.path("../output/cancer-type-dist/", paste0("Relative_activity_", i, ".pdf")),
ggplot2plot = pxx, width = 12, height = 6
)
}rm(pxx)
Signature landscape
Define a signature which is detectable if this signature contribute >5%
exposures and contribute >15 segments
in a sample.
<- df_rel %>%
df ::mutate_at(dplyr::vars(dplyr::starts_with("CNS")), ~ ifelse(. > 0.05, 1L, 0L)) %>%
dplyr::pivot_longer(
tidyrcols = dplyr::starts_with("CNS"),
names_to = "sig", values_to = "detectable"
)
<- df_rel %>%
df2 ::pivot_longer(
tidyrcols = dplyr::starts_with("CNS"),
names_to = "sig", values_to = "expo"
)
<- df_abs %>%
df3 ::mutate_at(dplyr::vars(dplyr::starts_with("CNS")), ~ ifelse(. > 15, 1L, 0L)) %>%
dplyr::pivot_longer(
tidyrcols = dplyr::starts_with("CNS"),
names_to = "sig", values_to = "segment_detect"
)
<- dplyr::left_join(df, df2,
df by = c("sample", "cancer_type", "sig")
%>% dplyr::left_join(., df3, by = c("sample", "cancer_type", "sig"))
)
<- df %>%
df_type ::group_by(cancer_type, sig) %>%
dplyr::summarise(
dplyrfreq = sum(segment_detect), # directly use count
expo = median(expo[detectable == 1]),
n = n(),
label = paste0(unique(cancer_type), " (n=", n, ")"),
.groups = "drop"
%>%
) ::group_by(cancer_type) %>%
dplyr::mutate(pro = freq / sum(freq))
dplyr
$expo <- ifelse(df_type$freq == 0, 0, df_type$expo)
df_type
<- unique(df_type[, c("cancer_type", "label")])
mps <- mps$label
mpss names(mpss) <- mps$cancer_type
summary(df_type$freq)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 1.00 7.00 15.27 20.00 175.00
Show copy number signature landscape.
library(cowplot)
<- ggplot(
p
df_type,aes(x = cancer_type, y = factor(sig, levels = paste0("CNS", 1:14)))
+
) geom_point(aes(size = pro, color = expo)) +
theme_cowplot() +
::rotate_x_text(60) +
ggpubrscale_x_discrete(breaks = mps$cancer_type, labels = mps$label) +
scale_size_continuous(
limits = c(0.1, 1),
breaks = c(0, 0.25, 0.5, 0.75, 1)
+
) scale_color_stepsn(
colors = viridis::viridis(5, direction = -1),
breaks = c(0, 0.25, 0.5, 0.75, 1)
+
) labs(
x = NULL, y = "Copy number signatures",
color = "Median activity\ndue to signature",
size = "Proportion of tumors \nwith the signatures"
) p
### Signature number distribution
For most of cancer types, they have similar signature constitution (most of copy number signatures available in them). However, we need to further check that if many tumors have so many signatures activated.
<- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds") %>%
pcawg_cns "Exposure.norm"]] %>%
.[[t() %>%
as.data.frame() %>%
::rownames_to_column(., var = "sample")
tibble<- read_csv("../data/PCAWG/PCAWG_sigProfiler_SBS_signatures_in_samples.csv")
pcawg_sbs <- read_csv("../data/PCAWG/PCAWG_sigProfiler_DBS_signatures_in_samples.csv")
pcawg_dbs <- read_csv("../data/PCAWG/PCAWG_SigProfiler_ID_signatures_in_samples.csv")
pcawg_id
# Use signature relative contribution for analysis
<- pcawg_sbs[, -c(1, 3)] %>%
pcawg_sbs ::rename(sample = `Sample Names`) %>%
dplyr::select(-SBS43, -c(SBS45:SBS60))
dplyr
<- pcawg_dbs[, -c(1, 3)] %>%
pcawg_dbs ::rename(sample = `Sample Names`)
dplyr
<- pcawg_id[, -c(1, 3)] %>%
pcawg_id ::rename(sample = `Sample Names`)
dplyr
<- df_rel %>%
df_cns ::mutate_at(dplyr::vars(dplyr::starts_with("CNS")), ~ ifelse(. > 0.05, 1L, 0L)) %>%
dplyr::pivot_longer(
tidyrcols = dplyr::starts_with("CNS"),
names_to = "sig", values_to = "detectable"
)
<- df_rel %>%
df2_cns ::pivot_longer(
tidyrcols = dplyr::starts_with("CNS"),
names_to = "sig", values_to = "expo"
)
<- df_abs %>%
df3_cns ::mutate_at(dplyr::vars(dplyr::starts_with("CNS")), ~ ifelse(. > 15, 1L, 0L)) %>%
dplyr::pivot_longer(
tidyrcols = dplyr::starts_with("CNS"),
names_to = "sig", values_to = "segment_detect"
)
<- dplyr::left_join(df_cns, df2_cns,
df_cns by = c("sample", "cancer_type", "sig")
%>% dplyr::left_join(., df3_cns, by = c("sample", "cancer_type", "sig"))
)
<- df_cns %>%
df_type_cns ::group_by(cancer_type, sig) %>%
dplyr::summarise(
dplyrfreq = sum(segment_detect), # directly use count
expo = median(expo[detectable == 1]),
n = n(),
label = paste0(unique(cancer_type), " (n=", n, ")"),
.groups = "drop"
)
<- unique(df_type_cns[, c("cancer_type", "label")])
mps <- mps$label
mpss names(mpss) <- mps$cancer_type
<- df_cns %>%
df_detc_cns ::group_by(cancer_type, sample) %>%
dplyr::summarise(
dplyrsignumber = sum(segment_detect),
.groups = "drop"
)
<- df_detc_cns$signumber
num_CNS
# SBS
<- dplyr::inner_join(pcawg_sbs,
pcawg_sbs2 c("sample", "cancer_type")],
df_rel[, by = "sample"
)
<- pcawg_sbs2 %>%
df_sbs ::mutate_at(dplyr::vars(dplyr::starts_with("SBS")), ~ ifelse(. > 0, 1L, 0L)) %>%
dplyr::pivot_longer(
tidyrcols = dplyr::starts_with("SBS"),
names_to = "sig", values_to = "detectable"
)
<- pcawg_sbs2 %>%
df2_sbs ::pivot_longer(
tidyrcols = dplyr::starts_with("SBS"),
names_to = "sig", values_to = "expo"
)
<- dplyr::left_join(df_sbs, df2_sbs,
df_sbs by = c("sample", "cancer_type", "sig")
)
<- df_sbs %>%
df_type_sbs ::group_by(cancer_type, sig) %>%
dplyr::summarise(
dplyrfreq = sum(detectable), # directly use count
expo = median(expo[detectable == 1]),
n = n(),
label = paste0(unique(cancer_type), " (n=", n, ")"),
.groups = "drop"
)
<- df_sbs %>%
df_detc_sbs ::group_by(cancer_type, sample) %>%
dplyr::summarise(
dplyrsignumber = sum(detectable),
.groups = "drop"
)<- df_detc_sbs$signumber
num_SBS
# DBS
<- dplyr::inner_join(pcawg_dbs,
pcawg_dbs2 c("sample", "cancer_type")],
df_rel[, by = "sample"
)
<- pcawg_dbs2 %>%
df_dbs ::mutate_at(dplyr::vars(dplyr::starts_with("DBS")), ~ ifelse(. > 0, 1L, 0L)) %>%
dplyr::pivot_longer(
tidyrcols = dplyr::starts_with("DBS"),
names_to = "sig", values_to = "detectable"
)
<- pcawg_dbs2 %>%
df2_dbs ::pivot_longer(
tidyrcols = dplyr::starts_with("DBS"),
names_to = "sig", values_to = "expo"
)
<- dplyr::left_join(df_dbs, df2_dbs,
df_dbs by = c("sample", "cancer_type", "sig")
)
<- df_dbs %>%
df_type_dbs ::group_by(cancer_type, sig) %>%
dplyr::summarise(
dplyrfreq = sum(detectable), # directly use count
expo = median(expo[detectable == 1]),
n = n(),
label = paste0(unique(cancer_type), " (n=", n, ")"),
.groups = "drop"
)
<- df_dbs %>%
df_detc_dbs ::group_by(cancer_type, sample) %>%
dplyr::summarise(
dplyrsignumber = sum(detectable),
.groups = "drop"
)
<- df_detc_dbs$signumber
num_DBS
# ID
# DBS
<- dplyr::inner_join(pcawg_id,
pcawg_id2 c("sample", "cancer_type")],
df_rel[, by = "sample"
)
<- pcawg_id2 %>%
df_id ::mutate_at(dplyr::vars(dplyr::starts_with("ID")), ~ ifelse(. > 0, 1L, 0L)) %>%
dplyr::pivot_longer(
tidyrcols = dplyr::starts_with("ID"),
names_to = "sig", values_to = "detectable"
)
<- pcawg_id2 %>%
df2_id ::pivot_longer(
tidyrcols = dplyr::starts_with("ID"),
names_to = "sig", values_to = "expo"
)
<- dplyr::left_join(df_id, df2_id,
df_id by = c("sample", "cancer_type", "sig")
)
<- df_id %>%
df_type_id ::group_by(cancer_type, sig) %>%
dplyr::summarise(
dplyrfreq = sum(detectable), # directly use count
expo = median(expo[detectable == 1]),
n = n(),
label = paste0(unique(cancer_type), " (n=", n, ")"),
.groups = "drop"
)
<- df_id %>%
df_detc_id ::group_by(cancer_type, sample) %>%
dplyr::summarise(
dplyrsignumber = sum(detectable),
.groups = "drop"
)
<- df_detc_id$signumber
num_ID
# sum
<- dplyr::tibble(
df_num_all sig_type = rep(
c("CNS", "SBS", "DBS", "ID"),
c(length(num_CNS), length(num_SBS), length(num_DBS), length(num_ID))
),num = c(num_CNS, num_SBS, num_DBS, num_ID)
)# saveRDS(df_num_all, file = "/home/tzy/projects/CNX-method/data/df_num_all.rds")
Pan-Cancer signature number distribution.
library(ggpubr)
<- ggboxplot(df_num_all,
num x = "sig_type", y = "num",
ylab = "Signature number",
color = "sig_type",
xlab = FALSE,
legend = "none",
palette = "jco",
width = 0.3,
outlier.size = 0.05
+
) scale_color_manual(values = c("#0073C2", "#EFC000", "#CD534C", "#7AA6DC"))
num
Most tumors have 3-6 signatures.
<- ggplot(data = df_detc_cns, aes(x = cancer_type, y = signumber, fill = cancer_type)) +
cancer_num geom_boxplot() +
coord_flip() +
labs(x = NULL, y = "Signature number") +
theme_cowplot() +
theme(legend.position = "none")
cancer_num
From the landscape and distribution data, we know that many signatures activate in most of cancer types, but for a specified tumor, in general there are 2-4 signatures are detectable.
Cancer type associated enrichment
Run enrichment analysis.
<- group_enrichment(
enrich_result
df_abs,grp_vars = "cancer_type",
enrich_vars = paste0("CNS", 1:14),
co_method = "wilcox.test"
)
Show enrichment landscape.
$enrich_var <- factor(enrich_result$enrich_var, paste0("CNS", 1:14))
enrich_result<- show_group_enrichment(enrich_result, fill_by_p_value = TRUE, return_list = T)
p <- p$cancer_type + labs(x = NULL, y = NULL)
p p
ggsave("../output/CNS_PCAWG_enrichment_landscape.pdf",
plot = p,
height = 8, width = 8.5
)
To better visualize the enrichment results, we use binned color regions.
<- show_group_enrichment(
p
enrich_result,fill_by_p_value = TRUE,
cut_p_value = TRUE,
return_list = T
)<- p$cancer_type + labs(x = NULL, y = NULL)
p p
ggsave("../output/CNS_PCAWG_enrichment_landscape2.pdf",
plot = p,
height = 8, width = 8.5
)
We see cancer type SoftTissue-Liposarc
has pretty high enrichment on CNS4
. Let’s check the enrichment result.
== "SoftTissue-Liposarc"] enrich_result[grp1
grp_var enrich_var grp1 grp2 grp1_size grp1_pos_measure
1: cancer_type CNS1 SoftTissue-Liposarc Rest 19 96.315789
2: cancer_type CNS2 SoftTissue-Liposarc Rest 19 68.736842
3: cancer_type CNS3 SoftTissue-Liposarc Rest 19 11.947368
4: cancer_type CNS4 SoftTissue-Liposarc Rest 19 435.526316
5: cancer_type CNS5 SoftTissue-Liposarc Rest 19 17.684211
grp2_size grp2_pos_measure measure_observed measure_tested p_value
1: 2718 15.318985 6.2873482 NA 7.503689e-07
2: 2718 13.663355 5.0307439 NA 1.832154e-02
3: 2718 12.660412 0.9436793 NA 8.258332e-01
4: 2718 8.231052 52.9125928 NA 1.535689e-22
5: 2718 10.408389 1.6990344 NA 6.439534e-02
type method fdr
1: continuous wilcox.test 1.715129e-06
2: continuous wilcox.test 2.664952e-02
3: continuous wilcox.test 8.258332e-01
4: continuous wilcox.test 4.914204e-21
5: continuous wilcox.test 8.586046e-02
[ reached getOption("max.print") -- omitted 9 rows ]
Let’s go further plot the distribution for the two groups.
<- df_abs[, c("CNS4", "cancer_type")][
df_check
, .(cancer_type = ifelse(cancer_type == "SoftTissue-Liposarc",
"SoftTissue-Liposarc",
"Others"
),CNS4 = CNS4
) ]
# ggpubr::ggboxplot(
# df_check,
# x = "cancer_type", y = "CNS4",
# fill = "cancer_type",
# xlab = FALSE, width = 0.3, legend = "none")
show_group_distribution(
df_check,gvar = "cancer_type",
dvar = "CNS4",
order_by_fun = FALSE,
g_angle = 90,
ylab = "CNS4"
)
Check copy number distribution for the "SoftTissue-Liposarc"
samples.
<- df_abs[cancer_type == "SoftTissue-Liposarc"]$sample
samples
<- readRDS("../data/pcawg_cn_obj.rds")
pcawg_cn_obj <- subset(pcawg_cn_obj@data, sample %in% samples)
cn_dt $segLen <- cn_dt$end - cn_dt$start + 1 cn_dt
Copy number value:
boxplot(cn_dt$segVal, ylab = "Copy number value")
Segment length:
boxplot(cn_dt$segLen, ylab = "Segment length")
<- cn_dt[, .(nAMP = sum(segVal > 2)), by = sample] cn_dt_samp
boxplot(cn_dt_samp$nAMP, ylab = "Number of amplifications")
PART 4: Cancer subtyping and prognosis analysis
In this part, we would like to cluster all tumors based on CN signatures and use the clusters to explore Pan-Cancer heterogeneity.
Data integration
Many raw data have been processed by tools (or collected) and cleaned. This part I mainly describe how to combine them to construct the integrated dataset.
If you care about the data (source) shown below, please check the last section of PART 0.
library(pacman)
p_load(
tidyverse,
data.table,
sigminer,
IDConverter )
Load pre-processed data.
<- readRDS("../data/pcawg_cn_obj.rds")
pcawg_cn_obj <- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds")
pcawg_cn_sig <- readRDS("../data/pcawg_samp_info_sp.rds")
pcawg_samp_info_sp <- pcawg_cn_obj@summary.per.sample
samp_summary $n_loh <- NULL samp_summary
Check if all samples are recorded.
all(pcawg_cn_obj@summary.per.sample$sample %in% pcawg_samp_info_sp$pcawg_specimen_histology_August2016_v9$icgc_specimen_id)
<- lapply(pcawg_samp_info_sp, function(x) {
pcawg_samp_info_sp colnames(x)[1] <- "sample"
x
})<- purrr::reduce(pcawg_samp_info_sp, dplyr::full_join, by = "sample")
pcawg_samp_info <- pcawg_samp_info %>% dplyr::filter(sample %in% pcawg_cn_obj@summary.per.sample$sample)
pcawg_samp_info
rm(pcawg_samp_info_sp)
Refer to PCAWG Mutational Signatures Working Group et al. (2020), some cancer types with small sample size should be combined.
table(pcawg_samp_info$histology_abbreviation)
<- pcawg_samp_info %>%
pcawg_samp_info mutate(
cancer_type = case_when(
startsWith(histology_abbreviation, "Breast") ~ "Breast",
startsWith(histology_abbreviation, "Biliary-AdenoCA") ~ "Biliary-AdenoCA",
%in% c("Bone-Epith", "Bone-Benign") ~ "Bone-Other",
histology_abbreviation startsWith(histology_abbreviation, "Cervix") ~ "Cervix",
%in% c("Myeloid-AML, Myeloid-MDS", "Myeloid-AML, Myeloid-MPN", "Myeloid-MDS", "Myeloid-MPN") ~ "Myeloid-MDS/MPN",
histology_abbreviation TRUE ~ histology_abbreviation
) )
Select important features.
<- pcawg_samp_info %>%
pcawg_samp_info ::select(c(
dplyr"sample", "_PATIENT", "donor_sex", "donor_age_at_diagnosis",
"donor_survival_time", "donor_vital_status", "first_therapy_type",
"tobacco_smoking_history_indicator", "alcohol_history",
"dcc_project_code", "dcc_specimen_type",
"histology_abbreviation", "cancer_type",
"tumour_stage",
"level_of_cellularity"
))colnames(pcawg_samp_info)[2] <- "donor_id"
# Remove duplicated samples
<- pcawg_samp_info[!duplicated(pcawg_samp_info$sample), ] pcawg_samp_info
Get signature activities.
<- readRDS("../data/pcawg_cn_sigs_CN176_activity.rds")
pcawg_activity
<- pcawg_activity$absolute[, lapply(.SD, function(x) if (is.numeric(x)) round(x, digits = 3) else x)]
expo_abs colnames(expo_abs) <- c("sample", paste0("Abs_", colnames(expo_abs)[-1]))
<- pcawg_activity$relative[, lapply(.SD, function(x) if (is.numeric(x)) round(x, digits = 3) else x)]
expo_rel colnames(expo_rel) <- c("sample", paste0("Rel_", colnames(expo_rel)[-1]))
<- dplyr::left_join(expo_abs, expo_rel, by = "sample")
pcawg_activity2 <- dplyr::left_join(pcawg_activity2, pcawg_activity$similarity, by = "sample")
pcawg_activity2
$keep <- pcawg_activity2$similarity > 0.75
pcawg_activity2<- pcawg_activity2
pcawg_activity
rm(pcawg_activity2, expo_abs, expo_rel)
Read tumor purity & ploidy & WGD status.
<- fread("../data/PCAWG/consensus.20170217.purity.ploidy_sp")
dt_pp <- dt_pp[, c("samplename", "purity", "ploidy", "wgd_status")]
dt_pp colnames(dt_pp)[1] <- "sample"
Calculate fusion events in all samples.
<- fread("../data/PCAWG/pcawg3_fusions_PKU_EBI.gene_centric.sp.xena")
fusion <- as.matrix(fusion[, -1])
fusion <- apply(fusion, 2, sum)
fusion <- dplyr::tibble(
fusion sample = names(fusion),
n_fusion = as.numeric(fusion)
)
Load HRD status in all samples.
<- readRDS("../data/PCAWG/pcawg_CHORD.rds")
HRD <- HRD %>%
HRD ::select(icgc_specimen_id, is_hrd, genotype_monoall) %>%
dplyrset_names(c("sample", "hrd_status", "hrd_genotype")) %>%
unique()
Read LOH data.
<- readRDS("../data/pcawg_loh.rds") LOH
Read Chromothripsis data and clean it.
# http://compbio.med.harvard.edu/chromothripsis/
<- fread("../data/PCAWG/PCAWG-ShatterSeek.txt.gz")
chromothripsis <- chromothripsis[, c("icgc_donor_id", "type_chromothripsis", "chromo_label")]
chromothripsis
# Check label
table(chromothripsis$chromo_label)
any(is.na(chromothripsis$chromo_label))
# Summarize
<- chromothripsis[
chromothripsis_summary
, .(n_chromo = sum(chromo_label != "No"),
chromo_type = paste0(na.omit(type_chromothripsis), collapse = "/")
),= icgc_donor_id
by
]
<- merge(chromothripsis_summary, pcawg_full[, c("icgc_donor_id", "icgc_specimen_id")],
chromothripsis_summary by = "icgc_donor_id"
%>% unique()
) $icgc_donor_id <- NULL
chromothripsis_summarycolnames(chromothripsis_summary)[3] <- "sample"
rm(chromothripsis)
Read AmpliconArchitect results.
<- readxl::read_excel("../data/PCAWG/ecDNA.xlsx", skip = 1)
aa1 <- readxl::read_excel("../data/PCAWG/ecDNA.xlsx", sheet = 3)
aa2
table(aa1$amplicon_classification)
table(aa2$sample_classification)
all(aa1$sample_barcode %in% aa2$sample_barcode)
<- aa1 %>%
aa_summary ::group_by(sample_barcode) %>%
dplyr::summarise(
dplyrn_amplicon_BFB = sum(amplicon_classification == "BFB", na.rm = TRUE),
n_amplicon_ecDNA = sum(amplicon_classification == "Circular", na.rm = TRUE),
n_amplicon_HR = sum(amplicon_classification == "Heavily-rearranged", na.rm = TRUE),
n_amplicon_Linear = sum(amplicon_classification == "Linear", na.rm = TRUE),
.groups = "drop"
)
::setDT(aa_summary)
data.table
<- IDConverter::pcawg_simple[, c("submitted_specimen_id", "icgc_specimen_id")]
custom_dt <- custom_dt[startsWith(submitted_specimen_id, "TCGA")]
custom_dt := substr(submitted_specimen_id, 1, 15)]
custom_dt[, submitted_specimen_id
aa_summary[:= ifelse(
, sample startsWith(sample_barcode, "SA"),
convert_pcawg(sample_barcode, from = "icgc_sample_id", to = "icgc_specimen_id"),
convert_custom(sample_barcode, from = "submitted_specimen_id", to = "icgc_specimen_id", dt = custom_dt)
)
]
## Filter out samples not in PCAWG
<- aa_summary[!is.na(sample)][, sample_barcode := NULL]
aa_summary
rm(aa1, aa2, custom_dt)
Read TelomereHunter results.
<- readxl::read_excel("../data/PCAWG/TelomereContent.xlsx")
TC <- TC %>% dplyr::select(c("icgc_specimen_id", "tel_content_log2"))
TC colnames(TC)[1] <- "sample"
::setDT(TC) data.table
<- read_tsv("../data/PCAWG/MAF_Aug31_2016_sorted_A3A_A3B_comparePlus.txt", col_types = cols())
apobec <- apobec %>%
apobec ::select(c(1, 5)) %>%
dplyr::as.data.table() %>%
data.tablesetNames(c("sample", "APOBEC_mutations"))
Combine and save result combined data.
<- purrr::reduce(list(
pcawg_samp_info
pcawg_samp_info,
pcawg_activity,
samp_summary,
dt_pp,
fusion,
HRD,
LOH,
chromothripsis_summary,
aa_summary,
TC,
apobec
),::left_join,
dplyrby = "sample"
)
saveRDS(pcawg_samp_info, file = "../data/pcawg_sample_tidy_info.rds")
## Remove all objects
rm(list = ls())
Clustering with signature activity
Here, we use recent consensus clustering toolkit cola.
We take 2 steps to obtain a robust clustering result:
- We sort all samples by their total signature activities and randomly select
500
samples for running multiple methods provided bycola
at the same time, then pick up the optimal method combination. - We run clustering for all samples by the method combination above and check the cola report to determine the suitable cluster number.
Step 1: select suitable method combination
There are 2 key arguments in cola
clustering function:
top_value_method
: used to extract rows (i.e. signatures here) with top values.partition_method
: used to select partition method.
We try all combinations with randomly selected 500
samples to explore the suitable setting.
library(cola)
library(tidyverse)
<- readRDS("../data/pcawg_cn_sigs_CN176_activity.rds")
act
<- purrr::reduce(
df list(
$absolute,
act$similarity
act
),::left_join,
dplyrby = "sample"
)
<- df %>%
mat filter(similarity > 0.75) %>%
select(sample, starts_with("CNS")) %>%
column_to_rownames("sample") %>%
t()
<- adjust_matrix(mat)
mat_adj # 1 rows have been removed with too low variance (sd < 0.05 quantile)
rownames(mat_adj)
# Select suitable parameters ----------------------------------------------
<- colSums(mat_adj)
ds boxplot(ds)
set.seed(123)
<- sample(names(sort(ds)), 500)
select_samps
boxplot(ds[select_samps])
<- run_all_consensus_partition_methods(mat_adj[, select_samps], top_n = 13, mc.cores = 8, max_k = 10)
rl_samp cola_report(rl_samp, output_dir = "../output/cola_report/pcawg_sigs_500_sampls", mc.cores = 8)
rm(rl_samp)
The cola
output report is very big and isn’t suitable to show, here I only include key figures to determine the parameter setting.
# Cluster number 2
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-1-1.png") knitr
# Cluster number 3
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-2-1.png") knitr
# Cluster number 4
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-3-1.png") knitr
# Cluster number 5
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-4-1.png") knitr
# Cluster number 6
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-5-1.png") knitr
# Cluster number 7
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-6-1.png") knitr
# Cluster number 8
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-7-1.png") knitr
# Cluster number 9
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-8-1.png") knitr
# Cluster number 10
::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-9-1.png") knitr
Step 2: select suitable cluster number
From figures above we do clearly see that the skmeans
partition method fits our data and any top_value_method
is okay (because all signatures are used). Finally, we choose combine skmeans
(partition method) and ATC
(top value method introduced firstly by cola
) to run clustering for all samples.
<- run_all_consensus_partition_methods(
final
mat_adj,top_value_method = "ATC",
partition_method = "skmeans",
top_n = 13, mc.cores = 8, max_k = 10
)cola_report(final, output_dir = "../output/cola_report/pcawg_sigs_all_sampls", mc.cores = 8)
saveRDS(final, file = "../data/pcawg_cola_result.rds")
The report can be viewed at here.
Next, we will load the data and visualize to select suitable cluster number.
library(cola)
library(ComplexHeatmap)
library(tidyverse)
<- readRDS("../data/pcawg_cola_result.rds")
cluster_res <- cluster_res["ATC", "skmeans"]
cluster_res cluster_res
A 'ConsensusPartition' object with k = 2, 3, 4, 5, 6, 7, 8, 9, 10.
On a matrix with 13 rows and 2737 columns.
Top rows (13) are extracted by 'ATC' method.
Subgroups are detected by 'skmeans' method.
Performed in total 450 partitions by row resampling.
Best k for subgroups seems to be 2.
Following methods can be applied to this 'ConsensusPartition' object:
[1] "cola_report" "collect_classes"
[3] "collect_plots" "collect_stats"
[5] "colnames" "compare_signatures"
[7] "consensus_heatmap" "dimension_reduction"
[9] "functional_enrichment" "get_anno_col"
[11] "get_anno" "get_classes"
[13] "get_consensus" "get_matrix"
[15] "get_membership" "get_param"
[17] "get_signatures" "get_stats"
[19] "is_best_k" "is_stable_k"
[21] "membership_heatmap" "ncol"
[23] "nrow" "plot_ecdf"
[25] "rownames" "select_partition_number"
[27] "show" "suggest_best_k"
[29] "test_to_known_factors"
Show different statistics along with different cluster number k
, which helps to determine the “best k.”
select_partition_number(cluster_res)
There are many details about all statistical measures shown above at cola vignette. The best k 2
is suggested by cola
package due to its better statistical measures. However, in this practice, I think 5
is more suitable.
I have the following reasons:
- Measure
1 - PAC
reaches convergence atk = 5
. - Measure
Silhouette
reaches local optimum atk = 5
. - Measure
Concordance
reaches local optimum atk = 5
.
I will draw another plot to show why the two k
s are suitable.
collect_classes(cluster_res)
We can see that the probability constitution of class assignment for subgroups for k = 6, 7, 8
are similar to k = 5
.
Draw all plots into one single page.
pdf(file = "../output/cola_all_plots.pdf", width = 16, height = 10)
collect_plots(cluster_res)
dev.off()
Now we get the cluster labels of samples for downstream analysis.
<- get_classes(cluster_res, k = 5)
pcawg_clusters saveRDS(pcawg_clusters, file = "../data/pcawg_clusters.rds")
Heatmap for clustering and genotype/phenotype features
Load tidy sample info.
<- readRDS("../data/pcawg_sample_tidy_info.rds") tidy_info
Clean some info.
<- function(x, p) {
detect_any if (length(p) == 1L) {
<- stringr::str_detect(x, p)
y else {
} <- purrr::map(p, ~ stringr::str_detect(x, .))
y <- purrr::reduce(y, `|`)
y
}is.na(y)] <- FALSE
y[
y
}
<- tidy_info %>%
tidy_info ::mutate(
dplyr# Roughly reassign the staging to TNM stages
# Basically follow 7th TNM staging version, see plot <https://bkimg.cdn.bcebos.com/pic/a8014c086e061d952019ec8773f40ad162d9ca36?x-bce-process=image/watermark,image_d2F0ZXIvYmFpa2U4MA==,g_7,xp_5,yp_5>
#
# Other references:
# https://www.cancer.org/treatment/understanding-your-diagnosis/staging.html
# https://www.cancer.gov/publications/dictionaries/cancer-terms/def/abcd-rating
# https://web.archive.org/web/20081004121942/http://www.oncologychannel.com/prostatecancer/stagingsystems.shtml
# https://baike.baidu.com/item/TNM%E5%88%86%E6%9C%9F%E7%B3%BB%E7%BB%9F/10700513
tumour_stage = case_when(
%in% c("1", "1a", "1b", "A", "I", "IA", "IB", "T1c", "T1N0") | detect_any(tumour_stage, c("T1N0M0", "T1aN0M0", "T1bN0M0", "T2aN0M0")) ~ "I",
tumour_stage %in% c("2", "2a", "2b", "B", "II", "IIA", "IIB", "T3a", "T3aN0") | detect_any(tumour_stage, c("T2bN0M0", "T3N0M0", "T[^34]N1.*M0")) ~ "II",
tumour_stage %in% c("3", "3a", "3b", "3c", "C", "III", "IIIA", "IIIB", "IIIC") | detect_any(tumour_stage, c("N[23].*M0", "T3N1.*M0", "T4.*M0")) ~ "III",
tumour_stage %in% c("4", "IV", "IVA") | detect_any(tumour_stage, "M[^0X]") ~ "IV",
tumour_stage TRUE ~ "Unknown"
),first_therapy_type = ifelse(first_therapy_type == "monoclonal antibodies (for liquid tumours)",
"other therapy", first_therapy_type
) )
Generate annotation data for plotting.
<- tidy_info %>%
anno_df ::filter(keep) %>%
dplyr::select(
dplyr
sample,
donor_age_at_diagnosis, n_of_amp, n_of_del, n_of_vchr, cna_burden, purity, ploidy,
n_LOH, n_fusion, n_chromo, n_amplicon_BFB, n_amplicon_ecDNA, n_amplicon_HR, n_amplicon_Linear,
tel_content_log2,
donor_sex, tumour_stage, wgd_status, hrd_status%>%
) ::mutate_at(
dplyrvars(n_fusion, n_chromo, n_amplicon_BFB, n_amplicon_ecDNA, n_amplicon_HR, n_amplicon_Linear),
~ ifelse(is.na(.), 0, .)
%>%
) ::mutate(hrd_status = ifelse(hrd_status, "hrd", "no_hrd")) %>%
dplyrrename(
Age = donor_age_at_diagnosis,
AMPs = n_of_amp,
DELs = n_of_del,
Affected_Chrs = n_of_vchr,
CNA_Burden = cna_burden,
Purity = purity,
Ploidy = ploidy,
LOHs = n_LOH,
Fusions = n_fusion,
Chromothripsis = n_chromo,
BFBs = n_amplicon_BFB,
ecDNAs = n_amplicon_ecDNA,
HRs = n_amplicon_HR,
Linear_Amplicons = n_amplicon_Linear,
`Telomere_Content(log2)` = tel_content_log2,
Sex = donor_sex,
Tumor_Stage = tumour_stage,
WGD_Status = wgd_status,
HRD_Status = hrd_status
%>%
) ::column_to_rownames("sample")
tibble
saveRDS(anno_df, file = "../data/pcawg_tidy_anno_df.rds")
<- rowAnnotation(foo = anno_text(paste0("CNS", 1:13))) left_annotation
Show heatmap for clusters based on signature activities.
get_signatures(cluster_res,
k = 5, silhouette_cutoff = 0.2, anno = anno_df,
left_annotation = left_annotation
)
* 2674/2737 samples (in 5 classes) remain after filtering by silhouette (>= 0.2).
* cache hash: 4b9a944dfe09dbf61fff67d0357a67aa (seed 888).
* calculating row difference between subgroups by Ftest.
* split rows into 4 groups by k-means clustering.
* 13 signatures (100.0%) under fdr < 0.05, group_diff > 0.
* making heatmaps for signatures.
Save to file.
pdf(file = "../output/pcawg_cluster_heatmap.pdf", width = 22, height = 10, onefile = FALSE)
get_signatures(cluster_res, k = 5, silhouette_cutoff = 0.2, anno = anno_df, left_annotation = left_annotation)
dev.off()
Exploration of patients’ prognosis difference across clusters
Clean data for survival analysis
library(ezcox)
library(tidyverse)
<- readRDS("../data/pcawg_clusters.rds") %>%
cluster_df as.data.frame() %>%
::rownames_to_column("sample")
tibble
<- readRDS("../data/pcawg_sample_tidy_info.rds")
tidy_info
<- tidy_info %>%
df_os ::filter(keep) %>%
dplyr::select(sample, donor_vital_status, donor_survival_time) %>%
dplyr::set_names(c("sample", "os", "time")) %>%
purrrna.omit() %>%
mutate(
os = ifelse(os == "deceased", 1, 0),
time = time / 365
%>%
) left_join(cluster_df, by = "sample")
colnames(df_os)[4] <- "cluster"
# Use the group with minimal CNV level as reference group
$cluster <- paste0("subgroup", df_os$cluster) df_os
Available sample number with OS data:
nrow(df_os)
[1] 1729
Forest plot
It’s pretty easy to visualize the relationship between subgroups and patient’s prognosis with forest plot by R package ezcox
developed by me before.
show_forest(df_os, covariates = "cluster", status = "os", add_caption = FALSE, merge_models = TRUE)
K-M plot
We can also check the OS difference with K-M plot.
library(survival)
library(survminer)
<- survfit(Surv(time, os) ~ cluster, data = df_os)
sfit ggsurvplot(sfit,
pval = TRUE,
fun = "pct",
xlab = "Time (in years)",
# palette = "jco",
legend.title = ggplot2::element_blank(),
legend.labs = paste0("subgroup", 1:5),
break.time.by = 5,
risk.table = TRUE,
tables.height = 0.4
)
The result has similar meaning as forest plot.
The difference of genotype/phenotype measures across clusters
Signatures
Load SBS/DBS/ID signature activities generated from PCAWG Nature Study.
<- read_csv("../data/PCAWG/PCAWG_sigProfiler_SBS_signatures_in_samples.csv")
pcawg_sbs <- read_csv("../data/PCAWG/PCAWG_sigProfiler_DBS_signatures_in_samples.csv")
pcawg_dbs <- read_csv("../data/PCAWG/PCAWG_SigProfiler_ID_signatures_in_samples.csv") pcawg_id
Merge data.
<- purrr::reduce(
df_merged list(
tidy_info,-c(1, 3)] %>% dplyr::rename(sample = `Sample Names`) %>%
pcawg_sbs[, # Drop Possible sequencing artefact associated signatures
::select(-SBS43, -c(SBS45:SBS60)),
dplyr-c(1, 3)] %>% dplyr::rename(sample = `Sample Names`),
pcawg_dbs[, -c(1, 3)] %>% dplyr::rename(sample = `Sample Names`),
pcawg_id[, 1:2] %>% purrr::set_names(c("sample", "cluster"))
cluster_df[,
),::left_join,
dplyrby = "sample"
%>%
) ::filter(!is.na(cluster)) %>%
dplyrmutate(cluster = paste0("subgroup", cluster))
colnames(df_merged) <- gsub("Abs_", "", colnames(df_merged))
Enrichment analysis for copy number signatures
library(sigminer)
<- group_enrichment(
enrich_result_cn
df_merged,grp_vars = "cluster",
enrich_vars = paste0("CNS", 1:14),
co_method = "wilcox.test"
)
$enrich_var <- factor(enrich_result_cn$enrich_var, paste0("CNS", 1:14))
enrich_result_cn
<- show_group_enrichment(
p
enrich_result_cn,fill_by_p_value = TRUE,
cut_p_value = TRUE,
return_list = T
)<- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip() p
Enrichment analysis for SBS signatures
<- colnames(df_merged)[grepl("SBS", colnames(df_merged))]
nm_sbs
<- group_enrichment(
enrich_result_sbs
df_merged,grp_vars = "cluster",
enrich_vars = nm_sbs,
co_method = "wilcox.test"
)
$enrich_var <- factor(enrich_result_sbs$enrich_var, nm_sbs)
enrich_result_sbs
<- show_group_enrichment(
p
enrich_result_sbs,fill_by_p_value = TRUE,
cut_p_value = TRUE,
return_list = T
)<- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip() p
Enrichment analysis for DBS signatures
<- colnames(df_merged)[grepl("DBS", colnames(df_merged))]
nm_dbs
<- group_enrichment(
enrich_result_dbs
df_merged,grp_vars = "cluster",
enrich_vars = nm_dbs,
co_method = "wilcox.test"
)
$enrich_var <- factor(enrich_result_dbs$enrich_var, nm_dbs)
enrich_result_dbs
<- show_group_enrichment(
p
enrich_result_dbs,fill_by_p_value = TRUE,
cut_p_value = TRUE,
return_list = T
)<- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip() p
Enrichment analysis for ID signatures
<- colnames(df_merged)[grepl("^ID[0-9]+", colnames(df_merged))]
nm_id
<- group_enrichment(
enrich_result_id
df_merged,grp_vars = "cluster",
enrich_vars = nm_id,
co_method = "wilcox.test"
)
$enrich_var <- factor(enrich_result_id$enrich_var, nm_id)
enrich_result_id
<- show_group_enrichment(
p
enrich_result_id,fill_by_p_value = TRUE,
cut_p_value = TRUE,
return_list = T
)<- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip() p
Combined COSMIC signature result
Merge all COSMIC signature results.
<- purrr::reduce(
enrich_result_cosmic list(enrich_result_sbs, enrich_result_dbs, enrich_result_id),
rbind )
移除带 NA 和 Inf 的 signatures 结果,然后绘制一个汇总图,对 signatures 进行聚类,以确定 signature 排序。
Word cloud plot for COSMIC signature etiologies
There are so many COSMIC signatures above, it is not easy to summarize the data. Considering all COSMIC signatures are labeled and many of them have been assigned to a specific etiology, here we try to use word cloud plot to summarize the result above.
Load etiologies of COSMIC signatures.
<- purrr::reduce(
cosmic_ets list(
::get_sig_db("SBS")$aetiology %>% tibble::rownames_to_column("sig_name"),
sigminer::get_sig_db("DBS")$aetiology %>% tibble::rownames_to_column("sig_name"),
sigminer::get_sig_db("ID")$aetiology %>% tibble::rownames_to_column("sig_name")
sigminer
),
rbind )
::datatable(cosmic_ets) DT
Generate a data.frame
for plotting. We only keep significantly and positively enriched signatures in each subgroup. We note that descriptions for many signatures are too long, thus we use short names.
<- enrich_result_cosmic %>%
df_ets ::filter(p_value < 0.05 & measure_observed > 1) %>%
dplyr::select(grp1, enrich_var) %>%
dplyr::left_join(cosmic_ets, by = c("enrich_var" = "sig_name")) %>%
dplyr::mutate(
dplyraetiology = dplyr::case_when(
grepl("APOBEC", aetiology) ~ "APOBEC",
grepl("Tobacco", aetiology) ~ "Tobacco",
grepl("clock", aetiology) ~ "Clock",
grepl("Ultraviolet", aetiology, ignore.case = TRUE) ~ "UV",
grepl("homologous recombination", aetiology) ~ "HRD",
grepl("mismatch repair", aetiology) ~ "dMMR",
grepl("Slippage", aetiology) ~ "Slippage",
grepl("base excision repair", aetiology) ~ "dBER",
grepl("NHEJ", aetiology) ~ "NHEJ repair",
grepl("reactive oxygen", aetiology) ~ "ROS",
grepl("Polymerase epsilon exonuclease", aetiology) ~ "Polymerase epsilon mutation",
grepl("eta somatic hypermutation", aetiology) ~ "Polimerase eta hypermutation",
TRUE ~ aetiology
)%>%
) ::count(grp1, aetiology) %>%
dplyr::filter(!aetiology %in% c("Unknown", "Possible sequencing artefact")) dplyr
Plotting.
library(ggwordcloud)
set.seed(42)
ggplot(df_ets, aes(label = aetiology, size = n)) +
geom_text_wordcloud_area(eccentricity = .35, shape = "square") +
scale_size_area(max_size = 14) +
facet_wrap(~grp1) +
theme_bw()
Other integrated variables
<- readRDS("../data/pcawg_tidy_anno_df.rds") %>%
df_others ::rownames_to_column("sample") %>%
tibble::left_join(
dplyr1:2] %>% purrr::set_names(c("sample", "cluster")),
cluster_df[, by = "sample"
%>%
) ::mutate(
dplyrTumor_Stage = dplyr::case_when(
== "I" ~ 1,
Tumor_Stage == "II" ~ 2,
Tumor_Stage == "III" ~ 3,
Tumor_Stage == "IV" ~ 4
Tumor_Stage
),isFemale = Sex == "female",
isWGD = WGD_Status == "wgd",
isHRD = HRD_Status == "hrd",
cluster = paste0("subgroup", cluster)
%>%
) ::select(-c(Sex, WGD_Status, HRD_Status)) dplyr
<- setdiff(colnames(df_others), c("sample", "cluster"))
nm_others
<- group_enrichment(
enrich_result_others
df_others,grp_vars = "cluster",
enrich_vars = nm_others,
co_method = "wilcox.test"
)
$enrich_var <- factor(enrich_result_others$enrich_var, nm_others)
enrich_result_others
<- show_group_enrichment(
p
enrich_result_others,fill_by_p_value = TRUE,
cut_p_value = TRUE,
return_list = T
)<- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip() p
The
HR
here representsHeavily rearrangement
, don’t mix it withHRD
which meansDefective homologous recombination DNA damage repair
.
DNA-repair genes
We collect pathogeniv germline variants and somatic driver mutations in DNA-repair genes. Identification of pathogenic variants in human kinases from the PCAWG Drivers and Functional Interpretation Group (https://dcc.icgc.org/releases/PCAWG/driver_mutations). The file TableS3_panorama_driver_mutations_ICGC_samples.public.tsv.gz reported both somatic and pathogenic germline variants.
choose DNA-repair genes
<- c(
repair.select "ATM", "BRCA1", "BRCA2", "CDK12", "CHEK2",
"FANCA", "FANCC", "FANCD2", "FANCF",
# "MSH2",
"MSH3", "MSH4",
# "MSH5","MSH6",
"PALB2", "POLE", "RAD51B", "TP53"
)
select pathogenic gene and count for each sample
<- dplyr::inner_join(
pcawg_pathogenic read.csv2("../data/TableS3_panorama_driver_mutations_ICGC_samples.public.tsv.gz", sep = "\t"),
read.csv2("../data/pcawg_sample_sheet.tsv", sep = "\t")
%>%
::select(., aliquot_id, icgc_specimen_id, dcc_project_code) %>%
dplyr::rename(., sample_id = "aliquot_id"),
dplyrby = "sample_id"
%>%
) ::select(., gene, icgc_specimen_id) %>%
dplyr::rename(., Sample = "icgc_specimen_id") %>%
dplyrsubset(., gene %in% repair.select) %>%
!duplicated(.), ] %>%
.[::mutate(num = 1) dplyr
Driver genes
We have already observed that there are different mutational processes across different groups. Here we try to go further study if different driver genes activate in different groups.
We obtained coding driver mutations from UCSC Xena database. For known driver genes, we focus on driver gene list obtained from Martínez-Jiménez et al. (2020).
load("../data/pcawg_driver_info.RData")
<- dplyr::left_join(df_others[, c("sample", "cluster")],
df_mut
pcawg_driver_number,by = "sample"
)colnames(df_mut)[3] <- "Driver_variants"
$Driver_variants <- ifelse(is.na(df_mut$Driver_variants), 0, df_mut$Driver_variants)
df_mut
<- pcawg_driver_gene %>%
driver_gene_num ::group_by(sample, ROLE) %>%
dplyr::summarise(driver_genes = n(), .groups = "drop") %>%
dplyr::pivot_wider(names_from = "ROLE", values_from = "driver_genes", values_fill = 0)
tidyr
<- dplyr::left_join(df_mut, driver_gene_num, by = "sample")
df_mut colnames(df_mut)[4:5] <- c("Act_genes", "LoF_genes")
$Act_genes <- ifelse(is.na(df_mut$Act_genes), 0, df_mut$Act_genes)
df_mut$LoF_genes <- ifelse(is.na(df_mut$LoF_genes), 0, df_mut$LoF_genes) df_mut
Also merge into info of each driver gene.
<- df_mut %>%
df_mut2 ::left_join(
dplyr%>%
pcawg_driver_gene ::pivot_wider(names_from = "gene", values_from = "ROLE") %>%
tidyr::mutate_at(vars(-sample), ~ !is.na(.)),
dplyrby = "sample"
)
## Filter out driver genes with < 10 mutation
<- sort(colSums(df_mut2[, -c(1:5)], na.rm = TRUE), decreasing = TRUE)
gene_sum
# 基因按突变数量排序
<- df_mut2 %>% dplyr::select_at(c(colnames(df_mut2)[1:5], names(gene_sum[gene_sum >= 10]))) df_mut2
Save the data.
saveRDS(df_mut2, file = "../data/pcawg_mut_df.rds")
<- setdiff(colnames(df_mut2), c("sample", "cluster"))
nm_mut
<- group_enrichment(
enrich_result_mut
df_mut2,grp_vars = "cluster",
enrich_vars = nm_mut,
co_method = "wilcox.test"
)
Filter out genes insignificant in all subgroups.
<- enrich_result_mut[, .(count = sum(p_value < 0.05)), by = enrich_var]
keep_df <- keep_df[count == 0]$enrich_var
drop_vars
<- enrich_result_mut[!enrich_var %in% drop_vars] enrich_result_mut2
Plotting.
$enrich_var <- factor(enrich_result_mut2$enrich_var, setdiff(nm_mut, drop_vars))
enrich_result_mut2
<- setdiff(nm_mut, drop_vars)[-c(1:3)]
keep_vars <- c(nm_mut[1:3], paste0(keep_vars, " (", gene_sum[keep_vars], ")"))
labels
<- show_group_enrichment(
p
enrich_result_mut2,fill_by_p_value = TRUE,
cut_p_value = TRUE,
return_list = T
)<- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip() + scale_x_discrete(labels = labels) p
rm(list = ls())
PART 5: Inference of copy number signature etiologies
In this part, we would like to explore CN signatures in more details with association analysis and summarize what we get to infer their etiologies.
Let’s merge load all data we got and combine them.
library(tidyverse)
library(sigminer)
<- readRDS("../data/pcawg_sample_tidy_info.rds")
tidy_info <- readRDS("../data/pcawg_tidy_anno_df.rds")
tidy_anno <- readRDS("../data/pcawg_mut_df.rds")
tidy_mut <- read_csv("../data/PCAWG/PCAWG_sigProfiler_SBS_signatures_in_samples.csv")
pcawg_sbs <- read_csv("../data/PCAWG/PCAWG_sigProfiler_DBS_signatures_in_samples.csv")
pcawg_dbs <- read_csv("../data/PCAWG/PCAWG_SigProfiler_ID_signatures_in_samples.csv")
pcawg_id
# Use signature relative contribution for analysis
<- pcawg_sbs[, -c(1, 3)] %>%
pcawg_sbs ::rename(sample = `Sample Names`) %>%
dplyr::mutate(s = rowSums(.[, -1])) %>%
dplyr::mutate_at(vars(starts_with("SBS")), ~ . / s) %>%
dplyr::select(-s) %>%
dplyr# Drop Possible sequencing artefact associated signatures
::select(-SBS43, -c(SBS45:SBS60))
dplyr
<- pcawg_dbs[, -c(1, 3)] %>%
pcawg_dbs ::rename(sample = `Sample Names`) %>%
dplyr::mutate(s = rowSums(.[, -1])) %>%
dplyr::mutate_at(vars(starts_with("DBS")), ~ . / s) %>%
dplyr::select(-s)
dplyr
<- pcawg_id[, -c(1, 3)] %>%
pcawg_id ::rename(sample = `Sample Names`) %>%
dplyr::mutate(s = rowSums(.[, -1])) %>%
dplyr::mutate_at(vars(starts_with("ID")), ~ . / s) %>%
dplyr::select(-s) dplyr
Merge data.
<- purrr::reduce(
df_merged list(
%>%
tidy_info ::filter(keep) %>%
dplyr::select(sample, starts_with("Abs"), starts_with("Rel")),
dplyr%>%
tidy_anno ::rownames_to_column("sample") %>%
tibble::mutate(
dplyrTumor_Stage = dplyr::case_when(
== "I" ~ 1,
Tumor_Stage == "II" ~ 2,
Tumor_Stage == "III" ~ 3,
Tumor_Stage == "IV" ~ 4
Tumor_Stage
),isFemale = Sex == "female",
isWGD = WGD_Status == "wgd",
isHRD = HRD_Status == "hrd"
%>%
) ::select(-c(Sex, WGD_Status, HRD_Status)),
dplyr
pcawg_sbs,
pcawg_dbs,
pcawg_id,%>% dplyr::select(-cluster)
tidy_mut
),::left_join,
dplyrby = "sample"
%>%
) ::mutate_if(
dplyr
is.logical,~ ifelse(is.na(.), FALSE, .)
)
colnames(df_merged) <- gsub("Rel_", "", colnames(df_merged))
saveRDS(df_merged, file = "../data/pcawg_merged_df.rds")
Association analysis for copy number signatures
Copy number signature activity self-association
show_cor(dplyr::select(df_merged, starts_with("CNS")))
We can find that most the signatures are correlated. Therefore, it will add difficulty in understanding etiologies by association analysis.
Visualize with full correlation network
library(see)
library(ggraph)
library(correlation)
<- dplyr::select(df_merged, starts_with("CNS")) %>%
cc correlation(partial = FALSE, method = "spearman") %>%
::rename(r = rho)
dplyrplot(cc)
Copy number signatures and COSMIC signatures
<- show_cor(
p_cosmic data = df_merged,
x_vars = colnames(df_merged)[grepl("^CNS|^SBS[0-9]+", colnames(df_merged))],
y_vars = colnames(df_merged)[grepl("^CNS|^SBS[0-9]+", colnames(df_merged))],
hc_order = FALSE,
test = TRUE,
lab_size = 2,
# type = "lower",
insig = "blank"
) p_cosmic
Just show specified signatures:
<- show_cor(
p_cosmic2 data = df_merged,
x_vars = colnames(df_merged)[grepl("^CNS", colnames(df_merged))],
y_vars = c(
"SBS1", "SBS5",
"SBS2", "SBS13", "DBS11",
"SBS3", "ID6", "ID8",
"SBS4", "DBS2",
"SBS9",
"SBS18",
"DBS7", "ID1", "ID2",
"SBS12", "SBS17a", "SBS17b", "DBS6", "ID5", "ID12"
),hc_order = FALSE,
test = FALSE,
lab_size = 2
)
p_cosmic2
Visualize signature with similar trends with network.
<- list(
siglist c("SBS1", "SBS5"),
c("SBS2", "SBS13", "DBS11"),
c("SBS3", "ID6", "ID8"),
c("SBS4", "DBS2"),
c("SBS9"),
c("SBS18"),
c("DBS7", "ID1", "ID2"),
c("SBS12", "SBS17a", "SBS17b", "DBS6", "ID5", "ID12")
)
for (i in siglist) {
<- correlation(
cc ::select(df_merged, starts_with("CNS")),
dplyr::select_at(df_merged, i),
dplyrpartial = FALSE, method = "spearman"
%>%
) ::rename(r = rho) %>%
dplyrplot()
print(cc)
}
Signature clustering
We can take a try to cluster signatures based on their activity association.
First get the correlation data.
<- show_cor(
p_all data = df_merged,
x_vars = colnames(df_merged)[grepl("^CNS|^SBS|^DBS|^ID[0-9]+", colnames(df_merged))],
y_vars = colnames(df_merged)[grepl("^CNS|^SBS|^DBS|^ID[0-9]+", colnames(df_merged))],
hc_order = FALSE,
test = FALSE,
lab_size = 3
)
Get the correlation matrix and convert it into distance matrix.
<- p_all$cor$cor_mat
p_all_cor <- 1 - p_all_cor
all_dist # Set the NA value to 2
is.na(all_dist)] <- 2 all_dist[
Only include copy number signatures
<- as.dist(all_dist[1:14, 1:14])
dist_cn <- hclust(dist_cn)
hc_cn plot(hc_cn)
Roughly there is 3 clusters.
::fviz_dend(hc_cn, k = 3, rect = T) factoextra
All signatures
<- as.dist(all_dist)
dist_all <- hclust(dist_all)
hc_all plot(hc_all)
Here we manually set 10 clusters for better visualization.
::fviz_dend(
factoextra
hc_all,k = 10,
cex = 0.6, rect = F,
type = "circular"
+ theme_void() +
) theme(
panel.border = element_blank()
)
# theme(
# panel.background = element_rect(fill = "transparent", colour = NA),
# plot.background = element_rect(fill = "transparent", colour = NA),
# panel.grid = element_blank(),
# panel.border = element_blank(),
# plot.margin = unit(c(0, 0, 0, 0), "null"),
# panel.margin = unit(c(0, 0, 0, 0), "null"),
# axis.ticks = element_blank(),
# axis.text = element_blank(),
# axis.title = element_blank(),
# axis.line = element_blank(),
# legend.position = "none",
# axis.ticks.length = unit(0, "null"),
# axis.ticks.margin = unit(0, "null"),
# legend.margin = unit(0, "null")
# )
Association between CNS and genotype/phenotype features
Association between CNS and continuous variables
<- show_cor(
p_cn_others data = df_merged,
x_vars = colnames(df_merged)[16:29],
y_vars = colnames(df_merged)[c(30:44, 125:127)],
lab_size = 3
) p_cn_others
Visualize it with correlation network one by one.
<- colnames(df_merged)[c(30:44, 125:127)]
var_names <- df_merged[, var_names] %>%
df_mut ::rename(
dplyrTelomere_Content = `Telomere_Content(log2)`
)<- colnames(df_mut)
var_names
# dir.create("../output/cor_network")
for (i in var_names) {
# pdf(file.path("../output/cor_network", paste0(i, ".pdf")), width = 6, height = 5)
print(i)
correlation(
drop = FALSE],
df_mut[, i, colnames(df_merged)[16:29]],
df_merged[, partial = FALSE, method = "spearman"
%>%
) ::rename(r = rho) %>%
dplyrplot() -> p
print(p)
# dev.off()
}
[1] "Age"
[1] "AMPs"
[1] "DELs"
[1] "Affected_Chrs"
[1] "CNA_Burden"
[1] "Purity"
[1] "Ploidy"
[1] "LOHs"
[1] "Fusions"
[1] "Chromothripsis"
[1] "BFBs"
[1] "ecDNAs"
[1] "HRs"
[1] "Linear_Amplicons"
[1] "Telomere_Content"
[1] "Driver_variants"
[1] "Act_genes"
[1] "LoF_genes"
Association between CNS and discrete variables
Here we directly compare the CNS activity between different discrete variable groups.
Here 3 variables including WGD_Status
, HRD_Status
and Tumor_Stage
are included.
<- tidy_info %>%
df_ds2 ::filter(keep) %>%
dplyr::select(sample, starts_with("Abs")) %>%
dplyr::mutate_if(is.numeric, ~ log2(. + 1)) %>%
dplyr::set_names(c("sample", paste0("CNS", 1:14))) %>%
purrr::left_join(
dplyr%>%
tidy_anno ::rownames_to_column("sample") %>%
tibble::select(sample, WGD_Status, HRD_Status, Tumor_Stage),
dplyrby = "sample"
%>%
) ::mutate(Tumor_Stage = ifelse(Tumor_Stage == "Unknown", NA, Tumor_Stage))
dplyr
<- df_ds2 %>%
df_ds ::pivot_longer(
tidyrstarts_with("CNS"),
names_to = "sig_name",
values_to = "activity"
)
library(ggpubr)
For WGD.
ggboxplot(
df_ds,x = "sig_name", y = "activity",
color = "WGD_Status",
palette = "jco", width = .3,
xlab = FALSE, ylab = "log2(Activity)",
outlier.size = 0.05
+
) stat_compare_means(aes(group = WGD_Status), label = "p.signif") +
rotate_x_text()
CNS3 and CNS9 are highly associated with WGD.
For HRD.
ggboxplot(
%>%
df_ds ::mutate(HRD_Status = factor(HRD_Status, c("no_hrd", "hrd"))),
dplyrx = "sig_name", y = "activity",
color = "HRD_Status",
palette = "jco", width = .3,
xlab = FALSE, ylab = "log2(Activity)",
outlier.size = 0.05
+
) stat_compare_means(aes(group = HRD_Status), label = "p.signif") +
rotate_x_text()
CNS14 is highly associated with HRD status.
See supplementary analysis for HRD and WGD prediction analysis.
For tumor stage.
ggboxplot(
%>% dplyr::filter(!is.na(Tumor_Stage)),
df_ds x = "sig_name", y = "activity",
color = "Tumor_Stage",
palette = "jco", width = .3,
xlab = FALSE, ylab = "log2(Activity)",
outlier.size = 0.05
+
) stat_compare_means(aes(group = Tumor_Stage),
label = "p.signif",
method = "anova"
+
) rotate_x_text()
Association between CNS and driver gene mutation
<- get_sig_feature_association(
asso_res
df_merged,cols_to_sigs = paste0("CNS", 1:14),
cols_to_features = colnames(df_merged)[128:291],
method_ca = wilcox.test,
min_n = 0
)
<- get_tidy_association(asso_res) asso_res_tidy
Plotting.
show_sig_feature_corrplot(
asso_res_tidy,sig_orders = paste0("CNS", 1:14),
p_val = 0.05,
breaks_count = NA,
ylab = NULL,
xlab = NULL
+ ggpubr::rotate_x_text(0, hjust = 0.5) +
) labs(color = "Difference\nin means of\nactivity") +
::rotate_x_text() ggpubr
Association between CNS and patients’ prognosis
Get tidy data fro survival analysis.
library(ezcox)
<- readRDS("../data/pcawg_clusters.rds") %>%
cluster_df as.data.frame() %>%
::rownames_to_column("sample")
tibble
<- tidy_info %>%
df_surv ::filter(keep) %>%
dplyr::select(
dplyr
sample, cancer_type, donor_survival_time, donor_vital_status,::starts_with("Abs_")
dplyr%>%
) ::left_join(
dplyr
cluster_df,by = "sample"
%>%
) na.omit() %>%
::rename(
dplyrtime = donor_survival_time,
status = donor_vital_status,
cluster = class
%>%
) ::mutate(
dplyrstatus = ifelse(status == "deceased", 1, 0)
%>%
) ::rename_at(vars(starts_with("Abs_")), ~ sub("Abs_", "", .)) %>%
dplyr::mutate_at(vars(starts_with("CNS")), ~ . / 10) dplyr
We divide all signature activity by 10 to explore the HR change per 10 records contributed by a signature.
Unicox analysis for CNS
Here we focus on the association between each CNS and patients’ overall survival time.
<- show_forest(df_surv,
p1 covariates = paste0("CNS", 1:14),
merge_models = TRUE, add_caption = FALSE, point_size = 2
) p1
Multivariate cox analysis for CNS
We also build a multivariate cox model to explore if all CNSs are included, which is more important?
<- show_forest(df_surv,
p2 covariates = "CNS1", controls = paste0("CNS", 2:14),
merge_models = TRUE, add_caption = FALSE, point_size = 2
) p2
Control the cancer types, and run Cox for each signatures:
<- show_forest(df_surv,
p3 covariates = paste0("CNS", 1:14), controls = "cancer_type",
vars_to_show = paste0("CNS", 1:14),
merge_models = TRUE, add_caption = FALSE, point_size = 2,
) p3
Control the cancer types, and run Cox for all signatures in one model:
<- show_forest(df_surv,
p4 covariates = "CNS1", controls = c(paste0("CNS", 2:14), "cancer_type"),
vars_to_show = paste0("CNS", 1:14),
merge_models = TRUE, add_caption = FALSE, point_size = 2,
) p4
Signature similarity comparison between different types of signatures
The similarity between signatures is a key parameter, it determines how hard it is to distinguish between signatures and the precision to assign the contribution to a signature. If you have read the paper Maura et al. (2019), you may know that 2 of 3 main issues proposed by the authors are actually because of high similarity between mutational signatures:
- the ambiguous signature assignment that occurs when different combinations of signatures can explain equally well the same mutational catalog.
- bleeding of signatures that signatures present in only part of the set are also erroneously assigned to the entire set.
In this section, we explore the similarity of CNSs discovered in this study and known COSMIC signatures.
Load data.
<- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds")
CNS <- get_sig_db()
SBSv2 <- get_sig_db("SBS")
SBS <- get_sig_db("DBS")
DBS <- get_sig_db("ID") ID
Get signature similarity in each signature type.
<- get_sig_similarity(CNS, CNS)$similarity
sim_CNS <- get_sig_similarity(SBSv2$db, SBSv2$db)$similarity
sim_SBSv2 <- get_sig_similarity(SBS$db, SBS$db)$similarity
sim_SBS <- get_sig_similarity(DBS$db, DBS$db)$similarity
sim_DBS <- get_sig_similarity(ID$db, ID$db)$similarity sim_ID
Only use upper matrix to get similarity between one signature and non-self signatures.
<- sim_CNS[upper.tri(sim_CNS)]
a <- sim_SBSv2[upper.tri(sim_SBSv2)]
b <- sim_SBS[upper.tri(sim_SBS)]
c <- sim_DBS[upper.tri(sim_DBS)]
d <- sim_ID[upper.tri(sim_ID)]
e
<- dplyr::tibble(
df_sim sig_type = rep(
c("CNS", "SBSv2", "SBS", "DBS", "ID"),
c(length(a), length(b), length(c), length(d), length(e))
),similarity = c(a, b, c, d, e)
)
See header rows:
head(df_sim)
# A tibble: 6 x 2
sig_type similarity
<chr> <dbl>
1 CNS 0.34
2 CNS 0
3 CNS 0
4 CNS 0.112
5 CNS 0.018
6 CNS 0
Let’s visualize it and use ‘CNS’ as reference group to run statistical test (‘wilcox.test’ method).
library(ggpubr)
ggboxplot(df_sim,
x = "sig_type", y = "similarity",
color = "sig_type",
xlab = FALSE,
ylab = "Similarity between signatures",
legend = "none",
palette = "jco",
width = 0.3,
outlier.size = 0.05
+
) # stat_compare_means(method = "anova") +
stat_compare_means(
label = "p.signif", method = "wilcox.test",
ref.group = "CNS"
)
We can see that signature type CNS has lowest similarity level and CNS signatures are not likely similar to others. CNS has much lower similarity level than all other signature types except DBS.
Data summary and inference of signature etiologies
See our manuscript for the inference.
PART 6: Benchmark analysis
To evaluate the performance of our method across different platform and copy number calling tools. We carried out benchmark analysis in prostate cancer.
Compare the copy number calling algorithms
For WGS, we select copy number from ABSOLUTE and ACEseq. For WES, we select copy number from ABSOLUTE and ASCAT.
<- readRDS("../data/benchmark/tally_SNP_ABSOLUTE_matrix.rds")
tally_SNP_ABSOLUTE <- readRDS("../data/benchmark/tally_SNP_ASCAT2_matrix.rds")
tally_SNP_ASCAT2
# Compare different CNV algorithms in SNP
<- intersect(rownames(tally_SNP_ABSOLUTE), rownames(tally_SNP_ASCAT2))
SNP_sample <- subset(tally_SNP_ABSOLUTE, rownames(tally_SNP_ABSOLUTE) %in% SNP_sample)
tally_SNP_ABSOLUTE <- subset(tally_SNP_ASCAT2, rownames(tally_SNP_ASCAT2) %in% SNP_sample)
tally_SNP_ASCAT2 <- sapply(SNP_sample, function(i) {
sim_profile.SNP .1 <- tally_SNP_ASCAT2[i, ]
dt1.1 <- tally_SNP_ABSOLUTE[i, ]
dt2:::cosine(dt1.1, dt2.1)
sigminer%>%
}) subset(. > 0.4)
median(sim_profile.SNP)
[1] 0.8500122
hist(sim_profile.SNP,
breaks = 50, xlab = "",
main = "Copy number profile similarity from two methods(SNP)", xlim = range(0, 1)
)abline(v = 0.8500122, col = "red", lty = 2)
We compared the signature between tools in SNP.
<- readRDS("../data/benchmark/SNP_ABSOLUTE_SP_highmatch.rds")
SNP_ABSOLUTE_SP_sigs <- readRDS("../data/benchmark/SNP_ASCAT2_SP_highmatch.rds")
SNP_ASCAT2_SP_sigs colnames(SNP_ABSOLUTE_SP_sigs$solution_list$S4$Signature.norm) <- paste0("SNP_ABSOLUTE_sigs", seq(1:4))
colnames(SNP_ASCAT2_SP_sigs$solution_list$S4$Signature.norm) <- paste0("SNP_ASCAT_sigs", seq(1:4))
<- get_sig_similarity(SNP_ABSOLUTE_SP_sigs$solution_list$S4, SNP_ASCAT2_SP_sigs$solution_list$S4) sim
<- pheatmap::pheatmap(sim$similarity, cluster_cols = F, cluster_rows = F, display_numbers = TRUE) p
<- readRDS("../data/benchmark/tally_WGS_ABSOLUTE_matrix.rds")
tally_WGS_ABSOLUTE <- readRDS("../data/benchmark/tally_WGS_aceseq_matrix.rds")
tally_WGS_aceseq
# Compare different CNV algorithms in WGS
<- intersect(rownames(tally_WGS_ABSOLUTE), rownames(tally_WGS_aceseq))
WGS_sample <- subset(
tally_WGS_ABSOLUTE
tally_WGS_ABSOLUTE,rownames(tally_WGS_ABSOLUTE) %in% WGS_sample
)<- subset(tally_WGS_aceseq, rownames(tally_WGS_aceseq) %in% WGS_sample)
tally_WGS_aceseq <- sapply(WGS_sample, function(i) {
sim_profile.WGS .1 <- tally_WGS_ABSOLUTE[i, ]
dt1.1 <- tally_WGS_aceseq[i, ]
dt2:::cosine(dt1.1, dt2.1)
sigminer%>%
}) subset(. > 0.4)
hist(sim_profile.WGS,
breaks = 50, xlab = "",
main = "Copy number profile similarity from two methods(WGS)", xlim = range(0, 1)
)abline(v = 0.9725061, col = "red", lty = 2)
We compared the signature between tools in WGS.
<- readRDS("../data/benchmark/WGS_ABSOLUTE_SP_highmatch.rds")
WGS_ABSOLUTE_SP_sigs <- readRDS("../data/benchmark/wgs_aceseq_sig.rds")
WGS_aceseq_SP_sigs colnames(WGS_ABSOLUTE_SP_sigs$solution_list$S4$Signature.norm) <- paste0("WGS_ABSOLUTE_sigs", seq(1:4))
colnames(WGS_aceseq_SP_sigs$Signature.norm) <- paste0("WGS_ACEseq_sigs", seq(1:4))
<- get_sig_similarity(WGS_ABSOLUTE_SP_sigs$solution_list$S4, WGS_aceseq_SP_sigs) sim
<- pheatmap::pheatmap(sim$similarity, cluster_cols = F, cluster_rows = F, display_numbers = TRUE) p
Compare the data platforms
CNA signatures have been extracted independently from WGS, WES and SNP array derived prostate cancer CNA profiles. We compare the 4 copy number signatures extracted by Sigprofiler platform.
SNP vs WGS
<- readRDS("../data/benchmark/SNP_ABSOLUTE_SP_source.rds")
SNP_sigs_source <- readRDS("../data/benchmark/WGS_ABSOLUTE_SP_source.rds")
WGS_sigs_source <- readRDS("../data/benchmark/WES_FACETS_SP_source.rds")
WES_sigs_source
colnames(SNP_sigs_source$solution_list$S4$Signature.norm) <- paste0("SNP_sigs", seq(1:4))
colnames(WGS_sigs_source$solution_list$S4$Signature.norm) <- paste0("WGS_sigs", seq(1:4))
colnames(WES_sigs_source$solution_list$S4$Signature.norm) <- paste0("WES_sigs", seq(1:4))
<- get_sig_similarity(SNP_sigs_source$solution_list$S4, WGS_sigs_source$solution_list$S4) sim
<- pheatmap::pheatmap(sim$similarity, cluster_cols = F, cluster_rows = F, display_numbers = TRUE) p
SNP vs WES
<- get_sig_similarity(SNP_sigs_source$solution_list$S4, WES_sigs_source$solution_list$S4)
sim <- pheatmap::pheatmap(
p $similarity,
simcluster_cols = F,
cluster_rows = F,
display_numbers = TRUE,
legend_breaks = c(0.2, 0.4, 0.6, 0.8)
)
### WES vs WGS
<- get_sig_similarity(WES_sigs_source$solution_list$S4, WGS_sigs_source$solution_list$S4)
sim <- pheatmap::pheatmap(sim$similarity, cluster_cols = F, cluster_rows = F, display_numbers = TRUE) p
Fit PCAWG to TCGA
In this part, we fit PCAWG to TCGA signatures.
library(sigminer)
<- readRDS("../data/TCGA/tcga_cn_sigs_CN176_signature.rds")
tcga_cn_sigs_CN176_signature <- readRDS("../data/pcawg_cn_tally_X.rds")
pcawg_cn_tally_X <- t(pcawg_cn_tally_X$nmf_matrix)
pcawg_matrix <- sig_fit_bootstrap_batch(
pcawg_fit_qp
pcawg_matrix,sig = tcga_cn_sigs_CN176_signature,
methods = c("QP"),
n = 100L,
min_count = 1L,
p_val_thresholds = c(0.05),
use_parallel = 12,
type = c("absolute")
)saveRDS(pcawg_fit_qp, file = "../data/pcawg_fit_qp.rds")
observed segment catalog profile
<- readRDS("../data/pcawg_fit_qp.rds")
pcawg_fit_qp <- readRDS("../data/pcawg_sample_tidy_info.rds")
pcawg_tcga_sample <- readRDS("../data/pcawg_tcga_type.rds") %>% distinct()
pcawg_tcga <- subset(
choose_fit_sample "cosine"]],
pcawg_fit_qp[[%in% pcawg_tcga_sample$sample
sample
)<- "SP112827"
s <- readRDS("../data/pcawg_cn_tally_X.rds")
tally_x <- readRDS("../data/TCGA/tcga_cn_sigs_CN176_signature.rds")
tcga_sig <- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds") pcawg_sig
<- show_catalogue(t(tally_x$nmf_matrix),
p samples = s,
style = "cosmic", mode = "copynumber", method = "X",
# by_context = TRUE, font_scale = 0.7,
normalize = "row"
) p
reconstructed catalog profile
<- pcawg_fit_qp$expo %>%
pcawg_fit_expo subset(., type == "optimal") %>%
::select(., sample, sig, exposure) %>%
dplyr::dcast(., sig ~ sample) %>%
reshape2::column_to_rownames(., var = "sig") %>%
tibbleas.matrix()
<- tcga_sig$Signature.norm %*% pcawg_fit_expo reconstructed_mat
<- show_catalogue(reconstructed_mat,
p samples = s,
style = "cosmic", mode = "copynumber", method = "X",
normalize = "row"
) p
The similarity of observed profile of reconstructed profile.
cosine(reconstructed_mat[, s], t(tally_x$nmf_matrix)[, s])
[1] 0.9695092
<- show_sig_profile(tcga_sig,
p sig_names = paste0("Sig", c(1, 2, 10, 13)),
style = "cosmic", mode = "copynumber", method = "X",
) p
<- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds")
pcawg_sig <- show_sig_profile(
p
pcawg_sig,sig_names = factor(paste0("CNS", c(2, 3, 7, 8, 10, 11)),
levels = paste0("CNS", c(10, 3, 2, 8, 7, 11))
),style = "cosmic",
mode = "copynumber",
method = "X"
) p
Supplementary Analyses
In this part, we will add some supplementary analyses.
CNS validation
TCGA CNS
TCGA has ~10, 000 tumors and we can obtain allele specific copy number data from GDC portal, so it is the good resource to validate the CNS discovered in PCAWG.
Call signatures.
PBS
file content:
#PBS -l walltime=1000:00:00
#PBS -l nodes=1:ppn=12
#PBS -S /bin/bash
#PBS -l mem=100gb
#PBS -j oe
#PBS -M w_shixiang@163.com
#PBS -q pub_fast
# Please set PBS arguments above
cd /public/home/wangshx/wangshx/PCAWG-TCGA
module load apps/R/3.6.1
echo Extracting TCGA signatures...
# Following are commands/scripts you want to run
# If you do not know how to set this, please check README.md file
Rscript call_tcga.R
R script content:
library(sigminer)
<- readRDS("tcga_cn_tally_X.rds")
tally_X
sigprofiler_extract(tally_X$nmf_matrix,
output = "TCGA_CN176X",
range = 2:30,
nrun = 100,
cores = 36,
init_method = "random",
is_exome = FALSE,
use_conda = TRUE)
Metadata:
THIS FILE CONTAINS THE METADATA ABOUT SYSTEM AND RUNTIME
-------System Info-------
Operating System Name: Linux
Nodename: node34
Release: 3.10.0-693.el7.x86_64
Version: #1 SMP Tue Aug 22 21:09:27 UTC 2017
-------Python and Package Versions-------
Python Version: 3.8.5
Sigproextractor Version: 1.0.17
SigprofilerPlotting Version: 1.1.8
SigprofilerMatrixGenerator Version: 1.1.20
Pandas version: 1.1.2
Numpy version: 1.19.2
Scipy version: 1.5.2
Scikit-learn version: 0.23.2
--------------EXECUTION PARAMETERS--------------
INPUT DATA
input_type: matrix
output: TCGA_CN176X
input_data: /tmp/RtmphYcBmk/dir852158ab0364/sigprofiler_input.txt
reference_genome: GRCh37
context_types: SBS176
exome: False
NMF REPLICATES
minimum_signatures: 2
maximum_signatures: 30
NMF_replicates: 100
NMF ENGINE
NMF_init: random
precision: single
matrix_normalization: gmm
resample: True
seeds: random
min_NMF_iterations: 10,000
max_NMF_iterations: 1,000,000
NMF_test_conv: 10,000
NMF_tolerance: 1e-15
CLUSTERING
clustering_distance: cosine
EXECUTION
cpu: 36; Maximum number of CPU is 36
gpu: False
Solution Estimation
stability: 0.8
min_stability: 0.2
combined_stability: 1.0
COSMIC MATCH
opportunity_genome: GRCh37
nnls_add_penalty: 0.05
nnls_remove_penalty: 0.01
initial_remove_penalty: 0.05
de_novo_fit_penalty: 0.02
refit_denovo_signatures: False
-------Analysis Progress-------
[2020-11-29 17:39:12] Analysis started:
##################################
[2020-11-29 17:39:18] Analysis started for SBS176. Matrix size [176 rows x 10851 columns]
[2020-11-29 17:39:18] Normalization GMM with cutoff value set at 17600
[2020-11-29 19:12:03] SBS176 de novo extraction completed for a total of 2 signatures!
Execution time:1:32:45
[2020-11-29 23:58:10] SBS176 de novo extraction completed for a total of 3 signatures!
Execution time:4:46:06
[2020-11-30 08:32:21] SBS176 de novo extraction completed for a total of 4 signatures!
Execution time:8:34:11
[2020-11-30 19:51:18] SBS176 de novo extraction completed for a total of 5 signatures!
Execution time:11:18:56
[2020-12-01 07:10:18] SBS176 de novo extraction completed for a total of 6 signatures!
Execution time:11:18:59
[2020-12-01 21:14:24] SBS176 de novo extraction completed for a total of 7 signatures!
Execution time:14:04:06
[2020-12-02 15:48:43] SBS176 de novo extraction completed for a total of 8 signatures!
Execution time:18:34:19
[2020-12-03 12:53:25] SBS176 de novo extraction completed for a total of 9 signatures!
Execution time:21:04:41
[2020-12-04 11:25:25] SBS176 de novo extraction completed for a total of 10 signatures!
Execution time:22:31:59
[2020-12-05 10:20:15] SBS176 de novo extraction completed for a total of 11 signatures!
Execution time:22:54:50
[2020-12-06 11:36:03] SBS176 de novo extraction completed for a total of 12 signatures!
Execution time:1 day, 1:15:48
[2020-12-07 14:43:31] SBS176 de novo extraction completed for a total of 13 signatures!
Execution time:1 day, 3:07:28
[2020-12-09 01:15:22] SBS176 de novo extraction completed for a total of 14 signatures!
Execution time:1 day, 10:31:50
[2020-12-10 08:02:39] SBS176 de novo extraction completed for a total of 15 signatures!
Execution time:1 day, 6:47:17
[2020-12-11 16:05:31] SBS176 de novo extraction completed for a total of 16 signatures!
Execution time:1 day, 8:02:51
[2020-12-13 02:19:40] SBS176 de novo extraction completed for a total of 17 signatures!
Execution time:1 day, 10:14:09
[2020-12-14 14:56:13] SBS176 de novo extraction completed for a total of 18 signatures!
Execution time:1 day, 12:36:32
[2020-12-16 02:39:41] SBS176 de novo extraction completed for a total of 19 signatures!
Execution time:1 day, 11:43:27
[2020-12-18 02:54:09] SBS176 de novo extraction completed for a total of 20 signatures!
Execution time:2 days, 0:14:28
[2020-12-19 20:20:57] SBS176 de novo extraction completed for a total of 21 signatures!
Execution time:1 day, 17:26:47
[2020-12-21 19:30:07] SBS176 de novo extraction completed for a total of 22 signatures!
Execution time:1 day, 23:09:10
[2020-12-24 00:44:41] SBS176 de novo extraction completed for a total of 23 signatures!
Execution time:2 days, 5:14:34
[2020-12-25 19:03:07] SBS176 de novo extraction completed for a total of 24 signatures!
Execution time:1 day, 18:18:25
[2020-12-27 07:50:06] SBS176 de novo extraction completed for a total of 25 signatures!
Execution time:1 day, 12:46:58
[2020-12-29 04:40:22] SBS176 de novo extraction completed for a total of 26 signatures!
Execution time:1 day, 20:50:16
[2020-12-30 12:35:04] SBS176 de novo extraction completed for a total of 27 signatures!
Execution time:1 day, 7:54:41
[2020-12-31 19:04:04] SBS176 de novo extraction completed for a total of 28 signatures!
Execution time:1 day, 6:29:00
[2021-01-02 04:50:07] SBS176 de novo extraction completed for a total of 29 signatures!
Execution time:1 day, 9:46:02
[2021-01-03 14:15:32] SBS176 de novo extraction completed for a total of 30 signatures!
Execution time:1 day, 9:25:25
[2021-01-03 14:48:21] Analysis ended:
-------Job Status-------
Analysis of mutational signatures completed successfully!
Total execution time: 34 days, 21:09:09
Results can be found in: TCGA_CN176X folder
Next we determine the signature number.
<- sigprofiler_import("../SP/TCGA_CN176X/", order_by_expo = TRUE, type = "all")
SP_TCGA saveRDS(SP_TCGA, file = "../data/TCGA/tcga_cn_solutions_sp.rds")
<- readRDS("../data/TCGA/tcga_cn_solutions_sp.rds") SP_TCGA
show_sig_number_survey(
$all_stats %>%
SP_TCGA::rename(
dplyrs = `Stability (Avg Silhouette)`,
e = `Mean Cosine Distance`
%>%
) ::mutate(
dplyrSignatureNumber = as.integer(gsub("[^0-9]", "", SignatureNumber))
),x = "SignatureNumber",
left_y = "s", right_y = "e",
left_name = "Stability (Avg Silhouette)",
right_name = "Mean Cosine Distance",
highlight = 20
)
We pick up 20 signatures for TCGA data due to its relatively high stability and low distance.
<- SP_TCGA$solution_list$S20
tcga_sigs saveRDS(tcga_sigs, file = "../data/TCGA/tcga_cn_sigs_CN176_signature.rds")
Here we try know how well each sample catalog profile can be constructed from the signature combination.
Firstly we get the activity data.
<- readRDS("../data/TCGA/tcga_cn_sigs_CN176_signature.rds")
tcga_sigs <- list(
tcga_act absolute = get_sig_exposure(tcga_sigs, type = "absolute"),
relative = get_sig_exposure(tcga_sigs, type = "relative", rel_threshold = 0),
similarity = tcga_sigs$Stats$samples[, .(Samples, `Cosine Similarity`)]
)
colnames(tcga_act$similarity) <- c("sample", "similarity")
saveRDS(tcga_act, file = "../data/TCGA/tcga_cn_sigs_CN176_activity.rds")
Check the stat summary of reconstructed Cosine similarity.
summary(tcga_act$similarity$similarity)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.4350 0.9010 0.9430 0.9318 0.9710 0.9980
hist(tcga_act$similarity$similarity, breaks = 100, xlab = "Reconstructed similarity", main = NA)
Let’s go further check the similarities between PCAWG CNS and TCGA CNS.
<- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds") pcawg_sigs
Run similarity analysis.
library(sigminer)
<- get_sig_similarity(pcawg_sigs, tcga_sigs) sim
::pheatmap(sim$similarity, cluster_rows = FALSE, cluster_cols = FALSE, display_numbers = TRUE) pheatmap
From the pheatmap, we can see that:
- Most of PCAWG copy number signatures can be matched to TCGA copy number signatures with
>0.8
similarity. - CNS4, CNS5, CNS6 and CNS14 from PCAWG has lower similarity to TCGA CNS. This indicates either they are not shared signatures or they have relatively different patterns in this two dataset.
It is not surprising to see that TCGA has more copy number signatures than PCAWG due to about 4 times more sample number.
This data remind us it is not easy to get perfect matched copy number signatures between different datasets due to the following reasons:
- TCGA copy number data are generated from SNP array and PCAWG copy number data are generated from WGS.
- They also use different copy number calling softwares. Considering PCAWG data are consensus copy number data from many known calling software, the quality of PCAWG data is much higher than that of TCGA data.
Let’s get the summary of match similarity.
summary(apply(sim$similarity, 1, max))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.4670 0.6625 0.8380 0.7727 0.8705 0.9620
TCGA CNS labeling and analysis
To better integrate PCAWG and TCGA result, here we use PCAWG CNS as reference and label TCGA copy number signatures by following the rule:
- Define the signature similarity level: >=0.8 (High, H), >=0.51(Medium, M), < 0.51(UNMATCH).
Based on the rule, we can get the labels for TCGA signatures according to the similarity matrix heatmap.
<- readr::read_csv("../data/PCAWG_TCGA_CNS_match.csv")
label_df $label <- paste(label_df$TCGA, label_df$match, sep = "-") label_df
::datatable(label_df) DT
TCGA survival analysis
Obtain signature activity and survival data.
Update TCGA signature names.
<- label_df$label
label_map names(label_map) <- label_df$TCGA
colnames(tcga_act$absolute)[-1] <- label_map[colnames(tcga_act$absolute)[-1]]
colnames(tcga_act$relative)[-1] <- label_map[colnames(tcga_act$relative)[-1]]
Do same operation like PCAWG: keep only samples with >75% reconstructed similarity and divide activity value by 10.
<- data.table::copy(tcga_act$absolute)
df_act <- df_act %>%
df_act ::mutate(dplyr::across(where(is.numeric), ~ . / 10))
dplyr
<- tcga_act$similarity %>%
keep_tcga_samples ::filter(similarity > 0.75) %>%
dplyr::pull(sample) dplyr
Read phenotype data.
# tcga_act <- readRDS(file = "../data/TCGA/tcga_cn_sigs_CN176_activity.rds")
<- readRDS("../data/TCGA/tcga_cli.rds")
tcga_cli <- readRDS("../data/TCGA/TCGA_type_info.rds") tcga_type
Merge the data.
library(purrr)
library(tidyverse)
<- reduce(list(
tcga_surv_df
tcga_type,
tcga_cli,
df_actby = "sample") %>%
), left_join, filter(sample %in% keep_tcga_samples) %>%
filter(!is.na(cancer_type))
Same as for PCAWG data, we build two multi-variable cox model here. However, we handle “OS” and “PFI” data here.
Control the cancer types, and run Cox for each signatures:
library(ezcox)
<- show_forest(tcga_surv_df,
p1 covariates = label_df$label, controls = "cancer_type",
vars_to_show = label_df$label,
merge_models = TRUE, add_caption = FALSE, point_size = 2,
time = "OS.time", status = "OS"
) p1
Control the cancer types, and run Cox for all signatures in one model:
<- show_forest(tcga_surv_df,
p2 covariates = label_df$label[1], controls = c(label_df$label[-1], "cancer_type"),
vars_to_show = label_df$label,
merge_models = TRUE, add_caption = FALSE, point_size = 2,
time = "OS.time", status = "OS"
) p2
Control the cancer types, and run Cox for each signatures:
<- show_forest(tcga_surv_df %>% filter(!is.na(PFI.time)),
p3 covariates = label_df$label, controls = "cancer_type",
vars_to_show = label_df$label,
merge_models = TRUE, add_caption = FALSE, point_size = 2,
time = "PFI.time", status = "PFI"
) p3
Control the cancer types, and run Cox for all signatures in one model:
<- show_forest(tcga_surv_df %>% filter(!is.na(PFI.time)),
p4 covariates = label_df$label[1], controls = c(label_df$label[-1], "cancer_type"),
vars_to_show = label_df$label,
merge_models = TRUE, add_caption = FALSE, point_size = 2,
time = "PFI.time", status = "PFI"
) p4
# run cox for all signatures for selected cancer type in TCGA
library(patchwork)
library(ggplot2)
library(tidyr)
library(ezcox)
<- names(
selected_cancer table(tcga_surv_df$cancer_type)[which(table(tcga_surv_df$cancer_type) >= 500)]
)
<- function(i) {
myplot <- subset(
tcga_surv_df2
tcga_surv_df,== i
cancer_type
)<- show_forest(tcga_surv_df2 %>% filter(!is.na(PFI.time)),
p4 covariates = label_df$label[1], controls = c(label_df$label[-1]),
vars_to_show = label_df$label,
merge_models = TRUE, add_caption = FALSE, point_size = 2,
time = "PFI.time", status = "PFI"
+ labs(title = i) + xlab("") + ylab("")
) }
<- mapply(myplot, i = selected_cancer, SIMPLIFY = FALSE)
p # plot
::multiplot(plotlist = p, layout = matrix(1:9, nrow = 3)) Rmisc
Show signature profile for unmatched/low-matched signatures
library(tidyverse)
<- sig_signature(tcga_sigs)
tcga_sig_mat
<- label_df %>% filter(
um_df %in% c("UNMATCH3", "UNMATCH4", "UNMATCH1", "UNMATCH2")
match
)<- um_df$label
um_names names(um_names) <- um_df$TCGA
<- tcga_sig_mat[, names(um_names)]
um_sig_mat colnames(um_sig_mat) <- as.character(um_names)
<- show_sig_profile(um_sig_mat,
p mode = "copynumber", method = "X",
check_sig_names = FALSE, style = "cosmic",
by_context = TRUE, font_scale = 0.6,
sig_orders = c(2, 3, 1, 4)
) p
Save the profile.
<- show_sig_profile_loop(um_sig_mat[, c(2, 3, 1, 4)],
p mode = "copynumber", method = "X",
check_sig_names = FALSE, style = "cosmic",
by_context = TRUE, font_scale = 0.6
)ggsave("../output/TCGA_unmatched_sig_profile_c176_by_context.pdf",
plot = p,
width = 14, height = 8
)
Show representative sample profile for unmatched signatures
Here we will draw copy number profile of 2 representative samples for each signature.
<- function(dt, sig) {
get_sample_from_sig <- head(dt[order(dt[[sig]], dt[[paste0("ABS_", sig)]], decreasing = TRUE)], 2L)
res
res }
Read CopyNumber object for TCGA data.
<- readRDS("../data/TCGA/tcga_cn_obj.rds")
tcga_cn_obj <- tcga_cn_obj@summary.per.sample
samp_summary <- tcga_act$relative
rel_activity <- tcga_act$absolute
abs_activity
<- rel_activity[, lapply(.SD, function(x) {
rel_activity if (is.numeric(x)) round(x, 2) else x
})]
colnames(abs_activity)[-1] <- paste0("ABS_", colnames(abs_activity)[-1])
<- merge(
act
rel_activity, abs_activity,by = "sample"
)
dir.create("../output/enrich_samples_tcga/", showWarnings = FALSE)
for (i in colnames(rel_activity)[-1]) {
cat(paste0("Most enriched in ", i, "\n"))
<- get_sample_from_sig(act, i)
s print(s)
<- show_cn_profile(tcga_cn_obj,
plist samples = s$sample,
show_title = TRUE,
return_plotlist = TRUE
)<- purrr::map2(plist, s$sample, function(p, x) {
plist <- samp_summary[sample == x]
s <- paste0(
text "n_of_seg:", s$n_of_seg, "\n",
"n_of_amp:", s$n_of_amp, "\n",
"n_of_del:", s$n_of_del, "\n",
"rel:", act[sample == x][[i]], "\n",
"abs:", act[sample == x][[paste0("ABS_", i)]], "\n"
)<- p + annotate("text",
p x = Inf, y = Inf, hjust = 1, vjust = 1,
label = text, color = "gray50"
)
p
})<- cowplot::plot_grid(plotlist = plist, nrow = 2)
p print(p)
::ggsave(file.path("../output/enrich_samples_tcga/", paste0(gsub("/", "_", i), ".pdf")),
ggplot2plot = p, width = 12, height = 6
) }
Most enriched in Sig1-CNS10(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-E8-A417-01 0.80 0.1 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0 0 0 0.00
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0.00 0 0.00 0.00 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.01 0.00 0.00 0.08 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.01 67 8 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 0 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 0 0
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 0 1
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 7 0
ABS_Sig20-UNMATCH2
1: 1
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig2-CNS3(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-FP-8209-01 0 0.89 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.04 0 0 0 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0.02 0.02 0 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.00 0.04 0 0.00 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0 0 47 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 2 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 0 1
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 1 0 0 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 2 0 0 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig3-CNS1(M)/CNS5(M)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-ZG-A9LN-01 0 0 1.00 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0 0.00 0.00 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0 0 0 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0 0 0 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.00 0 0 424
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 0 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 0 0
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 0 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 1 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig4-CNS2(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-20-1684-01 0 0.07 0.23 0.42
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0 0.00 0 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0.09 0.07 0 0.11
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0 0.00 0.01 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0 0 46 156
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 283 0 2 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 0 63
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 47 0 78 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 4 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig5-CNS10(M)_2
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-D1-A1O7-01 0.20 0.02 0 0.00
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.66 0 0 0 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0.1 0.01 0.00 0 0.00
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.01 0 0.00 0.00 0.01
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0 25 2 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 82 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 12 1
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 0 1
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 0 1
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig6-CNS5(M)_2
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-DX-A1KZ-01 0 0 0.14 0.04
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0.66 0.02 0.03 0.07
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0.01 0 0 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0 0.01 0.02 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.00 0 0 92
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 23 0 424 13
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 20 47 0 5
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 2 0 0 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 4 12 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig7-CNS9(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-AB-3011-03 0 0 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0 0.96 0 0.03
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0 0.01 0 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0 0 0 0.00
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0 0 0 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 0 0 69
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 2 0 0
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 1 0 0 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 0 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig8-CNS5(M)_3
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-BQ-5876-01 0 0 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0.00 0.00 0.9 0.02
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0 0.02 0 0.07
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.00 0 0 0 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.00 0 0 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 0 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 109 2 0 0
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 2 0 8 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 0 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig9-UNMATCH1
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-ZB-A966-01 0 0 0.02 0.02
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0.00 0.03 0.02 0.89
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0 0 0 0.01
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0 0 0 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.00 0 0 2
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 2 0 0 3
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 2 83 0 0
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 1 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 0 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig10-CNS8(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-XK-AAJA-01 0.11 0.00 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.10 0 0 0 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0.68 0.01 0 0.04 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0.06 0.00 0.00 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0 49 0 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 43 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 295 5
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 17 0 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 27 0 1 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig11-CNS7(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-BR-6453-01 0 0.31 0.00 0.00
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.00 0 0 0 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0.54 0 0 0.07
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.01 0.01 0.06 0.01 0.00
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0 0 116 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 0 0 1
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 0 203
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 27 5
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 2 21 2 0
ABS_Sig20-UNMATCH2
1: 1
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig12-CNS6(M)_2
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-DX-AB30-01 0 0 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.00 0.01 0.01 0.05 0.01
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0 0.86 0 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0.00 0.01 0.02 0.00
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.04 0 0 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 0 2 2
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 8 1 0 0
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 143 0 0 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 1 3 0
ABS_Sig20-UNMATCH2
1: 6
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig13-CNS11(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-D8-A1XU-01 0.08 0.16 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.12 0 0 0 0.00
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0.08 0.09 0 0.43 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.01 0 0.00 0 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.03 10 19 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 14 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 10 11
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 52 0 1
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 0 0
ABS_Sig20-UNMATCH2
1: 3
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig14-CNS6(M)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-BS-A0UA-01 0 0.06 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0 0 0 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0.03 0 0.00 0.88
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.03 0 0 0 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0 0 2 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 0 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 0 1
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 28 1
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 0 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig15-UNMATCH3
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-UE-A6QT-01 0.05 0 0.00 0.03
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.00 0 0.00 0 0.00
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0.04 0.03 0.03 0.03
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.72 0 0.01 0 0.03
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.04 4 0 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 2 0 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 0 3
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 2 2 2 55
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 1 0 2
ABS_Sig20-UNMATCH2
1: 3
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig16-CNS14(M)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-CG-4441-01 0.05 0 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.05 0 0 0 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0.35 0.00 0 0 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.01 0.52 0.02 0 0
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0 36 0 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 34 0 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 0 232 0
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 0 6
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 342 10 0 0
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig17-CNS12(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-AO-A12A-01 0 0.11 0 0.11
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.00 0 0.04 0.06 0
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0.00 0.04 0.00 0 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0 0.50 0.01 0.04
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.09 0 9 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 9 0 0 3
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 5 0 0 3
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 0 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 40 1 3
ABS_Sig20-UNMATCH2
1: 7
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig18-UNMATCH4
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-D1-A162-01 0.02 0.09 0.00 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.04 0.01 0.00 0.02 0.08
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0.00 0.00 0 0.01 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.01 0 0 0.71 0.02
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.00 2 9 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 4 1 0
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 2 8 0 0
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 1 0 1
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 0 73 2
ABS_Sig20-UNMATCH2
1: 0
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig19-CNS13(H)
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-VD-A8KJ-01 0.02 0.03 0 0
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0.04 0.00 0.02 0 0.08
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0.00 0.02 0 0.00 0
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0 0 0 0.05 0.68
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.06 13 17 0
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 27 0 12
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 0 48 2 12
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 0 1
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 1 0 30 415
ABS_Sig20-UNMATCH2
1: 35
[ reached getOption("max.print") -- omitted 1 row ]
Most enriched in Sig20-UNMATCH2
sample Sig1-CNS10(H) Sig2-CNS3(H) Sig3-CNS1(M)/CNS5(M) Sig4-CNS2(H)
1: TCGA-4V-A9QR-01 0 0 0.07 0.00
Sig5-CNS10(M)_2 Sig6-CNS5(M)_2 Sig7-CNS9(H) Sig8-CNS5(M)_3 Sig9-UNMATCH1
1: 0 0.01 0.02 0.06 0.32
Sig10-CNS8(H) Sig11-CNS7(H) Sig12-CNS6(M)_2 Sig13-CNS11(H) Sig14-CNS6(M)
1: 0 0.01 0 0.00 0.04
Sig15-UNMATCH3 Sig16-CNS14(M) Sig17-CNS12(H) Sig18-UNMATCH4 Sig19-CNS13(H)
1: 0.00 0 0.02 0.00 0.02
Sig20-UNMATCH2 ABS_Sig1-CNS10(H) ABS_Sig2-CNS3(H) ABS_Sig3-CNS1(M)/CNS5(M)
1: 0.43 0 0 8
ABS_Sig4-CNS2(H) ABS_Sig5-CNS10(M)_2 ABS_Sig6-CNS5(M)_2 ABS_Sig7-CNS9(H)
1: 0 0 1 2
ABS_Sig8-CNS5(M)_3 ABS_Sig9-UNMATCH1 ABS_Sig10-CNS8(H) ABS_Sig11-CNS7(H)
1: 7 39 0 1
ABS_Sig12-CNS6(M)_2 ABS_Sig13-CNS11(H) ABS_Sig14-CNS6(M) ABS_Sig15-UNMATCH3
1: 0 0 5 0
ABS_Sig16-CNS14(M) ABS_Sig17-CNS12(H) ABS_Sig18-UNMATCH4 ABS_Sig19-CNS13(H)
1: 0 3 0 3
ABS_Sig20-UNMATCH2
1: 52
[ reached getOption("max.print") -- omitted 1 row ]
Merged CNS from clustering of cancer-type associated CNS
To further validate the CNS (stability) identified in PCAWG whole dataset. We also tried the cancer-type-wise signature identification pipeline from Degasperi et al. (2020), we implemented the procedure in our package sigminer
by following the method description. A benchmark has been done to make sure our implementation works properly.
1000 NMF runs (20 bootstrap catalogs and 50 NMF runs for each bootstrap catalog) was applied for data from each cancer type, which is similar to Degasperi et al. (2020).
# This script is running on HPC
library(sigminer)
<- readRDS("pcawg_cn_tally_X.rds")
tally_X <- readRDS("pcawg_type_info.rds")
pcawg_types
for (i in unique(pcawg_types$cancer_type)) {
message("handling type: ", i)
<- file.path("BP", paste0("PCAWG_CN176_", gsub("/", "-", i), ".rds"))
fn
if (!file.exists(fn)) {
<- pcawg_types %>%
type_samples ::filter(cancer_type == i) %>%
dplyr::pull(sample)
dplyrmessage("Sample number: ", length(type_samples))
<- bp_extract_signatures(
sigs $nmf_matrix[type_samples, , drop = FALSE],
tally_Xrange = 2:min(20, length(type_samples) - 1),
cores = 30,
cores_solution = 20,
cache_dir = "BP/BP_PCAWG_types",
keep_cache = TRUE,
only_core_stats = TRUE
)saveRDS(sigs, file = fn)
} }
We then load the result and do manual inspection to determine the proper signature number for each cancer type.
library(sigminer)
<- list.files("../data/cancer_types/", pattern = "PCAWG_CN176", full.names = TRUE)
cancer_type_files <- lapply(cancer_type_files, readRDS)
pcawg
bp_show_survey2(pcawg[[1]]) # 11
bp_show_survey2(pcawg[[2]]) # 14
bp_show_survey2(pcawg[[3]]) # 13
bp_show_survey2(pcawg[[4]]) # 8
bp_show_survey2(pcawg[[5]]) # 14
bp_show_survey2(pcawg[[6]]) # 11
bp_show_survey2(pcawg[[7]]) # 12
bp_show_survey2(pcawg[[8]]) # 8
bp_show_survey2(pcawg[[9]]) # 7
bp_show_survey2(pcawg[[10]]) # 5
bp_show_survey2(pcawg[[11]]) # 13
bp_show_survey2(pcawg[[12]]) # 9
bp_show_survey2(pcawg[[13]]) # 15
bp_show_survey2(pcawg[[14]]) # 7
bp_show_survey2(pcawg[[15]]) # 11
bp_show_survey2(pcawg[[16]]) # 12
bp_show_survey2(pcawg[[17]]) # 12
bp_show_survey2(pcawg[[18]]) # 17
bp_show_survey2(pcawg[[19]]) # 12
bp_show_survey2(pcawg[[20]]) # 6
bp_show_survey2(pcawg[[21]]) # 3
bp_show_survey2(pcawg[[22]]) # 7
bp_show_survey2(pcawg[[23]]) # 11
bp_show_survey2(pcawg[[24]]) # 19
bp_show_survey2(pcawg[[25]]) # 8
bp_show_survey2(pcawg[[26]]) # 10
bp_show_survey2(pcawg[[27]]) # 11
bp_show_survey2(pcawg[[28]]) # 9
bp_show_survey2(pcawg[[29]]) # 9
bp_show_survey2(pcawg[[30]]) # 15
bp_show_survey2(pcawg[[31]]) # 6
bp_show_survey2(pcawg[[32]]) # 17
<- data.frame(
type_sig cancer_type = sub(".*_CN176_([^_]+).rds", "\\1", basename(cancer_type_files)),
signum = c(
11, 14, 13, 8, 14, 11, 12, 8,
7, 5, 13, 9, 15, 7, 11, 12,
12, 17, 12, 6, 3, 7, 11, 19,
8, 10, 11, 9, 9, 15, 6, 17
)
)
<- lapply(
pcawg_sig_list seq_along(pcawg),
function(i) {
bp_get_sig_obj(pcawg[[i]], signum = type_sig$signum[i])
}
)
names(pcawg_sig_list) <- type_sig$cancer_type
saveRDS(pcawg_sig_list, file = "data/pcawg_type_CNS.rds")
The signature number ranges from 3
to 19
.
Next we need to collapse all shared signatures from cancer types. To do this, we cluster all signatures based their cosine dissimilarity.
library(sigminer)
<- readRDS("../data/pcawg_type_CNS.rds")
pcawg_sig_list $`Biliary-AdenoCA` pcawg_sig_list
$Signature
Sig1 Sig2 Sig3 Sig4 Sig5
E:HH:0:AA 1.919365e-17 1.836909e-17 1.862198e-17 1.876080e-17 1.826403e-17
E:HH:0:BB 1.919365e-17 4.166667e-02 1.862198e-17 1.876080e-17 3.333333e-01
E:HH:1:AA 1.439106e+00 2.676199e+01 5.001386e+01 1.133406e+01 3.832072e+01
E:HH:1:BB 2.501214e+00 1.968315e+00 3.348985e+00 1.262798e+00 4.322378e+00
E:HH:2:AA 4.466717e+00 1.547829e+02 2.642123e+01 5.648644e+00 1.748853e+01
E:HH:2:BB 1.919365e-17 1.220585e+01 5.863062e-01 1.876080e-17 3.057059e+00
Sig6 Sig7 Sig8 Sig9 Sig10
E:HH:0:AA 1.782894e-17 1.822001e-17 1.780042e-17 1.780276e-17 1.754057e-17
E:HH:0:BB 1.782894e-17 1.666667e-01 2.500000e-01 1.780276e-17 1.754057e-17
E:HH:1:AA 1.965323e+01 2.988848e+01 3.066430e+01 1.351001e+01 1.994798e+01
E:HH:1:BB 9.366926e-01 1.306594e+00 4.723706e+00 2.207346e+00 1.315582e+00
E:HH:2:AA 8.329851e+01 1.482409e+01 3.108890e+01 1.371453e+01 1.312290e+01
E:HH:2:BB 5.839599e+00 6.961702e-01 4.629335e+00 6.959860e-01 3.094529e-02
Sig11
E:HH:0:AA 1.718942e-17
E:HH:0:BB 4.166667e-02
E:HH:1:AA 4.299592e+00
E:HH:1:BB 1.189723e+00
E:HH:2:AA 4.768073e+00
E:HH:2:BB 1.337466e-01
[ reached getOption("max.print") -- omitted 170 rows ]
$Signature.norm
Sig1 Sig2 Sig3 Sig4 Sig5
E:HH:0:AA 4.170979e-20 4.025733e-20 4.219701e-20 4.560529e-20 4.546751e-20
E:HH:0:BB 4.170979e-20 9.131585e-05 4.219701e-20 4.560529e-20 8.298187e-04
E:HH:1:AA 3.127326e-03 5.865106e-02 1.133303e-01 2.755176e-02 9.539775e-02
E:HH:1:BB 5.435396e-03 4.313720e-03 7.588728e-03 3.069712e-03 1.076037e-02
E:HH:2:AA 9.706637e-03 3.392191e-01 5.986994e-02 1.373119e-02 4.353692e-02
E:HH:2:BB 4.170979e-20 2.675010e-02 1.328557e-03 4.560529e-20 7.610416e-03
Sig6 Sig7 Sig8 Sig9 Sig10
E:HH:0:AA 4.505784e-20 4.702870e-20 4.662014e-20 4.735328e-20 4.690863e-20
E:HH:0:BB 4.505784e-20 4.301928e-04 6.547619e-04 4.735328e-20 4.690863e-20
E:HH:1:AA 4.966824e-02 7.714685e-02 8.031126e-02 3.593507e-02 5.334673e-02
E:HH:1:BB 2.367237e-03 3.372525e-03 1.237161e-02 5.871284e-03 3.518252e-03
E:HH:2:AA 2.105145e-01 3.826331e-02 8.142329e-02 3.647905e-02 3.509449e-02
E:HH:2:BB 1.475801e-02 1.796925e-03 1.212445e-02 1.851242e-03 8.275678e-05
Sig11
E:HH:0:AA 4.743997e-20
E:HH:0:BB 1.149931e-04
E:HH:1:AA 1.186616e-02
E:HH:1:BB 3.283438e-03
E:HH:2:AA 1.315909e-02
E:HH:2:BB 3.691185e-04
[ reached getOption("max.print") -- omitted 170 rows ]
$Exposure
SP117017 SP117031 SP117332 SP117425 SP117556
Sig1 0.1672924 1.197175e-12 1.197175e-12 6.971195e+01 1.197175e-12
Sig2 16.5815545 5.832088e+00 3.089812e+01 9.958335e+00 1.191513e+01
SP117621 SP117627 SP117712 SP117759 SP117930
Sig1 2.280120e+01 1.197175e-12 0.9674283 1.197175e-12 3.162089e+01
Sig2 1.576648e-10 8.451208e+00 9.7202684 1.218155e+01 1.228684e-12
SP99113 SP99185 SP99209 SP99213 SP99221
Sig1 1.197175e-12 1.859668e-01 1.036956 1.197175e-12 1.197175e-12
Sig2 1.374978e+01 4.361682e+01 27.334544 1.639245e+00 1.437495e+01
SP99225 SP99241 SP99287 SP99297 SP99301
Sig1 1.197175e-12 1.197175e-12 7.040598e-01 1.197175e-12 1.197175e-12
Sig2 1.287393e+01 2.741258e+00 8.065416e+00 1.107821e+01 1.441029e+01
SP99305 SP99309 SP99313 SP99317 SP99321
Sig1 1.197175e-12 1.197175e-12 1.582058e-11 1.197175e-12 1.197175e-12
Sig2 4.729177e+00 3.679884e+01 5.166667e+00 2.669124e+01 1.138996e+01
SP99325 SP99329 SP99333 SP99337 SP99341
Sig1 1.197175e-12 1.8473495 1.197175e-12 1.197175e-12 1.197175e-12
Sig2 6.065915e+01 3.4707960 4.141006e+00 1.200774e+01 1.381966e+01
SP99345 SP117655 SP117775 SP99293
Sig1 1.197175e-12 3.092667e+02 18.3954369 3.466154e+00
Sig2 1.919694e+01 7.187344e-01 2.0791266 2.457368e-12
[ reached getOption("max.print") -- omitted 9 rows ]
$Exposure.norm
SP117017 SP117031 SP117332 SP117425 SP117556
Sig1 0.001779718 1.786838e-14 2.029149e-14 2.916822e-01 9.976572e-15
Sig2 0.176400657 8.704661e-02 5.237070e-01 4.166673e-02 9.929391e-02
SP117621 SP117627 SP117712 SP117759 SP117930
Sig1 1.398845e-01 6.107916e-15 0.017275519 4.450592e-15 1.681940e-01
Sig2 9.672676e-13 4.311758e-02 0.173576348 4.528589e-02 6.535468e-15
SP99113 SP99185 SP99209 SP99213 SP99221
Sig1 5.441786e-14 2.582919e-03 0.01329435 6.046194e-15 8.675074e-15
Sig2 6.249994e-01 6.058002e-01 0.35044411 8.278819e-03 1.041650e-01
SP99225 SP99241 SP99287 SP99297 SP99301
Sig1 2.720856e-14 1.686136e-14 6.579953e-03 1.900301e-14 3.990636e-14
Sig2 2.925898e-01 3.860870e-02 7.537720e-02 1.758469e-01 4.803496e-01
SP99305 SP99309 SP99313 SP99317 SP99321
Sig1 2.029128e-14 1.496494e-14 1.275835e-13 2.302295e-14 8.430798e-15
Sig2 8.015629e-02 4.599934e-01 4.166609e-02 5.133011e-01 8.021093e-02
SP99325 SP99329 SP99333 SP99337 SP99341
Sig1 4.926634e-15 0.014661689 1.930950e-14 2.137816e-14 1.059437e-14
Sig2 2.496256e-01 0.027546348 6.679120e-02 2.144244e-01 1.222968e-01
SP99345 SP117655 SP117775 SP99293
Sig1 3.861913e-14 6.608248e-01 0.054747947 1.229137e-02
Sig2 6.192655e-01 1.535754e-03 0.006187834 8.714100e-15
[ reached getOption("max.print") -- omitted 9 rows ]
$K
[1] 11
attr(,"class")
[1] "Signature"
attr(,"used_runs")
[1] 24
attr(,"method")
[1] "brunet"
attr(,"call_method")
[1] "NMF with best practice"
attr(,"nrun")
[1] 1000
attr(,"seed")
[1] 123456
<- bp_cluster_iter_list(pcawg_sig_list, k = 2:30) cls
We try 2
to 30
signatures here to check how the silhouette changes in different cluster number k
.
$sil_summary cls
k min mean max sd
1 2 -0.4144666 0.2910101 0.5572367 0.1744367
2 3 -0.7191056 0.2049187 0.8931134 0.2275587
3 4 -0.7191056 0.1850721 0.8931134 0.2503893
4 5 -0.6641900 0.2973617 0.8884523 0.2848243
5 6 -0.6616768 0.2762899 0.8880718 0.2842627
6 7 -0.6600962 0.2767034 0.8797653 0.2863463
7 8 -0.6600962 0.2678768 0.8797653 0.2920637
8 9 -0.6990881 0.2568783 0.8797653 0.3143687
9 10 -0.6990881 0.2452582 0.8797653 0.3194973
10 11 -0.6800734 0.2446516 0.8797653 0.3149417
11 12 -0.6789815 0.2340692 0.8797653 0.3160896
12 13 -0.6789815 0.2310671 0.8710570 0.3172617
13 14 -0.6788359 0.2124811 0.8710570 0.3206350
14 15 -0.6788359 0.2043917 0.8710570 0.3230532
15 16 -0.6788359 0.1930057 0.8710570 0.3189946
[ reached 'max' / getOption("max.print") -- omitted 14 rows ]
plot(cls$sil_summary$k, cls$sil_summary$mean, type = "b", xlab = "Total signatures", ylab = "Mean silhouette")
In general, the mean silhouette is not high. But we can still see that 20
or 22
may be suitable. k=5
is not considered because it is too small to explain our data.
Signature similarity is the key parameter in our study. Here we use an average cosine distance 0.4
(i.e. signatures with similarity >=0.6
could be clustered as one) as a cutoff:
::fviz_dend(cls$cluster,
factoextrah = 0.4, cex = 0.15,
main = "",
xlab = "",
ylab = "1 - cosine similarity, average linkage"
)
Check the cluster number:
<- cutree(cls$cluster, h = 0.4)
cls_tree table(cls_tree)
cls_tree
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
16 57 71 108 20 3 1 29 2 4 7 1 6 1 3 4 2 1 1 1
21 22 23 24
1 1 1 1
This is close to 22
. We observe there are a few isolated clusters with only one signature, they may be artifacts or should be assigned to other clusters but cannot due to low similarity to others.
Next we obtain clustered (grouped) signatures by averaging signatures in a cluster.
<- bp_get_clustered_sigs(cls, cls_tree) grp_sigs
Next we load PCAWG global CNS and check their similarity.
<- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds") pcawg_sigs
<- get_sig_similarity(pcawg_sigs, grp_sigs$grp_sigs)
sim1 <- sim1$similarity
sim_mat colnames(sim_mat) <- gsub("Sig", "Group-Sig", colnames(sim_mat))
::pheatmap(sim_mat, cluster_rows = FALSE, cluster_cols = FALSE, display_numbers = TRUE) pheatmap
We can see that most of PCAWG global signatures can be properly matched to group signatures. CNS2
和 CNS13
seems not stable and tend to be easier affected by extraction procedure.
Also, we observe almost all group signatures on the right side can be assigned to a PCAWG global signatures, although most of them have relatively low similarity. This data indicate the signatures may be redundant.
Let’s check the similarity between group signatures and best matched global signatures.
<- apply(sim1$similarity, 2, which.max)
mc <- apply(sim1$similarity, 2, max)
mc_sim mc_sim
Sig1 Sig14 Sig2 Sig5 Sig10 Sig7 Sig18 Sig6 Sig11 Sig21 Sig15 Sig13 Sig19
0.886 0.646 0.988 0.899 0.914 0.813 0.824 0.855 0.953 0.788 0.902 0.937 0.676
Sig9 Sig3 Sig4 Sig8 Sig12 Sig16 Sig17 Sig20 Sig22 Sig23 Sig24
0.901 0.727 0.569 0.667 0.598 0.815 0.717 0.693 0.579 0.477 0.467
Next let’s visualize the data along with the similarity of signatures within a group (cluster).
<- data.frame(
mc_df grp_sig = names(mc),
global_sig = paste0("CNS", mc),
sim = as.numeric(mc_sim)
%>%
) ::mutate(
dplyrlabel = paste0(grp_sig, "(", global_sig, ", ", sim, ")"),
grp_sig = factor(grp_sig, levels = paste0("Sig", seq_len(nrow(.))))
%>%
) ::arrange(grp_sig) dplyr
::ggboxplot(grp_sigs$sim_sig_to_grp_mean,
ggpubrx = "grp_sig", y = "sim", width = 0.3,
xlab = FALSE, ylab = "cosine similarity",
title = "Cosine similarity between signatures in each group"
+
) ::rotate_x_text() ggpubr
Add labels:
::ggboxplot(grp_sigs$sim_sig_to_grp_mean, # %>%
ggpubr# dplyr::filter(grp_sig %in% select_sigs),
x = "grp_sig", y = "sim", width = 0.3,
xlab = FALSE, ylab = "Cosine similarity",
title = "Cosine similarity between signatures in each group"
+
) ::rotate_x_text() +
ggpubrscale_x_discrete(labels = mc_df$label)
Here y axis indicates cosine similarity between the averaging signature of each group and the individual signatures that belong to each group.
PCAWG and TCGA Pan-Cancer copy number alteration profile and analysis
library(sigminer)
<- readRDS("../data/pcawg_cn_obj.rds")
pcawg_cn <- readRDS("../data/pcawg_type_info.rds")
pcawg_type <- data.table::fread("../data/PCAWG/PCAWG_sigProfiler_SBS_signatures_in_samples.csv")
pcawg_sbs <- pcawg_sbs %>%
pcawg_sbs ::mutate(
dplyrsbs = rowSums(pcawg_sbs[, grepl("SBS", colnames(pcawg_sbs)), with = FALSE], na.rm = TRUE)
%>%
) ::rename(
dplyrsample = `Sample Names`
%>%
) ::select(sample, sbs)
dplyr<- data.table::fread("../data/PCAWG/PCAWG_SigProfiler_ID_signatures_in_samples.csv")
pcawg_ids <- pcawg_ids %>%
pcawg_ids ::mutate(
dplyrids = rowSums(pcawg_ids[, grepl("ID", colnames(pcawg_ids)), with = FALSE], na.rm = TRUE)
%>%
) ::rename(
dplyrsample = `Sample Names`
%>%
) ::select(sample, ids)
dplyr
<- purrr::reduce(
pcawg_summary list(
@summary.per.sample,
pcawg_cn
pcawg_sbs,
pcawg_ids,
pcawg_type::left_join,
), dplyrby = "sample"
)
<- readRDS("../data/TCGA/tcga_cn_obj.rds")
tcga_cn <- readRDS("../data/TCGA/TCGA_type_info.rds")
tcga_type <- data.table::fread("../data/PCAWG/TCGA_WES_sigProfiler_SBS_signatures_in_samples.csv")
tcga_sbs <- tcga_sbs %>%
tcga_sbs ::mutate(
dplyrsbs = rowSums(tcga_sbs[, grepl("SBS", colnames(tcga_sbs)), with = FALSE], na.rm = TRUE)
%>%
) ::rename(
dplyrsample = `Sample Names`
%>%
) ::mutate(sample = substr(sample, 1, 15)) %>%
dplyr::select(sample, sbs)
dplyr<- data.table::fread("../data/PCAWG/TCGA_WES_sigProfiler_ID_signatures_in_samples.csv")
tcga_ids <- tcga_ids %>%
tcga_ids ::mutate(
dplyrids = rowSums(tcga_ids[, grepl("ID", colnames(tcga_ids)), with = FALSE], na.rm = TRUE)
%>%
) ::rename(
dplyrsample = `Sample Names`
%>%
) ::mutate(
dplyrsample = substr(sample, 1, 15)
%>%
) ::select(sample, ids)
dplyr
<- purrr::reduce(
tcga_summary list(
@summary.per.sample,
tcga_cn
tcga_sbs,
tcga_ids,
tcga_type::left_join,
), dplyrby = "sample"
)
Summary
SCNA length distribution
PCAWG:
show_cn_distribution(pcawg_cn)
TCGA:
show_cn_distribution(tcga_cn)
SCNA chromosome distribution
PCAWG:
show_cn_distribution(pcawg_cn, mode = "cd", fill = TRUE)
TCGA:
show_cn_distribution(tcga_cn, mode = "cd", fill = TRUE)
SCNA variation frequency
PCAWG:
show_cn_group_profile(pcawg_cn)
TCGA:
show_cn_group_profile(tcga_cn)
Summary data analysis
library(ggpubr)
Distribution
PCAWG
<- pcawg_summary %>%
data ::select(sample, n_of_cnv, cna_burden, sbs, ids) %>%
dplyr::mutate(
dplyrn_of_cnv = log10(n_of_cnv + 1),
sbs = log10(sbs + 1),
ids = log10(ids + 1)
%>%
) setNames(c("sample", "Total CNVs (log10)", "CNA burden", "Total SNVs (log10)", "Total Indels (log10)")) %>%
::pivot_longer(colnames(.)[-1])
tidyr
ggplot(data, aes(x = name, y = value)) +
geom_violin() +
geom_boxplot(width = 0.1, fill = "gold") +
facet_wrap(~name, scales = "free") +
labs(x = NULL, y = NULL) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
library(ggridges)
ggplot(data, aes(x = value, y = name, fill = name)) +
geom_density_ridges() +
labs(x = NULL, y = NULL) +
theme_ridges() +
theme(legend.position = "none")
TCGA
<- tcga_summary %>%
data ::select(sample, n_of_cnv, cna_burden, sbs, ids) %>%
dplyr::mutate(
dplyrn_of_cnv = log10(n_of_cnv + 1),
sbs = log10(sbs + 1),
ids = log10(ids + 1)
%>%
) setNames(c("sample", "Total CNVs (log10)", "CNA burden", "Total SNVs (log10)", "Total Indels (log10)")) %>%
::pivot_longer(colnames(.)[-1])
tidyr
ggplot(data, aes(x = name, y = value)) +
geom_violin() +
geom_boxplot(width = 0.1, fill = "gold") +
facet_wrap(~name, scales = "free") +
labs(x = NULL, y = NULL) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
ggplot(data, aes(x = value, y = name, fill = name)) +
geom_density_ridges() +
labs(x = NULL, y = NULL) +
theme_ridges() +
theme(legend.position = "none")
PCAWG CNA burden pancancer distribution.
<- ggplot(pcawg_summary, aes(x = cna_burden, y = cancer_type, fill = cancer_type)) +
p geom_density_ridges() +
labs(x = "CNA burden", y = NULL) +
theme_ridges() +
theme(legend.position = "none")
p
TCGA CNA burden pancancer distribution.
<- ggplot(
p2 %>%
tcga_summary ::filter(!is.na(cancer_type)),
dplyraes(x = cna_burden, y = cancer_type, fill = cancer_type)
+
) geom_density_ridges() +
labs(x = "CNA burden", y = NULL) +
theme_ridges() +
theme(legend.position = "none")
p2
Correlation
PCAWG:
<- pcawg_summary %>%
data ::select(n_of_cnv, cna_burden, sbs, ids) %>%
dplyr::mutate(
dplyrn_of_cnv = log10(n_of_cnv + 1),
sbs = log10(sbs + 1),
ids = log10(ids + 1)
%>%
) setNames(c("Total CNVs (log10)", "CNA burden", "Total SNVs (log10)", "Total Indels (log10)"))
show_cor(data)
Show some detail scatter plots.
library(ggpubr)
ggscatter(data,
x = "Total CNVs (log10)", y = "Total SNVs (log10)",
color = "black", shape = 21, size = 1, # Points color, shape and size
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "spearman", label.x = 1.5, label.sep = "\n")
)
ggscatter(data,
x = "Total CNVs (log10)", y = "Total Indels (log10)",
color = "black", shape = 21, size = 1, # Points color, shape and size
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "spearman", label.x = 2.5, label.sep = "\n")
)
TCGA:
<- tcga_summary %>%
data ::select(n_of_cnv, cna_burden, sbs, ids) %>%
dplyr::mutate(
dplyrn_of_cnv = log10(n_of_cnv + 1),
sbs = log10(sbs + 1),
ids = log10(ids + 1)
%>%
) setNames(c("Total CNVs (log10)", "CNA burden", "Total SNVs (log10)", "Total Indels (log10)"))
show_cor(data)
Show some detail scatter plots.
ggscatter(data,
x = "Total CNVs (log10)", y = "Total SNVs (log10)",
color = "black", shape = 21, size = 1, # Points color, shape and size
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "spearman", label.x = 0.5, label.sep = "\n")
)
ggscatter(data,
x = "Total CNVs (log10)", y = "Total Indels (log10)",
color = "black", shape = 21, size = 1, # Points color, shape and size
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "spearman", label.x = 0.5, label.sep = "\n")
)
Cancer type
PCAWG CNV number
show_group_distribution(
pcawg_summary,gvar = "cancer_type",
dvar = "n_of_cnv",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3,
ylab = "Number of CNV"
)
PCAWG CNA burden
show_group_distribution(
pcawg_summary,gvar = "cancer_type",
dvar = "cna_burden",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3,
ylab = "CNA burden"
)
PCAWG SBS number
show_group_distribution(
%>% dplyr::mutate(sbs = log10(sbs + 1)),
pcawg_summary gvar = "cancer_type",
dvar = "sbs",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3,
ylab = "Number of SBS (log10)"
)
PCAWG Indel number
show_group_distribution(
%>% dplyr::mutate(ids = log10(ids + 1)),
pcawg_summary gvar = "cancer_type",
dvar = "ids",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3,
ylab = "Number of Indel (log10)"
)
TCGA CNV number
show_group_distribution(
%>% dplyr::filter(!is.na(cancer_type)),
tcga_summary gvar = "cancer_type",
dvar = "n_of_cnv",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3,
ylab = "Number of CNV"
)
TCGA CNA burden
show_group_distribution(
%>% dplyr::filter(!is.na(cancer_type)),
tcga_summary gvar = "cancer_type",
dvar = "cna_burden",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3,
ylab = "CNA burden"
)
TCGA SBS number
show_group_distribution(
%>% dplyr::filter(!is.na(cancer_type)) %>%
tcga_summary ::mutate(sbs = log10(sbs + 1)),
dplyrgvar = "cancer_type",
dvar = "sbs",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3,
ylab = "Number of SBS (log10)"
)
TCGA Indel number
show_group_distribution(
%>% dplyr::filter(!is.na(cancer_type)) %>%
tcga_summary ::mutate(
dplyrids = ifelse(is.na(ids), 0, ids),
ids = log10(ids + 1)
),gvar = "cancer_type",
dvar = "ids",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3,
ylab = "Number of Indel (log10)"
)
Variation enrichment in cancer types
PCAWG:
<- group_enrichment(
enrich_pcawg %>%
pcawg_summary ::rename(
dplyr`CNA burden` = cna_burden,
`Total CNVs` = n_of_cnv,
`Total SNVs` = sbs,
`Total Indels` = ids
),grp_vars = "cancer_type",
enrich_vars = c("CNA burden", "Total CNVs", "Total SNVs", "Total Indels"),
co_method = "wilcox.test"
)
<- show_group_enrichment(enrich_pcawg, cut_p_value = TRUE, return_list = TRUE)
p $cancer_type + labs(x = NULL, y = NULL) + ggpubr::rotate_x_text(angle = 45) p
TCGA:
Grey squares will aesthetically replaced by white squares also represent NA.
<- group_enrichment(
enrich_tcga %>%
tcga_summary ::rename(
dplyr`CNA burden` = cna_burden,
`Total CNVs` = n_of_cnv,
`Total SNVs` = sbs,
`Total Indels` = ids
),grp_vars = "cancer_type",
enrich_vars = c("CNA burden", "Total CNVs", "Total SNVs", "Total Indels"),
co_method = "wilcox.test"
)
<- show_group_enrichment(enrich_tcga, cut_p_value = TRUE, return_list = TRUE)
p $cancer_type + labs(x = NULL, y = NULL) + ggpubr::rotate_x_text(angle = 45) p
Comparison between previous method devised by our group “W” and new classification method in this study “X”
We use several PCAWG cohorts including Breast and OV as example datasets to compare method W and X in two aspects:
- the similarity within copy number signatures. This result indicates if it is easy to distinguish each signature. The smaller similarity, the better to distinguish.
- the difference between estimated segments and observed segments. This result indicates the accuracy to estimate the total contribution of all signatures to a sample. The smaller difference, the better.
Here we set the signature number from 3 to 10, and see the measures generated from these two methods.
Prepare data
library(sigminer)
library(tidyverse)
<- readRDS("../data/pcawg_cn_tally_X.rds")
tally_X <- readRDS("../data/pcawg_type_info.rds")
pcawg_types
<- readRDS("../data/pcawg_cn_obj.rds")
cn_obj <- sig_tally(cn_obj,
tally_W method = "W",
cores = 10
)saveRDS(tally_W, file = "../data/pcawg_cn_tally_W.rds")
Extract specified signatures
<- list()
ResultX <- list()
ResultW <- c("Breast", "Ovary-AdenoCA", "Prost-AdenoCA", "Liver-HCC", "Skin-Melanoma")
types for (type in types) {
message("=> Handling type: ", type)
<- pcawg_types$sample[pcawg_types$cancer_type == type]
samples <- tally_X$nmf_matrix[samples, ]
matX <- tally_W$nmf_matrix[samples, ]
matW
<- map(3:10, ~ sig_extract(matX, n_sig = ., nrun = 50, cores = 10))
sigX <- map(3:10, ~ sig_extract(matW, n_sig = ., nrun = 50, cores = 10))
sigW
<- sigX
ResultX[[type]] <- sigW
ResultW[[type]] rm(sigX, sigW)
}
save(ResultX, ResultW, file = "../data/Comparison_Sig_Data.RData")
Comparison
Calculates measures.
# within similarity
<- function(x) {
get_within_similarity <- suppressMessages(get_sig_similarity(x, x, set_order = FALSE))$similarity
sim upper.tri(sim)]
sim[
}
<- map_df(ResultX, function(x) {
CrossSimX map(x, get_within_similarity) %>%
set_names(paste0(3:10, "_Sigs")) %>%
enframe("sig_type", "sim") %>%
unnest("sim")
.id = "cancer_type")
},
<- map_df(ResultW, function(x) {
CrossSimW map(x, get_within_similarity) %>%
set_names(paste0(3:10, "_Sigs")) %>%
enframe("sig_type", "sim") %>%
unnest("sim")
.id = "cancer_type")
},
<- bind_rows(
CrossSim %>% mutate(method = "X"),
CrossSimX %>% mutate(method = "W")
CrossSimW
)
<- cn_obj@summary.per.sample[, .(sample, n_of_seg)]
observed_count
<- function(x) {
get_estimated_count get_sig_exposure(x) %>%
mutate(count = rowSums(.[, -1])) %>%
select(sample, count)
}
<- map2_df(ResultX, ResultW, function(x, y) {
count_df <- map(x, get_estimated_count) %>%
countX set_names(paste0(3:10, "_Sigs")) %>%
::rbindlist(idcol = "sig_type")
data.table$method <- "X"
countX<- map(y, get_estimated_count) %>%
countW set_names(paste0(3:10, "_Sigs")) %>%
::rbindlist(idcol = "sig_type")
data.table$method <- "W"
countWrbind(countX, countW)
.id = "cancer_type")
},
<- left_join(count_df, observed_count, by = "sample")
count_df2
save(CrossSim, count_df2, file = "../data/Comparison_Result_Data.RData")
library(ggpubr)
$method <- ifelse(CrossSim$method == "W", "Wang et al method", "Method developed here")
CrossSim
<- ggboxplot(CrossSim %>%
p mutate(
sig_type = stringr::str_replace(sig_type, "_Sigs", "")
),x = "sig_type", y = "sim", fill = "method", facet.by = "cancer_type",
xlab = "Extracted signature number", ylab = "Similarity between each other"
+
) stat_compare_means(aes(group = method, label = ..p.signif..))
p
<- count_df2 %>%
count_summary group_by(cancer_type, sig_type, sample, method) %>%
summarise(
rmse = sqrt(sum((count - n_of_seg)^2) / length(count)),
change_frac = 100 * (rmse / n_of_seg),
.groups = "drop"
)
count_summary
# A tibble: 16,752 x 6
cancer_type sig_type sample method rmse change_frac
<chr> <chr> <chr> <chr> <dbl> <dbl>
1 Breast 10_Sigs SP10084 W 15.0 2.72
2 Breast 10_Sigs SP10084 X 0.000382 0.0000695
3 Breast 10_Sigs SP10150 W 5.89 5.36
4 Breast 10_Sigs SP10150 X 0.000142 0.000129
5 Breast 10_Sigs SP10470 W 5.46 0.540
6 Breast 10_Sigs SP10470 X 3.00 0.297
7 Breast 10_Sigs SP10563 W 25.7 5.64
8 Breast 10_Sigs SP10563 X 4.00 0.879
9 Breast 10_Sigs SP10635 W 1.12 2.29
10 Breast 10_Sigs SP10635 X 0.0000271 0.0000554
# … with 16,742 more rows
<- ggboxplot(count_summary %>%
p mutate(
sig_type = as.integer(stringr::str_replace(sig_type, "_Sigs", ""))
),x = "sig_type", y = "rmse", fill = "method", size = 0.5, width = 0.5, outlier.size = 0.5,
facet.by = "cancer_type", scales = "free_y",
xlab = "Extracted signature number", ylab = "RMSE between observed segments and estimated segments"
+
) stat_compare_means(aes(group = method, label = ..p.signif..))
p
<- ggboxplot(count_summary %>%
p mutate(
sig_type = as.integer(stringr::str_replace(sig_type, "_Sigs", ""))
),x = "sig_type", y = "change_frac", fill = "method", size = 0.5, width = 0.5, outlier.size = 0.5,
facet.by = "cancer_type", scales = "free_y",
xlab = "Extracted signature number", ylab = "Change between observed segments and estimated segments (%)"
+
) stat_compare_means(aes(group = method, label = ..p.signif..))
p
R Session:
Many thanks to authors and contributors of all packages used in this project.
::session_info() devtools
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.0.2 (2020-06-22)
os macOS 10.16
system x86_64, darwin17.0
ui X11
language EN
collate zh_CN.UTF-8
ctype zh_CN.UTF-8
tz Asia/Shanghai
date 2022-01-27
─ Packages ───────────────────────────────────────────────────────────────────
package * version date lib source
abind 1.4-5 2016-07-21 [1] CRAN (R 4.0.2)
annotate 1.66.0 2020-04-28 [1] Bioconductor
AnnotationDbi 1.50.3 2020-07-25 [1] Bioconductor
assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.2)
backports 1.1.10 2020-09-15 [1] CRAN (R 4.0.2)
bayestestR 0.10.5 2021-07-26 [1] CRAN (R 4.0.2)
bibtex 0.4.2.3 2020-09-19 [1] CRAN (R 4.0.2)
Biobase * 2.48.0 2020-04-27 [1] Bioconductor
BiocGenerics * 0.34.0 2020-04-27 [1] Bioconductor
BiocManager 1.30.10 2019-11-16 [1] CRAN (R 4.0.2)
bit 4.0.4 2020-08-04 [1] CRAN (R 4.0.2)
bit64 4.0.5 2020-08-30 [1] CRAN (R 4.0.2)
bitops 1.0-6 2013-08-17 [1] CRAN (R 4.0.2)
blob 1.2.1 2020-01-20 [1] CRAN (R 4.0.2)
bookdown 0.24 2021-09-02 [1] CRAN (R 4.0.2)
brew 1.0-6 2011-04-13 [1] CRAN (R 4.0.2)
broom 0.7.0 2020-07-09 [1] CRAN (R 4.0.2)
bslib 0.2.4 2021-01-25 [1] CRAN (R 4.0.2)
cachem 1.0.4 2021-02-13 [1] CRAN (R 4.0.2)
callr 3.7.0 2021-04-20 [1] CRAN (R 4.0.2)
car 3.0-10 2020-09-29 [1] CRAN (R 4.0.2)
carData 3.0-4 2020-05-22 [1] CRAN (R 4.0.2)
cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.2)
circlize 0.4.10 2020-06-15 [1] CRAN (R 4.0.2)
cli 3.1.0 2021-10-27 [1] CRAN (R 4.0.2)
clue 0.3-59 2021-04-16 [1] CRAN (R 4.0.2)
cluster 2.1.0 2019-06-19 [1] CRAN (R 4.0.2)
coda 0.19-4 2020-09-30 [1] CRAN (R 4.0.2)
codetools 0.2-16 2018-12-24 [1] CRAN (R 4.0.2)
cola * 1.4.1 2020-05-04 [1] Bioconductor
colorspace 1.4-1 2019-03-18 [1] CRAN (R 4.0.2)
ComplexHeatmap * 2.4.3 2020-07-25 [1] Bioconductor
correlation * 0.4.0 2020-09-27 [1] CRAN (R 4.0.2)
cowplot * 1.1.0 2020-09-08 [1] CRAN (R 4.0.2)
crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.2)
crosstalk 1.1.0.1 2020-03-13 [1] CRAN (R 4.0.2)
curl 4.3 2019-12-02 [1] CRAN (R 4.0.1)
data.table 1.13.0 2020-07-24 [1] CRAN (R 4.0.2)
datawizard 0.1.0 2021-06-18 [1] CRAN (R 4.0.2)
DBI 1.1.0 2019-12-15 [1] CRAN (R 4.0.2)
dbplyr 1.4.4 2020-05-27 [1] CRAN (R 4.0.2)
dendextend 1.15.1 2021-05-08 [1] CRAN (R 4.0.2)
desc 1.4.0 2021-09-28 [1] CRAN (R 4.0.2)
devtools 2.4.2 2021-06-07 [1] CRAN (R 4.0.2)
digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.2)
doParallel 1.0.15 2019-08-02 [1] CRAN (R 4.0.2)
dplyr * 1.0.2 2020-08-18 [1] CRAN (R 4.0.2)
DT 0.19 2021-09-02 [1] CRAN (R 4.0.2)
effectsize 0.4.0 2020-10-25 [1] CRAN (R 4.0.2)
ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.2)
emmeans 1.6.3 2021-08-20 [1] CRAN (R 4.0.2)
estimability 1.3 2018-02-11 [1] CRAN (R 4.0.2)
eulerr 6.1.0 2020-03-09 [1] CRAN (R 4.0.2)
evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.1)
ezcox * 0.8.0 2020-09-25 [1] CRAN (R 4.0.2)
factoextra 1.0.7 2020-04-01 [1] CRAN (R 4.0.2)
fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.2)
farver 2.0.3 2020-01-16 [1] CRAN (R 4.0.2)
fastmap 1.0.1 2019-10-08 [1] CRAN (R 4.0.2)
forcats * 0.5.0 2020-03-01 [1] CRAN (R 4.0.2)
foreach 1.5.1 2020-10-15 [1] CRAN (R 4.0.2)
foreign 0.8-80 2020-05-24 [1] CRAN (R 4.0.2)
forestmodel 0.6.2 2020-07-19 [1] CRAN (R 4.0.2)
fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.2)
furrr 0.2.1 2020-10-21 [1] CRAN (R 4.0.2)
future 1.19.1 2020-09-22 [1] CRAN (R 4.0.2)
genefilter 1.70.0 2020-04-27 [1] Bioconductor
generics 0.0.2 2018-11-29 [1] CRAN (R 4.0.2)
GetoptLong 1.0.5 2020-12-15 [1] CRAN (R 4.0.2)
ggcorrplot 0.1.3 2019-05-19 [1] CRAN (R 4.0.2)
ggforce 0.3.2 2020-06-23 [1] CRAN (R 4.0.2)
ggplot2 * 3.3.5 2021-06-25 [1] CRAN (R 4.0.2)
ggplotify 0.0.5 2020-03-12 [1] CRAN (R 4.0.2)
ggpubr * 0.4.0 2020-06-27 [1] CRAN (R 4.0.2)
ggraph * 2.0.4 2020-11-16 [1] CRAN (R 4.0.2)
ggrepel 0.8.2 2020-03-08 [1] CRAN (R 4.0.2)
ggridges * 0.5.2 2020-01-12 [1] CRAN (R 4.0.2)
ggsci * 2.9 2018-05-14 [1] CRAN (R 4.0.2)
ggsignif 0.6.0 2019-08-08 [1] CRAN (R 4.0.2)
ggwordcloud * 0.5.0 2019-06-02 [1] CRAN (R 4.0.2)
GlobalOptions 0.1.2 2020-06-10 [1] CRAN (R 4.0.2)
globals 0.14.0 2020-11-22 [1] CRAN (R 4.0.2)
glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
graphlayouts 0.7.1 2020-10-26 [1] CRAN (R 4.0.2)
gridBase 0.4-7 2014-02-24 [1] CRAN (R 4.0.2)
gridExtra 2.3 2017-09-09 [1] CRAN (R 4.0.2)
gridGraphics 0.5-0 2020-02-25 [1] CRAN (R 4.0.2)
gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.2)
haven 2.3.1 2020-06-01 [1] CRAN (R 4.0.2)
highr 0.8 2019-03-20 [1] CRAN (R 4.0.2)
hms 0.5.3 2020-01-08 [1] CRAN (R 4.0.2)
htmltools 0.5.1.1 2021-01-22 [1] CRAN (R 4.0.2)
htmlwidgets 1.5.1 2019-10-08 [1] CRAN (R 4.0.2)
httpuv 1.5.4 2020-06-06 [1] CRAN (R 4.0.2)
httr 1.4.2 2020-07-20 [1] CRAN (R 4.0.2)
igraph 1.2.6 2020-10-06 [1] CRAN (R 4.0.2)
impute 1.62.0 2020-04-27 [1] Bioconductor
insight 0.14.2 2021-06-22 [1] CRAN (R 4.0.2)
IRanges 2.22.2 2020-05-21 [1] Bioconductor
iterators 1.0.12 2019-07-26 [1] CRAN (R 4.0.2)
jquerylib 0.1.3 2020-12-17 [1] CRAN (R 4.0.2)
jsonlite 1.7.2 2020-12-09 [1] CRAN (R 4.0.2)
km.ci 0.5-2 2009-08-30 [1] CRAN (R 4.0.2)
KMsurv 0.1-5 2012-12-03 [1] CRAN (R 4.0.2)
knitr * 1.36 2021-09-29 [1] CRAN (R 4.0.2)
labeling 0.3 2014-08-23 [1] CRAN (R 4.0.2)
later 1.3.0 2021-08-18 [1] CRAN (R 4.0.2)
lattice 0.20-41 2020-04-02 [1] CRAN (R 4.0.2)
lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.0.2)
listenv 0.8.0 2019-12-05 [1] CRAN (R 4.0.2)
lubridate 1.7.9 2020-06-08 [1] CRAN (R 4.0.2)
magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.0.2)
markdown 1.1 2019-08-07 [1] CRAN (R 4.0.2)
MASS 7.3-51.6 2020-04-26 [1] CRAN (R 4.0.2)
Matrix 1.2-18 2019-11-27 [1] CRAN (R 4.0.2)
matrixStats 0.57.0 2020-09-25 [1] CRAN (R 4.0.2)
mclust 5.4.7 2020-11-20 [1] CRAN (R 4.0.2)
memoise 2.0.0 2021-01-26 [1] CRAN (R 4.0.2)
mgcv 1.8-31 2019-11-09 [1] CRAN (R 4.0.2)
microbenchmark 1.4-7 2019-09-24 [1] CRAN (R 4.0.2)
mime 0.9 2020-02-04 [1] CRAN (R 4.0.2)
modelr 0.1.8 2020-05-19 [1] CRAN (R 4.0.2)
multcomp 1.4-14 2020-09-23 [1] CRAN (R 4.0.2)
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.2)
mvtnorm 1.1-1 2020-06-09 [1] CRAN (R 4.0.2)
nlme 3.1-148 2020-05-24 [1] CRAN (R 4.0.2)
NMF 0.23.0 2020-08-01 [1] CRAN (R 4.0.2)
openxlsx 4.2.2 2020-09-17 [1] CRAN (R 4.0.2)
parameters 0.14.0 2021-05-29 [1] CRAN (R 4.0.2)
patchwork * 1.0.1 2020-06-22 [1] CRAN (R 4.0.2)
pheatmap 1.0.12 2019-01-04 [1] CRAN (R 4.0.2)
pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.2)
pkgbuild 1.2.0 2020-12-15 [1] CRAN (R 4.0.2)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.2)
pkgload 1.2.3 2021-10-13 [1] CRAN (R 4.0.2)
pkgmaker 0.31.1 2020-03-19 [1] CRAN (R 4.0.2)
plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.2)
png 0.1-7 2013-12-03 [1] CRAN (R 4.0.2)
polyclip 1.10-0 2019-03-14 [1] CRAN (R 4.0.2)
prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.2)
processx 3.5.2 2021-04-30 [1] CRAN (R 4.0.2)
promises 1.1.1 2020-06-09 [1] CRAN (R 4.0.2)
ps 1.3.4 2020-08-11 [1] CRAN (R 4.0.2)
purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.2)
R.cache 0.14.0 2019-12-06 [1] CRAN (R 4.0.2)
R.methodsS3 1.8.1 2020-08-26 [1] CRAN (R 4.0.2)
R.oo 1.24.0 2020-08-26 [1] CRAN (R 4.0.2)
R.utils 2.10.1 2020-08-26 [1] CRAN (R 4.0.2)
R6 2.5.0 2020-10-28 [1] CRAN (R 4.0.2)
RColorBrewer 1.1-2 2014-12-07 [1] CRAN (R 4.0.2)
Rcpp 1.0.5 2020-07-06 [1] CRAN (R 4.0.2)
RCurl 1.98-1.2 2020-04-18 [1] CRAN (R 4.0.2)
readr * 1.3.1 2018-12-21 [1] CRAN (R 4.0.2)
readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.2)
registry 0.5-1 2019-03-05 [1] CRAN (R 4.0.2)
rematch2 2.1.2 2020-05-01 [1] CRAN (R 4.0.2)
remotes 2.3.0 2021-04-01 [1] CRAN (R 4.0.2)
reprex 0.3.0 2019-05-16 [1] CRAN (R 4.0.2)
reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.0.2)
rio 0.5.16 2018-11-26 [1] CRAN (R 4.0.2)
rjson 0.2.20 2018-06-08 [1] CRAN (R 4.0.2)
rlang 0.4.10 2020-12-30 [1] CRAN (R 4.0.2)
rmarkdown 2.11 2021-09-14 [1] CRAN (R 4.0.2)
rmdformats * 1.0.2 2021-03-09 [1] Github (juba/rmdformats@982432f)
Rmisc 1.5 2013-10-22 [1] CRAN (R 4.0.2)
rngtools 1.5 2020-01-23 [1] CRAN (R 4.0.2)
rprojroot 1.3-2 2018-01-03 [1] CRAN (R 4.0.2)
RSQLite 2.2.1 2020-09-30 [1] CRAN (R 4.0.2)
rstatix 0.6.0 2020-06-18 [1] CRAN (R 4.0.2)
rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.2)
rvcheck 0.1.8 2020-03-01 [1] CRAN (R 4.0.2)
rvest 0.3.6 2020-07-25 [1] CRAN (R 4.0.2)
S4Vectors 0.26.1 2020-05-16 [1] Bioconductor
sandwich 3.0-0 2020-10-02 [1] CRAN (R 4.0.2)
sass 0.3.1 2021-01-24 [1] CRAN (R 4.0.2)
scales 1.1.1 2020-05-11 [1] CRAN (R 4.0.2)
see * 0.6.4 2021-05-29 [1] CRAN (R 4.0.2)
sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.2)
shape 1.4.5 2020-09-13 [1] CRAN (R 4.0.2)
shiny 1.6.0 2021-01-25 [1] CRAN (R 4.0.2)
sigminer * 2.0.3 2021-07-18 [1] CRAN (R 4.0.2)
skmeans 0.2-13 2020-12-03 [1] CRAN (R 4.0.2)
slam 0.1-48 2020-12-03 [1] CRAN (R 4.0.2)
stringi 1.5.3 2020-09-09 [1] CRAN (R 4.0.2)
stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.2)
styler 1.3.2 2020-02-23 [1] CRAN (R 4.0.2)
survival * 3.1-12 2020-04-10 [1] CRAN (R 4.0.2)
survminer * 0.4.8 2020-07-25 [1] CRAN (R 4.0.2)
survMisc 0.5.5 2018-07-05 [1] CRAN (R 4.0.2)
testthat 3.1.0 2021-10-04 [1] CRAN (R 4.0.2)
TH.data 1.0-10 2019-01-21 [1] CRAN (R 4.0.2)
tibble * 3.0.3 2020-07-10 [1] CRAN (R 4.0.2)
tidygraph 1.2.0 2020-05-12 [1] CRAN (R 4.0.2)
tidyr * 1.1.2 2020-08-27 [1] CRAN (R 4.0.2)
tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.2)
tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 4.0.2)
tweenr 1.0.1 2018-12-14 [1] CRAN (R 4.0.2)
usethis 2.1.3 2021-10-27 [1] CRAN (R 4.0.2)
utf8 1.1.4 2018-05-24 [1] CRAN (R 4.0.2)
vctrs 0.3.4 2020-08-29 [1] CRAN (R 4.0.2)
viridis 0.5.1 2018-03-29 [1] CRAN (R 4.0.2)
viridisLite 0.3.0 2018-02-01 [1] CRAN (R 4.0.1)
withr 2.4.2 2021-04-18 [1] CRAN (R 4.0.2)
xfun 0.28 2021-11-04 [1] CRAN (R 4.0.2)
XML 3.99-0.5 2020-07-23 [1] CRAN (R 4.0.2)
xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.2)
xtable 1.8-4 2019-04-21 [1] CRAN (R 4.0.2)
yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.2)
zip 2.1.1 2020-08-27 [1] CRAN (R 4.0.2)
zoo 1.8-8 2020-05-02 [1] CRAN (R 4.0.2)
[1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library