Deep learning reveals the metabolic vulnerability of hypoxic tumor cells and the critical function of FLAD1 mediated tumor hypoxia adaption
Dependencies
library(yardstick)
library(dplyr)
library(ggplot2)
library(ggprism)
library(patchwork)
library(ggpubr)
Network analysis
Sample specific GEMs were download in mat format and convert to xml
format by Matlab. Then we applied the R package Met2Graph to extract
enzyme network from GSMs. In this network, enzymes are the nodes
connected by edges represented by metabolites. Two enzymes are connected
if they catalyze two reactions which produce or consume a specific
metabolite. The code for data format convertion can be found in
code/convert_gems.R
.
We use Graph2Vec, an algorithm designed to learn vector
representations for whole graphs, to compute the embedding vector for
each sample’s enzyme network. Then we use cosine similarity to calculate
the similarity between pairs of enzyme networks (the code for
calculating graph similarity can be found in
code/graph_sim.ipynb
). Based on the Buffa hypoxia score,
samples are divided into hypoxic (score greater than the upper quartile)
and non-hypoxic groups (score less than the lower quartile). We compared
the differences in network similarity between samples with the same
hypoxia status and those with different hypoxia statuses.
<- data.table::fread("/home/data/sdb/wt/enzyme_net_sim.csv",data.table = F)
sim $V1 <- NULL
simrownames(sim) <- colnames(sim)
<- data.table::fread("~/hypoxia_target/data/Pancancer hypoxia scores.txt",
hypo_score data.table = F)
<- hypo_score %>%
hypo_score_summ group_by(tumour_type) %>%
summarise(up_quan = quantile(Buffa_hypoxia_score_intra_tumour_type)[4],
down_quan = quantile(Buffa_hypoxia_score_intra_tumour_type)[2]) %>%
ungroup()
<- hypo_score %>%
hypo_score mutate(patient_id = gsub("[.]","-",patient_id)) %>%
select(patient_id, tumour_type, Buffa_hypoxia_score_intra_tumour_type) %>%
rename(score = Buffa_hypoxia_score_intra_tumour_type) %>%
left_join(.,hypo_score_summ) %>%
rowwise() %>%
mutate(hypo_type = case_when(
< down_quan ~ "no-hypo",
score > up_quan ~ "hypo",
score TRUE ~ "others"
%>% ungroup()
)) #> Joining with `by = join_by(tumour_type)`
<- hypo_score %>% filter(hypo_type != "others")
hypo_score
<- hypo_score$patient_id[which(hypo_score$hypo_type == "hypo")]
hypo_samples <- hypo_score$patient_id[which(hypo_score$hypo_type == "no-hypo")]
no_hypo_samples
<- which(substr(colnames(sim),1,12) %in% hypo_samples)
hypo_idx <- which(substr(colnames(sim),1,12) %in% no_hypo_samples)
no_hypo_idx
<- sim[c(hypo_idx,no_hypo_idx),c(hypo_idx,no_hypo_idx)]
filter_sim $row_sample <- rownames(filter_sim)
filter_sim<- paste0(rownames(filter_sim),"_",colnames(filter_sim))
need_ids
<- filter_sim %>%
filter_sim select(row_sample,everything())
<- filter_sim %>%
filter_sim ::pivot_longer(cols = 2:ncol(filter_sim), names_to = "col_sample",
tidyrvalues_to = "sim")
<- data.frame(
dt ids = c(hypo_samples,no_hypo_samples),
type = c(rep("hypo",length(hypo_samples)),rep("nohypo",length(no_hypo_samples)))
)<- filter_sim %>%
filter_sim mutate(row_id = substr(row_sample,1,12),
col_id = substr(col_sample,1,12)) %>%
left_join(.,
%>% rename(row_id = ids,row_type=type)) %>%
dt left_join(.,dt %>% rename(col_id = ids,col_type=type))
#> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
<- filter_sim %>%
filter_sim mutate(comb_type = paste0(row_type,"-",col_type))
<- data.frame(
cancer_type ids = c(unique(filter_sim$row_id),unique(filter_sim$col_id))
%>% distinct_all()
) $cancer <- EasyBioinfo::get_cancer_type(cancer_type$ids)
cancer_type
<- filter_sim %>%
filter_sim left_join(.,
%>% rename(row_id = ids,row_cancer=cancer)) %>%
cancer_type left_join(.,cancer_type %>% rename(col_id = ids,col_cancer=cancer))
#> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
library(igraph)
#>
#> Attaching package: 'igraph'
#>
#> The following objects are masked from 'package:dplyr':
#>
#> as_data_frame, groups, union
#>
#> The following objects are masked from 'package:stats':
#>
#> decompose, spectrum
#>
#> The following object is masked from 'package:base':
#>
#> union
<- graph_from_data_frame(filter_sim %>% select(1,2),directed = F)
dt <- simplify(dt)
dt <- as_data_frame(dt) %>%
dt_unique mutate(ids = paste0(from,"_",to))
<- filter_sim %>%
dt mutate(combid = paste0(row_sample,"_",col_sample)) %>%
filter(combid %in% dt_unique$ids) %>%
filter(row_cancer == col_cancer) %>%
filter(row_id != col_id)
<- dt %>%
dt mutate(type2 = ifelse(col_type != row_type,"Type2","Type1"))
<- ggboxplot(dt,x="type2",y="sim",xlab = F,
p ylab = "Similarity")+
stat_compare_means(label="p.format")
facet(p, facet.by = "row_cancer", nrow = 3)
We also calculated metrics to measure the importance of gene nodes in the enzyme network, including degree centrality, which indicates the number of connections a gene node has with every other gene; betweenness centrality, quantifying the number of times a gene node appears on the shortest path between two other nodes; eigenvector centrality which quantifies a node’s influence in the network based on its connections to other high-scoring gene nodes and closeness centrality, which calculates the length of the shortest path between a gene and all other genes in the network. These metrics were calculated by R package igraph:
###计算所有样本网络的节点重要性得分
<- list.files("/home/data/sdb/wt/MetGraph/EnzGraphs/",
all_files pattern = "tsv")
library(doParallel)
library(foreach)
<- parallel::makeCluster(
my.cluster 60,
type = "PSOCK"
)::registerDoParallel(cl = my.cluster)
doParallel<- foreach(
res i = all_files,
.packages = c("igraph","dplyr")
%dopar% {
) <- paste0("/home/data/sdb/wt/MetGraph/EnzGraphs/",i)
tmp <- gsub("_enzymes_based_graph.tsv","",i)
sample <- data.table::fread(tmp, data.table = F)
dt <- dt %>% dplyr::select(from,to)
dt <- igraph::graph_from_data_frame(dt)
dt_g # Compute the degree centrality for our graph G.
<- igraph::degree(dt_g, v = V(dt_g), mode = "all", normalized = FALSE)
degr_cent <- igraph::degree(dt_g, v = V(dt_g), mode = "all", normalized = TRUE)
norm_degr_cent # Compute the eigenvector centrality of our network
<- igraph::eigen_centrality(dt_g, directed = TRUE)
eign_cent <- eign_cent$vector
eign_cent # Compute the closeness centraility
<- igraph::closeness(dt_g, normalized = FALSE)
clos_cent <- igraph::closeness(dt_g, normalized = TRUE)
norm_clos_cent # Compute betweeness centrality
<- igraph::betweenness(dt_g, directed = TRUE, normalized = FALSE)
betw_cent <- igraph::betweenness(dt_g, directed = TRUE, normalized = TRUE)
norm_betw_cent <- data.frame(vertex = names(V(dt_g)),
all_centrality degree = degr_cent,
norm_degree = norm_degr_cent,
eigen = eign_cent,
closeness = clos_cent,
norm_closeness = norm_clos_cent,
betweeness = betw_cent,
norm_betweeness = norm_betw_cent)
$sample <- sample
all_centrality
all_centrality
}::stopCluster(cl = my.cluster)
parallel
<- bind_rows(res)
res saveRDS(res, file = "/home/data/sdb/wt/all_centrality.rds")
We retrieved the gene-wise vector of centrality measures of each sample sepcific enzyme network and computed the euclidean distance between these vectors of pairwise samples.
####基于(共有)节点重要性分布计算相似性
<- readRDS("/home/data/sdb/wt/all_centrality.rds")
res
<- data.table::fread("~/hypoxia_target/data/Pancancer hypoxia scores.txt",
hypo_score data.table = F)
<- hypo_score %>%
hypo_score_summ group_by(tumour_type) %>%
summarise(up_quan = quantile(Buffa_hypoxia_score_intra_tumour_type)[4],
down_quan = quantile(Buffa_hypoxia_score_intra_tumour_type)[2]) %>%
ungroup()
<- hypo_score %>%
hypo_score mutate(patient_id = gsub("[.]","-",patient_id)) %>%
select(patient_id, tumour_type, Buffa_hypoxia_score_intra_tumour_type) %>%
rename(score = Buffa_hypoxia_score_intra_tumour_type) %>%
left_join(.,hypo_score_summ) %>%
rowwise() %>%
mutate(hypo_type = case_when(
< down_quan ~ "no-hypo",
score > up_quan ~ "hypo",
score TRUE ~ "others"
%>% ungroup()
)) <- hypo_score %>% filter(hypo_type != "others")
hypo_score <- hypo_score %>% filter(hypo_type != "others")
hypo_score <- hypo_score$patient_id[which(hypo_score$hypo_type == "hypo")]
hypo_samples <- hypo_score$patient_id[which(hypo_score$hypo_type == "no-hypo")]
no_hypo_samples
<- res %>% filter(substr(sample,1,12) %in% hypo_score$patient_id)
res
<- function(dt, var){
cal_sim <- dt %>%
dt_met select(vertex,sample,var) %>%
::pivot_wider(names_from = sample, values_from = var) %>%
tidyras.data.frame()
rownames(dt_met) <- dt_met$vertex
$vertex <- NULL
dt_met<- t(dt_met)
dt_met <- dist(dt_met) %>% as.matrix()
dis return(dis)
}
<- cal_sim(dt = res, var = "norm_betweeness")
bet_dis <- cal_sim(dt = res, var = "norm_closeness")
clo_dis <- cal_sim(dt = res, var = "eigen")
eig_dis <- cal_sim(dt = res, var = "norm_degree")
deg_dis
<- list(bet_dis = bet_dis, clo_dis = clo_dis,
sim_res eig_dis = eig_dis, deg_dis = deg_dis)
saveRDS(sim_res, file = "/home/data/sdb/wt/met_net_dist.rds")
The the differences in this network distance between samples with the same hypoxia status and those with different hypoxia statuses were calculated.
rm(list = ls())
gc()
#> used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells 8911763 476.0 24229170 1294.0 21150860 1129.6
#> Vcells 23520092 179.5 162669053 1241.1 203303115 1551.1
<- data.table::fread("~/hypoxia_target/data/Pancancer hypoxia scores.txt",
hypo_score data.table = F)
<- hypo_score %>%
hypo_score_summ group_by(tumour_type) %>%
summarise(up_quan = quantile(Buffa_hypoxia_score_intra_tumour_type)[4],
down_quan = quantile(Buffa_hypoxia_score_intra_tumour_type)[2]) %>%
ungroup()
<- hypo_score %>%
hypo_score mutate(patient_id = gsub("[.]","-",patient_id)) %>%
select(patient_id, tumour_type, Buffa_hypoxia_score_intra_tumour_type) %>%
rename(score = Buffa_hypoxia_score_intra_tumour_type) %>%
left_join(.,hypo_score_summ) %>%
rowwise() %>%
mutate(hypo_type = case_when(
< down_quan ~ "no-hypo",
score > up_quan ~ "hypo",
score TRUE ~ "others"
%>% ungroup()
)) #> Joining with `by = join_by(tumour_type)`
<- hypo_score %>% filter(hypo_type != "others")
hypo_score <- hypo_score %>% filter(hypo_type != "others")
hypo_score <- hypo_score$patient_id[which(hypo_score$hypo_type == "hypo")]
hypo_samples <- hypo_score$patient_id[which(hypo_score$hypo_type == "no-hypo")]
no_hypo_samples
####
<- readRDS("/home/data/sdb/wt/met_net_dist.rds")
sim_res <- sim_res$bet_dis
bet_dis <- sim_res$clo_dis
clo_dis <- sim_res$eig_dis
eig_dis <- sim_res$deg_dis
deg_dis
<- function(sim_dt,hypo,nohypo){
get_sim <- as.data.frame(sim_dt)
sim_dt $row_sample <- rownames(sim_dt)
sim_dt<- sim_dt %>%
sim_dt select(row_sample,everything())
<- sim_dt %>%
sim_dt ::pivot_longer(cols = 2:ncol(sim_dt), names_to = "col_sample",
tidyrvalues_to = "sim")
<- data.frame(
dt ids = c(hypo,nohypo),
type = c(rep("hypo",length(hypo)),rep("nohypo",length(nohypo)))
)<- sim_dt %>%
sim_dt mutate(row_id = substr(row_sample,1,12),
col_id = substr(col_sample,1,12)) %>%
left_join(.,
%>% rename(row_id = ids,row_type=type)) %>%
dt left_join(.,dt %>% rename(col_id = ids,col_type=type))
<- data.frame(
cancer_type ids = c(unique(sim_dt$row_id),unique(sim_dt$col_id))
%>% distinct_all()
) $cancer <- EasyBioinfo::get_cancer_type(cancer_type$ids)
cancer_type
<- sim_dt %>%
sim_dt left_join(.,
%>% rename(row_id = ids,row_cancer=cancer)) %>%
cancer_type left_join(.,cancer_type %>% rename(col_id = ids,col_cancer=cancer))
<- graph_from_data_frame(sim_dt %>% select(1,2),directed = F)
dt <- simplify(dt)
dt <- as_data_frame(dt) %>% mutate(ids = paste0(from,"_",to))
dt_unique
<- sim_dt %>%
dt mutate(combid = paste0(row_sample,"_",col_sample)) %>%
filter(combid %in% dt_unique$ids) %>%
filter(row_cancer == col_cancer) %>%
filter(row_id != col_id)
<- dt %>%
dt mutate(type2 = ifelse(col_type != row_type,"Type2","Type1"))
return(dt)
}
<- get_sim(bet_dis, hypo_samples, no_hypo_samples)
bet_dt #> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
#> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
<- get_sim(clo_dis, hypo_samples, no_hypo_samples)
clo_dt #> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
#> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
<- get_sim(eig_dis, hypo_samples, no_hypo_samples)
eig_dt #> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
#> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
<- get_sim(deg_dis, hypo_samples, no_hypo_samples)
deg_dt #> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
#> Joining with `by = join_by(row_id)`
#> Joining with `by = join_by(col_id)`
<- ggboxplot(bet_dt,x="type2",y="sim",xlab = F,
p ylab = "Distance", title = "Betweeness")+
stat_compare_means(label="p.format")
facet(p, facet.by = "row_cancer", nrow = 3)
<- ggboxplot(clo_dt,x="type2",y="sim",xlab = F,
p ylab = "Distance", title = "Closeness")+
stat_compare_means(label="p.format")
facet(p, facet.by = "row_cancer", nrow = 3)
<- ggboxplot(eig_dt,x="type2",y="sim",xlab = F,
p ylab = "Distance", title = "Eigenvector")+
stat_compare_means(label="p.format")
facet(p, facet.by = "row_cancer", nrow = 3)
<- ggboxplot(deg_dt,x="type2",y="sim",xlab = F,
p ylab = "Distance", title = "Degree")+
stat_compare_means(label="p.format")
facet(p, facet.by = "row_cancer", nrow = 3)
For each gene, we calculated the difference in the previously mentioned centrality metrics between hypoxic and non-hypoxic tumors.
<- readRDS("/home/data/sdb/wt/all_centrality.rds")
res <- data.table::fread("data/Pancancer hypoxia scores.txt",data.table = F)
hypo_score <- hypo_score %>%
hypo_score_summ group_by(tumour_type) %>%
summarise(up_quan = quantile(Buffa_hypoxia_score_intra_tumour_type)[4],
down_quan = quantile(Buffa_hypoxia_score_intra_tumour_type)[2]) %>%
ungroup()
<- hypo_score %>%
hypo_score mutate(patient_id = gsub("[.]","-",patient_id)) %>%
select(patient_id, tumour_type, Buffa_hypoxia_score_intra_tumour_type) %>%
rename(score = Buffa_hypoxia_score_intra_tumour_type) %>%
left_join(.,hypo_score_summ) %>%
rowwise() %>%
mutate(hypo_type = case_when(
< down_quan ~ "no-hypo",
score > up_quan ~ "hypo",
score TRUE ~ "others"
%>% ungroup()
)) <- hypo_score %>% filter(hypo_type != "others")
hypo_score
<- res %>%
res filter(substr(sample,1,12) %in% hypo_score$patient_id)
<- res %>%
res mutate(patient_id = substr(sample, 1, 12)) %>%
left_join(.,
%>% select(patient_id,hypo_type))
hypo_score <- res %>% group_by(vertex) %>%
gene_summ summarise(counts = length(unique(hypo_type))) %>% ungroup() %>%
filter(counts > 1)
<- res %>%
res filter(vertex %in% gene_summ$vertex)
##
<- unique(res$vertex)
all_genes <- vector("list",length(all_genes))
diff_res for (i in 1:length(diff_res)){
<- res %>% filter(vertex == all_genes[i])
dt <- colnames(dt)[2:8]
metrics <- data.frame(
dt_res metrics = metrics,
p_value = NA,
diff = NA
)for (j in 1:nrow(dt_res)){
<- dt %>% select(dt_res$metrics[j], hypo_type) %>% na.omit()
tmp if (nrow(tmp) < 2){
$p_value[j] <- NA
dt_res$diff[j] <- NA
dt_reselse{
}<- wilcox.test(get(dt_res$metrics[j]) ~ hypo_type, data = dt)
dt_tmp $p_value[j] <- dt_tmp$p.value
dt_res$diff[j] <- median(dt[,dt_res$metrics[j]][which(dt$hypo_type == "hypo")],na.rm = T) - median(dt[,dt_res$metrics[j]][which(dt$hypo_type == "no-hypo")],na.rm = T)
dt_res
}
}$padj <- p.adjust(dt_res$p_value, method = "fdr")
dt_res$gene <- all_genes[i]
dt_res<- dt_res
diff_res[[i]] message("Complete ",i,"\n")
}<- bind_rows(diff_res)
diff_res saveRDS(diff_res, file = "~/hypoxia_target/data/hypo_gene_net_diff.rds")
We defined a gene as having a significant difference if it had an FDR value less than 0.05 (Wilcoxon rank-sum test) and the median of the centrality metric in hypoxic tumor samples was greater than in non-hypoxic tumor samples. We filtered for genes with at least one significantly up regulated centrality metric in hypoxic state and used these genes to perform over representation enrichment analysis of metabolic pathways.
rm(list = ls())
gc()
#> used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells 8913634 476.1 24229170 1294.0 24229170 1294.0
#> Vcells 25391465 193.8 126067738 961.9 203303115 1551.1
<- readRDS("~/hypoxia_target/data/hypo_gene_net_diff.rds")
diff_res <- diff_res %>%
diff_res_sig filter(metrics %in% c("norm_betweeness","norm_closeness","eigen","norm_degree")) %>%
group_by(gene) %>%
summarise(sig_counts = sum(diff > 0 & padj < 0.05)) %>%
ungroup() %>% filter(sig_counts >= 1) ##只要有一个显著
<- unique(diff_res_sig$gene)
sig_genes <- genekitr::transId(sig_genes,"symbol")
trans #>
#> Some ID occurs one-to-many match, like "ENSG00000137843, ENSG00000140521, ENSG00000140650"...
#> 99.39% genes are mapped to symbol
<- trans[!duplicated(trans$input_id),]
trans <- inner_join(
diff_res_sig
diff_res_sig,%>% rename(gene = input_id)
trans
)#> Joining with `by = join_by(gene)`
<- readRDS("~/meta_target/data/kegg_all_pathway.rds")
kegg <- kegg %>%
kegg select(pathway,genes)
colnames(kegg) <- c("gs_name","input_id")
<- genekitr::transId(unique(kegg$input_id), transTo = "entrez")
enz_id #> Some ID occurs one-to-many match, like "A2M, AAAS, ABCB11"...
#> 99.81% genes are mapped to entrezid
<- left_join(kegg,enz_id)
kegg #> Joining with `by = join_by(input_id)`
#> Warning in left_join(kegg, enz_id): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 51 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#> "many-to-many"` to silence this warning.
<- kegg %>%
kegg select(gs_name,entrezid) %>% rename(entrez_gene = entrezid)
<- genekitr::transId(unique(diff_res_sig$symbol), transTo = "entrez")
enz_id #> Some ID occurs one-to-many match, like "ABL1, ACAT2, ADA2"...
#> 100% genes are mapped to entrezid
<- clusterProfiler::enricher(enz_id$entrezid, TERM2GENE=kegg,
em pvalueCutoff = 1, qvalueCutoff =1) ##use all gene as background
<- em %>% as.data.frame()
em_res
<- readRDS("~/meta_target/data/kegg_all_pathway.rds")
kegg <- kegg %>%
kegg filter(grepl("Metabolism",class) | grepl("metabolism",pathway)) %>%
mutate(pathway = gsub(" \\- Homo sapiens \\(human\\)","",pathway))
<- em_res %>%
em_res mutate(pathway = gsub(" -.+","",ID)) %>%
filter(pathway %in% kegg$pathway)
library(enrichplot)
#>
#> Attaching package: 'enrichplot'
#> The following object is masked from 'package:ggpubr':
#>
#> color_palette
library(ggplot2)
<- filter(em, ID %in% em_res$ID)
em @result$Description <- gsub(" -.+","",em@result$Description)
embarplot(em, showCategory=15) +
scale_y_discrete(labels=function(x) stringr::str_wrap(x, width=40))
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
In silico perturbation
The idea is to use the activity of cell death-related gene sets to divide cells into dying cells and non-dying cells based on single-cell data, and then fine-tune the GeneFormer, which is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology, to predict cell dying state. We then determined the genes whose in silico deletion in hypoxic cells or non-hypoxic cells from non-dying cell state significantly shifted the fine- tuned Geneformer cell embeddings towards the dying cell states:
The raw expression counts matrix for lung cancer single-cell data was downloaded from the corresponding study and processed using the R package Seurat. To only retain high-quality data, we removed all cells that have fewer than 250 genes with mapped reads and contain more than 15% of mitochondrial specific reads. Then, we used TCfinder to predict cancer cells and retained only the cancer cells for downstream analysis.
library(Seurat)
<- readRDS("/home/data/sdb/wt/lung_cancer/RNA_rawcounts_matrix.rds")
lung
###remove cell with high MT
<- CreateSeuratObject(counts=lung)
scRNA_obj "percent_mt"]] <- PercentageFeatureSet(scRNA_obj,
scRNA_obj[[pattern = "^Mt\\.|^MT\\.|^mt\\.|^Mt-|^MT-|^mt-")
<- subset(scRNA_obj,
scRNA_obj subset = nFeature_RNA > 250 & percent_mt < 15)
##cancer cell pre
<- GetAssayData(scRNA_obj, assay="RNA", slot = "counts")
lung <- TCfinder::pathway_score(expr_data = lung,
ps normalized = FALSE)
rownames(ps) <- colnames(lung)
<- callr::r(func = function(conda_env, ps){
res ::use_python(paste0(conda_env,"/bin/python"))
reticulate<- TCfinder::predict_cell(path_score = ps)
predict_result return(predict_result)
args = list(conda_env = "/home/wt/miniconda3/envs/tcfinder/",
},ps = ps))
########
<- res$barcode[which(res$cell_type == "tumor")]
tumor_cells <- lung[,tumor_cells]
tumor saveRDS(tumor,file = "/home/data/sdb/wt/lung_cancer/lung_tumor_cell.rds")
###test
<- ReadMtx(
exp mtx = "/home/data/sdc/wt/single_cell/raw_counts/cell_research_2020/lung/LC_counts/matrix.mtx",
features = "/home/data/sdc/wt/single_cell/raw_counts/cell_research_2020/lung/LC_counts/genes.tsv",
cells = "/home/data/sdc/wt/single_cell/raw_counts/cell_research_2020/lung/LC_counts/barcodes.tsv"
)
###remove cell with high MT
<- CreateSeuratObject(counts=exp)
scRNA_obj "percent_mt"]] <- PercentageFeatureSet(scRNA_obj,
scRNA_obj[[pattern = "^Mt\\.|^MT\\.|^mt\\.|^Mt-|^MT-|^mt-")
<- subset(scRNA_obj,
scRNA_obj subset = nFeature_RNA > 250 & percent_mt < 15)
###find cancer cell
<- GetAssayData(scRNA_obj, assay="RNA", slot = "counts")
lung <- TCfinder::pathway_score(expr_data = exp,
ps normalized = FALSE)
rownames(ps) <- colnames(exp)
<- callr::r(func = function(conda_env, ps){
res ::use_python(paste0(conda_env,"/bin/python"))
reticulate<- TCfinder::predict_cell(path_score = ps)
predict_result return(predict_result)
args = list(conda_env = "/home/wt/miniconda3/envs/tcfinder/",
},ps = ps))
<- res$barcode[which(res$cell_type == "tumor")]
tumor_cells <- exp[,tumor_cells]
tumor
saveRDS(tumor,file = "/home/data/sdb/wt/lung_cancer/lung_tumor_cell_test.rds")
AUCell was used to calculate activity score of hypoxia gene set and cell death related gene set for each cell. Gaussian mixture model (GMM) was used to assign cells into high- and low-score group based on cells’ activity scores.
library(AUCell)
<- readRDS("/home/data/sdb/wt/lung_cancer/lung_tumor_cell.rds")
lung_tumor <- fgsea::gmtPathways("~/CHPF/exmaple/Hypoxia_geneset.gmt")
hypo_pathway <- hypo_pathway %>% unlist() %>% unname()
all_hypo_genes <- list(
all_hypo_genes hypo_sig = all_hypo_genes
)<- AUCell_run(lung_tumor, all_hypo_genes)
cells_AUC
<- AUCell_exploreThresholds(cells_AUC,
cells_assignment plotHist=TRUE, assign=TRUE)
<- getAUC(cells_AUC) %>% t() %>% as.data.frame()
auc_score
###
#GMM clustering
<- Mclust(auc_score[,1], G = 2)
fit_GMM <- list(fit_GMM["classification"])
Cluster $cell <- rownames(auc_score)
auc_score$cluster <- Cluster[[1]][[1]]
auc_scoresaveRDS(auc_score, file = "~/hypoxia_target/data/auc_hypo_cluster.rds")
###death
<- readRDS("/home/data/sdb/wt/lung_cancer/lung_tumor_cell.rds")
lung_tumor <- fgsea::gmtPathways("data/pcd_pathway.gmt")
pcd_pathway <- pcd_pathway %>% unlist() %>% unname()
all_pcd_genes <- list(
all_pcd_genes pcd_sig = all_pcd_genes
)<- AUCell_run(lung_tumor, all_pcd_genes)
cells_AUC
<- AUCell_exploreThresholds(cells_AUC,
cells_assignment plotHist=TRUE, assign=TRUE)
<- getAUC(cells_AUC) %>% t() %>% as.data.frame()
auc_score
#GMM clustering
<- Mclust(auc_score[,1], G = 2)
fit_GMM <- list(fit_GMM["classification"])
Cluster $cell <- rownames(auc_score)
auc_score$cluster <- Cluster[[1]][[1]]
auc_scoresaveRDS(auc_score, file = "~/hypoxia_target/data/auc_pcd_cluster.rds")
For GeneFormer fine-tuning, the data was saved as loom
format:
<- readRDS("/home/data/sdb/wt/lung_cancer/lung_tumor_cell.rds")
lung_tumor ###id 转化
<- rownames(lung_tumor)
id <- genekitr::transId(id, transTo = "ens")
ems_id #> Some ID occurs one-to-many match, like "A2M, A2MP1, AAAS"...
#> 44.48% genes are mapped to ensembl
<- ems_id[!duplicated(ems_id$input_id),]
ems_id <- ems_id[!duplicated(ems_id$ensembl),]
ems_id <- lung_tumor[ems_id$input_id,]
lung_tumor rownames(lung_tumor) <- ems_id$ensembl
<- readRDS("~/hypoxia_target/data/auc_pcd_cluster.rds")
auc_pcd_cluster <- readRDS("~/hypoxia_target/data/auc_hypo_cluster.rds")
auc_hypo_cluster
library(ggpubr)
ggboxplot(auc_pcd_cluster,x="cluster",y="pcd_sig")
ggboxplot(auc_hypo_cluster,x="cluster",y="hypo_sig")
The data from K.H. et al was used as training data, and 30% of the training data was applied to monitor training process and tune hyper-parameters.
<- auc_pcd_cluster %>%
auc_pcd_cluster mutate(pcd_type = ifelse(cluster == 2, "pcd","no_pcd")) %>%
select(cell, pcd_type)
<- auc_hypo_cluster %>%
auc_hypo_cluster mutate(hypo_type = ifelse(cluster == 2, "hypo","no_hypo")) %>%
select(cell, hypo_type)
library(Seurat,lib.loc = "~/seuratV4/")
<- data.frame(
meta cell = colnames(lung_tumor)
%>% inner_join(.,auc_pcd_cluster) %>%
) inner_join(.,auc_hypo_cluster)
<- CreateSeuratObject(counts = lung_tumor)
all_pcd_obj <- all_pcd_obj@meta.data
meta1 $orig.ident <- rownames(meta1)
meta1<- left_join(
meta1 %>% rename(orig.ident = cell)
meta1, meta %>% rename(cell_state_hypo = hypo_type,
) cell_state_pcd = pcd_type) %>% as.data.frame()
rownames(meta1) <- meta1$orig.ident
@meta.data <- meta1
all_pcd_obj###抽样训练集和测试集
set.seed(202403211)
<- rep("Test",ncol(all_pcd_obj))
pcd_idx sample(1:length(pcd_idx),length(pcd_idx)*0.7,replace = FALSE)] <- "Train"
pcd_idx[
$train_test <- pcd_idx
all_pcd_obj
library(SeuratDisk)
SaveLoom(all_pcd_obj,
filename = "/home/data/sdb/wt/pcd_obj.loom",
gene_col_name = "ensembl_id",
overwrite = TRUE)
library(loomR)
<- loomR::connect(filename = "/home/data/sdb/wt/pcd_obj.loom",
lfile mode = "r+",skip.validate = T)
$add.col.attribute(list(n_counts = lfile$col.attrs$nCount_RNA[]),
lfileoverwrite = TRUE)
$add.col.attribute(list(individual = 1:ncol(all_pcd_obj)),
lfileoverwrite = TRUE)
"row_attrs"]]
lfile[["col_attrs"]]
lfile[[$close_all() lfile
The hyperparameter tuning process is implemented by the Ray Tune
framework and used AUC score as evaluation metric. After hyper-parameter
tuning, we set following hyper-parameters: 3.152009e-04 for learning
rate, cosine for learning rate scheduler, 635.01 for warmup steps, 0.279
for weight decay and 24 for batch size. The Fine-tuned model were
independently validated on data sets from Qian, J. et al. The code for
data creation, fine-tuning, training and validation can be found in
script/dataset.ipynb
,
script/fine_tuning.ipynb
,
script/gene_former.ipynb
.
<- data.table::fread("~/hypoxia_target/data/geneformer_preds.csv",
pre data.table = F) %>%
select(-V1)
<- pre %>%
pre mutate(predictions = gsub("\\[|\\]","",predictions)) %>%
::separate_wider_delim(cols = predictions,delim = ", ",
tidyrnames = c("pre1","pre2")) %>%
mutate(pre1 = as.numeric(pre1),
pre2 = as.numeric(pre2))
library(yardstick)
<- pre %>%
pre mutate(truth = ifelse(label_ids == 0, "Class1","Class2"),
predicted = ifelse(pred_ids == 0, "Class1","Class2")) %>%
rename(Class1 = pre1,
Class2 = pre2)
$truth <- factor(pre$truth)
pre
<- pr_auc(pre, truth, Class1)[".estimate"] %>%
pr unlist() %>% unname() %>% round(.,2)
<- roc_auc(pre, truth, Class1)[".estimate"] %>%
roc unlist() %>% unname() %>% round(.,2)
<- roc_curve(pre, truth, Class1) %>%
p1 ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path() +
coord_fixed(xlim = 0:1, ylim = 0:1) +
theme_bw() +
annotate(geom="text", x=0.75, y=0.7, label=paste0("ROC-AUC: ",roc),
size=5)
p1
In order to do in silico perturb, we merge the training set and test set to increase the sample size:
library(AUCell)
library(mclust)
<- function(exp_path,pathway_path){
cal_score <- readRDS(exp_path)
tumor_exp <- fgsea::gmtPathways(pathway_path)
pathway <- pathway %>% unlist() %>% unname()
all_genes <- list(
all_genes sig = all_genes
)<- AUCell_run(tumor_exp, all_genes)
cells_AUC <- getAUC(cells_AUC) %>% t() %>% as.data.frame()
auc_score <- Mclust(auc_score[,1], G = 2)
fit_GMM <- list(fit_GMM["classification"])
Cluster $cell <- rownames(auc_score)
auc_score$cluster <- Cluster[[1]][[1]]
auc_scorereturn(auc_score)
}##hypo
<- cal_score(exp_path = "/home/data/sdb/wt/lung_cancer/lung_tumor_cell_test.rds",
hypo_score pathway_path = "~/CHPF/exmaple/Hypoxia_geneset.gmt")
saveRDS(hypo_score,file = "~/hypoxia_target/data/hypo_score_test.rds")
##death
<- cal_score(exp_path = "/home/data/sdb/wt/lung_cancer/lung_tumor_cell_test.rds",
death_score pathway_path = "~/hypoxia_target/data/pcd_pathway.gmt")
saveRDS(death_score,file = "~/hypoxia_target/data/pcd_score_test.rds")
##combine train and test data
<- readRDS("/home/data/sdb/wt/lung_cancer/lung_tumor_cell_test.rds")
lung_tumor ###id 转化
<- rownames(lung_tumor)
id <- genekitr::transId(id, transTo = "ens")
ems_id <- ems_id[!duplicated(ems_id$input_id),]
ems_id <- ems_id[!duplicated(ems_id$ensembl),]
ems_id <- lung_tumor[ems_id$input_id,]
lung_tumor rownames(lung_tumor) <- ems_id$ensembl
<- readRDS("~/hypoxia_target/data/pcd_score_test.rds")
auc_pcd_cluster <- readRDS("~/hypoxia_target/data/hypo_score_test.rds")
auc_hypo_cluster <- auc_pcd_cluster %>%
auc_pcd_cluster mutate(pcd_type = ifelse(cluster == 2, "pcd","no_pcd")) %>%
select(cell, pcd_type)
<- auc_hypo_cluster %>%
auc_hypo_cluster mutate(hypo_type = ifelse(cluster == 2, "hypo","no_hypo")) %>%
select(cell, hypo_type)
<- data.frame(
meta cell = colnames(lung_tumor)
%>% inner_join(.,auc_pcd_cluster) %>%
) inner_join(.,auc_hypo_cluster)
<- CreateSeuratObject(counts = lung_tumor)
all_pcd_obj_test <- all_pcd_obj_test@meta.data
meta1 $orig.ident <- rownames(meta1)
meta1<- left_join(
meta1 %>% rename(orig.ident = cell)
meta1, meta %>% rename(cell_state_hypo = hypo_type,
) cell_state_pcd = pcd_type) %>% as.data.frame()
rownames(meta1) <- meta1$orig.ident
@meta.data <- meta1
all_pcd_obj_test###
<- merge(all_pcd_obj,all_pcd_obj_test)
combined library(SeuratDisk)
SaveLoom(combined,
filename = "/home/data/sdb/wt/pcd_obj_comb.loom",
gene_col_name = "ensembl_id",
overwrite = TRUE)
<- loomR::connect(filename = "/home/data/sdb/wt/pcd_obj_comb.loom",
lfile mode = "r+",skip.validate = T)
$add.col.attribute(list(n_counts = lfile$col.attrs$nCount_RNA[]),
lfileoverwrite = TRUE)
$add.col.attribute(list(individual = 1:ncol(combined)),
lfileoverwrite = TRUE)
"row_attrs"]]
lfile[["col_attrs"]]
lfile[[$close_all() lfile
In silico perturbation was achieved by removing the given gene from
the rank value encoding of the given single-cell transcriptome and
quantifying the cosine similarity between the original and perturbed
cell embeddings to determine the predicted deleterious impact of
deleting that gene in that cell. This impact was compared with the
random distribution drawn from the other genes to calculate p value and
corresponding FDR. The code for perturbation can be found in
script/in_silico_perturbation.ipynb
.
We first performed in silico perturbation on all single cells, identified genes whose deletion could significantly shift cells from the non-dying state to the dying state, and compared these genes with the dependent genes obtained by CRIPSR.
<- data.table::fread("~/hypoxia_target/data/all_pcd_res_all_cell.csv",
res data.table = F)
<- res %>%
res mutate(type = ifelse(Shift_to_goal_end > 0 & Goal_end_FDR < 0.0001,
"Hit","Not-Hit"))
<- read.csv("/home/data/sdb/wt/Model_2023Q4.csv")
cell_info <- cell_info %>%
cell_info filter((OncotreeLineage != "Normal") & (OncotreePrimaryDisease != "Non-Cancerous"))
<- cell_info %>% filter(OncotreeLineage == "Lung") %>%
lung_cell filter(OncotreeSubtype %in% c("Lung Adenocarcinoma","Non-Small Cell Lung Cancer",
"Lung Squamous Cell Carcinoma",
"Lung Adenosquamous Carcinoma"))
<- data.table::fread("/home/data/sdb/wt/CRISPRGeneDependency_2023Q4.csv",
dep_dt data.table = F)
colnames(dep_dt)[2:ncol(dep_dt)] <- gsub("\\s*\\([^\\)]+\\)","",
colnames(dep_dt)[2:ncol(dep_dt)])
<- dep_dt %>%
dep_dt ::pivot_longer(cols = colnames(dep_dt)[2:ncol(dep_dt)],names_to = "gene",
tidyrvalues_to = "score")
<- dep_dt %>% filter(V1 %in% lung_cell$ModelID) %>%
lung_dep filter(!is.na(score))
<- lung_dep %>% filter(gene %in% res$Gene_name)
dt <- inner_join(dt, res %>% rename(gene = Gene_name) %>% select(gene, type))
dt #> Joining with `by = join_by(gene)`
<- dt %>% group_by(gene) %>%
dt_summ summarise(pos_cell = sum(score > 0.8),
type = unique(type)) %>% ungroup()
<- dt_summ %>% mutate(depmap_type = case_when(
dt_summ > 20 ~ "yes",
pos_cell TRUE ~ "no"
))
###
library(ggprism)
library(ggplot2)
<- dt_summ %>%
df group_by(depmap_type,type) %>%
summarise(gene_counts = length(unique(gene))) %>%
ungroup() %>%
mutate(type = ifelse(type == "Hit","In Silico Perturbation Hit",
"In Silico Perturbation Not Hit"))
#> `summarise()` has grouped output by 'depmap_type'. You can override using the
#> `.groups` argument.
table(dt_summ$type,dt_summ$depmap_type) %>% chisq.test()
#>
#> Pearson's Chi-squared test with Yates' continuity correction
#>
#> data: .
#> X-squared = 731.72, df = 1, p-value < 2.2e-16
ggplot(data=df,aes(x=depmap_type,y=gene_counts,fill=type))+
geom_bar(stat = "identity",position="fill")+
theme_prism()+
labs(y="Percent of cases (%)",title = "Chi-squared test, P < 2.2e-16")+
scale_fill_manual(values=c("#FE0000","#00FDFE"))+
scale_x_discrete(labels=c("CRIPSR Not Hit","CRIPSR Hit"))+
theme(axis.title.x = element_blank())
ggplot(data=dt_summ, aes(x=pos_cell, group=type, fill=type)) +
geom_density(adjust=1.5, alpha=.4) +
theme_prism()+
labs(x="Counts of CRPSPR Positive Cell",y="Density")
We filtered the results of in silico perturbation by the following criteria: 1. Only retain metabolism-related genes (KEGG),2. The effect value obtained by delete the gene under hypoxia state was > 0, and the FDR was < 0.05 (the effect value refers to the magnitude of cell embedding shift from the non-dying state to the dying state after delete the specific gene), 3. Fold change (absolute value) of the effect size between hypoxic and non-hypoxic state was > 1, 4. Based on TCGA data, fold change of mRNA expression between hypoxic and non-hypoxic tumors was > 1 and corresponding FDR was < 0.05.
<- data.table::fread("~/hypoxia_target/data/0331_pcd_hypo_res.csv",
hypo_res data.table = F)
<- data.table::fread("~/hypoxia_target/data/0331pcd_no_hypo_res.csv",
no_hypo_res data.table = F)
<- inner_join(
all_res %>% select(Gene_name,Shift_to_goal_end,Goal_end_vs_random_pval,
hypo_res %>%
Goal_end_FDR) filter(nchar(Gene_name) > 0),
%>% select(Gene_name,Shift_to_goal_end) %>%
no_hypo_res rename(Shift_no_hypo = Shift_to_goal_end) %>%
filter(nchar(Gene_name) > 0)
%>%
) mutate(fc = Shift_to_goal_end/Shift_no_hypo)
<- readRDS("~/meta_target/data/kegg_all_pathway.rds")
kegg_all_pathway <- kegg_all_pathway %>%
kegg_all_pathway filter(grepl("Metabolism",class) | grepl("metabolism",pathway)) %>%
mutate(pathway = gsub(" \\- Homo sapiens \\(human\\)","",pathway))
<- unique(kegg_all_pathway$genes)
kegg_genes
<- all_res %>%
all_res_meta filter(Gene_name %in% kegg_genes) %>%
filter(Shift_to_goal_end > 0 & Goal_end_FDR < 0.05) %>%
mutate(abs_fc = abs(fc)) %>%
arrange(Goal_end_FDR, desc(abs_fc)) %>%
mutate(stat= (-log10(Goal_end_FDR)) * abs_fc) %>%
arrange(desc(stat))
saveRDS(all_res_meta,file = "~/hypoxia_target/data/all_res_meta0331.rds")
#####分析这些基因的表达
<- all_res_meta %>%
all_res_meta filter(abs_fc > 1)
<- genekitr::transId(all_res_meta$Gene_name, transTo = "ens")
ems_id <- left_join(
all_res_meta
all_res_meta,%>% rename(Gene_name = input_id)
ems_id
)<- data.table::fread("/home/data/sdb/wt/gencode.v22.annotation.gene.probeMap",data.table = F)
mapping <- mapping %>%
mapping filter(gsub("[.].+","",id) %in% ems_id$ensembl)
<- left_join(
all_res_meta
all_res_meta,%>% select(gene,id) %>%
mapping mutate(id2 = gsub("[.].+","",id)) %>%
rename(ensembl = id2)
)<- all_res_meta %>%
all_res_meta filter(!is.na(id))
<- all_res_meta %>%
dup_gene group_by(Gene_name) %>%
summarise(counts = n()) %>% ungroup() %>% filter(counts > 1)
<- all_res_meta %>%
all_res_meta filter(!(Gene_name %in% dup_gene$Gene_name) | ((Gene_name %in% dup_gene$Gene_name) & (Gene_name == gene)))
<- data.table::fread("/home/data/sdb/wt/TCGA-LUAD.htseq_fpkm-uq.tsv.gz",
luad data.table = F)
<- luad %>% filter(Ensembl_ID %in% all_res_meta$id)
luad rownames(luad) <- luad$Ensembl_ID
$Ensembl_ID <- NULL
luad<- data.table::fread("/home/data/sdb/wt/TCGA-LUSC.htseq_fpkm-uq.tsv.gz",
lusc data.table = F)
<- lusc %>% filter(Ensembl_ID %in% all_res_meta$id)
lusc rownames(lusc) <- lusc$Ensembl_ID
$Ensembl_ID <- NULL
lusc
<- c(colnames(luad),colnames(lusc))
all_samples <- all_samples[as.numeric(substr(all_samples,14,15)) < 11] %>% na.omit()
all_cancers
<- bind_cols(luad,lusc)
all_cancer_exp <- all_cancer_exp[,all_cancers]
all_cancer_exp <- t(all_cancer_exp) %>% as.data.frame()
all_cancer_exp <- apply(all_cancer_exp,2,function(x){(2^x)-1}) %>% as.data.frame()
all_cancer_exp
<- data.table::fread("data/Pancancer hypoxia scores.txt",data.table = F)
hypo_score <- hypo_score %>%
hypo_score mutate(patient_id = gsub("[.]","-",patient_id)) %>%
filter(tumour_type %in% c("LUAD","LUSC")) %>%
select(patient_id, tumour_type, Buffa_hypoxia_score_intra_tumour_type) %>%
mutate(type = case_when(
== "LUAD" & Buffa_hypoxia_score_intra_tumour_type > 21 ~ "hypo",
tumour_type == "LUAD" & Buffa_hypoxia_score_intra_tumour_type < (-21) ~ "normal",
tumour_type == "LUSC" & Buffa_hypoxia_score_intra_tumour_type > 15 ~ "hypo",
tumour_type == "LUSC" & Buffa_hypoxia_score_intra_tumour_type < (-13) ~ "normal",
tumour_type TRUE ~ "others"
))$sample <- substr(rownames(all_cancer_exp),1,12)
all_cancer_exp<- left_join(
all_cancer_exp
all_cancer_exp,%>% rename(sample = patient_id) %>% select(sample,type)
hypo_score
)<- all_cancer_exp %>% select(sample,type,everything())
all_cancer_exp <- all_cancer_exp %>% filter(type != "others")
all_cancer_exp <- data.frame(ids = colnames(all_cancer_exp)[3:ncol(all_cancer_exp)],
diff_res exp_fc = NA, p_value = NA)
for (i in 1:nrow(diff_res)){
<- all_cancer_exp %>%
dt select(type,diff_res$ids[i]) %>%
rename(exp = 2)
$type <- factor(dt$type,levels = c("hypo","normal"))
dt<- wilcox.test(exp ~ type, data=dt)
dt_res $exp_fc[i] <- mean(dt$exp[dt$type == "hypo"]) / mean(dt$exp[dt$type == "normal"])
diff_res$p_value[i] <- dt_res$p.value
diff_res
}
<- left_join(
all_res_meta
all_res_meta,%>% rename(id = ids)
diff_res
)$exp_diff_fdr <- p.adjust(all_res_meta$p_value,"fdr")
all_res_meta
saveRDS(all_res_meta, file = "~/hypoxia_target/data/filter_target0331.rds")
For each gene we define differential expression statistics and perturbation statistics:Differential Expression Stat = (-log10(Exp-FDR)) * Exp_FC; Perturbation Stat = (-log10(Perturbation-FDR)) * Abs_FC, and sort the genes according to these two statistics.
<- readRDS("~/hypoxia_target/data/filter_target0331.rds")
all_res_meta <- all_res_meta %>%
tt filter(exp_diff_fdr < 0.05 & exp_fc > 1) %>%
mutate(pertur_pre_stat = (-log10(Goal_end_FDR)) * abs_fc) %>%
mutate(exp_pre_stat = (-log10(exp_diff_fdr)) * exp_fc) %>%
mutate(pertur_stat = log10(pertur_pre_stat),
exp_stat = log10(exp_pre_stat)) %>%
arrange(desc(pertur_stat),desc(exp_stat))
library(ggtext)
<- tt %>% select(Gene_name,exp_stat,pertur_stat) %>%
dt rename(`Differential Expression Stat`=exp_stat,
`Perturbation Stat`=pertur_stat) %>%
::pivot_longer(cols = c("Differential Expression Stat",
tidyr"Perturbation Stat"),
names_to = "Score Type",values_to = "score")
ggbarplot(dt,x="Gene_name",y="score",fill = "Score Type",
position = position_dodge(0.9),xlab = F,ylab = "log10(Score)",
lab.size = 2) +
theme(axis.text.x = element_markdown(angle = 90, vjust = 0.5, hjust = 1))
Finally, we obtained 110 metabolic genes that met the criterias. We performed metabolic pathway over representation enrichment analysis on these genes and found that the most significantly enriched metabolic pathway was oxidative phosphorylation.
<- readRDS("~/meta_target/data/kegg_all_pathway.rds")
kegg <- kegg %>%
kegg select(pathway,genes)
colnames(kegg) <- c("gs_name","input_id")
<- genekitr::transId(unique(kegg$input_id), transTo = "entrez")
enz_id #> Some ID occurs one-to-many match, like "A2M, AAAS, ABCB11"...
#> 99.81% genes are mapped to entrezid
<- left_join(kegg,enz_id)
kegg #> Joining with `by = join_by(input_id)`
#> Warning in left_join(kegg, enz_id): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 51 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#> "many-to-many"` to silence this warning.
<- kegg %>%
kegg select(gs_name,entrezid) %>% rename(entrez_gene = entrezid)
<- genekitr::transId(unique(tt$Gene_name), transTo = "entrez")
enz_id #> Some ID occurs one-to-many match, like "B3GALNT2, CES1, CHURC1-FNTB"...
#> 100% genes are mapped to entrezid
<- clusterProfiler::enricher(enz_id$entrezid, TERM2GENE=kegg,
em pvalueCutoff = 1, qvalueCutoff =1)##use all gene as background
<- em %>% as.data.frame()
em_res
<- readRDS("~/meta_target/data/kegg_all_pathway.rds")
kegg <- kegg %>%
kegg filter(grepl("Metabolism",class) | grepl("metabolism",pathway)) %>%
mutate(pathway = gsub(" \\- Homo sapiens \\(human\\)","",pathway))
<- em_res %>%
em_res mutate(pathway = gsub(" -.+","",ID)) %>%
filter(pathway %in% kegg$pathway) %>%
filter(p.adjust < 0.05)
library(enrichplot)
library(ggplot2)
<- filter(em, ID %in% em_res$ID)
em @result$Description <- gsub(" -.+","",em@result$Description)
embarplot(em, showCategory=15) +
scale_y_discrete(labels=function(x) stringr::str_wrap(x, width=40))
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
Subsequently, we extracted genes that have protein interactions with oxidative phosphorylation pathway genes from the STRING database to form a protein-protein interaction network.
library(rbioapi)
<- readRDS("~/meta_target/data/kegg_all_pathway.rds")
kegg <- kegg %>%
kegg filter(grepl("Metabolism",class) | grepl("metabolism",pathway)) %>%
mutate(pathway = gsub(" \\- Homo sapiens \\(human\\)","",pathway))
<- kegg %>% filter(pathway == "Oxidative phosphorylation")
oxi <- oxi %>% select(genes)
oxi <- oxi$genes
proteins ## map our protein IDs
<- rba_string_map_ids(ids = proteins, species = 9606)
proteins_mapped <- rba_string_interaction_partners(ids = proteins_mapped$stringId,
int_partners species = 9606,
required_score = 700)
saveRDS(int_partners, file = "~/hypoxia_target/data/ppi_oxi.rds")
The network has a total of 413 gene nodes, of which 33 genes are in our screening list. We extracted the smallest subgraph connecting these 33 gene nodes by minimum spanning tree based approximation (KB) algorithm.
<- readRDS("~/hypoxia_target/data/filter_target0331.rds")
all_res_meta <- readRDS("~/hypoxia_target/data/ppi_oxi.rds")
ppi_oxi <- all_res_meta %>%
all_res_meta filter(exp_diff_fdr < 0.05 & exp_fc > 1) %>%
mutate(pertur_stat = (-log10(Goal_end_FDR)) * log10(abs_fc)) %>%
mutate(exp_stat = (-log10(exp_diff_fdr)) * log10(exp_fc)) %>%
arrange(desc(pertur_stat),desc(exp_stat))
<- ppi_oxi %>%
dt filter(score > 0.9) %>%
select(preferredName_A, preferredName_B)
<- data.frame(
dt_in genes = c(unique(c(dt$preferredName_A,dt$preferredName_B)))
%>%
) mutate(Type = ifelse(genes %in% all_res_meta$Gene_name,"yes","no"))
$Type <- factor(dt_in$Type, levels = c("yes","no"))
dt_in
library(ggnetwork)
#> Registered S3 method overwritten by 'ggnetwork':
#> method from
#> fortify.igraph ggtree
<- simplify(graph_from_data_frame(d = dt,
graph vertices = dt_in,
directed = FALSE))
<- ggnetwork::ggnetwork(graph, arrow.gap = 0,
n layout= igraph::layout_with_fr(graph))
ggplot(n, aes(x = .data$x, y = .data$y,
xend = .data$xend,
yend = .data$yend)) +
geom_edges(color = "grey75",
alpha = 0.5,
curvature = 0,
show.legend = FALSE) +
geom_nodes(aes(color = .data$Type)) +
::theme_blank() ggnetwork
####提取子图
library(igraph)
library(SteinerNet)
library(qgraph)
<- graph_from_data_frame(dt)
ppi_g <- dt_in$genes[which(dt_in$Type == "yes")]
in_genes <- steinertree(type = "KB", terminals = in_genes,
out graph = ppi_g, color = FALSE)
<- as_data_frame(out[[1]])
out_dt ##plot
<- graph_from_data_frame(out_dt,directed = F)
out_g
<- readRDS("~/meta_target/data/kegg_all_pathway.rds")
kegg <- kegg %>%
kegg filter(grepl("Metabolism",class) | grepl("metabolism",pathway)) %>%
mutate(pathway = gsub(" \\- Homo sapiens \\(human\\)","",pathway))
<- kegg %>% filter(pathway == "Oxidative phosphorylation")
oxi <- oxi %>% select(genes)
oxi <- in_genes[which((in_genes %in% oxi$genes))]
oxi_gene <- in_genes[which(!(in_genes %in% oxi$genes))]
not_oxi_gene V(out_g)[oxi_gene]$color <- "#11325D"
V(out_g)[not_oxi_gene]$color <- "#F5A673"
<- get.edgelist(out_g,names=FALSE)
e <- qgraph.layout.fruchtermanreingold(e,vcount=vcount(out_g))
l par(mar=c(0,0,0,0)+.1)
plot(out_g,vertex.size=8, edge.arrow.size=0.3,
vertex.label = V(out_g)$name,
vertex.label.dist=1.5,layout=l, vertex.label.cex = 0.8)
We found that there are two genes in this subgraph that are not genes in the oxidative phosphorylation pathway (FLAD1 and SUCLG1). Since oxidative phosphorylation is also important in normal cells, these two genes may be potential hypoxia targets.
<- readRDS("~/hypoxia_target/data/filter_target0331.rds")
all_res_meta <- all_res_meta %>%
tt filter(Gene_name %in% in_genes) %>%
filter(exp_diff_fdr < 0.05 & exp_fc > 1) %>%
mutate(pertur_pre_stat = (-log10(Goal_end_FDR)) * abs_fc) %>%
mutate(exp_pre_stat = (-log10(exp_diff_fdr)) * exp_fc) %>%
mutate(pertur_stat = log10(pertur_pre_stat),
exp_stat = log10(exp_pre_stat)) %>%
arrange(desc(pertur_stat),desc(exp_stat))
<- tt %>% select(Gene_name,exp_stat,pertur_stat) %>%
dt rename(`Differential Expression Stat`=exp_stat,
`Perturbation Stat`=pertur_stat) %>%
::pivot_longer(cols = c("Differential Expression Stat",
tidyr"Perturbation Stat"),
names_to = "Score Type",values_to = "score")
<- "FLAD1"
x_to_highlight ggbarplot(dt,x="Gene_name",y="score",fill = "Score Type",
position = position_dodge(0.9),xlab = F,ylab = "log10(Score)") +
theme(axis.text.x = element_markdown(angle = 90, vjust = 0.5, hjust = 1))+
scale_x_discrete(labels = ~ case_when(
%in% x_to_highlight ~ paste0("<span style='color: red'><b>", .x, "</b></span>"),
.x TRUE ~ .x
))
We further compared the expression differences of these genes in tumors and normal tissues:
<- readRDS("~/hypoxia_target/data/filter_target0331.rds")
all_res_meta <- all_res_meta %>%
tt filter(Gene_name %in% in_genes) %>%
filter(exp_diff_fdr < 0.05 & exp_fc > 1) %>%
mutate(pertur_pre_stat = (-log10(Goal_end_FDR)) * abs_fc) %>%
mutate(exp_pre_stat = (-log10(exp_diff_fdr)) * exp_fc) %>%
mutate(pertur_stat = log10(pertur_pre_stat),
exp_stat = log10(exp_pre_stat)) %>%
arrange(desc(pertur_stat),desc(exp_stat))
<- data.table::fread("/home/data/sdb/wt/TCGA-LUAD.htseq_fpkm-uq.tsv.gz",
luad data.table = F)
<- luad %>% filter(Ensembl_ID %in% tt$id)
luad rownames(luad) <- luad$Ensembl_ID
$Ensembl_ID <- NULL
luad<- data.table::fread("/home/data/sdb/wt/TCGA-LUSC.htseq_fpkm-uq.tsv.gz",
lusc data.table = F)
<- lusc %>% filter(Ensembl_ID %in% tt$id)
lusc rownames(lusc) <- lusc$Ensembl_ID
$Ensembl_ID <- NULL
lusc
<- bind_cols(luad,lusc)
lung_exp <- t(lung_exp) %>% as.data.frame()
lung_exp $sample <- rownames(lung_exp)
lung_exp<- lung_exp %>%
lung_exp mutate(sample_type = ifelse(as.numeric(substr(sample,14,15)) < 11,"cancer","normal"))
<- lung_exp %>% select(sample,sample_type,everything())
lung_exp <- lung_exp %>% tidyr::pivot_longer(cols = 3:ncol(lung_exp),
lung_exp names_to = "gene",
values_to = "exp")
<- left_join(
lung_exp
lung_exp,%>% select(id,Gene_name) %>% rename(gene=id)
tt
)#> Joining with `by = join_by(gene)`
<- data.frame(ids = unique(lung_exp$Gene_name),
diff_res p_value = NA)
for (i in 1:nrow(diff_res)){
<- lung_exp %>%
dt select(sample_type,Gene_name,exp) %>%
filter(Gene_name == diff_res$ids[i])
$sample_type <- factor(dt$sample_type,levels = c("cancer","normal"))
dt<- wilcox.test(exp ~ sample_type, data=dt)
dt_res $p_value[i] <- dt_res$p.value
diff_res
}<- diff_res %>% arrange(p_value)
diff_res <- diff_res %>%
diff_res mutate(group1 = "cancer", group2 = "normal") %>%
rename(Gene_name = ids) %>%
mutate(y.position = 25)
$p.adj <- p.adjust(diff_res$p_value,"fdr")
diff_res$p.adj <- format(diff_res$p.adj,digits = 2)
diff_res
$Gene_name <- factor(lung_exp$Gene_name,levels = diff_res$Gene_name)
lung_exp$sample_type <- factor(lung_exp$sample_type,levels = c("normal","cancer"))
lung_exp
library(ggprism)
ggplot(lung_exp, aes(x = Gene_name, y = exp)) +
geom_boxplot(aes(fill = sample_type)) +
add_pvalue(diff_res, x = "Gene_name")+
theme_prism()+
rotate_x_text(90)+
labs(y="log2(FPKM-UQ + 1)")+
theme(axis.title.x = element_blank())