R script to :
(1) Map gene names to each KEGG pathways. This code can help to get gene name in one column and KEGG pathway ID in another column (gen2path
).
(2) Map gene names to KEGG modules (gene2module
).
Several intermediate datasets also will be generated during the procedure that might be of interest, like a dataset on modules, their definition, and pathways that they are involved in that module (keggModule
).
library(org.Hs.eg.db)
library(dplyr)
library(tidyr)
library(jsonlite)
library(KEGGREST)
# get ENTREZ ID for gene symbols
gn <- select(org.Hs.eg.db,
keys = unique(snv$Gene), # snv$Gene is a column in aa dataframe with filled with gene SYMBOL
columns=c("ENTREZID","SYMBOL","GENENAME"),
keytype="SYMBOL")
gn$keggGeneID <- ifelse(!is.na(gn$ENTREZID), paste0("hsa:", gn$ENTREZID), NA)
# A function to retrive KEGG pathway names for a list of genes
getPathway <- function(gene){
# Check if purrr and KEGGREST are installed; if not, install them
if(!requireNamespace("KEGGREST", quietly = T)){
install.packages("KEGGREST")
}
cat("Fetching pathway for gene:", gene, "\n")
result <- tryCatch({
q <- KEGGREST::keggGet(gene)
if ("PATHWAY" %in% names(q[[1]])) {
pathway <- q[[1]][["PATHWAY"]]
kegg_id <- names(pathway)
pathway_name <- pathway
gene_id <- rep(gene, length(pathway))
} else {
kegg_id <- "N/A"
pathway_name <- "N/A"
gene_id <- gene
}
df <- data.frame(gene_id = gene_id, kegg_id = kegg_id, pathway_name = pathway_name)
# Introduce a delay of 2 seconds between requests
Sys.sleep(2)
return(df)
}, error=function(e){
cat("Error fetching pathway for gene:", gene, "Error message:", e$message, "\n")
return(data.frame(gene_id = "N/A", kegg_id = "N/A", pathway_name = "N/A"))
})
return(result)
}
keggPaths <- purrr::map_dfr(unique(gn$keggGeneID), getPathway)
# Retriving KEGG module data
url = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002&format=json&filedir="
download.file(url, destfile = "~/keggM.json", method = "curl")
# reading json
document <- fromJSON(txt=url)
# parsing json
df = data.frame(Reduce(rbind, document))
# pathway modules
pathMod = df[2,2]
pathModDF = data.frame(Reduce(rbind, pathMod))
pathway_modules = data.frame(name =c(), modules = c(), path = c(), p1Path = c(), p2Path = c())
for(f in 1:dim(pathModDF)[1]){
for(i in 1:dim(pathModDF)[1]){
tmp = pathModDF[[2]][[i]]
for(j in 1:dim(tmp)[1]){
tmp2 = tmp[[2]][[j]]
tmp2$module = substr(tmp2$name,1,7)
tmp2$path = stringr::str_extract(string = tmp2$name, pattern = "(?<=\\[)[^{}]+(?=\\])")
tmp2$path = sub("PATH:", "", tmp2$path)
tmp2$p1Path = tmp[j,1]
tmp2$p2Path = pathModDF[f,1]
pathway_modules= rbind(pathway_modules, tmp2)
}
}
}
sigMod = df[3,2]
sigModDF = data.frame(Reduce(rbind, sigMod))
sig_modules = data.frame(name =c(), modules = c(), path = c(), p1Path = c(), p2Path = c())
for(f in 1:dim(sigModDF)[1]){
for(i in 1:dim(sigModDF)[1]){
tmp = sigModDF[[2]][[i]]
for(j in 1:dim(tmp)[1]){
tmp2 = tmp[[2]][[j]]
tmp2$module = substr(tmp2$name,1,7)
tmp2$path = stringr::str_extract(string = tmp2$name, pattern = "(?<=\\[)[^{}]+(?=\\])")
tmp2$path = sub("PATH:", "", tmp2$path)
tmp2$p1Path = tmp[j,1]
tmp2$p2Path = sigModDF[f,1]
sig_modules= rbind(sig_modules, tmp2)
}
}
}
pathway_modules$module_type = "pathway"
sig_modules$module_type = "signature"
keggModule = rbind(pathway_modules, sig_modules)
# module matrix
modMat = keggModule[,c(2,3)]
modMat = data.frame(cbind(keggModule[,2], stringr::str_split_fixed(keggModule$path, " ", 7))) # 7 maximum pathways assigned to a Module
modMat[modMat == ""] <- NA
# long dataframe fro module and pathway
path2Mod = data.frame(module = c(), path = c())
for(i in 2:ncol(modMat)){
tmp = modMat[,c(1,i)][!is.na(modMat[i]),]
names(tmp) = c("module", "path")
path2Mod = rbind(path2Mod, tmp)
}
#deduplication
path2Mod= path2Mod[!duplicated(paste0(path2Mod$module, path2Mod$path)),]
# joing datasets
kegg = merge(path2Mod, mappedDF, by.x = "path" , by.y ="KEGG_ID" , all.x = TRUE)
# removing duplicates in kegg dataset
keggdedup = kegg[!duplicated(paste0(kegg$module, kegg$ENTREZ_ID)),]
#gene2module dataset
gene2module = aggregate(. ~ ENTREZ_ID, keggdedup[, c(2,3)], FUN = function(x)
toString(x), na.action = NULL)
```