hamidghaedi / keggPath2Gene_Path2Modules

R script to map gene name to each KEGG pathways and Modules

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

keggPath2Gene_Path2Modules

R script to :

(1) Map gene names to each KEGG pathways. This code can help to get gene name in one column and KEGG pathway ID in another column (gen2path).

(2) Map gene names to KEGG modules (gene2module).

Several intermediate datasets also will be generated during the procedure that might be of interest, like a dataset on modules, their definition, and pathways that they are involved in that module (keggModule).

Mapping genes to KEGG pathways

library(org.Hs.eg.db)
library(dplyr)
library(tidyr)
library(jsonlite)
library(KEGGREST)


# get ENTREZ ID for gene symbols
gn <- select(org.Hs.eg.db,
       keys = unique(snv$Gene), # snv$Gene is a column in aa dataframe with filled with gene SYMBOL 
       columns=c("ENTREZID","SYMBOL","GENENAME"),
       keytype="SYMBOL")
gn$keggGeneID <- ifelse(!is.na(gn$ENTREZID), paste0("hsa:", gn$ENTREZID), NA)

# A function to retrive KEGG pathway names for a list of genes

getPathway <- function(gene){
  # Check if purrr and KEGGREST are installed; if not, install them
  if(!requireNamespace("KEGGREST", quietly = T)){
    install.packages("KEGGREST")
  }
  cat("Fetching pathway for gene:", gene, "\n")
  result <- tryCatch({
    q <- KEGGREST::keggGet(gene)
    if ("PATHWAY" %in% names(q[[1]])) {
      pathway <- q[[1]][["PATHWAY"]]
      kegg_id <- names(pathway)
      pathway_name <- pathway
      gene_id <- rep(gene, length(pathway))
    } else {
      kegg_id <- "N/A"
      pathway_name <- "N/A"
      gene_id <- gene
    }
    df <- data.frame(gene_id = gene_id, kegg_id = kegg_id, pathway_name = pathway_name)
    # Introduce a delay of 2 seconds between requests
    Sys.sleep(2)
    
    return(df)
  }, error=function(e){
    cat("Error fetching pathway for gene:", gene, "Error message:", e$message, "\n")
    return(data.frame(gene_id = "N/A", kegg_id = "N/A", pathway_name = "N/A"))
  })
  return(result)
}


keggPaths <- purrr::map_dfr(unique(gn$keggGeneID), getPathway)

Map gene names to KEGG modules

    # Retriving KEGG module data
    url = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002&format=json&filedir="
    download.file(url, destfile = "~/keggM.json", method = "curl")
    
    # reading json
    document <- fromJSON(txt=url)
    # parsing json
    df = data.frame(Reduce(rbind, document))
    # pathway modules
    pathMod = df[2,2]
    pathModDF = data.frame(Reduce(rbind, pathMod))
    pathway_modules = data.frame(name =c(), modules = c(), path = c(), p1Path = c(), p2Path = c())
    for(f in 1:dim(pathModDF)[1]){
      for(i in 1:dim(pathModDF)[1]){
        tmp = pathModDF[[2]][[i]]
        for(j in 1:dim(tmp)[1]){
          tmp2 = tmp[[2]][[j]]
          tmp2$module = substr(tmp2$name,1,7)
          tmp2$path = stringr::str_extract(string = tmp2$name, pattern = "(?<=\\[)[^{}]+(?=\\])")
          tmp2$path = sub("PATH:", "", tmp2$path)
          tmp2$p1Path = tmp[j,1]
          tmp2$p2Path = pathModDF[f,1]
          pathway_modules= rbind(pathway_modules, tmp2)
        }
      }
    }
    
    sigMod = df[3,2]
    sigModDF = data.frame(Reduce(rbind, sigMod))
    sig_modules = data.frame(name =c(), modules = c(), path = c(), p1Path = c(), p2Path = c())
    for(f in 1:dim(sigModDF)[1]){
      for(i in 1:dim(sigModDF)[1]){
        tmp = sigModDF[[2]][[i]]
        for(j in 1:dim(tmp)[1]){
          tmp2 = tmp[[2]][[j]]
          tmp2$module = substr(tmp2$name,1,7)
          tmp2$path = stringr::str_extract(string = tmp2$name, pattern = "(?<=\\[)[^{}]+(?=\\])")
          tmp2$path = sub("PATH:", "", tmp2$path)
          tmp2$p1Path = tmp[j,1]
          tmp2$p2Path = sigModDF[f,1]
          sig_modules= rbind(sig_modules, tmp2)
          
        }
      }
    }
    
    pathway_modules$module_type = "pathway"
    sig_modules$module_type = "signature"
    
    keggModule = rbind(pathway_modules, sig_modules) 
    
    # module matrix
    modMat = keggModule[,c(2,3)]
    modMat = data.frame(cbind(keggModule[,2], stringr::str_split_fixed(keggModule$path, " ", 7))) # 7 maximum pathways assigned to a Module
    modMat[modMat == ""] <- NA
    # long dataframe fro module and pathway
    path2Mod = data.frame(module = c(), path = c())
    for(i in 2:ncol(modMat)){
      tmp = modMat[,c(1,i)][!is.na(modMat[i]),]
      names(tmp) = c("module", "path")
      path2Mod = rbind(path2Mod, tmp)
    }
    
    #deduplication
    path2Mod= path2Mod[!duplicated(paste0(path2Mod$module, path2Mod$path)),]
    
    # joing datasets
    kegg = merge(path2Mod, mappedDF, by.x = "path" , by.y ="KEGG_ID" , all.x = TRUE)
    # removing duplicates in kegg dataset
    keggdedup = kegg[!duplicated(paste0(kegg$module, kegg$ENTREZ_ID)),]
    
    #gene2module dataset
    gene2module = aggregate(. ~ ENTREZ_ID, keggdedup[, c(2,3)], FUN = function(x) 
      toString(x), na.action = NULL)
     
     ```

About

R script to map gene name to each KEGG pathways and Modules