构建自己的注释包

作者: 多啦A梦的时光机_648d | 来源:发表于2020-11-06 17:01 被阅读0次
egg_f <- "annuum.eggnog.annotations"
egg <- read.csv(egg_f, sep = "\t")
egg[egg==""]<-NA #这个代码来自花花的指导(将空行变成NA,方便下面的去除)
gene_info <- egg %>%
  dplyr::select(GID = query_name, GENENAME = `eggNOG.free.text.desc.`) %>% na.omit()

gterms <- egg %>%
  dplyr::select(query_name, GOs) %>% na.omit()

gene2go <- data.frame(GID = character(),
                      GO = character(),
                      EVIDENCE = character())

# 然后向其中填充:注意到有的query_name对应多个GO,因此我们以GO号为标准,每一行只能有一个GO号,但query_name和Evidence可以重复

for (row in 1:nrow(gterms)) {
  gene_id <- gterms[row, "query_name"][[1]]
  gene_terms <- str_split(gterms[row,"GOs"], ",", simplify = FALSE)[[1]]
  df_temp <- data_frame(GID = rep(gene_id, length(gene_terms)),
                        GO = gene_terms,
                        EVIDENCE = rep("IEA", length(gene_terms)))
  gene2go <- rbind(gene2go, df_temp)
}

gene2go = gene2go[!duplicated(gene2go$GID), ]

gene2ko <- egg %>%
  dplyr::select(GID = query_name, Ko = KEGG_ko) %>%
  na.omit()

options(stringsAsFactors = F)
if(F){
  # 需要下载 json文件(这是是经常更新的)
  # https://www.genome.jp/kegg-bin/get_htext?ko00001
  # 代码来自:http://www.genek.tv/course/225/task/4861/show
  library(jsonlite)
  library(purrr)
  library(RCurl)
  
  update_kegg <- function(json = "ko00001.json") {
    pathway2name <- tibble(Pathway = character(), Name = character())
    ko2pathway <- tibble(Ko = character(), Pathway = character())
    
    kegg <- fromJSON(json)
    
    for (a in seq_along(kegg[["children"]][["children"]])) {
      A <- kegg[["children"]][["name"]][[a]]
      
      for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
        B <- kegg[["children"]][["children"]][[a]][["name"]][[b]] 
        
        for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
          pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
          
          pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
          pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
          pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
          
          kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
          
          kos <- str_match(kos_info, "K[0-9]*")[,1]
          
          ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
        }
      }
    }
    
    save(pathway2name, ko2pathway, file = "kegg_info.RData")
  }
  
  update_kegg(json = "ko00001.json")
  
}

load(file = "kegg_info.RData")
gene2pathway <- gene2ko %>% left_join(ko2pathway, by = "Ko") %>% 
  dplyr::select(GID, Pathway) %>%
  na.omit()


tax_id = "4072"
genus = "Capsicum" 
species = "annuum"
author= "yuantao"
makeOrgPackage(gene_info=gene_info,
               go=gene2go,
               ko=gene2ko,
               pathway=gene2pathway,
               # gene2pathway=gene2pathway,
               version="0.0.2",
               maintainer=author,
               author=author,
               outputDir = ".",
               tax_id=tax_id,
               genus=genus,
               species=species,
               goTable="go")

相关文章

网友评论

    本文标题:构建自己的注释包

    本文链接:https://www.haomeiwen.com/subject/bcotbktx.html