Skip to content

Extract bibliographic data associated with nucleotide records

David Winter edited this page Sep 24, 2017 · 1 revision
library(rentrez)
library(XML)
library(dplyr)



# function to add column to df if not already included
fncols <- function(data, cname) {
  add <-cname[!cname%in%names(data)]
  if(length(add)!=0) data[add] <- NA
  data
}

# function to collate all publications associated with sequences
get_pub_info <- function(i){
  fetch2 <- entrez_fetch(db = "nucleotide", id = i, 
                       rettype = "gbc", retmode="xml", parsed = TRUE)
  xml_list2 <- xmlToList(fetch2)
  ref_list <- xml_list2$INSDSeq$INSDSeq_references
  # extract publication fields info
  authors <- unlist(ref_list$INSDReference$INSDReference_authors) %>% paste(collapse = "; ")
  title <- ref_list$INSDReference$INSDReference_title
  journal <- ref_list$INSDReference$INSDReference_journal
  year <-gsub(".*\\((.*)\\).*", "\\1", journal)
  pm_id <- ref_list$INSDReference$INSDReference_pubmed
  remark <- ref_list$INSDReference$INSDReference_remark
  # create data frame of information
  pub.data <- data.frame(i, authors, journal, year) 
  if(is.null(title)==FALSE) pub.data$title <- title
  if(is.null(pm_id)==FALSE) pub.data$pubmed_id <- pm_id
  if(is.null(remark)==FALSE) pub.data$remark <- remark
  pub.data <- fncols(pub.data, c("title", "pubmed_id", "remark"))
}
sequence_list <- c("AB687721.2", "AB600942.1", "AJ880277.1")
list_of_dfs <- lapply(sequence_list, get_pub_info) # run function on list of sequences
df_combine <- bind_rows(list_of_dfs)
colnames(df_combine)[1] <- "NCBI_idv"
df_combine <- tidyr::separate(df_combine, remark, c("text", "doi"), sep = "DOI:") # extract doi
df_combine <- tidyr::separate(df_combine, doi, c("doi", "text2"), sep = ";")
df_combine$remark <- paste(df_combine$text,df_combine$text2)
df_combine$text <- NULL
df_combine$text2 <- NULL

head(df_combine[, c("pubmed_id", "doi", "journal")])
  pubmed_id                  doi                                    journal
1  23135729 10.1128/JVI.02419-12         J. Virol. 87 (2), 1105-1114 (2013)
2   8709862                 <NA> Microbiol. Immunol. 40 (4), 271-275 (1996)
3  16525735                 <NA>           Virus Genes 32 (1), 49-57 (2006)