-
Notifications
You must be signed in to change notification settings - Fork 38
Extract bibliographic data associated with nucleotide records
David Winter edited this page Sep 24, 2017
·
1 revision
library(rentrez)
library(XML)
library(dplyr)
# function to add column to df if not already included
fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)]
if(length(add)!=0) data[add] <- NA
data
}
# function to collate all publications associated with sequences
get_pub_info <- function(i){
fetch2 <- entrez_fetch(db = "nucleotide", id = i,
rettype = "gbc", retmode="xml", parsed = TRUE)
xml_list2 <- xmlToList(fetch2)
ref_list <- xml_list2$INSDSeq$INSDSeq_references
# extract publication fields info
authors <- unlist(ref_list$INSDReference$INSDReference_authors) %>% paste(collapse = "; ")
title <- ref_list$INSDReference$INSDReference_title
journal <- ref_list$INSDReference$INSDReference_journal
year <-gsub(".*\\((.*)\\).*", "\\1", journal)
pm_id <- ref_list$INSDReference$INSDReference_pubmed
remark <- ref_list$INSDReference$INSDReference_remark
# create data frame of information
pub.data <- data.frame(i, authors, journal, year)
if(is.null(title)==FALSE) pub.data$title <- title
if(is.null(pm_id)==FALSE) pub.data$pubmed_id <- pm_id
if(is.null(remark)==FALSE) pub.data$remark <- remark
pub.data <- fncols(pub.data, c("title", "pubmed_id", "remark"))
}
sequence_list <- c("AB687721.2", "AB600942.1", "AJ880277.1")
list_of_dfs <- lapply(sequence_list, get_pub_info) # run function on list of sequences
df_combine <- bind_rows(list_of_dfs)
colnames(df_combine)[1] <- "NCBI_idv"
df_combine <- tidyr::separate(df_combine, remark, c("text", "doi"), sep = "DOI:") # extract doi
df_combine <- tidyr::separate(df_combine, doi, c("doi", "text2"), sep = ";")
df_combine$remark <- paste(df_combine$text,df_combine$text2)
df_combine$text <- NULL
df_combine$text2 <- NULL
head(df_combine[, c("pubmed_id", "doi", "journal")])
pubmed_id doi journal
1 23135729 10.1128/JVI.02419-12 J. Virol. 87 (2), 1105-1114 (2013)
2 8709862 <NA> Microbiol. Immunol. 40 (4), 271-275 (1996)
3 16525735 <NA> Virus Genes 32 (1), 49-57 (2006)