This is useful number if you have hundreds of PMIDs and need specific fields from the pubmed/medline citation.
#################################
# Retrieve pubmed citation data #
# Ewen Harrison #
# March 2013 #
# www.datasurg.net #
#################################
# Sample data
pmid<-c(8339401,8485120,8427851,3418853,3195585,2643302,7947591,8274474,8243847)
#-----------------------------------------------------------------------------------------------------------------------------------
# Batch list of PMIDs into groups of 200
fn_batch<-function(pmid=pmid,n=200){
require(plyr)
b<-list()
max=ceiling(length(pmid)/n)
for (i in 1:max){
b[[i]]<-pmid[(1+((i-1)*n)):(n*i)]
}
b[[max]]<-b[[max]][!is.na(b[[max]])] # drop missing values in the final block
c<-llply(b, function(a){ # convert from list to comma separted list
paste(a, collapse=",")
})
return(c)
}
#-----------------------------------------------------------------------------------------------------------------------------------
# Run
pmid_batch<-fn_batch(pmid)
#-----------------------------------------------------------------------------------------------------------------------------------
# Function to fetch, parse and extract medline citation data. Use wrapper below.
get_pubmed<-function(f){
require(RCurl)
require(XML)
require(plyr)
# Post PMID (UID) numbers
url<-paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=", f, "&retmode=XML", sep="")
# Medline fetch
g<-llply(url, .progress = progress_tk(label="Fetching and parse Pubmed records ..."), function(x){
xmlTreeParse(x, useInternalNodes=TRUE)
})
# Using given format and xml tree structure, paste here the specific fields you wish to extract
k<-ldply(g, .progress = progress_tk(label="Creating dataframe ..."), function(x){
a<-getNodeSet(x, "/PubmedArticleSet/*/MedlineCitation")
pmid_l<-sapply (a, function(a) xpathSApply(a, "./PMID", xmlValue))
pmid<-lapply(pmid_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
nct_id_l<-sapply (a, function(a) xpathSApply(a, "./Article/DataBankList/DataBank/AccessionNumberList/AccessionNumber", xmlValue))
nct_id<-lapply(nct_id_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
year_l<-sapply (a, function(a) xpathSApply(a, "./Article/Journal/JournalIssue/PubDate/Year", xmlValue))
year<-lapply(year_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
month_l<-sapply (a, function(a) xpathSApply(a, "./Article/Journal/JournalIssue/PubDate/Month", xmlValue))
month<-lapply(month_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
day_l<-sapply (a, function(a) xpathSApply(a, "./Article/Journal/JournalIssue/PubDate/Day", xmlValue))
day<-lapply(day_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
year1_l<-sapply (a, function(a) xpathSApply(a, "./Article/ArticleDate/Year", xmlValue))
year1<-lapply(year1_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
month1_l<-sapply (a, function(a) xpathSApply(a, "./Article/ArticleDate/Month", xmlValue))
month1<-lapply(month1_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
day1_l<-sapply (a, function(a) xpathSApply(a, "./Article/ArticleDate/Day", xmlValue))
day1<-lapply(day1_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
medlinedate_l<-sapply (a, function(a) xpathSApply(a, "./Article/Journal/JournalIssue/PubDate/MedlineDate", xmlValue))
medlinedate<-lapply(medlinedate_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
journal_l<-sapply (a, function(a) xpathSApply(a, "./Article/Journal/Title", xmlValue))
journal<-lapply(journal_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
title_l<-sapply (a, function(a) xpathSApply(a, "./Article/ArticleTitle", xmlValue))
title<-lapply(title_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
author_l<-sapply (a, function(a) xpathSApply(a, "./Article/AuthorList/Author[1]/LastName", xmlValue))
author<-lapply(author_l, function(x) ifelse(class(x)=="list" | class(x)=="NULL", NA, x))
return(data.frame(nct_pm=unlist(nct_id), pmid=unlist(pmid), year=unlist(year), month=unlist(month), day=unlist(day),
year1=unlist(year1), month1=unlist(month1), day1=unlist(day1), medlinedate=unlist(medlinedate), journal=unlist(journal),
title=unlist(title), author=unlist(author) ))
})
}
#-----------------------------------------------------------------------------------------------------------------------------------
# Wrapper function uses batched PMID list and get_pubmed to run pubmed search
# Path takes desired name for folder to save data frames referred to as data files.
fn_pubmed<-function(pmid = pmid_batch, path="pmid_data", from=1, to=max){
require(plyr)
max<-length(pmid)
if (file.exists(path)==FALSE){
dir.create(path)
}
for (i in from:to){
df<-data.frame()
df<-get_pubmed(pmid[[i]])
file<-paste(path, "/data", i,".txt", sep="")
write.table(df, file=file, sep=";")
}
}
#-----------------------------------------------------------------------------------------------------------------------------------
fn_pubmed(pmid_batch)
#-----------------------------------------------------------------------------------------------------------------------------------
# Merge back saved tables
fn_merge<-function(path="pmid_data"){
require(plyr)
data_files<-list.files(path, full.names=T)
df<-ldply(data_files, function(x){
df1<-read.csv(x, header=TRUE, sep=";")
df2<-data.frame(data_files=gsub(pattern=paste(path, "/", sep=""), replacement="",x), df1)
return(df2)
})
return(df)
}
#-----------------------------------------------------------------------------------------------------------------------------------
# Run
data<-fn_merge()