Skip to content

Commit

Permalink
Merge pull request #4 from philipskokoh/master
Browse files Browse the repository at this point in the history
add script to download all open access and foaf:page in corpus.jsonld
  • Loading branch information
ceteri authored Sep 27, 2019
2 parents 4e85430 + af35dbc commit d01d47c
Show file tree
Hide file tree
Showing 2 changed files with 220 additions and 0 deletions.
216 changes: 216 additions & 0 deletions download_corpus_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import argparse
import json
import requests
import time

from bs4 import BeautifulSoup # type: ignore
from pathlib import Path
from pdfminer.pdfdocument import PDFDocument # type: ignore
from pdfminer.pdfpage import PDFTextExtractionNotAllowed # type: ignore
from pdfminer.pdfparser import PDFParser, PDFSyntaxError # type: ignore
from requests_html import HTMLSession # type: ignore
from tqdm import tqdm # type: ignore
from urllib.parse import urlparse

DEFAULT_CORPUS_FILE = 'corpus.jsonld'
DEFAULT_OUTPUT_RESOURCE = 'resources/'

MAX_DOWNLOAD_TRIAL = 3
PUB_PDF_PATH = 'pubs/pdf/'
DATASET_PAGE_PATH = 'dataset/'


def load_corpus(filename: str) -> dict:
""" Load corpus file (in jsonld format)
"""
with open(filename, 'r') as f:
corpus = json.load(f)
return corpus['@graph']


def is_valid_pdf_file(filename: str) -> bool:
try:
with open(filename, 'rb') as f:
parser = PDFParser(f)
document = PDFDocument(parser, '')
if not document.is_extractable:
raise PDFTextExtractionNotAllowed(filename)
return True
except PDFSyntaxError as err:
print(err)
print(f'{filename} is not a valid pdf file.')
return False


def _download(uri: str, res_type: str,
output_path: Path, e_id: str) -> bool:
""" download a resource and store in a file
"""
if res_type not in ['pdf', 'html', 'unknown']:
raise ValueError(f'Invalid resource type: {res_type}')
trial = 0
headers = requests.utils.default_headers()
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'

while trial < MAX_DOWNLOAD_TRIAL:
try:
parsed_uri = urlparse(uri)
if parsed_uri.netloc == 'www.sciencedirect.com':
""" Special case: sciencedirect.com auto generates pdf
download link in an intermediate page
"""
session = HTMLSession()
r0 = session.get(uri)
res = session.get(list(r0.html.absolute_links)[0])
elif parsed_uri.netloc.endswith('onlinelibrary.wiley.com'):
""" Special case: wiley auto generates embed pdf to render pdf
"""
r0 = requests.get(uri)
soup = BeautifulSoup(r0.content, 'html5lib')
if soup.find('embed') is None:
print(f'Unexpected response for: {uri}')
trial += 1
continue
src = soup.find('embed')['src']
res = requests.get(parsed_uri.scheme + '://' +
parsed_uri.netloc + src)
else:
res = requests.get(uri, headers=headers, timeout=(10, 20))
if res_type == 'unknown':
content_type = res.headers["content-type"]
res_type = 'html' if 'text/html' in content_type else 'pdf'
out_file = output_path / (e_id + '.' + res_type)
out_file.write_bytes(res.content)
if res_type == 'pdf':
if not is_valid_pdf_file(out_file.resolve().as_posix()):
out_file.unlink()
trial += 1
continue
return True
except requests.exceptions.RequestException as err:
print(err)
time.sleep(5)
trial += 1

if trial == MAX_DOWNLOAD_TRIAL:
print(f'Failed downloading {uri} after {MAX_DOWNLOAD_TRIAL} attempts.')
return False


def download_pub_resources(corpus: dict,
output_path: Path,
force_download: bool = False) -> None:
""" Download all publications pdf files from corpus data (if not downloaded
yet). We use the entity id as filename.
All downloaded files are stored under `output_path` folder.
Input:
- corpus: corpus file containing a list of publications.
- output_path: path to store downloaded resources
- force_download: Always download resources if True. Default:
false.
"""
pub_pdf_full_path = output_path / PUB_PDF_PATH
if not pub_pdf_full_path.exists():
pub_pdf_full_path.mkdir(parents=True)
pubs = [e for e in corpus if e['@type'] == 'ResearchPublication']
downloaded_pubs = list(pub_pdf_full_path.glob('*.pdf'))
downloaded_pubs_id = set([f.stem for f in downloaded_pubs])

for entity in tqdm(pubs, ascii=True, desc='Fetch resources'):
_id = urlparse(entity['@id']).fragment.split('-')[1]
_type = entity['@type']
downloaded_before = _id in downloaded_pubs_id
res_uri = entity['openAccess']['@value']
if force_download or not downloaded_before:
if not _download(res_uri, 'pdf',
pub_pdf_full_path, _id):
print(f'Failed to download {res_uri}')
time.sleep(0.5)


def download_dataset_resources(corpus: dict,
output_path: Path,
force_download: bool = False) -> None:
""" Download all dataset 'foaf:page' files from corpus data (if not
downloaded yet). We use the entity id as filename.
All downloaded files are stored under `output_path` folder.
Input:
- corpus: corpus file containing a list of datasets.
- output_path: path to store downloaded resources
- force_download: Always download resources if True. Default:
false.
"""
dataset_page_full_path = output_path / DATASET_PAGE_PATH
if not dataset_page_full_path.exists():
dataset_page_full_path.mkdir(parents=True)
datasets = [e for e in corpus if e['@type'] == 'Dataset']
downloaded_datasets = list(dataset_page_full_path.glob('*.*'))
downloaded_datasets_id = set([f.stem for f in downloaded_datasets])

for entity in tqdm(datasets, ascii=True, desc='Fetch resources'):
_id = urlparse(entity['@id']).fragment.split('-')[1]
_type = entity['@type']
downloaded_before = _id in downloaded_datasets_id
res_uri = entity['foaf:page']['@value']
if force_download or not downloaded_before:
if not _download(res_uri, 'unknown',
dataset_page_full_path, _id):
print(f'Failed to download {res_uri}')
time.sleep(0.5)


def get_resources_stats(corpus: dict, output_dir: str) -> None:
print(f'Number of records in the corpus: {len(corpus)}')

output_path = Path(output_dir)
pubs = [e for e in corpus if e['@type'] == 'ResearchPublication']
pub_pdf_full_path = output_path / PUB_PDF_PATH
downloaded_pubs = list(pub_pdf_full_path.glob('*.pdf'))
downloaded_pubs_id = set([f.stem for f in downloaded_pubs])
missing_pub = set()
for entity in pubs:
_id = urlparse(entity['@id']).fragment.split('-')[1]
if _id not in downloaded_pubs_id:
missing_pub.add(_id)
print(f'Number of research publications: {len(pubs)}')
print(f'Successfully downloaded {len(pubs) - len(missing_pub)} pdf files.')
print(f'Missing publication resources: {missing_pub}')

datasets = [e for e in corpus if e['@type'] == 'Dataset']
dataset_page_full_path = output_path / DATASET_PAGE_PATH
downloaded_datasets = list(dataset_page_full_path.glob('*.*'))
downloaded_datasets_id = set([f.stem for f in downloaded_datasets])
missing_dataset_res = set()
for entity in datasets:
_id = urlparse(entity['@id']).fragment.split('-')[1]
if _id not in downloaded_datasets_id:
missing_dataset_res.add(_id)
print(f'Number of datasets: {len(datasets)}')
print(f'Successfully downloaded {len(datasets) - len(missing_dataset_res)} resource files.')
print(f'Missing dataset resources: {missing_dataset_res}')


def download_resources(corpus: dict, output_dir: str) -> None:
output_path = Path(output_dir)
download_pub_resources(corpus, output_path)
download_dataset_resources(corpus, output_path)


def main(args):
corpus = load_corpus(args.input)
download_resources(corpus, args.output_dir)
get_resources_stats(corpus, args.output_dir)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Download publication \
open access and dataset foaf page \
from rclc corpus')
parser.add_argument('--input', type=str,
default=DEFAULT_CORPUS_FILE,
help='rclc corpus file')
parser.add_argument('--output_dir', type=str,
default=DEFAULT_OUTPUT_RESOURCE,
help='path to store downloaded resources')
args = parser.parse_args()
main(args)
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
rdflib >= 4.2.2
rdflib-jsonld==0.4.0
mypy==0.730
pdfminer.six==20181108
requests-html==0.10.0
html5lib==1.0.1

0 comments on commit d01d47c

Please sign in to comment.