From d0be18a82c385c80338c88ddf5ae0005a84971c7 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:41:37 -0500 Subject: [PATCH 01/13] Added sanitize_json_string function to general_util.py Function strips extraneous whitespace from strings to be written to a JSON file to ensure they don't break the format --- src/pds_doi_service/core/util/general_util.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/pds_doi_service/core/util/general_util.py b/src/pds_doi_service/core/util/general_util.py index b6f9c602..04ae3040 100644 --- a/src/pds_doi_service/core/util/general_util.py +++ b/src/pds_doi_service/core/util/general_util.py @@ -13,11 +13,34 @@ General utility functions for things like logging. """ +import re import logging from pds_doi_service.core.util.config_parser import DOIConfigUtil +def sanitize_json_string(string): + """ + Cleans up extraneous whitespace from the provided string so it may be + written to a JSON file. Extraneous whitespace include any before or after + the provided string, as well as between words. + + Parameters + ---------- + string : str + The string to sanitize. + + Returns + ------- + string : str + The provided string, sanitized of extraneous whitespace. + + """ + # Clean up whitespace (including line breaks) both between words and + # at the ends of the string + return re.sub(r"\s+", " ", string, flags=re.UNICODE).strip() + + def get_logger(module_name=''): # If the user specifies the module name, we can use it. if module_name: @@ -36,4 +59,3 @@ def get_logger(module_name=''): return logger - From 1b8182849fc7db5394498b524845e0a1f922caaa Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:43:32 -0500 Subject: [PATCH 02/13] Added use of general_util.sanitize_json_string() to DataCite record creation for the keywords, title and description fields --- .../core/outputs/datacite/datacite_record.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/pds_doi_service/core/outputs/datacite/datacite_record.py b/src/pds_doi_service/core/outputs/datacite/datacite_record.py index fe789524..84b72cdb 100644 --- a/src/pds_doi_service/core/outputs/datacite/datacite_record.py +++ b/src/pds_doi_service/core/outputs/datacite/datacite_record.py @@ -22,7 +22,7 @@ from pds_doi_service.core.entities.doi import ProductType, Doi from pds_doi_service.core.outputs.doi_record import DOIRecord, CONTENT_TYPE_JSON from pds_doi_service.core.util.config_parser import DOIConfigUtil -from pds_doi_service.core.util.general_util import get_logger +from pds_doi_service.core.util.general_util import get_logger, sanitize_json_string logger = get_logger(__name__) @@ -103,7 +103,7 @@ def create_doi_record(self, dois, content_type=CONTENT_TYPE_JSON): doi_fields['product_type'] = ProductType.Collection # Sort keywords so we can output them in the same order each time - doi_fields['keywords'] = sorted(doi.keywords) + doi_fields['keywords'] = sorted(map(sanitize_json_string, doi.keywords)) # Convert datetime objects to isoformat strings if doi.date_record_added: @@ -112,9 +112,13 @@ def create_doi_record(self, dois, content_type=CONTENT_TYPE_JSON): if doi.date_record_updated: doi_fields['date_record_updated'] = doi.date_record_updated.strftime("%Y-%m-%dT%H:%M:%S.%fZ") - # Remove any extraneous whitespace from a provided description + # Cleanup extra whitespace that could break JSON format from title + # and description + if doi.title: + doi_fields['title'] = sanitize_json_string(doi.title) + if doi.description: - doi_fields['description'] = str.strip(doi.description) + doi_fields['description'] = sanitize_json_string(doi.description) # Publication year is a must-have doi_fields['publication_year'] = doi.publication_date.strftime('%Y') From 3e0dcf448d68a2278d93f36bad65ce8fce32a46b Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:45:05 -0500 Subject: [PATCH 03/13] Removed default filtering of query results by client ID in DOIDataCiteWebParser.query_doi() There are certain DOI records we will need to query that have been submitted by other clients, so this was no longer valid behavior --- .../core/outputs/datacite/datacite_web_client.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/pds_doi_service/core/outputs/datacite/datacite_web_client.py b/src/pds_doi_service/core/outputs/datacite/datacite_web_client.py index 7d1431f4..a1c6f474 100644 --- a/src/pds_doi_service/core/outputs/datacite/datacite_web_client.py +++ b/src/pds_doi_service/core/outputs/datacite/datacite_web_client.py @@ -100,9 +100,9 @@ def query_doi(self, query, url=None, username=None, password=None, Notes ----- - Queries are automatically filtered by this method to only include - DOI entries associated with the PDS client ID, which corresponds to the - username used with the query request. + Queries are NOT automatically filtered by this method. Callers should be + prepared to filter results as desired if more results are returned + by their query than expected. Parameters ---------- @@ -151,15 +151,13 @@ def query_doi(self, query, url=None, username=None, password=None, query_string = str(query) url = url or config.get('DATACITE', 'url') - client_id = (username or config.get('DATACITE', 'user')).lower() logger.debug('query_string: %s', query_string) logger.debug('url: %s', url) - logger.debug('client_id: %s', client_id) datacite_response = requests.request( WEB_METHOD_GET, url=url, auth=auth, headers=headers, - params={"query": query_string, "client-id": client_id} + params={"query": query_string} ) try: From 2643f92dccbb0feb5da613f7886980968893566a Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:47:45 -0500 Subject: [PATCH 04/13] Added "Other" value to ProductType enum --- src/pds_doi_service/core/entities/doi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pds_doi_service/core/entities/doi.py b/src/pds_doi_service/core/entities/doi.py index 896572de..1828f6cf 100644 --- a/src/pds_doi_service/core/entities/doi.py +++ b/src/pds_doi_service/core/entities/doi.py @@ -25,6 +25,7 @@ class ProductType(str, Enum): Bundle = 'Bundle' Text = 'Text' Dataset = 'Dataset' + Other = 'Other' @unique From 27f98c5b948917e34c25d7c7e23f19b728408efb Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:48:32 -0500 Subject: [PATCH 05/13] Added an event field to the Doi object, with an associated enumeration This field is now set by the release and reserve actions to ensure the submission to DataCite performs the desired status change for the record(s). Previously, this logic was encoded into the DataCite record template itself, but this proved to be the incorrect approach. --- src/pds_doi_service/core/actions/release.py | 7 ++++- src/pds_doi_service/core/actions/reserve.py | 30 ++++++++++++++++++--- src/pds_doi_service/core/entities/doi.py | 21 +++++++++++++++ 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/src/pds_doi_service/core/actions/release.py b/src/pds_doi_service/core/actions/release.py index 69a7b732..2dae4c7d 100644 --- a/src/pds_doi_service/core/actions/release.py +++ b/src/pds_doi_service/core/actions/release.py @@ -14,7 +14,7 @@ """ from pds_doi_service.core.actions.action import DOICoreAction -from pds_doi_service.core.entities.doi import DoiStatus +from pds_doi_service.core.entities.doi import DoiEvent, DoiStatus from pds_doi_service.core.input.exceptions import (InputFormatException, DuplicatedTitleDOIException, UnexpectedDOIActionException, @@ -137,6 +137,11 @@ def _complete_dois(self, dois): # Add 'status' field so the ranking in the workflow can be determined. doi.status = DoiStatus.Pending if self._no_review else DoiStatus.Review + if self._no_review: + # Add the event field to instruct DataCite to publish DOI to + # findable state (should have no effect for other providers) + doi.event = DoiEvent.Publish + return dois def _validate_dois(self, dois): diff --git a/src/pds_doi_service/core/actions/reserve.py b/src/pds_doi_service/core/actions/reserve.py index dcefe1e9..3ef553bc 100644 --- a/src/pds_doi_service/core/actions/reserve.py +++ b/src/pds_doi_service/core/actions/reserve.py @@ -14,7 +14,7 @@ """ from pds_doi_service.core.actions.action import DOICoreAction -from pds_doi_service.core.entities.doi import DoiStatus +from pds_doi_service.core.entities.doi import DoiEvent, DoiStatus from pds_doi_service.core.input.exceptions import (CriticalDOIException, DuplicatedTitleDOIException, InputFormatException, @@ -28,8 +28,8 @@ from pds_doi_service.core.input.node_util import NodeUtil from pds_doi_service.core.outputs.doi_record import CONTENT_TYPE_JSON from pds_doi_service.core.outputs.doi_validator import DOIValidator -from pds_doi_service.core.outputs.service import DOIServiceFactory -from pds_doi_service.core.outputs.web_client import WEB_METHOD_POST +from pds_doi_service.core.outputs.service import DOIServiceFactory, SERVICE_TYPE_DATACITE +from pds_doi_service.core.outputs.web_client import WEB_METHOD_POST, WEB_METHOD_PUT from pds_doi_service.core.util.general_util import get_logger logger = get_logger(__name__) @@ -136,6 +136,12 @@ def _complete_dois(self, dois): # Add 'status' field so the ranking in the workflow can be determined doi.status = DoiStatus.Reserved_not_submitted if self._dry_run else DoiStatus.Reserved + if not self._dry_run: + # Add the event field to instruct DataCite to make this entry + # hidden so it can be modified (should have no effect for other + # providers) + doi.event = DoiEvent.Hide + return dois def _validate_dois(self, dois): @@ -237,8 +243,24 @@ def run(self, **kwargs): # Note that for both OSTI and DataCite, reserve requests should # utilize the POST method if not self._dry_run: + service_type = DOIServiceFactory.get_service_type() + + # If a DOI has already been assigned by DataCite, + # we need to use a PUT request on the URL associated to the DOI + if service_type == SERVICE_TYPE_DATACITE and doi.doi: + method = WEB_METHOD_PUT + url = '{url}/{doi}'.format( + url=self._config.get('DATACITE', 'url'), doi=doi.doi + ) + # Otherwise, for both DataCite and OSTI, just a POST request + # on the default endpoint is sufficient + else: + method = WEB_METHOD_POST + url = self._config.get(service_type.upper(), 'url') + doi, o_doi_label = self._web_client.submit_content( - method=WEB_METHOD_POST, + method=method, + url=url, payload=io_doi_label, content_type=CONTENT_TYPE_JSON ) diff --git a/src/pds_doi_service/core/entities/doi.py b/src/pds_doi_service/core/entities/doi.py index 1828f6cf..e1342be5 100644 --- a/src/pds_doi_service/core/entities/doi.py +++ b/src/pds_doi_service/core/entities/doi.py @@ -75,6 +75,26 @@ class DoiStatus(str, Enum): Deactivated = 'deactivated' +@unique +class DoiEvent(str, Enum): + """ + Enumerates the possible DOI events that can be requested in a submission + to DataCite. + + Events consist of: + Publish - + Moves a DOI from draft or registered state to findable + Register - + Moves a DOI from draft to registered + Hide - + Moves a DOI from findable back to registered + + """ + Publish = 'publish' + Register = 'register' + Hide = 'hide' + + @dataclass class Doi: """The dataclass definition for a Doi object.""" @@ -97,3 +117,4 @@ class Doi: message: str = None date_record_added: datetime = None date_record_updated: datetime = None + event: DoiEvent = None From d0a3850819265f1656e4b6e4b1b25afdb58da123 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:50:48 -0500 Subject: [PATCH 06/13] Fixed bug in DOIWebParser._get_identifier_from_site_url() where a URL containing no "&version=" token caused index errors --- src/pds_doi_service/core/outputs/web_parser.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/pds_doi_service/core/outputs/web_parser.py b/src/pds_doi_service/core/outputs/web_parser.py index da3a30c4..170359b0 100644 --- a/src/pds_doi_service/core/outputs/web_parser.py +++ b/src/pds_doi_service/core/outputs/web_parser.py @@ -40,16 +40,20 @@ def _get_identifier_from_site_url(site_url): """ # TODO: rewrite to utilize urlparse and support PDS3 labels + lid_vid_value = None + site_tokens = site_url.split("identifier=") identifier_tokens = site_tokens[1].split(";") lid_vid_tokens = identifier_tokens[0].split("&version=") - lid_value = lid_vid_tokens[0].replace("%3A", ":") - vid_value = lid_vid_tokens[1] - # Finally combine the lid and vid together. - lid_vid_value = lid_value + '::' + vid_value + if len(lid_vid_tokens) >= 2: + lid_value = lid_vid_tokens[0].replace("%3A", ":") + vid_value = lid_vid_tokens[1] + + # Finally combine the lid and vid together. + lid_vid_value = lid_value + '::' + vid_value return lid_vid_value From 25b1fc63a1b5db5a6972264043f1fb462350ec75 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:53:24 -0500 Subject: [PATCH 07/13] Added mappings for missing DoiStatus values to the workflow order defined in doi_validator.py --- src/pds_doi_service/core/outputs/doi_validator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/pds_doi_service/core/outputs/doi_validator.py b/src/pds_doi_service/core/outputs/doi_validator.py index 1ae998d2..83ece862 100644 --- a/src/pds_doi_service/core/outputs/doi_validator.py +++ b/src/pds_doi_service/core/outputs/doi_validator.py @@ -43,12 +43,16 @@ class DOIValidator: # The workflow_order dictionary contains the progression of the status of a DOI: m_workflow_order = { + DoiStatus.Error: 0, + DoiStatus.Unknown: 0, DoiStatus.Reserved_not_submitted: 0, DoiStatus.Reserved: 1, DoiStatus.Draft: 2, DoiStatus.Review: 3, DoiStatus.Pending: 4, - DoiStatus.Registered: 5 + DoiStatus.Registered: 5, + DoiStatus.Findable: 5, + DoiStatus.Deactivated: 5 } def __init__(self, db_name=None): From 14d1c5cfae316b3da8ca7657f6eba6425e59463f Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:54:12 -0500 Subject: [PATCH 08/13] Added identifiers field to Doi object definition Field is useful for storing additional identifiers parsed from DataCite DOI records --- src/pds_doi_service/core/entities/doi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pds_doi_service/core/entities/doi.py b/src/pds_doi_service/core/entities/doi.py index e1342be5..6b450970 100644 --- a/src/pds_doi_service/core/entities/doi.py +++ b/src/pds_doi_service/core/entities/doi.py @@ -103,6 +103,7 @@ class Doi: product_type: ProductType product_type_specific: str related_identifier: str + identifiers: list = field(default_factory=list) authors: list = None keywords: set = field(default_factory=set) editors: list = None From cda6f9d7805edea2787b54a65f9db9cdc9a5e389 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:55:08 -0500 Subject: [PATCH 09/13] Overhauled the DOIDataCiteWebParser class to better support parsing of labels submitted by other PDS nodes --- .../outputs/datacite/datacite_web_parser.py | 203 ++++++++++-------- 1 file changed, 118 insertions(+), 85 deletions(-) diff --git a/src/pds_doi_service/core/outputs/datacite/datacite_web_parser.py b/src/pds_doi_service/core/outputs/datacite/datacite_web_parser.py index d262740a..89529ee5 100644 --- a/src/pds_doi_service/core/outputs/datacite/datacite_web_parser.py +++ b/src/pds_doi_service/core/outputs/datacite/datacite_web_parser.py @@ -37,14 +37,14 @@ class DOIDataCiteWebParser(DOIWebParser): This class only supports parsing records in JSON format. """ _optional_fields = [ - 'id', 'doi', 'description', 'keywords', 'authors', 'site_url', - 'editors', 'status', 'date_record_added', 'date_record_updated', - 'contributor', 'related_identifier' + 'id', 'doi', 'identifiers', 'description', 'keywords', 'authors', + 'site_url', 'editors', 'status', 'date_record_added', + 'date_record_updated', 'contributor' ] _mandatory_fields = [ 'title', 'publisher', 'publication_date', 'product_type', - 'product_type_specific' + 'product_type_specific', 'related_identifier' ] @staticmethod @@ -55,87 +55,114 @@ def _parse_id(record): else: # Parse the ID from the DOI field, it it's available return record.get('doi').split('/')[-1] - except (AttributeError, KeyError) as err: - logger.warning('Could not parse id from record, reason: %s %s', - err.__class__, err) + except (AttributeError, KeyError): + logger.warning('Could not parse optional field "id"') @staticmethod def _parse_doi(record): try: return record['doi'] - except KeyError as err: - logger.warning('Could not parse doi from record, reason: %s %s', - err.__class__, err) + except KeyError: + logger.warning('Could not parse optional field "doi"') + + @staticmethod + def _parse_identifiers(record): + try: + identifiers = filter( + lambda identifier: identifier["identifierType"] != "DOI", + record['identifiers'] + ) + return list(identifiers) + except KeyError: + logger.warning('Could not parse optional field "identifiers"') @staticmethod def _parse_description(record): try: return record['descriptions'][0]['description'] - except (IndexError, KeyError) as err: - logger.warning('Could not parse description from record, reason: %s %s', - err.__class__, err) + except (IndexError, KeyError): + logger.warning('Could not parse optional field "description"') @staticmethod def _parse_keywords(record): try: return set(sorted(subject['subject'] for subject in record['subjects'])) - except KeyError as err: - logger.warning('Could not parse keywords from record, reason: %s %s', - err.__class__, err) + except KeyError: + logger.warning('Could not parse optional field "keywords"') @staticmethod def _parse_authors(record): try: - return [{'first_name': creator['givenName'], - 'last_name': creator['familyName']} - for creator in record['creators']] - except KeyError as err: - logger.warning('Could not parse authors from record, reason: %s %s', - err.__class__, err) + authors = [] + + for creator in record['creators']: + if all(name_type in creator for name_type in ('givenName', 'familyName')): + name = f"{creator['givenName']} {creator['familyName']}" + else: + name = creator['name'] + + authors.append( + { + 'name': name, + 'name_type': creator['nameType'], + 'name_identifiers': creator.get('nameIdentifiers', []) + } + ) + + return authors + except KeyError: + logger.warning('Could not parse optional field "authors"') @staticmethod def _parse_site_url(record): try: return html.unescape(record['url']) - except (KeyError, TypeError) as err: - logger.warning('Could not parse site url from record, reason: %s %s', - err.__class__, err) + except (KeyError, TypeError): + logger.warning('Could not parse optional field "site_url"') @staticmethod def _parse_editors(record): try: - return [{'first_name': contributor['givenName'], - 'last_name': contributor['familyName']} - for contributor in record['contributors'] - if contributor['contributorType'] == 'Editor'] - except KeyError as err: - logger.warning('Could not parse editors from record, reason: %s %s', - err.__class__, err) + editors = [] + + for contributor in record['contributors']: + if contributor['contributorType'] == 'Editor': + if all(name_type in contributor for name_type in ('givenName', 'familyName')): + name = f"{contributor['givenName']} {contributor['familyName']}" + else: + name = contributor['name'] + + editors.append( + { + 'name': name, + 'name_identifiers': contributor.get('nameIdentifiers', []) + } + ) + return editors + except KeyError: + logger.warning('Could not parse optional field "editors"') @staticmethod def _parse_status(record): try: return DoiStatus(record['state']) - except (KeyError, ValueError) as err: - logger.warning('Could not parse status from record, reason: %s %s', - err.__class__, err) + except (KeyError, ValueError): + logger.warning('Could not parse optional field "status"') @staticmethod def _parse_date_record_added(record): try: return isoparse(record['created']) - except (KeyError, ValueError) as err: - logger.warning('Could not parse date added from record, reason: %s %s', - err.__class__, err) + except (KeyError, ValueError): + logger.warning('Could not parse optional field "date_record_added"') @staticmethod def _parse_date_record_updated(record): try: return isoparse(record['updated']) except (KeyError, ValueError) as err: - logger.warning('Could not parse date updated from record, reason: %s %s', - err.__class__, err) + logger.warning('Could not parse optional field "date_record_updated"') @staticmethod def _parse_contributor(record): @@ -152,9 +179,8 @@ def _parse_contributor(record): .strip()) return contributor - except (KeyError, StopIteration, ValueError) as err: - logger.warning('Could not parse a contributor from record, reason: %s %s', - err.__class__, err) + except (KeyError, StopIteration, ValueError): + logger.warning('Could not parse optional field "contributor"') @staticmethod def _parse_related_identifier(record): @@ -162,67 +188,66 @@ def _parse_related_identifier(record): try: identifier = record['relatedIdentifiers'][0]['relatedIdentifier'] - except (IndexError, KeyError) as err: - if 'url' in record: + except (IndexError, KeyError): + if 'identifiers' in record: + for identifier_record in record['identifiers']: + if identifier_record["identifier"].startswith('urn:'): + identifier = identifier_record["identifier"] + break + elif 'url' in record: logger.info('Parsing related identifier from URL') identifier = DOIWebParser._get_identifier_from_site_url(record['url']) - else: - logger.warning('Could not parse a related identifier from record, ' - 'reason: %s %s', err.__class__, err) - if identifier: - identifier = identifier.strip() + if identifier is None: + raise InputFormatException( + 'Failed to parse mandatory field "related_identifier"' + ) - return identifier + return identifier.strip() @staticmethod def _parse_title(record): try: return record['titles'][0]['title'] - except (IndexError, KeyError) as err: + except (IndexError, KeyError): raise InputFormatException( - f'Failed to parse title from provided record, reason: ' - f'{err.__class__} {err}' + 'Failed to parse mandatory field "title"' ) @staticmethod def _parse_publisher(record): try: return record['publisher'] - except KeyError as err: + except KeyError: raise InputFormatException( - f'Failed to parse publisher from provided record, reason: ' - f'{err.__class__} {err}' + 'Failed to parse mandatory field "publisher"' ) @staticmethod def _parse_publication_date(record): try: return datetime.strptime(str(record['publicationYear']), '%Y') - except (KeyError, ValueError) as err: + except (KeyError, ValueError): raise InputFormatException( - 'Failed to parse publication date from provided record, reason: ' - f'{err.__class__} {err}' + 'Failed to parse mandatory field "publication_date"' ) @staticmethod def _parse_product_type(record): try: return ProductType(record['types']['resourceTypeGeneral']) - except (KeyError, ValueError) as err: + except (KeyError, ValueError): raise InputFormatException( - 'Failed to parse product type from provided record, reason: ' - f'{err.__class__} {err}' + 'Failed to parse mandatory field "product_type"' ) @staticmethod def _parse_product_type_specific(record): try: return record['types']['resourceType'] - except KeyError as err: + except KeyError: raise InputFormatException( - 'Failed to parse product type specific from provided record, ' - f'reason: {err.__class__} {err}' + 'Failed to parse mandatory field "product_type_specific"' ) @staticmethod @@ -261,31 +286,39 @@ def parse_dois_from_label(label_text, content_type=CONTENT_TYPE_JSON): if not isinstance(datacite_records, list): datacite_records = [datacite_records] - for datacite_record in datacite_records: - doi_fields = {} + for index, datacite_record in enumerate(datacite_records): + try: + logger.info('Parsing record index %d', index) + doi_fields = {} + + # Everything we care about in a DataCite response is under + # attributes + datacite_record = datacite_record['attributes'] - # Everything we care about in a DataCite response is under - # attributes - datacite_record = datacite_record['attributes'] + for mandatory_field in DOIDataCiteWebParser._mandatory_fields: + doi_fields[mandatory_field] = getattr( + DOIDataCiteWebParser, f'_parse_{mandatory_field}')(datacite_record) + logger.debug('Parsed value %s for mandatory field %s', + doi_fields[mandatory_field], mandatory_field) - for mandatory_field in DOIDataCiteWebParser._mandatory_fields: - doi_fields[mandatory_field] = getattr( - DOIDataCiteWebParser, f'_parse_{mandatory_field}')(datacite_record) - logger.debug('Parsed value %s for mandatory field %s', - doi_fields[mandatory_field], mandatory_field) + for optional_field in DOIDataCiteWebParser._optional_fields: + parsed_value = getattr( + DOIDataCiteWebParser, f'_parse_{optional_field}')(datacite_record) - for optional_field in DOIDataCiteWebParser._optional_fields: - parsed_value = getattr( - DOIDataCiteWebParser, f'_parse_{optional_field}')(datacite_record) + if parsed_value is not None: + doi_fields[optional_field] = parsed_value + logger.debug('Parsed value %s for optional field %s', + parsed_value, optional_field) - if parsed_value is not None: - doi_fields[optional_field] = parsed_value - logger.debug('Parsed value %s for optional field %s', - parsed_value, optional_field) + doi = Doi(**doi_fields) - doi = Doi(**doi_fields) + dois.append(doi) + except InputFormatException as err: + logger.warning('Failed to parse a DOI object from record index %d ' + 'of the provided label, reason: %s', index, str(err)) + continue - dois.append(doi) + logger.info('Parsed %d DOI objects from %d records', len(dois), len(datacite_records)) return dois, errors From 88c4b6a9fe915ac1c473ba7988d36f338c13186f Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:56:28 -0500 Subject: [PATCH 10/13] Improved the DataCite JSON label Jinja2 template to support additional fields --- ...DOI_DataCite_template_20210520-jinja2.json | 55 ++++++++++++++----- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/src/pds_doi_service/core/outputs/datacite/DOI_DataCite_template_20210520-jinja2.json b/src/pds_doi_service/core/outputs/datacite/DOI_DataCite_template_20210520-jinja2.json index c5cd1bc7..427005b9 100644 --- a/src/pds_doi_service/core/outputs/datacite/DOI_DataCite_template_20210520-jinja2.json +++ b/src/pds_doi_service/core/outputs/datacite/DOI_DataCite_template_20210520-jinja2.json @@ -10,10 +10,8 @@ {% endif %} "type": "dois", "attributes": { - {% if doi.status.value == "pending" %} - "event": "publish", - {% elif doi.status.value == "findable" %} - "event": "hide", + {% if doi.event %} + "event": "{{ doi.event.value }}", {% endif %} {% if doi.doi %} "doi": "{{ doi.doi }}", @@ -24,6 +22,12 @@ "suffix": "{{ doi.id }}", {% endif %} "identifiers": [ + {% for identifier in doi.identifiers %} + { + "identifier": "{{ identifier.identifier.strip() }}", + "identifierType": "{{ identifier.identifierType }}" + }, + {% endfor %} { {% if doi.doi %} "identifier": "{{ doi.doi }}", @@ -36,10 +40,25 @@ "creators": [ {% for author in doi.authors %} { + {% if author.name_type %} + "nameType": "{{author.name_type}}", + {% else %} "nameType": "Personal", - "name": "{{ author['last_name'] }}, {{ author['first_name'] }}", - "givenName": "{{ author['first_name'] }}", - "familyName": "{{ author['last_name'] }}" + {% endif %} + {% if author.first_name and author.last_name %} + "name": "{{ author.first_name }} {{ author.last_name }}", + {% else %} + "name": "{{ author.name }}", + {% endif %} + "nameIdentifiers": [ + {% for name_identifier in author.name_identifiers %} + { + {% for key, value in name_identifier.items() %} + "{{key}}": "{{value}}"{% if not loop.last %},{% endif +%} + {% endfor %} + } + {% endfor %} + ] }{% if not loop.last %},{% endif +%} {% endfor %} ], @@ -60,9 +79,20 @@ {% for editor in doi.editors %} { "nameType": "Personal", - "name": "{{ editor['last_name'] }}, {{ editor['first_name'] }}", - "givenName": "{{ editor['first_name'] }}", - "familyName": "{{ editor['last_name'] }}", + {% if editor.first_name and editor.last_name %} + "name": "{{ editor.first_name }} {{ editor.last_name }}", + {% else %} + "name": "{{ editor.name }}", + {% endif %} + "nameIdentifiers": [ + {% for name_identifier in editor.name_identifiers %} + { + {% for key, value in name_identifier.items() %} + "{{key}}": "{{value}}"{% if not loop.last %},{% endif +%} + {% endfor %} + } + {% endfor %} + ], "contributorType": "Editor" }, {% endfor %} @@ -79,9 +109,8 @@ "relatedIdentifiers": [ { "relatedIdentifier": "{{ doi.related_identifier }}", - "relatedIdentifierType": "URN", - "relationType": "HasMetadata", - "resourceTypeGeneral": "Text" + "relatedIdentifierType": {% if doi.related_identifier.lower().startswith("urn") %}"URN"{% else %}"Handle"{% endif %}, + "relationType": "IsIdenticalTo" } ], {% if doi.description %} From fce9afdf6c655a92d49f046e1705d26cf7e1bb95 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:59:04 -0500 Subject: [PATCH 11/13] Updated DataCite-specific regression tests to account for changes to how event field is used and how authors/editors are parsed --- .../core/outputs/test/datacite_test.py | 60 +++---------------- 1 file changed, 7 insertions(+), 53 deletions(-) diff --git a/src/pds_doi_service/core/outputs/test/datacite_test.py b/src/pds_doi_service/core/outputs/test/datacite_test.py index cdcd078a..c24d3c01 100644 --- a/src/pds_doi_service/core/outputs/test/datacite_test.py +++ b/src/pds_doi_service/core/outputs/test/datacite_test.py @@ -50,52 +50,6 @@ def test_create_datacite_label_json(self): self.assertDictEqual(input_doi_fields, output_doi_fields) - def test_assign_datacite_event(self): - """ - Test assignment of the event field when creating a label for a DOI - in the reserved or pending state - """ - # Create a dummy Doi object to be reserved - test_doi = Doi(title='InSight Cameras Bundle', - publication_date=datetime(2019, 1, 1, 0, 0), - product_type=ProductType.Dataset, - product_type_specific='PDS4 Refereed Data Bundle', - related_identifier='urn:nasa:pds:insight_cameras::1.0', - id='yzw2-vz66', - doi='10.13143/yzw2-vz66', - publisher='NASA Planetary Data System', - contributor='Engineering', - status=DoiStatus.Reserved) - - # Create the label to submit to DataCite, using the Pending (release) - # state, which should map to the "publish" event - test_doi.status = DoiStatus.Pending - - release_label = DOIDataCiteRecord().create_doi_record(test_doi) - release_label_dict = json.loads(release_label) - - self.assertIn('event', release_label_dict['data']['attributes']) - self.assertEqual(release_label_dict['data']['attributes']['event'], 'publish') - - # If updating a record that has already been published (findable), - # we need to move it back to the registered stage via the "hide" event - test_doi.status = DoiStatus.Findable - - back_to_reserve_label = DOIDataCiteRecord().create_doi_record(test_doi) - - back_to_reserve_label_dict = json.loads(back_to_reserve_label) - - self.assertIn('event', back_to_reserve_label_dict['data']['attributes']) - self.assertEqual(back_to_reserve_label_dict['data']['attributes']['event'], 'hide') - - # For any other state, we should not get an event field in the label - test_doi.status = DoiStatus.Reserved_not_submitted - - reserve_label = DOIDataCiteRecord().create_doi_record(test_doi) - reserve_label_dict = json.loads(reserve_label) - - self.assertNotIn('event', reserve_label_dict['data']['attributes']) - def requests_valid_request_patch(method, url, **kwargs): response = Response() @@ -175,13 +129,13 @@ def setUpClass(cls): join(cls.test_dir, os.pardir, os.pardir, os.pardir, os.pardir, os.pardir, 'input') ) - cls.expected_authors = [{'first_name': 'R.', 'last_name': 'Deen'}, - {'first_name': 'H.', 'last_name': 'Abarca'}, - {'first_name': 'P.', 'last_name': 'Zamani'}, - {'first_name': 'J.', 'last_name': 'Maki'}] - cls.expected_editors = [{'first_name': 'P. H.', 'last_name': 'Smith'}, - {'first_name': 'M.', 'last_name': 'Lemmon'}, - {'first_name': 'R. F.', 'last_name': 'Beebe'}] + cls.expected_authors = [{'name': 'R. Deen', 'name_identifiers': [], 'name_type': 'Personal'}, + {'name': 'H. Abarca', 'name_identifiers': [], 'name_type': 'Personal'}, + {'name': 'P. Zamani', 'name_identifiers': [], 'name_type': 'Personal'}, + {'name': 'J. Maki', 'name_identifiers': [], 'name_type': 'Personal'}] + cls.expected_editors = [{'name': 'P. H. Smith', 'name_identifiers': []}, + {'name': 'M. Lemmon', 'name_identifiers': []}, + {'name': 'R. F. Beebe', 'name_identifiers': []}] cls.expected_keywords = {'data', 'rdr', 'product', 'experiment', 'lander', 'context', 'PDS', 'raw', 'mars', 'record', 'reduced', 'science', 'edr', 'PDS4', 'camera', 'deployment', From 76ed4cb3ec72cdd94afd779b135befeead21f628 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 11:59:49 -0500 Subject: [PATCH 12/13] Modified interfaces in DOIServiceFactory to allow a caller to specify the service type to return an object for --- src/pds_doi_service/core/outputs/service.py | 86 ++++++++++++++++----- 1 file changed, 67 insertions(+), 19 deletions(-) diff --git a/src/pds_doi_service/core/outputs/service.py b/src/pds_doi_service/core/outputs/service.py index 9748a68a..c7991859 100644 --- a/src/pds_doi_service/core/outputs/service.py +++ b/src/pds_doi_service/core/outputs/service.py @@ -77,15 +77,15 @@ class DOIServiceFactory: _config = DOIConfigUtil().get_config() @staticmethod - def get_service_type(): + def _check_service_type(service_type): """ - Returns the configured service type as defined within the INI config. + Checks if the provided service type is among the expected values. - Returns - ------- + Parameters + ---------- service_type : str - The service type to be used. This value is converted to lowercase - by this method before it is returned. + Service type to check. Is automatically converted to lowercase + to provide a case-insensitive check. Raises ------ @@ -94,24 +94,42 @@ def get_service_type(): class (or if no type is specified by the INI config at all). """ - service_type = DOIServiceFactory._config.get( - 'SERVICE', 'provider', fallback='unassigned' - ) - if service_type.lower() not in VALID_SERVICE_TYPES: raise ValueError( f'Unsupported service type "{service_type}" provided.\n' - f'Service type must be assigned to the SERVICE.provider field of ' + f'Service type should be assigned to the SERVICE.provider field of ' f'the INI config with one of the following values: {VALID_SERVICE_TYPES}' ) + @staticmethod + def get_service_type(): + """ + Returns the configured service type as defined within the INI config. + + Returns + ------- + service_type : str + The service type to be used. This value is converted to lowercase + by this method before it is returned. + + """ + service_type = DOIServiceFactory._config.get( + 'SERVICE', 'provider', fallback='unassigned' + ) + return service_type.lower() @staticmethod - def get_doi_record_service(): + def get_doi_record_service(service_type=None): """ Returns the appropriate DOIRecord subclass for the current service type. + Parameters + ---------- + service_type : str, optional + The service type to return a DOIRecord subclass for. Defaults to + the SERVICE.provider value of the INI config. + Returns ------- DOIRecord @@ -119,7 +137,10 @@ def get_doi_record_service(): DOI service type. """ - service_type = DOIServiceFactory.get_service_type() + if not service_type: + service_type = DOIServiceFactory.get_service_type() + + DOIServiceFactory._check_service_type(service_type) doi_record_class = DOIServiceFactory._DOI_RECORD_MAP[service_type] logger.debug('Returning instance of %s for service type %s', @@ -128,10 +149,16 @@ def get_doi_record_service(): return doi_record_class() @staticmethod - def get_validator_service(): + def get_validator_service(service_type=None): """ Returns the appropriate DOIValidator subclass for the current service type. + Parameters + ---------- + service_type : str, optional + The service type to return a DOIValidator subclass for. Defaults to + the SERVICE.provider value of the INI config. + Returns ------- DOIValidator @@ -139,7 +166,10 @@ def get_validator_service(): DOI service type. """ - service_type = DOIServiceFactory.get_service_type() + if not service_type: + service_type = DOIServiceFactory.get_service_type() + + DOIServiceFactory._check_service_type(service_type) doi_validator_class = DOIServiceFactory._SERVICE_VALIDATOR_MAP[service_type] logger.debug('Returning instance of %s for service type %s', @@ -148,10 +178,16 @@ def get_validator_service(): return doi_validator_class() @staticmethod - def get_web_client_service(): + def get_web_client_service(service_type=None): """ Returns the appropriate DOIWebClient subclass for the current service type. + Parameters + ---------- + service_type : str, optional + The service type to return a DOIWebClient subclass for. Defaults to + the SERVICE.provider value of the INI config. + Returns ------- DOIWebClient @@ -159,7 +195,10 @@ def get_web_client_service(): DOI service type. """ - service_type = DOIServiceFactory.get_service_type() + if not service_type: + service_type = DOIServiceFactory.get_service_type() + + DOIServiceFactory._check_service_type(service_type) web_client_class = DOIServiceFactory._WEB_CLIENT_MAP[service_type] logger.debug('Returning instance of %s for service type %s', @@ -168,10 +207,16 @@ def get_web_client_service(): return web_client_class() @staticmethod - def get_web_parser_service(): + def get_web_parser_service(service_type=None): """ Returns the appropriate DOIWebParser subclass for the current service type. + Parameters + ---------- + service_type : str, optional + The service type to return a DOIWebParser subclass for. Defaults to + the SERVICE.provider value of the INI config. + Returns ------- DOIWebParser @@ -179,7 +224,10 @@ def get_web_parser_service(): DOI service type. """ - service_type = DOIServiceFactory.get_service_type() + if not service_type: + service_type = DOIServiceFactory.get_service_type() + + DOIServiceFactory._check_service_type(service_type) web_parser_class = DOIServiceFactory._WEB_PARSER_MAP[service_type] logger.debug('Returning instance of %s for service type %s', From dd7b2814dc5fee644c4bd2831394056abe007f68 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 9 Sep 2021 12:01:08 -0500 Subject: [PATCH 13/13] Reworked the initialize_production_deployment.py script to better support syncing of records Changes include: * Addition of a --service argument to specify the source provider (DataCite or OSTI) to pull existing records from * Addition of a --prefix argument to specify a DOI prefix to query for, other than the default in the INI config * Fixes to ensure that queried DOI's are imported using the format expected by the service configured in the INI config * Misc code/documentation cleanup --- .../util/initialize_production_deployment.py | 301 ++++++++++++------ 1 file changed, 207 insertions(+), 94 deletions(-) diff --git a/src/pds_doi_service/core/util/initialize_production_deployment.py b/src/pds_doi_service/core/util/initialize_production_deployment.py index a5c37b34..27e4ae5f 100644 --- a/src/pds_doi_service/core/util/initialize_production_deployment.py +++ b/src/pds_doi_service/core/util/initialize_production_deployment.py @@ -11,18 +11,24 @@ initialize_production_deployment.py =================================== -Script used to import the available DOIs from the server provider into a local +Script used to import the available DOIs from a service provider into the local production database. """ # Parameters to this script: # +# The -S (optional) is the name of the DOI service provider to pull existing +# DOI records from. When used with the -i option, it should correspond +# to the format of the provided input file. Should be set to either osti +# or datacite. +# The -p (optional) may be used to specify a DOI prefix to query for. By +# default the prefix is obtained from the INI config. # The -s (required) is email of the PDS operator: -s pds-operator@jpl.nasa.gov # The -i is optional. If the input is provided and is a file, parse from it # The format of input file is the same format of text returned from # querying the server via a browser or curl command. # If provided,, this will override the url in the config file. -# The -d is optional. If provided it is the name of the database file to +# The -d is optional. If provided it is the name of the database file to # write records to: -d doi.db # If provided, this will override the db_name in the config file. # The --dry-run parameter allows the code to parse the input or querying the @@ -56,13 +62,16 @@ # with the NASA-PDS account. import argparse +import json import logging import os from datetime import datetime from pds_doi_service.core.input.exceptions import (InputFormatException, CriticalDOIException) -from pds_doi_service.core.outputs.service import DOIServiceFactory +from pds_doi_service.core.outputs.service import (DOIServiceFactory, + SERVICE_TYPE_DATACITE, + VALID_SERVICE_TYPES) from pds_doi_service.core.outputs.osti.osti_web_parser import DOIOstiXmlWebParser from pds_doi_service.core.outputs.doi_record import CONTENT_TYPE_JSON from pds_doi_service.core.outputs.transaction_builder import TransactionBuilder @@ -80,13 +89,26 @@ def create_cmd_parser(): parser = argparse.ArgumentParser( description='Script to bulk import existing DOIs into the local ' - 'transaction database.' + 'transaction database.', + epilog='Note: When DOI records are imported to the local transaction ' + 'database, the DOI service creates an associated output label ' + 'for each record under the transaction_history directory. The ' + 'format of this output label is driven by the SERVICE.provider ' + 'field of the INI. Please ensure the field is set appropriately ' + 'before using this script, as a mismatch could cause parsing ' + 'errors when using the DOI service after this script.' ) - parser.add_argument("-i", "--input", required=False, - help="Input file (XML or JSON) to import existing DOIs from. " - "If no value is provided, the server URL " - "specified by the DOI service configuration INI " - "file is used by default.") + parser.add_argument("-S", "--service", required=False, default=None, + help="Name of the service provider to pull existing DOI " + "records from. If not provided, the provider configured " + "by the DOI service configuration INI is used by " + "default. Should be one of: [{}]" + .format(", ".join(VALID_SERVICE_TYPES))) + parser.add_argument("-p", "--prefix", required=False, default=None, + help="Specify the DOI prefix value to query the service " + "provider for. If not provided, the prefix value " + "configured to the providing in the INI config is " + "used by default.") parser.add_argument("-s", "--submitter-email", required=False, default='pds-operator@jpl.nasa.gov', help="The email address of the user performing the " @@ -96,6 +118,11 @@ def create_cmd_parser(): help="Name of the SQLite3 database file name to commit " "DOI records to. If not provided, the file name is " "obtained from the DOI service INI config.") + parser.add_argument("-i", "--input-file", required=False, + help="Input file (XML or JSON) to import existing DOIs from. " + "If no value is provided, the server URL " + "specified by the DOI service configuration INI " + "file is used by default.") parser.add_argument("-o", "--output-file", required=False, default=None, help="Path to write out the DOI JSON labels as returned " "from the query. When created, this file can be used " @@ -112,7 +139,23 @@ def create_cmd_parser(): def _read_from_local_xml(path): - """Read from a local xml file containing output from a query.""" + """ + Read from a local xml file containing output from a query. + + Note that since the PDS DOI service only supports XML labels from OSTI, + that is the default parser used by this function. + + Parameters + ---------- + path : str + Path of the XML file to read and parse. + + Returns + ------- + dois : list of Doi + The DOI objects parsed from the XML label. + + """ try: with open(path, mode='r') as f: doi_xml = f.read() @@ -124,73 +167,139 @@ def _read_from_local_xml(path): return dois -def _read_from_local_json(path): - """Read from a local JSON file containing output from a query.""" +def _read_from_local_json(service, path): + """ + Read from a local JSON file containing output from a query. + + The appropriate JSON parser (OSTI or DataCite) is determined based on + the provided service type. + + Parameters + ---------- + service : str + The name of the service provider corresponding to the JSON format + to read and parse. + path : str + Path to the JSON file to read and parse. + + Returns + ------- + dois : list of Doi + The DOI objects parsed from the JSON label. + + """ try: with open(path, mode='r') as f: doi_json = f.read() except Exception as e: raise CriticalDOIException(str(e)) - web_parser = DOIServiceFactory.get_web_parser_service() + web_parser = DOIServiceFactory.get_web_parser_service(service) - dois, _ = web_parser.parse_dois_from_label( - doi_json, content_type=CONTENT_TYPE_JSON - ) + try: + dois, _ = web_parser.parse_dois_from_label( + doi_json, content_type=CONTENT_TYPE_JSON + ) + except Exception: + raise InputFormatException( + f"Unable to parse input file {path} using parser {web_parser.__name__}\n" + f"Please ensure the --service flag is set correctly to specify the " + f"correct parser type for the format." + ) return dois -def _read_from_path(path): +def _read_from_path(service, path): + """ + Reads the label at the provided path, using the appropriate parser for the + provided service type. + + Parameters + ---------- + service : str + The name of the service provider corresponding to the format + to read and parse. Only used for JSON labels. + path : str + Path to the label to read and parse. The label format (XML or JSON) is + derived from the path's file extension. + + Returns + ------- + dois : list of Doi + The DOI objects parsed from the label. + + Raises + ------ + InputFormatException + If the file path does not exist or does not correspond to an XML or + JSON file. + + """ + if not os.path.exists(path): + raise InputFormatException(f"Error reading file {path}. " + "File may not exist.") + if path.endswith('.xml'): return _read_from_local_xml(path) elif path.endswith('.json'): - return _read_from_local_json(path) + return _read_from_local_json(service, path) raise InputFormatException(f'File {path} is not supported. ' f'Only .xml and .json are supported.') -def _parse_input(input_file): - if os.path.exists(input_file): - return _read_from_path(input_file) - - raise InputFormatException(f"Error reading file {input_file}. " - f"File may not exist.") - - -def get_dois_from_provider(output_file): +def get_dois_from_provider(service, prefix, output_file=None): """ Queries the service provider for all the current DOI associated with the - PDS-USER account. The server name is fetched from the config file with the - 'url' field in the service provider grouping. + provided prefix. - """ - query_dict = {} + Parameters + ---------- + service : str + Name of the service provider to pull DOI's from. + prefix : str + DOI prefix to query for. + output_file : str, optional + If provided, path to an output file to write the results of the DOI + query to. + + Returns + ------- + dois : list of Doi + The DOI objects obtained from the service provider. + server_url : str + The URL of the service provider endpoint. Helpful for logging purposes. - service_provider = DOIServiceFactory.get_service_type().upper() + """ + if service == SERVICE_TYPE_DATACITE: + query_dict = {'doi': f'{prefix}/*'} + else: + query_dict = {'doi': prefix} - o_server_url = m_config.get(service_provider, 'url') + server_url = m_config.get(service.upper(), 'url') - logger.info("Using %s server URL %s", service_provider, o_server_url) + logger.info("Using %s server URL %s", service, server_url) - web_client = DOIServiceFactory.get_web_client_service() + web_client = DOIServiceFactory.get_web_client_service(service) doi_json = web_client.query_doi( query=query_dict, content_type=CONTENT_TYPE_JSON ) if output_file: + logger.info("Writing query results to %s", output_file) + with open(output_file, 'w') as outfile: - outfile.write(doi_json) + json.dump(json.loads(doi_json), outfile, indent=4) - web_parser = DOIServiceFactory.get_web_parser_service() + web_parser = DOIServiceFactory.get_web_parser_service(service) dois, _ = web_parser.parse_dois_from_label( doi_json, content_type=CONTENT_TYPE_JSON ) - return dois, o_server_url + return dois, server_url def _get_node_id_from_contributors(doi_fields): @@ -198,48 +307,60 @@ def _get_node_id_from_contributors(doi_fields): Given a doi object, attempt to extract the node_id from contributors field. If unable to, return 'eng' as default. This function is a one-off as well so no fancy logic. + + Parameters + ---------- + doi_fields : dict + DOI metadata fields to obtain PDS node ID from. + + Returns + ------- + node_id : str + The three-character PDS identifier determined from the DOI's contributor + field. + """ - o_node_id = 'eng' + node_id = 'eng' if doi_fields.get('contributor'): full_name_orig = doi_fields['contributor'] full_name = full_name_orig.lower() if 'atmospheres' in full_name: - o_node_id = 'atm' + node_id = 'atm' elif 'engineering' in full_name: - o_node_id = 'eng' + node_id = 'eng' elif 'geosciences' in full_name: - o_node_id = 'geo' + node_id = 'geo' elif 'imaging' in full_name: - o_node_id = 'img' + node_id = 'img' elif 'cartography' in full_name: - o_node_id = 'img' + node_id = 'img' # Some uses title: Navigation and Ancillary Information Facility Node # Some uses title: Navigational and Ancillary Information Facility # So check for both elif 'navigation' in full_name and 'ancillary' in full_name: - o_node_id = 'naif' + node_id = 'naif' elif 'navigational' in full_name and 'ancillary' in full_name: - o_node_id = 'naif' + node_id = 'naif' elif 'plasma' in full_name: - o_node_id = 'ppi' + node_id = 'ppi' elif 'ring' in full_name and 'moon' in full_name: - o_node_id = 'rms' + node_id = 'rms' elif 'small' in full_name or 'bodies' in full_name: - o_node_id = 'sbn' + node_id = 'sbn' logger.debug("Derived node ID %s from Contributor field %s", - o_node_id, full_name_orig) + node_id, full_name_orig) else: logger.warning("No Contributor field available for DOI %s, " - "defaulting to node ID %s", doi_fields['doi'], o_node_id) + "defaulting to node ID %s", doi_fields['doi'], node_id) - return o_node_id + return node_id -def perform_import_to_database(db_name, input_source, dry_run, submitter_email, - output_file): +def perform_import_to_database(service, prefix, db_name, input_source, dry_run, + submitter_email, output_file): """ Imports all records from the input source into a local database. The input source may either be an existing file containing DOIs to parse, @@ -250,6 +371,10 @@ def perform_import_to_database(db_name, input_source, dry_run, submitter_email, Parameters ---------- + service : str + Name of the service provider to import DOI's from. + prefix : str + DOI prefix value to query for. db_name : str Name of the database file to import DOI records to. input_source : str @@ -271,60 +396,41 @@ def perform_import_to_database(db_name, input_source, dry_run, submitter_email, o_records_written = 0 # Number of records actually written to database o_records_dois_skipped = 0 # Number of records skipped due to missing lidvid or invalid prefix - # If use_doi_filtering_flag is set to True, we will allow only DOIs that - # start with the configured PDS DOI token, e.g. '10.17189'. - # Servers may contain records other than expected, especially the test - # server. For normal operation use_doi_filtering_flag should be set to False. - # If set to True, the parameter doi_prefix in config/conf.ini - # should be set appropriately. - use_doi_filtering_flag = False + if not service: + service = DOIServiceFactory.get_service_type() - # If flag skip_db_write_flag set to True, will skip writing of records to - # database. Use by developer to skip database write action. - # For normal operation, skip_db_write_flag should be set to False. - skip_db_write_flag = False + logger.info("Using source service provider %s", service) - if dry_run: - skip_db_write_flag = True + if not prefix: + prefix = m_config.get(service.upper(), 'doi_prefix') - o_db_name = db_name + logger.info("Using DOI prefix %s", prefix) # If db_name is not provided, get one from config file: - if not o_db_name: + if not db_name: # This is the local database we'll be writing to - o_db_name = m_config.get('OTHER', 'db_file') + db_name = m_config.get('OTHER', 'db_file') - logger.info("Using local database %s", o_db_name) + logger.info("Using local database %s", db_name) - transaction_builder = TransactionBuilder(o_db_name) + transaction_builder = TransactionBuilder(db_name) # If the input is provided, parse from it. Otherwise query the server. if input_source: - dois = _parse_input(input_source) - o_server_url = input_source + dois = _read_from_path(service, input_source) + server_url = input_source else: # Get the dois from the server. - # Note that because the name of the server is in the config file, - # it can be the OPS or TEST server. - dois, o_server_url = get_dois_from_provider(output_file) + # Note that because the name of the server obtained from the config file, + # it could be the OPS or TEST server. + dois, server_url = get_dois_from_provider(service, prefix, output_file) o_records_found = len(dois) - logger.info("Parsed %d DOI(s) from %s", o_records_found, o_server_url) + logger.info("Parsed %d DOI(s) from %s", o_records_found, server_url) # Write each Doi object as a row into the database. for item_index, doi in enumerate(dois): - if use_doi_filtering_flag: - service_provider = DOIServiceFactory.get_service_type().upper() - o_pds_doi_token = m_config.get(service_provider, 'doi_prefix') - - if doi.doi and not doi.doi.startswith(o_pds_doi_token): - logger.warning("Skipping non-PDS DOI %s, index %d", doi.doi, - item_index) - - o_records_dois_skipped += 1 - continue - # If the field 'related_identifier' is None, we cannot proceed since # it serves as the primary key for our transaction database. if not doi.related_identifier: @@ -349,13 +455,16 @@ def perform_import_to_database(db_name, input_source, dry_run, submitter_email, o_records_processed += 1 - if not skip_db_write_flag: + if not dry_run: # Write a row into the database and save an output label for each - # DOI to the local transaction history + # DOI to the local transaction history. The format (OSTI vs. Datacite) + # of the output label is based on the service provider setting in + # the INI config. transaction = transaction_builder.prepare_transaction( node_id, submitter_email, - doi + doi, + output_content_type=CONTENT_TYPE_JSON ) transaction.log() @@ -375,6 +484,8 @@ def main(): parser = create_cmd_parser() arguments = parser.parse_args() + logger.setLevel(logging.INFO) + if arguments.debug: logger.setLevel(logging.DEBUG) @@ -385,8 +496,10 @@ def main(): (records_found, records_processed, records_written, - records_skipped) = perform_import_to_database(arguments.db_name, - arguments.input, + records_skipped) = perform_import_to_database(arguments.service, + arguments.prefix, + arguments.db_name, + arguments.input_file, arguments.dry_run, arguments.submitter_email, arguments.output_file)