diff --git a/src/pds_doi_service/core/actions/release.py b/src/pds_doi_service/core/actions/release.py index 69a7b732..2dae4c7d 100644 --- a/src/pds_doi_service/core/actions/release.py +++ b/src/pds_doi_service/core/actions/release.py @@ -14,7 +14,7 @@ """ from pds_doi_service.core.actions.action import DOICoreAction -from pds_doi_service.core.entities.doi import DoiStatus +from pds_doi_service.core.entities.doi import DoiEvent, DoiStatus from pds_doi_service.core.input.exceptions import (InputFormatException, DuplicatedTitleDOIException, UnexpectedDOIActionException, @@ -137,6 +137,11 @@ def _complete_dois(self, dois): # Add 'status' field so the ranking in the workflow can be determined. doi.status = DoiStatus.Pending if self._no_review else DoiStatus.Review + if self._no_review: + # Add the event field to instruct DataCite to publish DOI to + # findable state (should have no effect for other providers) + doi.event = DoiEvent.Publish + return dois def _validate_dois(self, dois): diff --git a/src/pds_doi_service/core/actions/reserve.py b/src/pds_doi_service/core/actions/reserve.py index dcefe1e9..3ef553bc 100644 --- a/src/pds_doi_service/core/actions/reserve.py +++ b/src/pds_doi_service/core/actions/reserve.py @@ -14,7 +14,7 @@ """ from pds_doi_service.core.actions.action import DOICoreAction -from pds_doi_service.core.entities.doi import DoiStatus +from pds_doi_service.core.entities.doi import DoiEvent, DoiStatus from pds_doi_service.core.input.exceptions import (CriticalDOIException, DuplicatedTitleDOIException, InputFormatException, @@ -28,8 +28,8 @@ from pds_doi_service.core.input.node_util import NodeUtil from pds_doi_service.core.outputs.doi_record import CONTENT_TYPE_JSON from pds_doi_service.core.outputs.doi_validator import DOIValidator -from pds_doi_service.core.outputs.service import DOIServiceFactory -from pds_doi_service.core.outputs.web_client import WEB_METHOD_POST +from pds_doi_service.core.outputs.service import DOIServiceFactory, SERVICE_TYPE_DATACITE +from pds_doi_service.core.outputs.web_client import WEB_METHOD_POST, WEB_METHOD_PUT from pds_doi_service.core.util.general_util import get_logger logger = get_logger(__name__) @@ -136,6 +136,12 @@ def _complete_dois(self, dois): # Add 'status' field so the ranking in the workflow can be determined doi.status = DoiStatus.Reserved_not_submitted if self._dry_run else DoiStatus.Reserved + if not self._dry_run: + # Add the event field to instruct DataCite to make this entry + # hidden so it can be modified (should have no effect for other + # providers) + doi.event = DoiEvent.Hide + return dois def _validate_dois(self, dois): @@ -237,8 +243,24 @@ def run(self, **kwargs): # Note that for both OSTI and DataCite, reserve requests should # utilize the POST method if not self._dry_run: + service_type = DOIServiceFactory.get_service_type() + + # If a DOI has already been assigned by DataCite, + # we need to use a PUT request on the URL associated to the DOI + if service_type == SERVICE_TYPE_DATACITE and doi.doi: + method = WEB_METHOD_PUT + url = '{url}/{doi}'.format( + url=self._config.get('DATACITE', 'url'), doi=doi.doi + ) + # Otherwise, for both DataCite and OSTI, just a POST request + # on the default endpoint is sufficient + else: + method = WEB_METHOD_POST + url = self._config.get(service_type.upper(), 'url') + doi, o_doi_label = self._web_client.submit_content( - method=WEB_METHOD_POST, + method=method, + url=url, payload=io_doi_label, content_type=CONTENT_TYPE_JSON ) diff --git a/src/pds_doi_service/core/entities/doi.py b/src/pds_doi_service/core/entities/doi.py index 896572de..6b450970 100644 --- a/src/pds_doi_service/core/entities/doi.py +++ b/src/pds_doi_service/core/entities/doi.py @@ -25,6 +25,7 @@ class ProductType(str, Enum): Bundle = 'Bundle' Text = 'Text' Dataset = 'Dataset' + Other = 'Other' @unique @@ -74,6 +75,26 @@ class DoiStatus(str, Enum): Deactivated = 'deactivated' +@unique +class DoiEvent(str, Enum): + """ + Enumerates the possible DOI events that can be requested in a submission + to DataCite. + + Events consist of: + Publish - + Moves a DOI from draft or registered state to findable + Register - + Moves a DOI from draft to registered + Hide - + Moves a DOI from findable back to registered + + """ + Publish = 'publish' + Register = 'register' + Hide = 'hide' + + @dataclass class Doi: """The dataclass definition for a Doi object.""" @@ -82,6 +103,7 @@ class Doi: product_type: ProductType product_type_specific: str related_identifier: str + identifiers: list = field(default_factory=list) authors: list = None keywords: set = field(default_factory=set) editors: list = None @@ -96,3 +118,4 @@ class Doi: message: str = None date_record_added: datetime = None date_record_updated: datetime = None + event: DoiEvent = None diff --git a/src/pds_doi_service/core/outputs/datacite/DOI_DataCite_template_20210520-jinja2.json b/src/pds_doi_service/core/outputs/datacite/DOI_DataCite_template_20210520-jinja2.json index c5cd1bc7..427005b9 100644 --- a/src/pds_doi_service/core/outputs/datacite/DOI_DataCite_template_20210520-jinja2.json +++ b/src/pds_doi_service/core/outputs/datacite/DOI_DataCite_template_20210520-jinja2.json @@ -10,10 +10,8 @@ {% endif %} "type": "dois", "attributes": { - {% if doi.status.value == "pending" %} - "event": "publish", - {% elif doi.status.value == "findable" %} - "event": "hide", + {% if doi.event %} + "event": "{{ doi.event.value }}", {% endif %} {% if doi.doi %} "doi": "{{ doi.doi }}", @@ -24,6 +22,12 @@ "suffix": "{{ doi.id }}", {% endif %} "identifiers": [ + {% for identifier in doi.identifiers %} + { + "identifier": "{{ identifier.identifier.strip() }}", + "identifierType": "{{ identifier.identifierType }}" + }, + {% endfor %} { {% if doi.doi %} "identifier": "{{ doi.doi }}", @@ -36,10 +40,25 @@ "creators": [ {% for author in doi.authors %} { + {% if author.name_type %} + "nameType": "{{author.name_type}}", + {% else %} "nameType": "Personal", - "name": "{{ author['last_name'] }}, {{ author['first_name'] }}", - "givenName": "{{ author['first_name'] }}", - "familyName": "{{ author['last_name'] }}" + {% endif %} + {% if author.first_name and author.last_name %} + "name": "{{ author.first_name }} {{ author.last_name }}", + {% else %} + "name": "{{ author.name }}", + {% endif %} + "nameIdentifiers": [ + {% for name_identifier in author.name_identifiers %} + { + {% for key, value in name_identifier.items() %} + "{{key}}": "{{value}}"{% if not loop.last %},{% endif +%} + {% endfor %} + } + {% endfor %} + ] }{% if not loop.last %},{% endif +%} {% endfor %} ], @@ -60,9 +79,20 @@ {% for editor in doi.editors %} { "nameType": "Personal", - "name": "{{ editor['last_name'] }}, {{ editor['first_name'] }}", - "givenName": "{{ editor['first_name'] }}", - "familyName": "{{ editor['last_name'] }}", + {% if editor.first_name and editor.last_name %} + "name": "{{ editor.first_name }} {{ editor.last_name }}", + {% else %} + "name": "{{ editor.name }}", + {% endif %} + "nameIdentifiers": [ + {% for name_identifier in editor.name_identifiers %} + { + {% for key, value in name_identifier.items() %} + "{{key}}": "{{value}}"{% if not loop.last %},{% endif +%} + {% endfor %} + } + {% endfor %} + ], "contributorType": "Editor" }, {% endfor %} @@ -79,9 +109,8 @@ "relatedIdentifiers": [ { "relatedIdentifier": "{{ doi.related_identifier }}", - "relatedIdentifierType": "URN", - "relationType": "HasMetadata", - "resourceTypeGeneral": "Text" + "relatedIdentifierType": {% if doi.related_identifier.lower().startswith("urn") %}"URN"{% else %}"Handle"{% endif %}, + "relationType": "IsIdenticalTo" } ], {% if doi.description %} diff --git a/src/pds_doi_service/core/outputs/datacite/datacite_record.py b/src/pds_doi_service/core/outputs/datacite/datacite_record.py index fe789524..84b72cdb 100644 --- a/src/pds_doi_service/core/outputs/datacite/datacite_record.py +++ b/src/pds_doi_service/core/outputs/datacite/datacite_record.py @@ -22,7 +22,7 @@ from pds_doi_service.core.entities.doi import ProductType, Doi from pds_doi_service.core.outputs.doi_record import DOIRecord, CONTENT_TYPE_JSON from pds_doi_service.core.util.config_parser import DOIConfigUtil -from pds_doi_service.core.util.general_util import get_logger +from pds_doi_service.core.util.general_util import get_logger, sanitize_json_string logger = get_logger(__name__) @@ -103,7 +103,7 @@ def create_doi_record(self, dois, content_type=CONTENT_TYPE_JSON): doi_fields['product_type'] = ProductType.Collection # Sort keywords so we can output them in the same order each time - doi_fields['keywords'] = sorted(doi.keywords) + doi_fields['keywords'] = sorted(map(sanitize_json_string, doi.keywords)) # Convert datetime objects to isoformat strings if doi.date_record_added: @@ -112,9 +112,13 @@ def create_doi_record(self, dois, content_type=CONTENT_TYPE_JSON): if doi.date_record_updated: doi_fields['date_record_updated'] = doi.date_record_updated.strftime("%Y-%m-%dT%H:%M:%S.%fZ") - # Remove any extraneous whitespace from a provided description + # Cleanup extra whitespace that could break JSON format from title + # and description + if doi.title: + doi_fields['title'] = sanitize_json_string(doi.title) + if doi.description: - doi_fields['description'] = str.strip(doi.description) + doi_fields['description'] = sanitize_json_string(doi.description) # Publication year is a must-have doi_fields['publication_year'] = doi.publication_date.strftime('%Y') diff --git a/src/pds_doi_service/core/outputs/datacite/datacite_web_client.py b/src/pds_doi_service/core/outputs/datacite/datacite_web_client.py index 7d1431f4..a1c6f474 100644 --- a/src/pds_doi_service/core/outputs/datacite/datacite_web_client.py +++ b/src/pds_doi_service/core/outputs/datacite/datacite_web_client.py @@ -100,9 +100,9 @@ def query_doi(self, query, url=None, username=None, password=None, Notes ----- - Queries are automatically filtered by this method to only include - DOI entries associated with the PDS client ID, which corresponds to the - username used with the query request. + Queries are NOT automatically filtered by this method. Callers should be + prepared to filter results as desired if more results are returned + by their query than expected. Parameters ---------- @@ -151,15 +151,13 @@ def query_doi(self, query, url=None, username=None, password=None, query_string = str(query) url = url or config.get('DATACITE', 'url') - client_id = (username or config.get('DATACITE', 'user')).lower() logger.debug('query_string: %s', query_string) logger.debug('url: %s', url) - logger.debug('client_id: %s', client_id) datacite_response = requests.request( WEB_METHOD_GET, url=url, auth=auth, headers=headers, - params={"query": query_string, "client-id": client_id} + params={"query": query_string} ) try: diff --git a/src/pds_doi_service/core/outputs/datacite/datacite_web_parser.py b/src/pds_doi_service/core/outputs/datacite/datacite_web_parser.py index d262740a..89529ee5 100644 --- a/src/pds_doi_service/core/outputs/datacite/datacite_web_parser.py +++ b/src/pds_doi_service/core/outputs/datacite/datacite_web_parser.py @@ -37,14 +37,14 @@ class DOIDataCiteWebParser(DOIWebParser): This class only supports parsing records in JSON format. """ _optional_fields = [ - 'id', 'doi', 'description', 'keywords', 'authors', 'site_url', - 'editors', 'status', 'date_record_added', 'date_record_updated', - 'contributor', 'related_identifier' + 'id', 'doi', 'identifiers', 'description', 'keywords', 'authors', + 'site_url', 'editors', 'status', 'date_record_added', + 'date_record_updated', 'contributor' ] _mandatory_fields = [ 'title', 'publisher', 'publication_date', 'product_type', - 'product_type_specific' + 'product_type_specific', 'related_identifier' ] @staticmethod @@ -55,87 +55,114 @@ def _parse_id(record): else: # Parse the ID from the DOI field, it it's available return record.get('doi').split('/')[-1] - except (AttributeError, KeyError) as err: - logger.warning('Could not parse id from record, reason: %s %s', - err.__class__, err) + except (AttributeError, KeyError): + logger.warning('Could not parse optional field "id"') @staticmethod def _parse_doi(record): try: return record['doi'] - except KeyError as err: - logger.warning('Could not parse doi from record, reason: %s %s', - err.__class__, err) + except KeyError: + logger.warning('Could not parse optional field "doi"') + + @staticmethod + def _parse_identifiers(record): + try: + identifiers = filter( + lambda identifier: identifier["identifierType"] != "DOI", + record['identifiers'] + ) + return list(identifiers) + except KeyError: + logger.warning('Could not parse optional field "identifiers"') @staticmethod def _parse_description(record): try: return record['descriptions'][0]['description'] - except (IndexError, KeyError) as err: - logger.warning('Could not parse description from record, reason: %s %s', - err.__class__, err) + except (IndexError, KeyError): + logger.warning('Could not parse optional field "description"') @staticmethod def _parse_keywords(record): try: return set(sorted(subject['subject'] for subject in record['subjects'])) - except KeyError as err: - logger.warning('Could not parse keywords from record, reason: %s %s', - err.__class__, err) + except KeyError: + logger.warning('Could not parse optional field "keywords"') @staticmethod def _parse_authors(record): try: - return [{'first_name': creator['givenName'], - 'last_name': creator['familyName']} - for creator in record['creators']] - except KeyError as err: - logger.warning('Could not parse authors from record, reason: %s %s', - err.__class__, err) + authors = [] + + for creator in record['creators']: + if all(name_type in creator for name_type in ('givenName', 'familyName')): + name = f"{creator['givenName']} {creator['familyName']}" + else: + name = creator['name'] + + authors.append( + { + 'name': name, + 'name_type': creator['nameType'], + 'name_identifiers': creator.get('nameIdentifiers', []) + } + ) + + return authors + except KeyError: + logger.warning('Could not parse optional field "authors"') @staticmethod def _parse_site_url(record): try: return html.unescape(record['url']) - except (KeyError, TypeError) as err: - logger.warning('Could not parse site url from record, reason: %s %s', - err.__class__, err) + except (KeyError, TypeError): + logger.warning('Could not parse optional field "site_url"') @staticmethod def _parse_editors(record): try: - return [{'first_name': contributor['givenName'], - 'last_name': contributor['familyName']} - for contributor in record['contributors'] - if contributor['contributorType'] == 'Editor'] - except KeyError as err: - logger.warning('Could not parse editors from record, reason: %s %s', - err.__class__, err) + editors = [] + + for contributor in record['contributors']: + if contributor['contributorType'] == 'Editor': + if all(name_type in contributor for name_type in ('givenName', 'familyName')): + name = f"{contributor['givenName']} {contributor['familyName']}" + else: + name = contributor['name'] + + editors.append( + { + 'name': name, + 'name_identifiers': contributor.get('nameIdentifiers', []) + } + ) + return editors + except KeyError: + logger.warning('Could not parse optional field "editors"') @staticmethod def _parse_status(record): try: return DoiStatus(record['state']) - except (KeyError, ValueError) as err: - logger.warning('Could not parse status from record, reason: %s %s', - err.__class__, err) + except (KeyError, ValueError): + logger.warning('Could not parse optional field "status"') @staticmethod def _parse_date_record_added(record): try: return isoparse(record['created']) - except (KeyError, ValueError) as err: - logger.warning('Could not parse date added from record, reason: %s %s', - err.__class__, err) + except (KeyError, ValueError): + logger.warning('Could not parse optional field "date_record_added"') @staticmethod def _parse_date_record_updated(record): try: return isoparse(record['updated']) except (KeyError, ValueError) as err: - logger.warning('Could not parse date updated from record, reason: %s %s', - err.__class__, err) + logger.warning('Could not parse optional field "date_record_updated"') @staticmethod def _parse_contributor(record): @@ -152,9 +179,8 @@ def _parse_contributor(record): .strip()) return contributor - except (KeyError, StopIteration, ValueError) as err: - logger.warning('Could not parse a contributor from record, reason: %s %s', - err.__class__, err) + except (KeyError, StopIteration, ValueError): + logger.warning('Could not parse optional field "contributor"') @staticmethod def _parse_related_identifier(record): @@ -162,67 +188,66 @@ def _parse_related_identifier(record): try: identifier = record['relatedIdentifiers'][0]['relatedIdentifier'] - except (IndexError, KeyError) as err: - if 'url' in record: + except (IndexError, KeyError): + if 'identifiers' in record: + for identifier_record in record['identifiers']: + if identifier_record["identifier"].startswith('urn:'): + identifier = identifier_record["identifier"] + break + elif 'url' in record: logger.info('Parsing related identifier from URL') identifier = DOIWebParser._get_identifier_from_site_url(record['url']) - else: - logger.warning('Could not parse a related identifier from record, ' - 'reason: %s %s', err.__class__, err) - if identifier: - identifier = identifier.strip() + if identifier is None: + raise InputFormatException( + 'Failed to parse mandatory field "related_identifier"' + ) - return identifier + return identifier.strip() @staticmethod def _parse_title(record): try: return record['titles'][0]['title'] - except (IndexError, KeyError) as err: + except (IndexError, KeyError): raise InputFormatException( - f'Failed to parse title from provided record, reason: ' - f'{err.__class__} {err}' + 'Failed to parse mandatory field "title"' ) @staticmethod def _parse_publisher(record): try: return record['publisher'] - except KeyError as err: + except KeyError: raise InputFormatException( - f'Failed to parse publisher from provided record, reason: ' - f'{err.__class__} {err}' + 'Failed to parse mandatory field "publisher"' ) @staticmethod def _parse_publication_date(record): try: return datetime.strptime(str(record['publicationYear']), '%Y') - except (KeyError, ValueError) as err: + except (KeyError, ValueError): raise InputFormatException( - 'Failed to parse publication date from provided record, reason: ' - f'{err.__class__} {err}' + 'Failed to parse mandatory field "publication_date"' ) @staticmethod def _parse_product_type(record): try: return ProductType(record['types']['resourceTypeGeneral']) - except (KeyError, ValueError) as err: + except (KeyError, ValueError): raise InputFormatException( - 'Failed to parse product type from provided record, reason: ' - f'{err.__class__} {err}' + 'Failed to parse mandatory field "product_type"' ) @staticmethod def _parse_product_type_specific(record): try: return record['types']['resourceType'] - except KeyError as err: + except KeyError: raise InputFormatException( - 'Failed to parse product type specific from provided record, ' - f'reason: {err.__class__} {err}' + 'Failed to parse mandatory field "product_type_specific"' ) @staticmethod @@ -261,31 +286,39 @@ def parse_dois_from_label(label_text, content_type=CONTENT_TYPE_JSON): if not isinstance(datacite_records, list): datacite_records = [datacite_records] - for datacite_record in datacite_records: - doi_fields = {} + for index, datacite_record in enumerate(datacite_records): + try: + logger.info('Parsing record index %d', index) + doi_fields = {} + + # Everything we care about in a DataCite response is under + # attributes + datacite_record = datacite_record['attributes'] - # Everything we care about in a DataCite response is under - # attributes - datacite_record = datacite_record['attributes'] + for mandatory_field in DOIDataCiteWebParser._mandatory_fields: + doi_fields[mandatory_field] = getattr( + DOIDataCiteWebParser, f'_parse_{mandatory_field}')(datacite_record) + logger.debug('Parsed value %s for mandatory field %s', + doi_fields[mandatory_field], mandatory_field) - for mandatory_field in DOIDataCiteWebParser._mandatory_fields: - doi_fields[mandatory_field] = getattr( - DOIDataCiteWebParser, f'_parse_{mandatory_field}')(datacite_record) - logger.debug('Parsed value %s for mandatory field %s', - doi_fields[mandatory_field], mandatory_field) + for optional_field in DOIDataCiteWebParser._optional_fields: + parsed_value = getattr( + DOIDataCiteWebParser, f'_parse_{optional_field}')(datacite_record) - for optional_field in DOIDataCiteWebParser._optional_fields: - parsed_value = getattr( - DOIDataCiteWebParser, f'_parse_{optional_field}')(datacite_record) + if parsed_value is not None: + doi_fields[optional_field] = parsed_value + logger.debug('Parsed value %s for optional field %s', + parsed_value, optional_field) - if parsed_value is not None: - doi_fields[optional_field] = parsed_value - logger.debug('Parsed value %s for optional field %s', - parsed_value, optional_field) + doi = Doi(**doi_fields) - doi = Doi(**doi_fields) + dois.append(doi) + except InputFormatException as err: + logger.warning('Failed to parse a DOI object from record index %d ' + 'of the provided label, reason: %s', index, str(err)) + continue - dois.append(doi) + logger.info('Parsed %d DOI objects from %d records', len(dois), len(datacite_records)) return dois, errors diff --git a/src/pds_doi_service/core/outputs/doi_validator.py b/src/pds_doi_service/core/outputs/doi_validator.py index 1ae998d2..83ece862 100644 --- a/src/pds_doi_service/core/outputs/doi_validator.py +++ b/src/pds_doi_service/core/outputs/doi_validator.py @@ -43,12 +43,16 @@ class DOIValidator: # The workflow_order dictionary contains the progression of the status of a DOI: m_workflow_order = { + DoiStatus.Error: 0, + DoiStatus.Unknown: 0, DoiStatus.Reserved_not_submitted: 0, DoiStatus.Reserved: 1, DoiStatus.Draft: 2, DoiStatus.Review: 3, DoiStatus.Pending: 4, - DoiStatus.Registered: 5 + DoiStatus.Registered: 5, + DoiStatus.Findable: 5, + DoiStatus.Deactivated: 5 } def __init__(self, db_name=None): diff --git a/src/pds_doi_service/core/outputs/service.py b/src/pds_doi_service/core/outputs/service.py index 9748a68a..c7991859 100644 --- a/src/pds_doi_service/core/outputs/service.py +++ b/src/pds_doi_service/core/outputs/service.py @@ -77,15 +77,15 @@ class DOIServiceFactory: _config = DOIConfigUtil().get_config() @staticmethod - def get_service_type(): + def _check_service_type(service_type): """ - Returns the configured service type as defined within the INI config. + Checks if the provided service type is among the expected values. - Returns - ------- + Parameters + ---------- service_type : str - The service type to be used. This value is converted to lowercase - by this method before it is returned. + Service type to check. Is automatically converted to lowercase + to provide a case-insensitive check. Raises ------ @@ -94,24 +94,42 @@ def get_service_type(): class (or if no type is specified by the INI config at all). """ - service_type = DOIServiceFactory._config.get( - 'SERVICE', 'provider', fallback='unassigned' - ) - if service_type.lower() not in VALID_SERVICE_TYPES: raise ValueError( f'Unsupported service type "{service_type}" provided.\n' - f'Service type must be assigned to the SERVICE.provider field of ' + f'Service type should be assigned to the SERVICE.provider field of ' f'the INI config with one of the following values: {VALID_SERVICE_TYPES}' ) + @staticmethod + def get_service_type(): + """ + Returns the configured service type as defined within the INI config. + + Returns + ------- + service_type : str + The service type to be used. This value is converted to lowercase + by this method before it is returned. + + """ + service_type = DOIServiceFactory._config.get( + 'SERVICE', 'provider', fallback='unassigned' + ) + return service_type.lower() @staticmethod - def get_doi_record_service(): + def get_doi_record_service(service_type=None): """ Returns the appropriate DOIRecord subclass for the current service type. + Parameters + ---------- + service_type : str, optional + The service type to return a DOIRecord subclass for. Defaults to + the SERVICE.provider value of the INI config. + Returns ------- DOIRecord @@ -119,7 +137,10 @@ def get_doi_record_service(): DOI service type. """ - service_type = DOIServiceFactory.get_service_type() + if not service_type: + service_type = DOIServiceFactory.get_service_type() + + DOIServiceFactory._check_service_type(service_type) doi_record_class = DOIServiceFactory._DOI_RECORD_MAP[service_type] logger.debug('Returning instance of %s for service type %s', @@ -128,10 +149,16 @@ def get_doi_record_service(): return doi_record_class() @staticmethod - def get_validator_service(): + def get_validator_service(service_type=None): """ Returns the appropriate DOIValidator subclass for the current service type. + Parameters + ---------- + service_type : str, optional + The service type to return a DOIValidator subclass for. Defaults to + the SERVICE.provider value of the INI config. + Returns ------- DOIValidator @@ -139,7 +166,10 @@ def get_validator_service(): DOI service type. """ - service_type = DOIServiceFactory.get_service_type() + if not service_type: + service_type = DOIServiceFactory.get_service_type() + + DOIServiceFactory._check_service_type(service_type) doi_validator_class = DOIServiceFactory._SERVICE_VALIDATOR_MAP[service_type] logger.debug('Returning instance of %s for service type %s', @@ -148,10 +178,16 @@ def get_validator_service(): return doi_validator_class() @staticmethod - def get_web_client_service(): + def get_web_client_service(service_type=None): """ Returns the appropriate DOIWebClient subclass for the current service type. + Parameters + ---------- + service_type : str, optional + The service type to return a DOIWebClient subclass for. Defaults to + the SERVICE.provider value of the INI config. + Returns ------- DOIWebClient @@ -159,7 +195,10 @@ def get_web_client_service(): DOI service type. """ - service_type = DOIServiceFactory.get_service_type() + if not service_type: + service_type = DOIServiceFactory.get_service_type() + + DOIServiceFactory._check_service_type(service_type) web_client_class = DOIServiceFactory._WEB_CLIENT_MAP[service_type] logger.debug('Returning instance of %s for service type %s', @@ -168,10 +207,16 @@ def get_web_client_service(): return web_client_class() @staticmethod - def get_web_parser_service(): + def get_web_parser_service(service_type=None): """ Returns the appropriate DOIWebParser subclass for the current service type. + Parameters + ---------- + service_type : str, optional + The service type to return a DOIWebParser subclass for. Defaults to + the SERVICE.provider value of the INI config. + Returns ------- DOIWebParser @@ -179,7 +224,10 @@ def get_web_parser_service(): DOI service type. """ - service_type = DOIServiceFactory.get_service_type() + if not service_type: + service_type = DOIServiceFactory.get_service_type() + + DOIServiceFactory._check_service_type(service_type) web_parser_class = DOIServiceFactory._WEB_PARSER_MAP[service_type] logger.debug('Returning instance of %s for service type %s', diff --git a/src/pds_doi_service/core/outputs/test/datacite_test.py b/src/pds_doi_service/core/outputs/test/datacite_test.py index cdcd078a..c24d3c01 100644 --- a/src/pds_doi_service/core/outputs/test/datacite_test.py +++ b/src/pds_doi_service/core/outputs/test/datacite_test.py @@ -50,52 +50,6 @@ def test_create_datacite_label_json(self): self.assertDictEqual(input_doi_fields, output_doi_fields) - def test_assign_datacite_event(self): - """ - Test assignment of the event field when creating a label for a DOI - in the reserved or pending state - """ - # Create a dummy Doi object to be reserved - test_doi = Doi(title='InSight Cameras Bundle', - publication_date=datetime(2019, 1, 1, 0, 0), - product_type=ProductType.Dataset, - product_type_specific='PDS4 Refereed Data Bundle', - related_identifier='urn:nasa:pds:insight_cameras::1.0', - id='yzw2-vz66', - doi='10.13143/yzw2-vz66', - publisher='NASA Planetary Data System', - contributor='Engineering', - status=DoiStatus.Reserved) - - # Create the label to submit to DataCite, using the Pending (release) - # state, which should map to the "publish" event - test_doi.status = DoiStatus.Pending - - release_label = DOIDataCiteRecord().create_doi_record(test_doi) - release_label_dict = json.loads(release_label) - - self.assertIn('event', release_label_dict['data']['attributes']) - self.assertEqual(release_label_dict['data']['attributes']['event'], 'publish') - - # If updating a record that has already been published (findable), - # we need to move it back to the registered stage via the "hide" event - test_doi.status = DoiStatus.Findable - - back_to_reserve_label = DOIDataCiteRecord().create_doi_record(test_doi) - - back_to_reserve_label_dict = json.loads(back_to_reserve_label) - - self.assertIn('event', back_to_reserve_label_dict['data']['attributes']) - self.assertEqual(back_to_reserve_label_dict['data']['attributes']['event'], 'hide') - - # For any other state, we should not get an event field in the label - test_doi.status = DoiStatus.Reserved_not_submitted - - reserve_label = DOIDataCiteRecord().create_doi_record(test_doi) - reserve_label_dict = json.loads(reserve_label) - - self.assertNotIn('event', reserve_label_dict['data']['attributes']) - def requests_valid_request_patch(method, url, **kwargs): response = Response() @@ -175,13 +129,13 @@ def setUpClass(cls): join(cls.test_dir, os.pardir, os.pardir, os.pardir, os.pardir, os.pardir, 'input') ) - cls.expected_authors = [{'first_name': 'R.', 'last_name': 'Deen'}, - {'first_name': 'H.', 'last_name': 'Abarca'}, - {'first_name': 'P.', 'last_name': 'Zamani'}, - {'first_name': 'J.', 'last_name': 'Maki'}] - cls.expected_editors = [{'first_name': 'P. H.', 'last_name': 'Smith'}, - {'first_name': 'M.', 'last_name': 'Lemmon'}, - {'first_name': 'R. F.', 'last_name': 'Beebe'}] + cls.expected_authors = [{'name': 'R. Deen', 'name_identifiers': [], 'name_type': 'Personal'}, + {'name': 'H. Abarca', 'name_identifiers': [], 'name_type': 'Personal'}, + {'name': 'P. Zamani', 'name_identifiers': [], 'name_type': 'Personal'}, + {'name': 'J. Maki', 'name_identifiers': [], 'name_type': 'Personal'}] + cls.expected_editors = [{'name': 'P. H. Smith', 'name_identifiers': []}, + {'name': 'M. Lemmon', 'name_identifiers': []}, + {'name': 'R. F. Beebe', 'name_identifiers': []}] cls.expected_keywords = {'data', 'rdr', 'product', 'experiment', 'lander', 'context', 'PDS', 'raw', 'mars', 'record', 'reduced', 'science', 'edr', 'PDS4', 'camera', 'deployment', diff --git a/src/pds_doi_service/core/outputs/web_parser.py b/src/pds_doi_service/core/outputs/web_parser.py index da3a30c4..170359b0 100644 --- a/src/pds_doi_service/core/outputs/web_parser.py +++ b/src/pds_doi_service/core/outputs/web_parser.py @@ -40,16 +40,20 @@ def _get_identifier_from_site_url(site_url): """ # TODO: rewrite to utilize urlparse and support PDS3 labels + lid_vid_value = None + site_tokens = site_url.split("identifier=") identifier_tokens = site_tokens[1].split(";") lid_vid_tokens = identifier_tokens[0].split("&version=") - lid_value = lid_vid_tokens[0].replace("%3A", ":") - vid_value = lid_vid_tokens[1] - # Finally combine the lid and vid together. - lid_vid_value = lid_value + '::' + vid_value + if len(lid_vid_tokens) >= 2: + lid_value = lid_vid_tokens[0].replace("%3A", ":") + vid_value = lid_vid_tokens[1] + + # Finally combine the lid and vid together. + lid_vid_value = lid_value + '::' + vid_value return lid_vid_value diff --git a/src/pds_doi_service/core/util/general_util.py b/src/pds_doi_service/core/util/general_util.py index b6f9c602..04ae3040 100644 --- a/src/pds_doi_service/core/util/general_util.py +++ b/src/pds_doi_service/core/util/general_util.py @@ -13,11 +13,34 @@ General utility functions for things like logging. """ +import re import logging from pds_doi_service.core.util.config_parser import DOIConfigUtil +def sanitize_json_string(string): + """ + Cleans up extraneous whitespace from the provided string so it may be + written to a JSON file. Extraneous whitespace include any before or after + the provided string, as well as between words. + + Parameters + ---------- + string : str + The string to sanitize. + + Returns + ------- + string : str + The provided string, sanitized of extraneous whitespace. + + """ + # Clean up whitespace (including line breaks) both between words and + # at the ends of the string + return re.sub(r"\s+", " ", string, flags=re.UNICODE).strip() + + def get_logger(module_name=''): # If the user specifies the module name, we can use it. if module_name: @@ -36,4 +59,3 @@ def get_logger(module_name=''): return logger - diff --git a/src/pds_doi_service/core/util/initialize_production_deployment.py b/src/pds_doi_service/core/util/initialize_production_deployment.py index a5c37b34..27e4ae5f 100644 --- a/src/pds_doi_service/core/util/initialize_production_deployment.py +++ b/src/pds_doi_service/core/util/initialize_production_deployment.py @@ -11,18 +11,24 @@ initialize_production_deployment.py =================================== -Script used to import the available DOIs from the server provider into a local +Script used to import the available DOIs from a service provider into the local production database. """ # Parameters to this script: # +# The -S (optional) is the name of the DOI service provider to pull existing +# DOI records from. When used with the -i option, it should correspond +# to the format of the provided input file. Should be set to either osti +# or datacite. +# The -p (optional) may be used to specify a DOI prefix to query for. By +# default the prefix is obtained from the INI config. # The -s (required) is email of the PDS operator: -s pds-operator@jpl.nasa.gov # The -i is optional. If the input is provided and is a file, parse from it # The format of input file is the same format of text returned from # querying the server via a browser or curl command. # If provided,, this will override the url in the config file. -# The -d is optional. If provided it is the name of the database file to +# The -d is optional. If provided it is the name of the database file to # write records to: -d doi.db # If provided, this will override the db_name in the config file. # The --dry-run parameter allows the code to parse the input or querying the @@ -56,13 +62,16 @@ # with the NASA-PDS account. import argparse +import json import logging import os from datetime import datetime from pds_doi_service.core.input.exceptions import (InputFormatException, CriticalDOIException) -from pds_doi_service.core.outputs.service import DOIServiceFactory +from pds_doi_service.core.outputs.service import (DOIServiceFactory, + SERVICE_TYPE_DATACITE, + VALID_SERVICE_TYPES) from pds_doi_service.core.outputs.osti.osti_web_parser import DOIOstiXmlWebParser from pds_doi_service.core.outputs.doi_record import CONTENT_TYPE_JSON from pds_doi_service.core.outputs.transaction_builder import TransactionBuilder @@ -80,13 +89,26 @@ def create_cmd_parser(): parser = argparse.ArgumentParser( description='Script to bulk import existing DOIs into the local ' - 'transaction database.' + 'transaction database.', + epilog='Note: When DOI records are imported to the local transaction ' + 'database, the DOI service creates an associated output label ' + 'for each record under the transaction_history directory. The ' + 'format of this output label is driven by the SERVICE.provider ' + 'field of the INI. Please ensure the field is set appropriately ' + 'before using this script, as a mismatch could cause parsing ' + 'errors when using the DOI service after this script.' ) - parser.add_argument("-i", "--input", required=False, - help="Input file (XML or JSON) to import existing DOIs from. " - "If no value is provided, the server URL " - "specified by the DOI service configuration INI " - "file is used by default.") + parser.add_argument("-S", "--service", required=False, default=None, + help="Name of the service provider to pull existing DOI " + "records from. If not provided, the provider configured " + "by the DOI service configuration INI is used by " + "default. Should be one of: [{}]" + .format(", ".join(VALID_SERVICE_TYPES))) + parser.add_argument("-p", "--prefix", required=False, default=None, + help="Specify the DOI prefix value to query the service " + "provider for. If not provided, the prefix value " + "configured to the providing in the INI config is " + "used by default.") parser.add_argument("-s", "--submitter-email", required=False, default='pds-operator@jpl.nasa.gov', help="The email address of the user performing the " @@ -96,6 +118,11 @@ def create_cmd_parser(): help="Name of the SQLite3 database file name to commit " "DOI records to. If not provided, the file name is " "obtained from the DOI service INI config.") + parser.add_argument("-i", "--input-file", required=False, + help="Input file (XML or JSON) to import existing DOIs from. " + "If no value is provided, the server URL " + "specified by the DOI service configuration INI " + "file is used by default.") parser.add_argument("-o", "--output-file", required=False, default=None, help="Path to write out the DOI JSON labels as returned " "from the query. When created, this file can be used " @@ -112,7 +139,23 @@ def create_cmd_parser(): def _read_from_local_xml(path): - """Read from a local xml file containing output from a query.""" + """ + Read from a local xml file containing output from a query. + + Note that since the PDS DOI service only supports XML labels from OSTI, + that is the default parser used by this function. + + Parameters + ---------- + path : str + Path of the XML file to read and parse. + + Returns + ------- + dois : list of Doi + The DOI objects parsed from the XML label. + + """ try: with open(path, mode='r') as f: doi_xml = f.read() @@ -124,73 +167,139 @@ def _read_from_local_xml(path): return dois -def _read_from_local_json(path): - """Read from a local JSON file containing output from a query.""" +def _read_from_local_json(service, path): + """ + Read from a local JSON file containing output from a query. + + The appropriate JSON parser (OSTI or DataCite) is determined based on + the provided service type. + + Parameters + ---------- + service : str + The name of the service provider corresponding to the JSON format + to read and parse. + path : str + Path to the JSON file to read and parse. + + Returns + ------- + dois : list of Doi + The DOI objects parsed from the JSON label. + + """ try: with open(path, mode='r') as f: doi_json = f.read() except Exception as e: raise CriticalDOIException(str(e)) - web_parser = DOIServiceFactory.get_web_parser_service() + web_parser = DOIServiceFactory.get_web_parser_service(service) - dois, _ = web_parser.parse_dois_from_label( - doi_json, content_type=CONTENT_TYPE_JSON - ) + try: + dois, _ = web_parser.parse_dois_from_label( + doi_json, content_type=CONTENT_TYPE_JSON + ) + except Exception: + raise InputFormatException( + f"Unable to parse input file {path} using parser {web_parser.__name__}\n" + f"Please ensure the --service flag is set correctly to specify the " + f"correct parser type for the format." + ) return dois -def _read_from_path(path): +def _read_from_path(service, path): + """ + Reads the label at the provided path, using the appropriate parser for the + provided service type. + + Parameters + ---------- + service : str + The name of the service provider corresponding to the format + to read and parse. Only used for JSON labels. + path : str + Path to the label to read and parse. The label format (XML or JSON) is + derived from the path's file extension. + + Returns + ------- + dois : list of Doi + The DOI objects parsed from the label. + + Raises + ------ + InputFormatException + If the file path does not exist or does not correspond to an XML or + JSON file. + + """ + if not os.path.exists(path): + raise InputFormatException(f"Error reading file {path}. " + "File may not exist.") + if path.endswith('.xml'): return _read_from_local_xml(path) elif path.endswith('.json'): - return _read_from_local_json(path) + return _read_from_local_json(service, path) raise InputFormatException(f'File {path} is not supported. ' f'Only .xml and .json are supported.') -def _parse_input(input_file): - if os.path.exists(input_file): - return _read_from_path(input_file) - - raise InputFormatException(f"Error reading file {input_file}. " - f"File may not exist.") - - -def get_dois_from_provider(output_file): +def get_dois_from_provider(service, prefix, output_file=None): """ Queries the service provider for all the current DOI associated with the - PDS-USER account. The server name is fetched from the config file with the - 'url' field in the service provider grouping. + provided prefix. - """ - query_dict = {} + Parameters + ---------- + service : str + Name of the service provider to pull DOI's from. + prefix : str + DOI prefix to query for. + output_file : str, optional + If provided, path to an output file to write the results of the DOI + query to. + + Returns + ------- + dois : list of Doi + The DOI objects obtained from the service provider. + server_url : str + The URL of the service provider endpoint. Helpful for logging purposes. - service_provider = DOIServiceFactory.get_service_type().upper() + """ + if service == SERVICE_TYPE_DATACITE: + query_dict = {'doi': f'{prefix}/*'} + else: + query_dict = {'doi': prefix} - o_server_url = m_config.get(service_provider, 'url') + server_url = m_config.get(service.upper(), 'url') - logger.info("Using %s server URL %s", service_provider, o_server_url) + logger.info("Using %s server URL %s", service, server_url) - web_client = DOIServiceFactory.get_web_client_service() + web_client = DOIServiceFactory.get_web_client_service(service) doi_json = web_client.query_doi( query=query_dict, content_type=CONTENT_TYPE_JSON ) if output_file: + logger.info("Writing query results to %s", output_file) + with open(output_file, 'w') as outfile: - outfile.write(doi_json) + json.dump(json.loads(doi_json), outfile, indent=4) - web_parser = DOIServiceFactory.get_web_parser_service() + web_parser = DOIServiceFactory.get_web_parser_service(service) dois, _ = web_parser.parse_dois_from_label( doi_json, content_type=CONTENT_TYPE_JSON ) - return dois, o_server_url + return dois, server_url def _get_node_id_from_contributors(doi_fields): @@ -198,48 +307,60 @@ def _get_node_id_from_contributors(doi_fields): Given a doi object, attempt to extract the node_id from contributors field. If unable to, return 'eng' as default. This function is a one-off as well so no fancy logic. + + Parameters + ---------- + doi_fields : dict + DOI metadata fields to obtain PDS node ID from. + + Returns + ------- + node_id : str + The three-character PDS identifier determined from the DOI's contributor + field. + """ - o_node_id = 'eng' + node_id = 'eng' if doi_fields.get('contributor'): full_name_orig = doi_fields['contributor'] full_name = full_name_orig.lower() if 'atmospheres' in full_name: - o_node_id = 'atm' + node_id = 'atm' elif 'engineering' in full_name: - o_node_id = 'eng' + node_id = 'eng' elif 'geosciences' in full_name: - o_node_id = 'geo' + node_id = 'geo' elif 'imaging' in full_name: - o_node_id = 'img' + node_id = 'img' elif 'cartography' in full_name: - o_node_id = 'img' + node_id = 'img' # Some uses title: Navigation and Ancillary Information Facility Node # Some uses title: Navigational and Ancillary Information Facility # So check for both elif 'navigation' in full_name and 'ancillary' in full_name: - o_node_id = 'naif' + node_id = 'naif' elif 'navigational' in full_name and 'ancillary' in full_name: - o_node_id = 'naif' + node_id = 'naif' elif 'plasma' in full_name: - o_node_id = 'ppi' + node_id = 'ppi' elif 'ring' in full_name and 'moon' in full_name: - o_node_id = 'rms' + node_id = 'rms' elif 'small' in full_name or 'bodies' in full_name: - o_node_id = 'sbn' + node_id = 'sbn' logger.debug("Derived node ID %s from Contributor field %s", - o_node_id, full_name_orig) + node_id, full_name_orig) else: logger.warning("No Contributor field available for DOI %s, " - "defaulting to node ID %s", doi_fields['doi'], o_node_id) + "defaulting to node ID %s", doi_fields['doi'], node_id) - return o_node_id + return node_id -def perform_import_to_database(db_name, input_source, dry_run, submitter_email, - output_file): +def perform_import_to_database(service, prefix, db_name, input_source, dry_run, + submitter_email, output_file): """ Imports all records from the input source into a local database. The input source may either be an existing file containing DOIs to parse, @@ -250,6 +371,10 @@ def perform_import_to_database(db_name, input_source, dry_run, submitter_email, Parameters ---------- + service : str + Name of the service provider to import DOI's from. + prefix : str + DOI prefix value to query for. db_name : str Name of the database file to import DOI records to. input_source : str @@ -271,60 +396,41 @@ def perform_import_to_database(db_name, input_source, dry_run, submitter_email, o_records_written = 0 # Number of records actually written to database o_records_dois_skipped = 0 # Number of records skipped due to missing lidvid or invalid prefix - # If use_doi_filtering_flag is set to True, we will allow only DOIs that - # start with the configured PDS DOI token, e.g. '10.17189'. - # Servers may contain records other than expected, especially the test - # server. For normal operation use_doi_filtering_flag should be set to False. - # If set to True, the parameter doi_prefix in config/conf.ini - # should be set appropriately. - use_doi_filtering_flag = False + if not service: + service = DOIServiceFactory.get_service_type() - # If flag skip_db_write_flag set to True, will skip writing of records to - # database. Use by developer to skip database write action. - # For normal operation, skip_db_write_flag should be set to False. - skip_db_write_flag = False + logger.info("Using source service provider %s", service) - if dry_run: - skip_db_write_flag = True + if not prefix: + prefix = m_config.get(service.upper(), 'doi_prefix') - o_db_name = db_name + logger.info("Using DOI prefix %s", prefix) # If db_name is not provided, get one from config file: - if not o_db_name: + if not db_name: # This is the local database we'll be writing to - o_db_name = m_config.get('OTHER', 'db_file') + db_name = m_config.get('OTHER', 'db_file') - logger.info("Using local database %s", o_db_name) + logger.info("Using local database %s", db_name) - transaction_builder = TransactionBuilder(o_db_name) + transaction_builder = TransactionBuilder(db_name) # If the input is provided, parse from it. Otherwise query the server. if input_source: - dois = _parse_input(input_source) - o_server_url = input_source + dois = _read_from_path(service, input_source) + server_url = input_source else: # Get the dois from the server. - # Note that because the name of the server is in the config file, - # it can be the OPS or TEST server. - dois, o_server_url = get_dois_from_provider(output_file) + # Note that because the name of the server obtained from the config file, + # it could be the OPS or TEST server. + dois, server_url = get_dois_from_provider(service, prefix, output_file) o_records_found = len(dois) - logger.info("Parsed %d DOI(s) from %s", o_records_found, o_server_url) + logger.info("Parsed %d DOI(s) from %s", o_records_found, server_url) # Write each Doi object as a row into the database. for item_index, doi in enumerate(dois): - if use_doi_filtering_flag: - service_provider = DOIServiceFactory.get_service_type().upper() - o_pds_doi_token = m_config.get(service_provider, 'doi_prefix') - - if doi.doi and not doi.doi.startswith(o_pds_doi_token): - logger.warning("Skipping non-PDS DOI %s, index %d", doi.doi, - item_index) - - o_records_dois_skipped += 1 - continue - # If the field 'related_identifier' is None, we cannot proceed since # it serves as the primary key for our transaction database. if not doi.related_identifier: @@ -349,13 +455,16 @@ def perform_import_to_database(db_name, input_source, dry_run, submitter_email, o_records_processed += 1 - if not skip_db_write_flag: + if not dry_run: # Write a row into the database and save an output label for each - # DOI to the local transaction history + # DOI to the local transaction history. The format (OSTI vs. Datacite) + # of the output label is based on the service provider setting in + # the INI config. transaction = transaction_builder.prepare_transaction( node_id, submitter_email, - doi + doi, + output_content_type=CONTENT_TYPE_JSON ) transaction.log() @@ -375,6 +484,8 @@ def main(): parser = create_cmd_parser() arguments = parser.parse_args() + logger.setLevel(logging.INFO) + if arguments.debug: logger.setLevel(logging.DEBUG) @@ -385,8 +496,10 @@ def main(): (records_found, records_processed, records_written, - records_skipped) = perform_import_to_database(arguments.db_name, - arguments.input, + records_skipped) = perform_import_to_database(arguments.service, + arguments.prefix, + arguments.db_name, + arguments.input_file, arguments.dry_run, arguments.submitter_email, arguments.output_file)