From 9c6bcf4c8542182866034bd9cc5c21e386aa6d46 Mon Sep 17 00:00:00 2001 From: Qui T Chau Date: Tue, 29 Sep 2020 10:28:39 -0700 Subject: [PATCH] Validate site_url availability Changes to support issue #70: Add catching of warning exceptions from validate_osti_submission() when site_url is not online, comment out function to write output file from draft_test.py since it is not needed, fix bug with draft_test.py to use force=True instead of force_flag=True, fix bug with release_test.py by removing existing temporary database at initial run, fix bug with reserve_test.py to use force=True instead of force_flag=True, add SiteURNotExistException to exceptions.py, fix bug with fromisoformat() since Python version on linux is 3.68 and does not support it, add error handling of publication_date, add verification of site_url being online or not in doi_validator.py. Changes responding to first set of comments for pull request of issue #70: Add handling when a connection can be made to the server but the page does not exist, add _check_field_site_url() check for all actions and not just 'release'. change warning message --- pds_doi_core/actions/release.py | 51 ++++++++++++++++++++--- pds_doi_core/actions/test/draft_test.py | 28 ++++++------- pds_doi_core/actions/test/release_test.py | 10 +++-- pds_doi_core/actions/test/reserve_test.py | 10 ++--- pds_doi_core/input/exceptions.py | 3 ++ pds_doi_core/input/input_util.py | 15 ++++++- pds_doi_core/util/doi_validator.py | 32 +++++++++++++- 7 files changed, 119 insertions(+), 30 deletions(-) diff --git a/pds_doi_core/actions/release.py b/pds_doi_core/actions/release.py index 673cb29a..da6e848e 100644 --- a/pds_doi_core/actions/release.py +++ b/pds_doi_core/actions/release.py @@ -9,7 +9,7 @@ from pds_doi_core.actions.action import DOICoreAction, logger from pds_doi_core.input.exceptions import UnknownNodeException, InputFormatException, DuplicatedTitleDOIException, \ - UnexpectedDOIActionException, TitleDoesNotMatchProductTypeException, IllegalDOIActionException, WarningDOIException, \ + UnexpectedDOIActionException, TitleDoesNotMatchProductTypeException, SiteURNotExistException, IllegalDOIActionException, WarningDOIException, \ CriticalDOIException from pds_doi_core.input.osti_input_validator import OSTIInputValidator from pds_doi_core.outputs.osti import DOIOutputOsti @@ -70,6 +70,30 @@ def _parse_input(self, input): return o_doi_label + def _collect_exception_classes_and_messages(self, single_exception, io_exception_classes, io_exception_messages): + # Given a single exception, collect the exception class name and message. + # The variables io_exception_classes and io_exception_messages are both input and output. + + actual_class_name = type(single_exception).__name__ + logger.debug(f"actual_class_name,type(actual_class_name) {actual_class_name},{type(actual_class_name)}") + + io_exception_classes.append (actual_class_name) # SiteURNotExistException + io_exception_messages.append(str(single_exception)) # site_url http://mysite.example.com/link/to/my-dataset-id-25901.html not exist + + return io_exception_classes, io_exception_messages + + def _raise_warn_exceptions(self, exception_classes, exception_messages): + # Raise a WarningDOIException with all the class names and messages. + + message_to_raise = '' + for ii in range(len(exception_classes)): + if ii == 0: + message_to_raise = message_to_raise + exception_classes[ii] + ':' + exception_messages[ii] + else: + # Add a comma after every message. + message_to_raise = message_to_raise + ', ' + exception_classes[ii] + ':' + exception_messages[ii] + raise WarningDOIException(message_to_raise) + def _validate_doi(self, doi_label): """ Before submitting the user input, it has to be validated against the database so a Doi object need to be built. @@ -77,6 +101,8 @@ def _validate_doi(self, doi_label): :param doi_label: :return: """ + exception_classes = [] + exception_messages = [] try: dois = DOIOstiWebParser().response_get_parse_osti_xml(doi_label) @@ -87,15 +113,28 @@ def _validate_doi(self, doi_label): self._doi_validator.validate_against_xsd(single_doi_label) # Validate the label to ensure that no rules are violated. - self._doi_validator.validate_osti_submission(doi) - + # Put validate_osti_submission() within a try/except clause to catch all the exceptions instead of + # exiting after the first exception. This allow the user to see all the things that are wrong with + # all the DOIs instead of just the first one. + try: + self._doi_validator.validate_osti_submission(doi) + except (DuplicatedTitleDOIException, UnexpectedDOIActionException, + TitleDoesNotMatchProductTypeException, SiteURNotExistException, WarningDOIException) as e: + (exception_classes, exception_messages) = self._collect_exception_classes_and_messages(e, + exception_classes, exception_messages) except Exception as e: raise # Re-raise all exceptions. + # If there is at least one exception caught, raise a WarningDOIException with all the messages. + if len(exception_classes) > 0: + self._raise_warn_exceptions(exception_classes,exception_messages) + + return 1 + def run(self, **kwargs): """ Function performs a release of a DOI that has been previously reserved. The input is a - XML text file contains previously return output from a 'reserved' action. The only + XML text file contains previously return output from a 'reserve' or 'draft' action. The only required field is 'id'. Any other fields included will be considered a replace action. """ self.parse_arguments(kwargs) @@ -117,9 +156,9 @@ def run(self, **kwargs): # warnings except (DuplicatedTitleDOIException, UnexpectedDOIActionException, - TitleDoesNotMatchProductTypeException) as e: + TitleDoesNotMatchProductTypeException, SiteURNotExistException, WarningDOIException) as e: if not self._force: - # If the user did not use force_flag, re-raise the exception. + # If the user did not use --force parameter, re-raise the exception. raise WarningDOIException(str(e)) # errors except (IllegalDOIActionException, UnknownNodeException, InputFormatException) as e: diff --git a/pds_doi_core/actions/test/draft_test.py b/pds_doi_core/actions/test/draft_test.py index f77a312c..ff0d68e0 100644 --- a/pds_doi_core/actions/test/draft_test.py +++ b/pds_doi_core/actions/test/draft_test.py @@ -18,7 +18,7 @@ def create_temporary_output_file(doi_label,filename): class MyTestCase(unittest.TestCase): db_name = 'doi_temp.db' - # Because validation has been added to each action, the force_flag=True is required as the command line is not parsed for unit test. + # Because validation has been added to each action, the force=True is required as the command line is not parsed for unit test. @classmethod def setUp(self): @@ -43,21 +43,21 @@ def test_local_dir_one_file(self): logger.info("test local dir with one file") osti_doi = self._action.run(input='input/draft_dir_one_file', node='img', - submitter='my_user@my_node.gov',force_flag=True) + submitter='my_user@my_node.gov',force=True) logger.info(osti_doi) def test_local_dir_two_files(self): logger.info("test local dir with two files") osti_doi = self._action.run(input='input/draft_dir_two_files', node='img', - submitter='my_user@my_node.gov',force_flag=True) + submitter='my_user@my_node.gov',force=True) logger.info(osti_doi) def test_local_bundle(self): logger.info("test local bundle") osti_doi = self._action.run(input='input/bundle_in_with_contributors.xml', node='img', - submitter='my_user@my_node.gov',force_flag=True) + submitter='my_user@my_node.gov',force=True) logger.info(osti_doi) def test_remote_bundle(self): @@ -65,41 +65,41 @@ def test_remote_bundle(self): osti_doi = self._action.run( input='https://pds-imaging.jpl.nasa.gov/data/nsyt/insight_cameras/bundle.xml', node='img', - submitter='my_user@my_node.gov',force_flag=True) + submitter='my_user@my_node.gov',force=True) logger.info(osti_doi) - create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_bundle_doi.xml')) + #create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_bundle_doi.xml')) def test_remote_collection(self): logger.info("test remote collection") osti_doi = self._action.run( input='https://pds-imaging.jpl.nasa.gov/data/nsyt/insight_cameras/data/collection_data.xml', - node='img', submitter='my_user@my_node.gov',force_flag=True) + node='img', submitter='my_user@my_node.gov',force=True) logger.info(osti_doi) - create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_datacoll_doi.xml')) + #create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_datacoll_doi.xml')) def test_remote_browse_collection(self): logger.info("test remote browse collection") osti_doi = self._action.run( input='https://pds-imaging.jpl.nasa.gov/data/nsyt/insight_cameras/browse/collection_browse.xml', - node='img', submitter='my_user@my_node.gov',force_flag=True) + node='img', submitter='my_user@my_node.gov',force=True) logger.info(osti_doi) - create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_browsecoll_doi.xml')) + #create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_browsecoll_doi.xml')) def test_remote_calibration_collection(self): logger.info("test remote calibration collection") osti_doi = self._action.run( input='https://pds-imaging.jpl.nasa.gov/data/nsyt/insight_cameras/calibration/collection_calibration.xml', - node='img', submitter='my_user@my_node.gov',force_flag=True) + node='img', submitter='my_user@my_node.gov',force=True) logger.info(osti_doi) - create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_calibcoll_doi.xml')) + #create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_calibcoll_doi.xml')) def test_remote_document_collection(self): logger.info("test remote document collection") osti_doi = self._action.run( input='https://pds-imaging.jpl.nasa.gov/data/nsyt/insight_cameras/document/collection_document.xml', - node='img', submitter='my_user@my_node.gov',force_flag=True) + node='img', submitter='my_user@my_node.gov',force=True) logger.info(osti_doi) - create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_docucoll_doi.xml')) + #create_temporary_output_file(osti_doi, os.path.join(self._temporary_output_dir,'valid_docucoll_doi.xml')) if __name__ == '__main__': diff --git a/pds_doi_core/actions/test/release_test.py b/pds_doi_core/actions/test/release_test.py index be61a4a6..6b5ff869 100644 --- a/pds_doi_core/actions/test/release_test.py +++ b/pds_doi_core/actions/test/release_test.py @@ -15,10 +15,14 @@ class MyTestCase(unittest.TestCase): # to demonstrate that they have new status of 'Pending' or 'Registered'. If for some reason the server has been wiped clean, this unit test will still run # but won't show any status changed to 'Registered'. # - # Because validation has been added to each action, the force_flag=True is required as the command line is not parsed for unit test. + # Because validation has been added to each action, the force=True is required as the command line is not parsed for unit test. db_name = 'doi_temp.db' logger.info("Creating test artifact database file {self.db_name}") + # Remove db_name if exist to have a fresh start otherwise exception will be raised about using existing lidvid. + if os.path.isfile(db_name): + os.remove(db_name) + logger.info(f"Removed test artifact database file {db_name}") # Release some DOIs. # Move instantation of DOICoreActionRelease() object inside tests since the temporary file is removed after each test @@ -39,7 +43,7 @@ def test_reserve(self): # Instantiate DOICoreActionRelease() here so a new database file is created and removed for each test. # The setUp() function is called per test. logger.info("test release of document from 'reserve' action. This test would only work if the authentication for OSTI has been set up and DOIs exist.") - result_list = self._action.run(input='input/DOI_Release_20200727_from_reserve.xml',node='img',submitter='Qui.T.Chau@jpl.nasa.gov',force_flag=True) + result_list = self._action.run(input='input/DOI_Release_20200727_from_reserve.xml',node='img',submitter='Qui.T.Chau@jpl.nasa.gov',force=True) logger.info(result_list) # The tearDown() function is called per test. @@ -49,7 +53,7 @@ def test_draft(self): # The setUp() function is called per test. logger.info("test release of document from 'release' output. This test would only work if the authentication for OSTI has been set up and DOIs exist.") result_list = [] - result_list = self._action.run(input='input/DOI_Release_20200727_from_draft.xml',node='img',submitter='Qui.T.Chau@jpl.nasa.gov',force_flag=True) + result_list = self._action.run(input='input/DOI_Release_20200727_from_draft.xml',node='img',submitter='Qui.T.Chau@jpl.nasa.gov',force=True) logger.info(result_list) # The tearDown() function is called per test. diff --git a/pds_doi_core/actions/test/reserve_test.py b/pds_doi_core/actions/test/reserve_test.py index ec40f9f7..6f184cdf 100644 --- a/pds_doi_core/actions/test/reserve_test.py +++ b/pds_doi_core/actions/test/reserve_test.py @@ -15,7 +15,7 @@ class MyTestCase(unittest.TestCase): # Due to addition of validation of existing 'title', 'lidvid' and 'doi' fields from local database, # the DOICoreActionReserve must be instantiated per test and the tear down per test as well to remove temporary database. - # Because validation has been added to each action, the force_flag=True is required as the command line is not parsed for unit test. + # Because validation has been added to each action, the force=True is required as the command line is not parsed for unit test. db_name = 'doi_temp.db' def setUp(self): @@ -37,7 +37,7 @@ def test_reserve_xlsx(self): self._action.run( input='input/DOI_Reserved_GEO_200318_with_corrected_identifier.xlsx', node='img', submitter='my_user@my_node.gov', - dry_run=True,force_flag=True) + dry_run=True,force=True) # The tearDown() function is called per test. @@ -49,7 +49,7 @@ def test_reserve_xlsx_and_submit(self): self._action.run( input='input/DOI_Reserved_GEO_200318_with_corrected_identifier.xlsx', node='img', submitter='my_user@my_node.gov', - dry_run=True,force_flag=True) + dry_run=True,force=True) # The tearDown() function is called per test. @@ -60,7 +60,7 @@ def test_reserve_csv(self): osti_doi = self._action.run( input='input/DOI_Reserved_GEO_200318.csv', node='img', submitter='my_user@my_node.gov', - dry_run=True,force_flag=True) + dry_run=True,force=True) logger.info(osti_doi) def test_reserve_csv_and_submit(self): @@ -68,7 +68,7 @@ def test_reserve_csv_and_submit(self): osti_doi = self._action.run( input='input/DOI_Reserved_GEO_200318.csv', node='img', submitter='my_user@my_node.gov', - dry_run=False,force_flag=True) + dry_run=False,force=True) logger.info(osti_doi) # The tearDown() function is called per test. diff --git a/pds_doi_core/input/exceptions.py b/pds_doi_core/input/exceptions.py index 8fd03c84..305b3ad2 100644 --- a/pds_doi_core/input/exceptions.py +++ b/pds_doi_core/input/exceptions.py @@ -31,3 +31,6 @@ class CriticalDOIException(Exception): class WarningDOIException(Exception): pass +class SiteURNotExistException(Exception): + pass + diff --git a/pds_doi_core/input/input_util.py b/pds_doi_core/input/input_util.py index c144e4c2..80fdb2d2 100644 --- a/pds_doi_core/input/input_util.py +++ b/pds_doi_core/input/input_util.py @@ -59,8 +59,21 @@ def _parse_rows_to_doi_meta(self, xl_sheet): doi_records = [] for index, row in xl_sheet.iterrows(): + logger.debug(f"row {row}") + # It is possible that the length of row['publication_date'] is more than needed, we only need to get the first 10 characters + # '2020-08-01' from '2020-08-01 00:00:00' + if len(row['publication_date']) >= 10: + # It is possible that the format provided is not expected, put try/except clause to catch that. + try: + publication_date_value = datetime.strptime(row['publication_date'][0:10], '%Y-%m-%d') + except Exception: + logger.error("Expecting publication_date [" + row['publication_date'] + "] with format YYYY-mm-dd") + raise InputFormatException("Expecting publication_date [" + row['publication_date'] + "] with format YYYY-mm-dd") + else: + raise InputFormatException("Expecting publication_date to be at least 10 characters: [" + row['publication_date'] + "]") + doi = Doi(title=row['title'], - publication_date=datetime.fromisoformat(row['publication_date']), + publication_date=publication_date_value, product_type='Collection', product_type_specific=row['product_type_specific'], related_identifier=row['related_resource'], diff --git a/pds_doi_core/util/doi_validator.py b/pds_doi_core/util/doi_validator.py index 60e4f7d0..b1d0fb36 100644 --- a/pds_doi_core/util/doi_validator.py +++ b/pds_doi_core/util/doi_validator.py @@ -9,10 +9,12 @@ from lxml import etree +import requests + from pds_doi_core.db.doi_database import DOIDataBase from pds_doi_core.entities.doi import Doi from pds_doi_core.input.exceptions import DuplicatedTitleDOIException, \ - IllegalDOIActionException, UnexpectedDOIActionException, TitleDoesNotMatchProductTypeException + IllegalDOIActionException, UnexpectedDOIActionException, TitleDoesNotMatchProductTypeException, SiteURNotExistException from pds_doi_core.util.config_parser import DOIConfigUtil from pds_doi_core.util.general_util import get_logger @@ -46,6 +48,32 @@ def get_database_name(self): def __lidvid(columns, row): return f"{row[columns.index('lid')]}::{row[columns.index('vid')]}" + def _check_field_site_url(self, doi: Doi): + """ If the site_url field exist in doi object, check to see if it is online. If the site is not online, an exception will be thrown. + """ + + logger.debug(f"doi {doi}") + logger.info(f"doi.site_url {doi.site_url}") + + if doi.site_url: + try: + response = requests.get(doi.site_url,timeout=5) + status_code = response.status_code + logger.debug(f"from_request:status_code,site_url {status_code,doi.site_url}") + # Handle cases when a connection can be made to the server but the status is greater than or equal to 400. + if status_code >= 400: + # Need to check its an 404, 503, 500, 403 etc. + raise requests.HTTPError(f"status_code,site_url {status_code,doi.site_url}") + else: + logger.debug(f"site_url {doi.site_url} indeed exist") + except (requests.exceptions.ConnectionError, Exception) as e: + error_message = f"site_url {doi.site_url} not reachable" + # Although not being able to connect is an error, the message printed is a warning. + logger.warning(error_message) + raise SiteURNotExistException(error_message) + + return 1 + def _check_field_title_duplicate(self, doi: Doi): """ Check if the same title exist already in local database for a different lidvid """ @@ -188,6 +216,8 @@ def validate(self, doi: Doi): # TO DO check id and doi fields are consistent. + # Validate the site_url first to give the user a chance to make the correction. + self._check_field_site_url(doi) self._check_field_title_duplicate(doi) self._check_field_title_content(doi) self._check_field_workflow(doi)