diff --git a/.travis.yml b/.travis.yml index b5ac7c9..d980530 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,7 +45,7 @@ script: after_success: - coveralls - + branches: only: - master diff --git a/docs/architecture.png b/docs/architecture.png index f666ece..3040f44 100644 Binary files a/docs/architecture.png and b/docs/architecture.png differ diff --git a/docs/client_server.png b/docs/client_server.png index 74b8847..1ad656c 100644 Binary files a/docs/client_server.png and b/docs/client_server.png differ diff --git a/docs/client_server_tg.png b/docs/client_server_tg.png index c0affcf..36a6621 100644 Binary files a/docs/client_server_tg.png and b/docs/client_server_tg.png differ diff --git a/docs/code_architecture.png b/docs/code_architecture.png index decf4b0..f76a815 100644 Binary files a/docs/code_architecture.png and b/docs/code_architecture.png differ diff --git a/docs/headers_example.png b/docs/headers_example.png index 28a74af..e76703d 100644 Binary files a/docs/headers_example.png and b/docs/headers_example.png differ diff --git a/docs/sequence.png b/docs/sequence.png index a5c0abe..6637bf7 100644 Binary files a/docs/sequence.png and b/docs/sequence.png differ diff --git a/docs/uris_example.png b/docs/uris_example.png index f10b387..f13ea0b 100644 Binary files a/docs/uris_example.png and b/docs/uris_example.png differ diff --git a/tests/conftest.py b/tests/conftest.py index 4674207..57d05be 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,8 +23,10 @@ def app(tmpdir): return application.TimeGate(config=dict( HOST='http://localhost', BASE_URI='http://www.example.com/', - CACHE_USE=True, - CACHE_FILE=tmpdir.mkdir('cache').strpath, + CACHE_BACKEND='werkzeug.contrib.cache:FileSystemCache', + CACHE_OPTIONS={ + 'cache_dir': tmpdir.mkdir('cache').strpath, + }, )) diff --git a/tests/test_timegate.py b/tests/test_timegate.py index dc1f6b7..56e1e19 100644 --- a/tests/test_timegate.py +++ b/tests/test_timegate.py @@ -25,11 +25,11 @@ def test_version(): def test_initialization(): """Test TimeGate initialization.""" - from timegate.application import TimeGate + from timegate.application import TimeGate, request from timegate.examples.simple import ExampleHandler handler = ExampleHandler() app = TimeGate(config=dict(HANDLER_MODULE=handler)) - assert handler == app.handler + assert len(list(app.url_map.iter_rules())) == 2 def test_application(): @@ -42,6 +42,43 @@ def test_application(): assert client.get('/').status_code == 404 +def test_multi_handler(): + """Test simple request.""" + from timegate.application import TimeGate, request + from timegate.examples.simple import ExampleHandler + from werkzeug.test import Client + from werkzeug.wrappers import BaseResponse + + base1_uri = 'http://www.example.com/base1/' + base2_uri = 'http://www.example.com/base2/' + app = TimeGate(config=dict( + CACHE_BACKEND='werkzeug.contrib.cache.NullCache', + HANDLER_MODULE=ExampleHandler(base_uri=base1_uri), + BASE_URI=base1_uri, + HANDLERS={ + 'base2': dict( + HANDLER_MODULE=ExampleHandler(base_uri=base2_uri), + BASE_URI=base2_uri, + ) + }, + )) + client = Client(app, BaseResponse) + + assert len(list(app.url_map.iter_rules())) == 4 + + parameters = [ + ('', base1_uri), (base1_uri, base1_uri), (base2_uri, base2_uri) + ] + for request_base, response_base in parameters: + response = client.get( + '/timegate/{0}resourceA'.format(request_base) + ) + assert response.status_code == 302 + assert response.headers['Location'] == ( + '{0}resourceA_v3'.format(response_base) + ) + + def test_timemap_response(client): """Test timemap responses.""" response = client.get( diff --git a/timegate/application.py b/timegate/application.py index aaa81d4..3fe636a 100644 --- a/timegate/application.py +++ b/timegate/application.py @@ -15,12 +15,15 @@ import json import logging import os +import re from datetime import datetime from pkg_resources import iter_entry_points from dateutil.tz import tzutc from link_header import Link, LinkHeader +from pkg_resources import iter_entry_points +from werkzeug.datastructures import CombinedMultiDict from werkzeug.exceptions import HTTPException, abort from werkzeug.http import http_date, parse_date from werkzeug.local import Local, LocalManager @@ -43,8 +46,7 @@ request = local('request') """Proxy to request object.""" -# logging.getLogger(__name__) -# logging.basicConfig(level=logging.DEBUG) +_RE_HANDLER = re.compile('^((?P[^.]+)\.)?(?P[^.]+)$') def url_for(*args, **kwargs): @@ -74,13 +76,17 @@ def load_handler(name_or_path): class URIConverter(BaseConverter): """URI Converter.""" - def __init__(self, url_map, base_uri=None): + def __init__(self, url_map, base_uri=None, default=True): super(URIConverter, self).__init__(url_map) + assert base_uri or default, 'base_uri or default must be defined' self.base_uri = base_uri self.regex = ( - r"([^:/?#]+:)?(//[^/?#]*)?" - r"[^?#]*(\?[^#]*)?(#.*)?" + r'([^:/?#]+:)?(//[^/?#]*)?' + r'[^?#]*(\?[^#]*)?(#.*)?' + ) if default else ( + r'({0})(.*)'.format(base_uri) ) + self.weigth = 100 if default else 400 def to_python(self, value): """Return value with base URI prefix.""" @@ -102,21 +108,55 @@ class TimeGate(object): def __init__(self, config=None, cache=None): """Initialize application with handler.""" + self.handlers = {} # registry of handlers + self.rules = [] # list of URL rules self.config = Config(None) self.config.from_object(constants) self.config.update(config or {}) self.cache = None if cache: self.cache = cache - elif self.config['CACHE_USE']: + else: self._build_default_cache() @cached_property - def handler(self): - handler = load_handler(self.config['HANDLER_MODULE']) + def url_map(self): + """Build URL map.""" + for handler_name, config in self.config.get('HANDLERS', {}).items(): + if handler_name is None: + continue # we have already regitered default handler + self.register_handler( + handler_name, CombinedMultiDict([config, self.config]) + ) + # Default handler at the end in case the weights are same. + self.register_handler(None, CombinedMultiDict([ + self.config.get('HANDLERS', {}).get(None, {}), self.config + ])) + return Map(self.rules, converters={'uri': URIConverter}) + + def _build_default_cache(self): + """Build default cache object.""" + self.cache = Cache( + self.config.get('CACHE_BACKEND', + 'werkzeug.contrib.cache.NullCache'), + cache_refresh_time=self.config.get('CACHE_REFRESH_TIME', 86400), + **self.config.get('CACHE_OPTIONS', {}) + ) + + def __repr__(self): + """Representation of this class.""" + return '<{0} {1}>'.format( + self.__class__.__name__, ', '.join([ + h.__class__.__name__ for h in self.handlers.items() + ]) + ) + + def register_handler(self, handler_name, config): + """Register handler.""" + handler = load_handler(config['HANDLER_MODULE']) HAS_TIMEGATE = hasattr(handler, 'get_memento') HAS_TIMEMAP = hasattr(handler, 'get_all_mementos') - if self.config['USE_TIMEMAPS'] and (not HAS_TIMEMAP): + if config['USE_TIMEMAPS'] and (not HAS_TIMEMAP): logging.error( "Handler has no get_all_mementos() function " "but is suppose to serve timemaps.") @@ -125,36 +165,28 @@ def handler(self): raise NotImplementedError( "NotImplementedError: Handler has neither `get_memento` " "nor `get_all_mementos` method.") - return handler - @cached_property - def url_map(self): - """Build URL map.""" - base_uri = self.config['BASE_URI'] - rules = [ - Rule('/timegate/'.format(base_uri), - endpoint='timegate', methods=['GET', 'HEAD']), - Rule('/timemap//' - ''.format(base_uri), - endpoint='timemap', methods=['GET', 'HEAD']), - ] - return Map(rules, converters={'uri': URIConverter}) - - def _build_default_cache(self): - """Build default cache object.""" - self.cache = Cache( - self.config['CACHE_FILE'], - self.config['CACHE_TOLERANCE'], - self.config['CACHE_EXP'], - self.config['CACHE_MAX_VALUES'], + handler.use_timemaps = ( + hasattr(handler, 'get_all_mementos') and config['USE_TIMEMAPS'] ) + handler.resource_type = config['RESOURCE_TYPE'] - def __repr__(self): - """Representation of this class.""" - return '<{0} {1}>'.format( - self.__class__.__name__, self.handler.__class__.__name__ + endpoint_prefix = '{0}.'.format(handler_name) if handler_name else '' + uri_r = ''.format( + config['BASE_URI'], str(handler_name is None) ) + self.rules.extend([ + Rule('/timegate/{0}'.format(uri_r), + endpoint=endpoint_prefix + 'timegate', + methods=['GET', 'HEAD']), + Rule('/timemap//{0}'.format(uri_r), + endpoint=endpoint_prefix + 'timemap', + methods=['GET', 'HEAD']), + ]) + + self.handlers[handler_name] = handler + def dispatch_request(self, request): """Choose correct method.""" request.adapter = adapter = self.url_map.bind_to_environ( @@ -162,11 +194,13 @@ def dispatch_request(self, request): ) try: endpoint, values = adapter.match() - return getattr(self, endpoint)(**values) + parts = _RE_HANDLER.match(endpoint).groupdict() + request.handler = self.handlers[parts['handler_name']] + return getattr(self, parts['endpoint'])(**values) except HTTPException as e: return e finally: - self.adapter = None + request.adapter = request.handler = None def wsgi_app(self, environ, start_response): local.request = request = Request(environ) @@ -186,7 +220,7 @@ def get_memento(self, uri_r, accept_datetime): :param accept_datetime: Datetime object with requested time. :return: The TimeMap if it exists and is valid. """ - return parsed_request(self.handler.get_memento, + return parsed_request(request.handler.get_memento, uri_r, accept_datetime) def get_all_mementos(self, uri_r): @@ -201,7 +235,7 @@ def get_all_mementos(self, uri_r): if self.cache and request.cache_control != 'no-cache': mementos = self.cache.get_all(uri_r) if mementos is None: - mementos = parsed_request(self.handler.get_all_mementos, uri_r) + mementos = parsed_request(request.handler.get_all_mementos, uri_r) if self.cache: self.cache.set(uri_r, mementos) return mementos @@ -224,8 +258,7 @@ def timegate(self, uri_r): # Runs the handler's API request for the Memento mementos = first = last = None - HAS_TIMEMAP = hasattr(self.handler, 'get_all_mementos') - if HAS_TIMEMAP and self.config['USE_TIMEMAPS']: + if request.handler.use_timemaps: logging.debug('Using multiple-request mode.') mementos = self.get_all_mementos(uri_r) @@ -233,7 +266,7 @@ def timegate(self, uri_r): first = mementos[0] last = mementos[-1] memento = best(mementos, accept_datetime, - self.config['RESOURCE_TYPE']) + request.handler.resource_type) else: logging.debug('Using single-request mode.') memento = self.get_memento(uri_r, accept_datetime) @@ -244,7 +277,7 @@ def timegate(self, uri_r): uri_r, first, last, - has_timemap=HAS_TIMEMAP and self.config['USE_TIMEMAPS'], + has_timemap=request.handler.use_timemaps, ) def timemap(self, uri_r, response_type='link'): @@ -258,7 +291,7 @@ def timemap(self, uri_r, response_type='link'): :param start_response: WSGI callback function. :return: The body of the HTTP response. """ - if not self.config['USE_TIMEMAPS']: + if not request.handler.use_timemaps: abort(403) mementos = self.get_all_mementos(uri_r) diff --git a/timegate/cache.py b/timegate/cache.py index 3d9ea05..cb234a7 100644 --- a/timegate/cache.py +++ b/timegate/cache.py @@ -14,65 +14,32 @@ import logging import os +import sys from datetime import datetime from dateutil.relativedelta import relativedelta from dateutil.tz import tzutc from werkzeug.contrib.cache import FileSystemCache, md5 - -from . import utils as timegate_utils -from .errors import CacheError +from werkzeug.utils import import_string class Cache(object): """Base class for TimeGate caches.""" - def __init__(self, path, tolerance, expiration, max_values, - run_tests=True, max_file_size=0): + def __init__(self, cache_backend, cache_refresh_time=86400, + max_file_size=0, **kwargs): """Constructor method. - :param path: The path of the cache database file. - :param tolerance: The tolerance, in seconds to which a TimeMap is - considered young enough to be used as is. - :param expiration: How long, in seconds, the cache entries are stored - every get will be a CACHE MISS. - :param max_values: The maximum number of TimeMaps stored in cache - before some are deleted - :param run_tests: (Optional) Tests the cache at initialization. + :param cache_backend: Importable string pointing to cache class. :param max_file_size: (Optional) The maximum size (in Bytes) for a TimeMap cache value. When max_file_size=0, there is no limit to a cache value. When max_file_size=X > 0, the cache will not store TimeMap that require more than X Bytes on disk. """ - # Parameters Check - if tolerance <= 0 or expiration <= 0 or max_values <= 0: - raise CacheError('Cannot create cache: all parameters must be > 0') - - self.tolerance = relativedelta(seconds=tolerance) - self.path = path.rstrip('/') + self.tolerance = relativedelta(seconds=cache_refresh_time) self.max_file_size = max(max_file_size, 0) self.CHECK_SIZE = self.max_file_size > 0 - self.max_values = max_values - self.backend = FileSystemCache(path, - threshold=self.max_values, - default_timeout=expiration) - - # Testing cache - if run_tests: - try: - key = b'1' - val = 1 - self.backend.set(key, val) - assert (not self.CHECK_SIZE) or self._check_size(key) > 0 - assert self.backend.get(key) == val - os.remove(os.path.join(self.path, md5(key).hexdigest())) - except Exception as e: - raise CacheError('Error testing cache: %s' % e) - - logging.debug( - 'Cache created. max_files = %d. Expiration = %d. ' - 'max_file_size = %d' % ( - self.max_values, expiration, self.max_file_size)) + self.backend = import_string(cache_backend)(**kwargs) def get_until(self, uri_r, date): """Returns the TimeMap (memento,datetime)-list for the requested @@ -88,28 +55,11 @@ def get_until(self, uri_r, date): None otherwise. """ # Query the backend for stored cache values to that memento - key = uri_r - try: - val = self.backend.get(key) - except Exception as e: - logging.error('Exception loading cache content: %s' % e) - return None - - if val: - # There is a value in the cache + val = self.backend.get(uri_r) + if val: # There is a value in the cache timestamp, timemap = val - logging.info('Cached value exists for %s' % uri_r) - if date > timestamp + self.tolerance: - logging.info('Cache MISS: value outdated for %s' % uri_r) - timemap = None - else: - logging.info('Cache HIT: found value for %s' % uri_r) - else: - # Cache MISS: No value - logging.info('Cache MISS: No cached value for %s' % uri_r) - timemap = None - - return timemap + if date <= timestamp + self.tolerance: + return timemap def get_all(self, uri_r): """Request the whole TimeMap for that uri. @@ -130,42 +80,21 @@ def set(self, uri_r, timemap): :param timemap: The value to cache. :return: The backend setter method return value. """ - logging.info('Updating cache for %s' % uri_r) timestamp = datetime.utcnow().replace(tzinfo=tzutc()) val = (timestamp, timemap) - key = uri_r - try: - self.backend.set(key, val) - if self.CHECK_SIZE: - self._check_size(uri_r) - except Exception as e: - logging.error('Error setting cache value: %s' % e) + if self._check_size(val): + self.backend.set(uri_r, val) - def _check_size(self, key, delete=True): - """Check the size that a specific TimeMap value is using on disk. + def _check_size(self, val): + """Check the size that a specific TimeMap value is using in memory. It deletes if it is more than the maximum size. - :param key: The TimeMap original resource. - :param delete: (Optional) When true, the value is deleted. - Else only a warning is raised. - :return: The size of the value on disk (0 if it was deleted). + :param val: The cached object. + :return: The True if it can be stored. """ - try: - fname = md5(key).hexdigest() # werkzeug key - fpath = self.path + '/' + fname - size = os.path.getsize(fpath) - if size > self.max_file_size and delete: - message = ('Cache value too big (%dB, max %dB) ' - 'for the TimeMap of %s') - if delete: - message += '. Deleting cached value.' - os.remove(fpath) - size = 0 - logging.warning(message % (size, self.max_file_size, key)) - return size - except Exception as e: - logging.error( - 'Exception checking cache value size for TimeMap of %s ' - 'Exception: %s' % (key, e)) - return 0 + if self.CHECK_SIZE: + size = sys.getsizeof(val) + if size > self.max_file_size: + return False + return True diff --git a/timegate/conf/config.ini b/timegate/conf/config.ini index e30dec7..75598b7 100644 --- a/timegate/conf/config.ini +++ b/timegate/conf/config.ini @@ -45,11 +45,9 @@ base_uri = [cache] -# cache_activated -# When true, the cache stores TimeMaps from API that allows batch (get_all_mementos) requests, except for requests with `Cache-Control: no-cache` header, which will always return fresh Mementos. -# When false, no cache file will be created -# Default true -cache_activated = false +# cache_backend +# For disabling cache use werkzeug.contrib.cache.NullCache +cache_backend = werkzeug.contrib.cache:FileSystemCache # cache_refresh_time # Time in seconds, for which it is assumed that a TimeMap didn't change. Any TimeGate request for a datetime past this period (or any TimeMap request past this period) will trigger a refresh of the cached value. @@ -59,10 +57,10 @@ cache_refresh_time = 86400 # cache_directory # Cache directory relative path for data files. Make sure that this directory is empty or else the cache will start deleting random files. # Default cache/ -cache_directory = cache +cache_dir = cache -# cache_max_values +# threshold # Maximum number of stored TimeMaps in the cache. # Tweak this depending on how big your TimeMaps can become (number of elements and length of URIs) # Default 250 -cache_max_values = 250 +threshold = 250 diff --git a/timegate/config.py b/timegate/config.py index febda41..d84c7f7 100644 --- a/timegate/config.py +++ b/timegate/config.py @@ -11,6 +11,8 @@ from __future__ import absolute_import, print_function +import re + from configparser import ConfigParser from ._compat import string_types @@ -56,33 +58,62 @@ def from_inifile(self, filename, silent=True): self['API_TIME_OUT'] = conf.getfloat('server', 'api_time_out') # Handler configuration - if conf.has_option('handler', 'handler_class'): - self['HANDLER_MODULE'] = conf.get('handler', 'handler_class') - if conf.has_option('handler', 'base_uri'): - self['BASE_URI'] = conf.get('handler', 'base_uri') - if conf.getboolean('handler', 'is_vcs'): - self['RESOURCE_TYPE'] = 'vcs' - else: - self['RESOURCE_TYPE'] = 'snapshot' - - if conf.has_option('handler', 'use_timemap'): - self['USE_TIMEMAPS'] = conf.getboolean('handler', 'use_timemap') - else: - self['USE_TIMEMAPS'] = False + def build_handler(section): + """Build handler configuration.""" + output = {} + if conf.has_option(section, 'handler_class'): + output['HANDLER_MODULE'] = conf.get(section, 'handler_class') + if conf.has_option(section, 'base_uri'): + output['BASE_URI'] = conf.get(section, 'base_uri') + if conf.getboolean(section, 'is_vcs'): + output['RESOURCE_TYPE'] = 'vcs' + else: + output['RESOURCE_TYPE'] = 'snapshot' + + if conf.has_option(section, 'use_timemap'): + output['USE_TIMEMAPS'] = conf.getboolean(section, + 'use_timemap') + else: + output['USE_TIMEMAPS'] = False + return output + + self.setdefault('HANDLERS', {}) + re_handler = re.compile('^handler(:(?P.+))?') + for section_name in conf.sections(): + handler = re_handler.match(section_name) + if handler: + handler_name = handler.groupdict()['handler_name'] + section = build_handler(section_name) + if handler_name or handler.groups()[0]: + self['HANDLERS'][handler_name] = section + else: + self.update(section) # Cache - # When False, all cache requests will be cache MISS - self['CACHE_USE'] = conf.getboolean('cache', 'cache_activated') + self['CACHE_BACKEND'] = conf.get('cache', 'cache_backend') # Time window in which the cache value is considered young # enough to be valid - self['CACHE_TOLERANCE'] = conf.getint('cache', 'cache_refresh_time') - # Cache files paths - self['CACHE_DIRECTORY'] = conf.get( - 'cache', 'cache_directory').rstrip('/') - # Maximum number of TimeMaps stored in cache - self['CACHE_MAX_VALUES'] = conf.getint('cache', 'cache_max_values') - # Cache files paths - self['CACHE_FILE'] = self['CACHE_DIRECTORY'] # + '/cache_data' + self['CACHE_REFRESH_TIME'] = conf.getint('cache', 'cache_refresh_time') + + options = { + 'cache_backend': None, + 'cache_refresh_time': None, + 'default_timeout': 'getint', + 'mode': 'getint', + 'port': 'getint', + 'threshold': 'getint', + } + self.setdefault('CACHE_OPTIONS', {}) + + for key in conf.options('cache'): + if key in options: + getter = options[key] + if getter: + self['CACHE_OPTIONS'][key] = getattr(conf, getter)( + 'cache', key + ) + else: + self['CACHE_OPTIONS'][key] = conf.get('cache', key) def from_object(self, obj): """Update config with values from given object. diff --git a/timegate/examples/simple.py b/timegate/examples/simple.py index 04bb8c0..f22fd8e 100644 --- a/timegate/examples/simple.py +++ b/timegate/examples/simple.py @@ -23,13 +23,13 @@ class ExampleHandler(Handler): - def __init__(self): + def __init__(self, base_uri='http://www.example.com/'): Handler.__init__(self) # Initialization code here. This part is run only once versions_a = [ - 'http://www.example.com/resourceA_v1', - 'http://www.example.com/resourceA_v2', - 'http://www.example.com/resourceA_v3' + '{0}resourceA_v1'.format(base_uri), + '{0}resourceA_v2'.format(base_uri), + '{0}resourceA_v3'.format(base_uri) ] date_times_a = [ @@ -38,8 +38,8 @@ def __init__(self): '2015-01-03T22:00:00Z' ] versions_b = [ - 'http://www.example.com/resourceB_v1', - 'http://www.example.com/resourceB_v2', + '{0}resourceB_v1'.format(base_uri), + '{0}resourceB_v2'.format(base_uri), ] date_times_b = [ @@ -47,16 +47,16 @@ def __init__(self): '2000-11-08T19:05:09Z' ] self.archives = { - 'http://www.example.com/resourceA': versions_a, - 'http://www.example.com/resourceB': versions_b, - 'http://www.example.com/resource%20space': [ - 'http://www.example.com/space', + '{0}resourceA'.format(base_uri): versions_a, + '{0}resourceB'.format(base_uri): versions_b, + '{0}resource%20space'.format(base_uri): [ + '{0}space'.format(base_uri), ], } self.dates = { - 'http://www.example.com/resourceA': date_times_a, - 'http://www.example.com/resourceB': date_times_b, - 'http://www.example.com/resource%20space': [ + '{0}resourceA'.format(base_uri): date_times_a, + '{0}resourceB'.format(base_uri): date_times_b, + '{0}resource%20space'.format(base_uri): [ '1970-01-01T00:00:00Z' ], }