From f4180681d36680353ea93b67333cfac94c2e3f81 Mon Sep 17 00:00:00 2001 From: chreman Date: Wed, 24 Apr 2024 14:14:55 +0200 Subject: [PATCH 01/75] wip --- server/workers/api/src/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/workers/api/src/app.py b/server/workers/api/src/app.py index 2d38f533d..787d42274 100644 --- a/server/workers/api/src/app.py +++ b/server/workers/api/src/app.py @@ -11,6 +11,7 @@ from apis.openaire import openaire_ns from apis.create_vis import vis_ns from apis.export import export_ns +from apis.orcid import orcid_ns class ReverseProxied(object): @@ -72,6 +73,7 @@ def api_patches(app): api.add_namespace(openaire_ns, path='/openaire') api.add_namespace(vis_ns, path='/vis') api.add_namespace(export_ns, path='/export') +api.add_namespace(orcid_ns, path='/orcid') app.logger.debug(app.config) app.logger.debug(app.url_map) From b7633651802e86805cac2cb50c34e278cc582182 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 13 Jun 2024 20:31:29 +0200 Subject: [PATCH 02/75] wip --- server/workers/orcid/.dockerignore | 10 ++++++++++ server/workers/orcid/Dockerfile | 11 +++++++++++ server/workers/orcid/requirements.txt | 22 ++++++++++++++++++++++ server/workers/orcid/run_orcid.py | 20 ++++++++++++++++++++ 4 files changed, 63 insertions(+) create mode 100644 server/workers/orcid/.dockerignore create mode 100644 server/workers/orcid/Dockerfile create mode 100644 server/workers/orcid/requirements.txt create mode 100644 server/workers/orcid/run_orcid.py diff --git a/server/workers/orcid/.dockerignore b/server/workers/orcid/.dockerignore new file mode 100644 index 000000000..2a5c729b7 --- /dev/null +++ b/server/workers/orcid/.dockerignore @@ -0,0 +1,10 @@ +renv +__pycache__ +.cache +.pytest_cache +.Rproj.user +.RData +.Rhistory +*.Rproj +.pynb_checkpoints +*.ipynb \ No newline at end of file diff --git a/server/workers/orcid/Dockerfile b/server/workers/orcid/Dockerfile new file mode 100644 index 000000000..c19b1b035 --- /dev/null +++ b/server/workers/orcid/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.8 +MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" + +RUN apt-get update +RUN apt-get install -y gcc git libpq-dev + +WORKDIR /orcid +COPY workers/orcid/requirements.txt . +RUN pip install --upgrade pip +RUN pip install --no-cache-dir -r requirements.txt +COPY workers/orcid/src/ ./ \ No newline at end of file diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt new file mode 100644 index 000000000..a2fe8f11f --- /dev/null +++ b/server/workers/orcid/requirements.txt @@ -0,0 +1,22 @@ +asn1crypto==0.24.0 +async-timeout==4.0.2 +cryptography==2.1.4 +idna==2.6 +importlib-metadata==4.8.3 +keyring==10.6.0 +keyrings.alt==3.0 +numpy==1.19.5 +packaging==21.3 +pandas==1.1.5 +pycrypto==2.6.1 +pygobject==3.26.1 +pyparsing==3.1.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +pyxdg==0.25 +redis==4.3.6 +SecretStorage==2.3.1 +six==1.11.0 +typing-extensions==4.1.1 +zipp==3.6.0 +pyorcid==1.2.0 \ No newline at end of file diff --git a/server/workers/orcid/run_orcid.py b/server/workers/orcid/run_orcid.py new file mode 100644 index 000000000..b280e2a2b --- /dev/null +++ b/server/workers/orcid/run_orcid.py @@ -0,0 +1,20 @@ +import os +# import json +import redis +from base.src.base import BaseClient + + +if __name__ == '__main__': + redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD"), + "client_name": "orcid_retrieval" + } + + redis_store = redis.StrictRedis(**redis_config) + wrapper = BaseClient(None, None, redis_store, + "english", + os.environ.get("LOGLEVEL", "INFO")) + wrapper.run() From cf9dfdb19edd6a7b14ff763a1be0b1657493e901 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 13 Jun 2024 20:38:04 +0200 Subject: [PATCH 03/75] deployment updates --- docker-compose.yml | 23 +++++++++++++++++++++++ server/workers/build_docker_images.sh | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index c7c721434..0d6bcc2c6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -80,6 +80,29 @@ services: networks: - headstart + orcid: + build: + context: server + dockerfile: workers/orcid/Dockerfile + restart: unless-stopped + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + LOGLEVEL: "${LOGLEVEL}" + LC_ALL: "en_US.UTF-8" + LANG: "en_US.UTF-8" + PYTHONIOENCODING: "utf-8" + ORCID_CLIENT_ID: "${ORCID_CLIENT_ID}" + ORCID_CLIENT_SECRET: "${ORCID_CLIENT_SECRET}" + command: ["python", "app.py"] + depends_on: + - redis + networks: + - headstart + dataprocessing: build: context: server diff --git a/server/workers/build_docker_images.sh b/server/workers/build_docker_images.sh index 1d93912a6..52feed689 100755 --- a/server/workers/build_docker_images.sh +++ b/server/workers/build_docker_images.sh @@ -1,6 +1,6 @@ #!/bin/bash SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -services=("api" "persistence" "dataprocessing" "base" "pubmed" "openaire") +services=("api" "persistence" "dataprocessing" "base" "pubmed" "openaire" "orcid") service_version="`git rev-parse HEAD`" echo "" echo "Building services with version $service_version" From 6401c1f818b80fa38f02d25113f689099a378134 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 13:18:49 +0200 Subject: [PATCH 04/75] orcid wip --- server/workers/orcid/__init__.py | 0 server/workers/orcid/requirements.txt | 25 ++-- server/workers/orcid/run_orcid.py | 4 +- server/workers/orcid/src/__init__.py | 0 server/workers/orcid/src/orcid.py | 179 ++++++++++++++++++++++++++ 5 files changed, 195 insertions(+), 13 deletions(-) create mode 100644 server/workers/orcid/__init__.py create mode 100644 server/workers/orcid/src/__init__.py create mode 100644 server/workers/orcid/src/orcid.py diff --git a/server/workers/orcid/__init__.py b/server/workers/orcid/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt index a2fe8f11f..aabcbd129 100644 --- a/server/workers/orcid/requirements.txt +++ b/server/workers/orcid/requirements.txt @@ -1,22 +1,25 @@ -asn1crypto==0.24.0 +aioredis==2.0.1 +aniso8601==9.0.1 async-timeout==4.0.2 -cryptography==2.1.4 -idna==2.6 +attrs==22.2.0 +click==8.1.3 +dataclasses>=0.6 +hiredis==2.0.0 importlib-metadata==4.8.3 -keyring==10.6.0 -keyrings.alt==3.0 +importlib-resources==5.4.0 +itsdangerous==2.1.2 +jsonschema==3.2.0 +MarkupSafe==2.1.3 +mistune==2.0.5 numpy==1.19.5 packaging==21.3 pandas==1.1.5 -pycrypto==2.6.1 -pygobject==3.26.1 pyparsing==3.1.1 +pyrsistent==0.18.0 python-dateutil==2.8.2 pytz==2023.3.post1 -pyxdg==0.25 +PyYAML==6.0.1 redis==4.3.6 -SecretStorage==2.3.1 -six==1.11.0 +six==1.16.0 typing-extensions==4.1.1 zipp==3.6.0 -pyorcid==1.2.0 \ No newline at end of file diff --git a/server/workers/orcid/run_orcid.py b/server/workers/orcid/run_orcid.py index b280e2a2b..71d4e041e 100644 --- a/server/workers/orcid/run_orcid.py +++ b/server/workers/orcid/run_orcid.py @@ -1,7 +1,7 @@ import os # import json import redis -from base.src.base import BaseClient +from orcid.src.orcid import OrcidClient if __name__ == '__main__': @@ -14,7 +14,7 @@ } redis_store = redis.StrictRedis(**redis_config) - wrapper = BaseClient(None, None, redis_store, + wrapper = OrcidClient(None, None, redis_store, "english", os.environ.get("LOGLEVEL", "INFO")) wrapper.run() diff --git a/server/workers/orcid/src/__init__.py b/server/workers/orcid/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py new file mode 100644 index 000000000..799013fbe --- /dev/null +++ b/server/workers/orcid/src/orcid.py @@ -0,0 +1,179 @@ +import os +import json +import subprocess +import pandas as pd +import logging +from datetime import timedelta +from dateutil.parser import parse +import re +from redis.exceptions import LockError +import time +import numpy as np +from pyorcid import OrcidAuthentication, Orcid + + + +formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + + + +class OrcidClient(): + + def __init__(self): + self.separation = 0.1 + self.rate_key = 'orcid-ratelimit' + self.ORCID_CLIENT_ID = os.getenv("ORCID_CLIENT_ID") + self.ORCID_CLIENT_SECRET = os.getenv("ORCID_CLIENT_SECRET") + self.access_token = self.authenticate() + if os.getenv("FLASK_ENV") == "dev": + self.sandbox = True + else: + self.sandbox = False + + def authenticate(self): + orcid_auth = OrcidAuthentication(client_id=self.ORCID_CLIENT_ID, + client_secret=self.ORCID_CLIENT_SECRET) + access_token = orcid_auth.get_public_access_token() + return access_token + + + def next_item(self): + queue, msg = self.redis_store.blpop("orcid") + msg = json.loads(msg.decode('utf-8')) + k = msg.get('id') + params = msg.get('params') + params["service"] = "orcid" + endpoint = msg.get('endpoint') + return k, params, endpoint + + + def orcid_rate_limit_reached(self): + """ + This implementation is inspired by an implementation of + Generic Cell Rate Algorithm based rate limiting, + seen on https://dev.to/astagi/rate-limiting-using-python-and-redis-58gk. + It has been simplified and adjusted to our use case. + + BASE demands one request per second (1 QPS), per + https://www.base-search.net/about/download/base_interface.pdf + """ + + t = self.redis_store.time()[0] + self.redis_store.setnx(self.rate_key, 0) + try: + with self.redis_store.lock('lock:' + self.rate_key, blocking_timeout=5) as lock: + theoretical_arrival_time = max(float(self.redis_store.get(self.rate_key)), t) + if theoretical_arrival_time - t <= 0: + new_theoretical_arrival_time = max(theoretical_arrival_time, t) + self.separation + self.redis_store.set(self.rate_key, new_theoretical_arrival_time) + return False + return True + # the locking mechanism is needed if a key is requested multiple times at the same time + except LockError: + return True + + def run(self): + while True: + while self.base_rate_limit_reached(): + self.logger.debug('🛑 Request is limited') + time.sleep(0.1) + k, params, endpoint = self.next_item() + self.logger.debug(k) + self.logger.debug(params) + if endpoint == "search": + try: + res = self.execute_search(params) + res["id"] = k + if res.get("status") == "error" or params.get('raw') is True: + self.redis_store.set(k+"_output", json.dumps(res)) + else: + self.redis_store.rpush("input_data", json.dumps(res).encode('utf8')) + q_len = self.redis_store.llen("input_data") + self.logger.debug("Queue length: %s %d %s" %("input_data", q_len, k)) + except Exception as e: + self.logger.exception("Exception during data retrieval.") + self.logger.error(params) + self.logger.error(e) + + def execute_search(self, params): + q = params.get('q') + service = params.get('service') + data = {} + data["params"] = params + orcid_id = params.get("orcid") + try: + orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) + works = retrieve_full_works_metadata(orcid) + metadata = apply_metadata_schema(works) + # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] + text = pd.concat([metadata.id, metadata[["title", "paper_abstract"]] + .apply(lambda x: " ".join(x), axis=1)], axis=1) + text.columns = ["id", "content"] + input_data = {} + input_data["metadata"] = metadata.to_json(orient='records') + input_data["text"] = text.to_json(orient='records') + res = {} + res["input_data"] = input_data + res["params"] = params + return res + except Exception as e: + self.logger.error(e) + raise + +def retrieve_full_works_metadata(orcid): + raw_works = pd.DataFrame(orcid.works()[1]["group"]).explode("work-summary") + works = pd.json_normalize(pd.DataFrame(raw_works["work-summary"])) + works["publication-date"] = works.apply(get_publication_date, axis=1) + works["doi"] = works.apply(extract_dois, axis=1) + return works + +def apply_metadata_schema(works): + works.rename(columns=works_mapping, inplace=True) + metadata = works + return metadata + +def filter_dicts_by_value(dicts, key, value): + return [d for d in dicts if d.get(key) == value] + +def extract_dois(work): + external_ids = work["external-ids.external-id"] + external_ids = external_ids if isinstance(external_ids, list) else [] + external_ids = (filter_dicts_by_value( + external_ids, + key="external-id-type", + value="doi") if len(external_ids)>0 else "") + doi = external_ids[0].get("external-id-value", "") if len(external_ids)>0 else "" + return doi + + +def get_publication_date(work): + year = work["publication-date.year.value"] + month = work["publication-date.month.value"] + day = work["publication-date.day.value"] + publication_date = "" + parsed_publication_date = publication_date + if year is not pd.np.NaN: + publication_date+=str(int(year)) + parsed_publication_date = publication_date + if month is not pd.np.NaN: + publication_date+=("-"+str(int(month))) + date_obj = parse(publication_date) + parsed_publication_date = date_obj.strftime('%Y-%m') + if day is not pd.np.NaN: + publication_date+=("-"+str(int(day))) + date_obj = parse(publication_date) + parsed_publication_date = date_obj.strftime('%Y-%m-%d') + return parsed_publication_date + + +works_mapping = { + "put-code": "id", + "title.title.value": "title", + "short-description": "paper_abstract", + "publication-date": "year", + "work-contributors": "authors", + "type": "resulttype", + "url.value": "link", + "journal-title.value": "published_in" +} \ No newline at end of file From 6c98409b23d105eab163486da4ca3bcfb0d85206 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 13:19:41 +0200 Subject: [PATCH 05/75] orcid wip --- docker-compose.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0d6bcc2c6..0af7ee29d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -97,7 +97,6 @@ services: PYTHONIOENCODING: "utf-8" ORCID_CLIENT_ID: "${ORCID_CLIENT_ID}" ORCID_CLIENT_SECRET: "${ORCID_CLIENT_SECRET}" - command: ["python", "app.py"] depends_on: - redis networks: From 62bcb36fda8651dd6e997b51dd5a7ac7c2544778 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 13:20:01 +0200 Subject: [PATCH 06/75] gsheets cleanup --- server/services/GSheetUpdateAvailable.php | 44 ---------------- server/services/createNewGSheet.php | 45 ---------------- server/services/getGSheetsMap.php | 63 ----------------------- server/services/updateGSheetsMap.php | 44 ---------------- 4 files changed, 196 deletions(-) delete mode 100644 server/services/GSheetUpdateAvailable.php delete mode 100644 server/services/createNewGSheet.php delete mode 100644 server/services/getGSheetsMap.php delete mode 100644 server/services/updateGSheetsMap.php diff --git a/server/services/GSheetUpdateAvailable.php b/server/services/GSheetUpdateAvailable.php deleted file mode 100644 index 37bdfc2af..000000000 --- a/server/services/GSheetUpdateAvailable.php +++ /dev/null @@ -1,44 +0,0 @@ - $vis_id, "details" => false, "context" => true)); - $res = $apiclient->call_persistence("getLastVersion", $payload); - if ($res["httpcode"] != 200) { - library\CommUtils::echoOrCallback($res, $_GET); - } else { - $data = json_decode($res["result"], true); - $rev_data = json_decode($data["rev_data"], true); - $timestamp_old = $rev_data["last_update"]; - $update_available = ($timestamp_old != $gsheet_last_updated) ? true : false; - $return_data = array("update_available" => $update_available); - $jsonData = json_encode($return_data); - library\CommUtils::echoOrCallback($jsonData, $_GET); - } -} else { - $data = $persistence->getLastVersion($vis_id, $details = false, $context = true)[0]; - $rev_data = json_decode($data["rev_data"], true); - $timestamp_old = $rev_data["last_update"]; - $update_available = ($timestamp_old != $gsheet_last_updated) ? true : false; - $return_data = array("update_available" => $update_available); - $jsonData = json_encode($return_data); - library\CommUtils::echoOrCallback($jsonData, $_GET); -} - -?> diff --git a/server/services/createNewGSheet.php b/server/services/createNewGSheet.php deleted file mode 100644 index 9d7fc3f5b..000000000 --- a/server/services/createNewGSheet.php +++ /dev/null @@ -1,45 +0,0 @@ - $sheet_name, - "project_name" => $project_name, - "main_curator_email" => $main_curator_email, - "knowledge_base_template_id" => $knowledge_base_template_id)); -$res = $apiclient->call_api("/gsheets" . "/createKnowledgebase", $payload); -if ($res["httpcode"] != 200) { - echo json_encode($res); -} else { - $result = json_decode($res["result"], true); - echo $result; -} - -?> diff --git a/server/services/getGSheetsMap.php b/server/services/getGSheetsMap.php deleted file mode 100644 index 896d9d356..000000000 --- a/server/services/getGSheetsMap.php +++ /dev/null @@ -1,63 +0,0 @@ - $vis_id, "details" => false, "context" => true)); - $res = $apiclient->call_persistence("getLastVersion", $payload); - if ($res["httpcode"] != 200) { - library\CommUtils::echoOrCallback($res, $_GET); - } else { - $data = json_decode($res["result"], true); - $rev_data = json_decode($data["rev_data"], true); - $context = array("id" => $data["rev_vis"], - "query" => $data["vis_query"], - "service" => $data["vis_title"], - "timestamp" => $data["rev_timestamp"], - "params" => $data["vis_params"], - "sheet_id" => $rev_data["sheet_id"], - "last_update" => $rev_data["last_update"]); - if (array_key_exists("additional_context", $rev_data)) { - $context = array_merge($context, $rev_data["additional_context"]); - } - $return_data = array("context" => $context, - "data" => $rev_data["data"], - "errors" => $rev_data["errors"]); - $jsonData = json_encode($return_data); - library\CommUtils::echoOrCallback($jsonData, $_GET); - } -} else { - $data = $persistence->getLastVersion($vis_id, $details = false, $context = true)[0]; - $rev_data = json_decode($data["rev_data"], true); - $context = array("id" => $data["rev_vis"], - "query" => $data["vis_query"], - "service" => $data["vis_title"], - "timestamp" => $data["rev_timestamp"], - "params" => $data["vis_params"], - "sheet_id" => $rev_data["sheet_id"], - "last_update" => $rev_data["last_update"]); - if (array_key_exists("additional_context", $rev_data)) { - $context = array_merge($context, $rev_data["additional_context"]); - } - $return_data = array("context" => $context, - "data" => $rev_data["data"], - "errors" => $rev_data["errors"]); - $jsonData = json_encode($return_data); - library\CommUtils::echoOrCallback($jsonData, $_GET); -} diff --git a/server/services/updateGSheetsMap.php b/server/services/updateGSheetsMap.php deleted file mode 100644 index 333ffcaac..000000000 --- a/server/services/updateGSheetsMap.php +++ /dev/null @@ -1,44 +0,0 @@ - $dirty_query, "sheet_id" => $sheet_id, "sheet_range" => "Resources!A1:AG200"); -if(isset($last_update)) { - $params["last_update"] = $last_update; -} - -$result = search("gsheets", $dirty_query - , $params, array("sheet_id") - , false - , false, null - , $sheet_id, false); - -echo $result; - -?> From 70d64ec5583a4ad340180b1a087bf087a0c765d8 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 13:20:18 +0200 Subject: [PATCH 07/75] orcid wip --- server/services/searchORCID.php | 38 ++++++++++++++++ server/workers/api/src/apis/orcid.py | 66 ++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 server/services/searchORCID.php create mode 100644 server/workers/api/src/apis/orcid.py diff --git a/server/services/searchORCID.php b/server/services/searchORCID.php new file mode 100644 index 000000000..a820eb57c --- /dev/null +++ b/server/services/searchORCID.php @@ -0,0 +1,38 @@ + diff --git a/server/workers/api/src/apis/orcid.py b/server/workers/api/src/apis/orcid.py new file mode 100644 index 000000000..3342359f8 --- /dev/null +++ b/server/workers/api/src/apis/orcid.py @@ -0,0 +1,66 @@ +import os +import json +import uuid +import pandas as pd +import time + +from flask import request, make_response, jsonify, abort, g +from flask_restx import Namespace, Resource, fields +from .request_validators import SearchParamSchema +from apis.utils import get_key, redis_store + +orcid_ns = Namespace("orcid", description="ORCiD API operations") +search_param_schema = SearchParamSchema() + + +orcid_querymodel = orcid_ns.model("SearchQuery", + {"q": fields.String(example='', + description='query string', + required=True), + "orcid": fields.String(example='1234-5678-9012-3456', + description='ORCiD iD', + required=True)}) + + + +@orcid_ns.route('/search') +class Search(Resource): + @orcid_ns.doc(responses={200: 'OK', + 400: 'Invalid search parameters'}) + @orcid_ns.expect(orcid_querymodel) + @orcid_ns.produces(["application/json", "text/csv"]) + def post(self): + """ + """ + params = request.get_json() + orcid_ns.logger.debug(params) + if "optradio" in params: + del params["optradio"] + errors = search_param_schema.validate(params, partial=True) + orcid_ns.logger.debug(errors) + if errors: + abort(400, str(errors)) + k = str(uuid.uuid4()) + d = {"id": k, "params": params, + "endpoint": "search"} + orcid_ns.logger.debug(d) + redis_store.rpush("orcid", json.dumps(d)) + q_len = redis_store.llen("orcid") + orcid_ns.logger.debug("Queue length: %s %d %s" %("orcid", q_len, k)) + result = get_key(redis_store, k, 300) + try: + headers = {} + if request.headers["Accept"] == "application/json": + headers["Content-Type"] = "application/json" + return make_response(result, + 200, + headers) + except Exception as e: + orcid_ns.logger.error(e) + abort(500, "Problem encountered, check logs.") + +@orcid_ns.route('/service_version') +class ServiceVersion(Resource): + def get(self): + result = {"service_version": os.getenv("SERVICE_VERSION")} + return make_response(result, 200, {"Content-Type": "application/json"}) From 2960aa4c8e8eedef01d054f4850a0bc8d02117b2 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 13:23:20 +0200 Subject: [PATCH 08/75] gsheets cleanup --- server/services/search.php | 3 +- server/workers/gsheets/Dockerfile | 15 - server/workers/gsheets/example_gsheets.env | 1 - server/workers/gsheets/requirements.txt | 6 - server/workers/gsheets/run_gsheets.py | 17 - server/workers/gsheets/src/search_gsheets.py | 444 ------------------- 6 files changed, 1 insertion(+), 485 deletions(-) delete mode 100644 server/workers/gsheets/Dockerfile delete mode 100644 server/workers/gsheets/example_gsheets.env delete mode 100644 server/workers/gsheets/requirements.txt delete mode 100644 server/workers/gsheets/run_gsheets.py delete mode 100644 server/workers/gsheets/src/search_gsheets.py diff --git a/server/services/search.php b/server/services/search.php index 23808a5d3..31588bfbe 100644 --- a/server/services/search.php +++ b/server/services/search.php @@ -66,8 +66,7 @@ function search($service, $dirty_query , "pubmed" => "PubMed" , "doaj" => "DOAJ" , "base" => "BASE" - , "openaire" => "OpenAire" - , "gsheets" => "GSheets"); + , "openaire" => "OpenAire"); $query = ($do_clean_query === true) ? (cleanQuery($dirty_query, $transform_query_tolowercase, $add_slashes)) diff --git a/server/workers/gsheets/Dockerfile b/server/workers/gsheets/Dockerfile deleted file mode 100644 index 235e809c8..000000000 --- a/server/workers/gsheets/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM python:3.7 - -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" - -RUN apt update -RUN apt-get install -y gcc - -WORKDIR /headstart -COPY workers/gsheets/requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt -COPY workers/gsheets/src/ ./gsheets/src -COPY workers/gsheets/run_gsheets.py . -COPY workers/gsheets/token.pickle ./gsheets - -ENTRYPOINT python run_gsheets.py diff --git a/server/workers/gsheets/example_gsheets.env b/server/workers/gsheets/example_gsheets.env deleted file mode 100644 index cd4767a1e..000000000 --- a/server/workers/gsheets/example_gsheets.env +++ /dev/null @@ -1 +0,0 @@ -LOGLEVEL=DEBUG diff --git a/server/workers/gsheets/requirements.txt b/server/workers/gsheets/requirements.txt deleted file mode 100644 index fd3c0b49f..000000000 --- a/server/workers/gsheets/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -redis -google-api-python-client -google-auth-httplib2 -google-auth-oauthlib -pandas -pandas_schema diff --git a/server/workers/gsheets/run_gsheets.py b/server/workers/gsheets/run_gsheets.py deleted file mode 100644 index 63b1aef09..000000000 --- a/server/workers/gsheets/run_gsheets.py +++ /dev/null @@ -1,17 +0,0 @@ -import os -import json -import redis -from gsheets.src.search_gsheets import GSheetsClient - - -if __name__ == '__main__': - redis_config = { - "host": os.getenv("REDIS_HOST"), - "port": os.getenv("REDIS_PORT"), - "db": os.getenv("REDIS_DB"), - "password": os.getenv("REDIS_PASSWORD"), - "client_name": "gsheets_retrieval" - } - redis_store = redis.StrictRedis(**redis_config) - gc = GSheetsClient(redis_store, os.environ.get("LOGLEVEL", "INFO")) - gc.run() diff --git a/server/workers/gsheets/src/search_gsheets.py b/server/workers/gsheets/src/search_gsheets.py deleted file mode 100644 index 8790d7f07..000000000 --- a/server/workers/gsheets/src/search_gsheets.py +++ /dev/null @@ -1,444 +0,0 @@ -import os -import pathlib -import sys -import json -import time -import pickle -import uuid -import logging -from dateutil.parser import parse - -import redis - -from googleapiclient.discovery import build -from google_auth_oauthlib.flow import InstalledAppFlow -from google.auth.transport.requests import Request - -import pandas as pd -from pandas_schema import Column, Schema -from pandas_schema.validation import (InListValidation, - DateFormatValidation) - - -formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') - - -def get_key(store, key): - while True: - res = store.get(key+"_output") - if res is None: - time.sleep(0.5) - else: - result = json.loads(res.decode('utf-8')) - store.delete(key) - store.delete(key+"_output") - break - return result - - -additional_context_fields = [ - "Project name", "Project website", "Topic", - "Main curator name", "Main curator e-mail" -] - - -schema = Schema([ - Column('ID', []), - Column('Title', []), - Column('Authors', []), - Column('Publication Venue', []), - Column('Publication Date', [DateFormatValidation("%Y-%m-%d")]), - Column('Abstract', []), - Column('Link to PDF', []), - Column('Type', []), - Column('Keywords', []), - Column('Tags', []), - Column('Access', []), - Column('Area', []), - Column('Comment 1', []), - Column('Author Comment 1', []), - Column('Comment 2', []), - Column('Author Comment 2', []), - Column('Comment 3', []), - Column('Author Comment 3', []), - Column('Comment 4', []), - Column('Author Comment 4', []) -]) - - -def process_comments(row): - row = row.tolist() - comments = [] - for i in range(0, len(row)-1, 2): - com = row[i] - aut = row[i+1] - if com is not None: - if aut is None: - aut = "" - comments.append({"comment": com, - "author": aut}) - return comments - - -class GSheetsClient(object): - - def __init__(self, redis_store, loglevel="INFO"): - # If modifying these scopes, delete the file token.pickle. - self.SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly', - 'https://www.googleapis.com/auth/drive.metadata.readonly', - 'https://www.googleapis.com/auth/drive'] - self.redis_store = redis_store - self.register_services() - self.last_updated = {} - self.logger = logging.getLogger(__name__) - self.logger.setLevel(loglevel) - handler = logging.StreamHandler(sys.stdout) - handler.setFormatter(formatter) - handler.setLevel(loglevel) - self.logger.addHandler(handler) - self.get_startPageToken() - - def authenticate(self): - creds = None - try: - tokenpath = os.path.join(pathlib.Path(__file__).parent.parent, "token.pickle") - credentialspath = os.path.join(pathlib.Path(__file__).parent.parent, "credentials.json") - except NameError: - tokenpath = os.path.join(os.getcwd(), "token.pickle") - credentialspath = os.path.join(os.getcwd(), "credentials.json") - if os.path.exists(tokenpath): - with open(tokenpath, 'rb') as token: - creds = pickle.load(token) - if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file( - credentialspath, self.SCOPES) - creds = flow.run_local_server(port=0) - # Save the credentials for the next run - with open(tokenpath, 'wb') as token: - pickle.dump(creds, token) - return creds - - def register_services(self): - self.gsheets_service = build('sheets', 'v4', credentials=self.authenticate()) - self.drive_service = build('drive', 'v3', credentials=self.authenticate()) - self.sheet = self.gsheets_service.spreadsheets() - self.files = self.drive_service.files() - - def get_startPageToken(self): - res = self.drive_service.changes().getStartPageToken().execute() - self.startPageToken = res.get('startPageToken') - - def get_currentPageToken(self, sheet_id): - pageToken = self.last_updated.get(sheet_id).get("pageToken") if sheet_id in self.last_updated else self.startPageToken - return pageToken if pageToken is not None else self.startPageToken - - def get_changes(self, pageToken): - res = self.drive_service.changes().list(pageToken=pageToken, - spaces='drive').execute() - return res - - def sheet_has_changed(self, sheet_id): - self.logger.debug(self.last_updated) - pageToken = self.get_currentPageToken(sheet_id) - self.logger.debug(pageToken) - res = self.get_changes(pageToken) - if res is not None: - changes = res.get('changes') - while "nextPageToken" in res: - res = self.get_changes(res.get('nextPageToken')) - changes.extend(res.get('changes')) - filtered_changes = [c for c in changes if c.get('fileId') == sheet_id] - if len(filtered_changes) != 0: - self.logger.debug(filtered_changes) - last_change = filtered_changes[-1] - self.last_updated[sheet_id]["pageToken"] = res.get('newStartPageToken') - self.logger.debug(self.last_updated) - d = parse(last_change.get('time')) - last_update_timestamp_utc = d.strftime("%Y-%m-%d %H:%M:%S %Z") - self.last_updated[sheet_id]["timestamp_utc"] = last_update_timestamp_utc - return True - else: - return False - else: - return False - - def next_item(self): - queue, msg = self.redis_store.blpop("gsheets") - msg = json.loads(msg) - k = msg.get('id') - params = msg.get('params') - params["service"] = "gsheets" - endpoint = msg.get('endpoint') - return k, params, endpoint - - def get_sheet_content(self, sheet_id, sheet_range): - res = self.sheet.values().get(spreadsheetId=sheet_id, - range=sheet_range).execute() - raw = pd.DataFrame(res.get('values')) - return raw - - @staticmethod - def validate_data(df): - """ - errors: [ - { - row: 31, - column: "Publication date", - reason: "\"2020-04-0\" does not match the date format string \"%Y-%m-%d\"", - data: {Abstract: "Now testing changes API, looks good, does it register two changes?" ...} - }, - ] - """ - df.columns = df.iloc[0] - df.drop([0, 1, 2], inplace=True) - df = df[(df["Ready for publication?"] == "yes") & - (df["Included in map"] == "yes")] - errors = schema.validate(df, columns=schema.get_column_names()) - errors_index_rows = [e.row for e in errors] - error_columns = [e.column for e in errors] - error_reasons = [" ".join([str(e.value), str(e.message)]) for e in errors] - if errors_index_rows == [-1]: - clean_df = df - errors_df = pd.DataFrame(columns=["row", "column", "reason", "data"]) - else: - clean_df = df.drop(index=errors_index_rows) - errors_df = pd.DataFrame(columns=["row", "column", "reason", "data"]) - errors_df["row"] = errors_index_rows - errors_df["row"] += 1 # to align with google sheet rows - errors_df["column"] = error_columns - errors_df["reason"] = error_reasons - errors_df["data"] = df.loc[errors_index_rows].to_dict(orient="records") - return clean_df, errors, errors_df - - @staticmethod - def post_process(clean_df, result_df): - sorter = clean_df["ID"] - sorterIndex = dict(zip(sorter, range(len(sorter)))) - result_df["orig_order"] = result_df["id"].map(sorterIndex) - result_df.sort_values(["orig_order"], ascending=[True], inplace=True) - result_df.drop("orig_order", axis=1, inplace=True) - result_df.index = clean_df.index - result_df["area"] = clean_df.Area.map(lambda x: x.strip()) - uris = {a: i for i, a in enumerate(result_df.area.unique())} - result_df["area_uri"] = result_df.area.map(lambda x: uris.get(x)) - oa_mapper = {"closed": 0, - "open": 1, - "unknown": 2, - "free": 3} - result_df["oa_state"] = result_df["oa_state"].map(lambda x: oa_mapper.get(x, 2)) - return result_df - - @staticmethod - def create_input_data(df): - metadata = pd.DataFrame() - metadata["id"] = df.ID - metadata["title"] = df.Title - metadata["authors"] = df.Authors - metadata["paper_abstract"] = df.Abstract.map(lambda x: x.replace("N/A", "") if isinstance(x, str) else "") - metadata["published_in"] = df["Publication Venue"] - metadata["year"] = df["Publication Date"] - metadata["url"] = df.ID - metadata["readers"] = 0 - metadata["subject_orig"] = df.Keywords - metadata["subject"] = metadata["subject_orig"] - metadata["oa_state"] = df.Access - metadata["link"] = df["Link to PDF"].map(lambda x: x.replace("N/A", "") if isinstance(x, str) else "") - metadata["relevance"] = df.index - metadata["comments"] = df.iloc[:, 16:25].apply(lambda x: process_comments(x), axis=1) - metadata["tags"] = df.Tags.map(lambda x: x.replace("N/A", "") if isinstance(x, str) else "") - metadata["resulttype"] = df.Type - if "Contact e-mail" in df: - metadata["author_email"] = df["Contact e-mail"].map(lambda x: x.replace("N/A", "") if isinstance(x, str) else "") - text = pd.DataFrame() - text["id"] = metadata["id"] - text["content"] = metadata.apply(lambda x: ". ".join(x[["title", - "paper_abstract", - "subject"]]), axis=1) - input_data = {} - input_data["metadata"] = metadata.to_json(orient='records') - input_data["text"] = text.to_json(orient='records') - return input_data - - def get_additional_context_data(self, df): - df.columns = df.iloc[0] - df.drop([0], inplace=True) - if all(field in df.columns for field in additional_context_fields): - additional_context = df[additional_context_fields].iloc[0].to_dict() - for k in additional_context_fields: - additional_context[k.lower().replace(" ", "_").replace("-", "")] = additional_context.pop(k) - return additional_context - else: - return None - - def get_spreadsheet_title(self, sheet_id): - res = self.sheet.get(spreadsheetId=sheet_id, fields='properties/title').execute() - return res.get('properties').get('title') - - def get_new_mapdata(self, sheet_id, sheet_range, params): - raw = self.get_sheet_content(sheet_id, sheet_range) - clean_df, errors, errors_df = self.validate_data(raw.copy()) - input_data = self.create_input_data(clean_df) - map_k = str(uuid.uuid4()) - map_input = {} - map_input["id"] = map_k - map_input["input_data"] = input_data - map_input["params"] = params - self.redis_store.rpush("input_data", json.dumps(map_input)) - result = get_key(self.redis_store, map_k) - result_df = self.post_process(clean_df, pd.DataFrame.from_records(json.loads(result))) - res = {} - res["data"] = result_df.to_json(orient="records") - res["errors"] = errors_df.to_dict(orient="records") - additional_context = self.get_additional_context_data(raw.copy()) - if additional_context: - res["additional_context"] = additional_context - res["additional_context"]["query"] = additional_context["topic"] - else: - # inject CoVis multi-map title from sheet title - res["additional_context"] = {} - res["additional_context"]["query"] = self.get_spreadsheet_title(sheet_id) - res["sheet_id"] = sheet_id - res["last_update"] = self.last_updated.get(sheet_id, {}).get("timestamp_utc") - return res - - def update(self, params): - res = {"status": "No update required"} - sheet_id = params.get('sheet_id') - sheet_range = params.get('sheet_range') - last_known_update = params.get('last_update') - if sheet_id not in self.last_updated: - self.last_updated[sheet_id] = {} - last_change = self.files.get(fileId=sheet_id, - fields='modifiedTime', - supportsAllDrives=True).execute().get('modifiedTime') - d = parse(last_change) - last_update_timestamp_utc = d.strftime("%Y-%m-%d %H:%M:%S %Z") - self.last_updated[sheet_id]["timestamp_utc"] = last_update_timestamp_utc - sheet_has_changed = self.sheet_has_changed(sheet_id) - if (last_known_update is not None - and last_known_update != self.last_updated[sheet_id]["timestamp_utc"]): - res = self.get_new_mapdata(sheet_id, sheet_range, params) - if sheet_has_changed is True: - res = self.get_new_mapdata(sheet_id, sheet_range, params) - return res - - def create_knowledgebase(self, params): - try: - sheet_name = params.get('sheet_name') - project_name = params.get('project_name') - main_curator_email = params.get('main_curator_email') - knowledge_base_template_id = params.get('knowledge_base_template_id') - new_drive = self.create_new_drive(project_name) - new_drive_id = new_drive.get('id') - new_kb = self.duplicate_knowledgebase( - knowledge_base_template_id, sheet_name, - new_drive_id) - self.set_new_kb_permissions(new_drive, new_kb, main_curator_email) - self.prefill_additional_context(new_kb, params) - res = {"status": "success"} - except Exception as e: - res = {"status": "error", "msg": str(e)} - return res - - def create_new_drive(self, project_name): - drive_metadata = {'name': project_name} - request_id = str(uuid.uuid4()) - new_drive = self.drive_service.drives().create(body=drive_metadata, - requestId=request_id, - fields='id').execute() - return new_drive - - def duplicate_knowledgebase(self, knowledge_base_template_id, sheet_name, - target_folder_id): - file_metadata = {'name': sheet_name, 'parents': [target_folder_id]} - new_kb = self.files.copy(fileId=knowledge_base_template_id, - body=file_metadata, - supportsAllDrives=True).execute() - return new_kb - - def prefill_additional_context(self, new_kb, params): - context_range = 'Resources!Y2:AC2' - value_input_option = 'RAW' # USER_ENTERED - values = [ - [params.get('project_name', ''), - params.get('project_website', ''), - params.get('topic', ''), - params.get('main_curator_name', ''), - params.get('main_curator_email', '')] - ] - body = {'values': values} - result = self.sheet.values().update( - spreadsheetId=new_kb.get('id'), range=context_range, - valueInputOption=value_input_option, body=body).execute() - - - - def set_new_kb_permissions(self, new_drive, new_kb, main_curator_email): - # set folder rights for okmaps - new_domain_permission = { - 'type': 'domain', - 'role': 'organizer', - 'domain': 'openknowledgemaps.org' - } - permission = self.drive_service.permissions().create( - fileId=new_drive.get('id'), - body=new_domain_permission, - supportsAllDrives=True - ).execute() - # set folder rights for main curator - new_organizer_permission = { - 'type': 'user', - 'role': 'organizer', - 'emailAddress': main_curator_email - } - permission = self.drive_service.permissions().create( - fileId=new_drive.get('id'), - body=new_organizer_permission, - supportsAllDrives=True - ).execute() - # set file rights for main curator - new_fileorganizer_permission = { - 'type': 'user', - 'role': 'writer', - 'emailAddress': main_curator_email - } - permission = self.drive_service.permissions().create( - fileId=new_kb.get('id'), - body=new_fileorganizer_permission, - supportsAllDrives=True - ).execute() - # set file rights for info okmaps - new_fileorganizer_permission = { - 'type': 'user', - 'role': 'writer', - 'emailAddress': 'info@openknowledgemaps.org' - } - permission = self.drive_service.permissions().create( - fileId=new_kb.get('id'), - body=new_fileorganizer_permission, - supportsAllDrives=True - ).execute() - - def run(self): - while True: - k, params, endpoint = self.next_item() - self.logger.debug(k) - self.logger.debug(params) - if endpoint == "search": - try: - res = self.update(params) - self.redis_store.set(k+"_output", json.dumps(res)) - except Exception as e: - self.logger.error(e) - self.logger.error(params) - if endpoint == "create_kb": - try: - res = self.create_knowledgebase(params) - self.redis_store.set(k+"_output", json.dumps(res)) - except Exception as e: - self.logger.error(e) - self.logger.error(params) From 4cb0132455f01d342be7f4665463a30faa80a356 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 13:38:14 +0200 Subject: [PATCH 09/75] gsheets cleanup --- examples/gsheets/config.example.php | 10 -- examples/gsheets/data-config.js | 46 ------ examples/gsheets/index.php | 214 ---------------------------- examples/gsheets/map.css | 64 --------- 4 files changed, 334 deletions(-) delete mode 100644 examples/gsheets/config.example.php delete mode 100644 examples/gsheets/data-config.js delete mode 100644 examples/gsheets/index.php delete mode 100644 examples/gsheets/map.css diff --git a/examples/gsheets/config.example.php b/examples/gsheets/config.example.php deleted file mode 100644 index d1ba22a19..000000000 --- a/examples/gsheets/config.example.php +++ /dev/null @@ -1,10 +0,0 @@ - diff --git a/examples/gsheets/data-config.js b/examples/gsheets/data-config.js deleted file mode 100644 index 56fe1a85b..000000000 --- a/examples/gsheets/data-config.js +++ /dev/null @@ -1,46 +0,0 @@ -var data_config = { - tag: "visualization", - mode: "gsheets", - - bubble_min_scale: 1.1, - bubble_max_scale: 1.1, - - paper_min_scale: 1, - - input_format: "json", - use_area_uri: true, - preview_type: "pdf", - use_hypothesis: true, - - show_intro: false, - show_list:true, - is_force_papers: true, - - show_context: true, - create_title_from_context: true, - show_context_timestamp: true, - show_loading_screen: true, - - scale_toolbar: false, - - content_based: true, - is_evaluation: true, - - is_force_areas: true, - area_force_alpha: 0.03, - papers_force_alpha: 0.2, - - language: "eng_gsheets", - - sort_options: ["year", "title", "area"], - filter_options: ["all", "open_access"], - - show_keywords: true, - hide_keywords_overview: false, - show_resulttype: true, - - filter_menu_dropdown: true, - - embed_modal: true, - share_modal: false -}; diff --git a/examples/gsheets/index.php b/examples/gsheets/index.php deleted file mode 100644 index 6e485b78a..000000000 --- a/examples/gsheets/index.php +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - - - - Knowledge Map of <?php echo $topic ?> - - - -
-
- - - - - - - - - - - diff --git a/examples/gsheets/map.css b/examples/gsheets/map.css deleted file mode 100644 index 873caea5d..000000000 --- a/examples/gsheets/map.css +++ /dev/null @@ -1,64 +0,0 @@ -.errors-container { - margin: 20px 10px; - display: none; - padding-left: 50px; -} - -.show-errors { - display: block; -} - -.topheader { - padding-top: 59px; -} - -.error-row-top, .error-row-entry { - padding: 5px; -} - -.errors-info { - line-height: 110% !important; - z-index: 99; - font-family: 'Lato', sans-serif !important; - position: relative; -} - -.errors-info, .expand-icon { - cursor: pointer; - text-decoration: underline; -} - -.errors-table-hidden { - display: none; -} - -.dismiss-reload { - color: white; - text-decoration: underline; -} - -.dismiss-reload:hover { - color: white; - text-decoration: none; -} - -.show-reload-button { - display: block; -} - -.hide-reload-text { - display: none; -} - -@media screen and (max-width: 1150px) { - .reload-button { - padding: 10px 12px; - max-width: 180px; - } -} - -@media screen and (max-width: 640px) { - .errors-container { - padding-left: 0px; - } -} \ No newline at end of file From 6e7a1d623f20c13afd597e11a91f4b76e0bdc6e9 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 13:46:06 +0200 Subject: [PATCH 10/75] gsheets cleanup --- server/workers/api/src/apis/gsheets.py | 101 ------------------ .../dataprocessing/fetchers/FetcherFactory.js | 3 - .../dataprocessing/fetchers/GsheetsFetcher.js | 21 ---- vis/js/default-config.js | 89 --------------- vis/js/templates/modals/InfoModal.jsx | 2 - vis/test/component/modals.test.js | 88 --------------- 6 files changed, 304 deletions(-) delete mode 100644 server/workers/api/src/apis/gsheets.py delete mode 100644 vis/js/dataprocessing/fetchers/GsheetsFetcher.js diff --git a/server/workers/api/src/apis/gsheets.py b/server/workers/api/src/apis/gsheets.py deleted file mode 100644 index e37928420..000000000 --- a/server/workers/api/src/apis/gsheets.py +++ /dev/null @@ -1,101 +0,0 @@ -import os -import json -import uuid -from datetime import datetime - -from flask import request, make_response, jsonify, abort, render_template -from flask_restx import Namespace, Resource, fields -from apis.utils import get_key, redis_store - -gsheets_ns = Namespace("google_sheets", description="Google Sheets API operations") - - -search_query = gsheets_ns.model("SearchQuery", - {"sheet_id": fields.String(example='1csx2x9DxoEd8Bi67mGxKkAVMB8d_A', - description='sheet ID to update', - required=True), - "sheet_range": fields.String(example="Resources!A1:N200", - description="Sheet name and data range to retrieve from", - required=True), - "q": fields.String(example='covid19', - description='hardcoded vis name'), - "last_update": fields.String(example='2020-07-20 11:47:50 UTC', - description='timestamp of last known GSheets edit')}) - - -create_kb_query = gsheets_ns.model("CreateKnowledgebaseQuery", - {"sheet_name": fields.String(example='CoVis', - description='Title of the knowledge base', - required=True), - "project_name": fields.String(example="Collaborative Visualization", - description="Project Name", - required=True), - "main_curator_email": fields.String(example='name@gmail.com', - description='gmail account of main curator', - required=True), - "knowledge_base_template_id": fields.String(example='1q0947856', - description='google drive ID of source template', - required=True)}) - - -@gsheets_ns.route('/search') -class Search(Resource): - @gsheets_ns.doc(responses={200: 'OK', - 400: 'Invalid search parameters'}) - @gsheets_ns.expect(search_query) - @gsheets_ns.produces(["application/json"]) - def post(self): - """ - """ - params = request.get_json() - # fill default params - params["vis_type"] = "overview" - params["service"] = "gsheets" - gsheets_ns.logger.debug(params) - k = str(uuid.uuid4()) - d = {"id": k, "params": params, - "endpoint": "search"} - gsheets_ns.logger.debug(d) - redis_store.rpush("gsheets", json.dumps(d)) - result = get_key(redis_store, k) - try: - headers = {} - headers["Content-Type"] = "application/json" - return make_response(result, - 200, - headers) - except Exception as e: - gsheets_ns.logger.error(e) - abort(500, "Problem encountered during processing, sorry.") - - -@gsheets_ns.route('/createKnowledgebase') -class createKnowledgebase(Resource): - @gsheets_ns.doc(responses={200: 'OK', - 400: 'Invalid parameters'}) - @gsheets_ns.expect(create_kb_query) - @gsheets_ns.produces(["application/json"]) - def post(self): - params = request.get_json() - gsheets_ns.logger.debug(params) - k = str(uuid.uuid4()) - d = {"id": k, "params": params, - "endpoint": "create_kb"} - gsheets_ns.logger.debug(d) - redis_store.rpush("gsheets", json.dumps(d)) - result = get_key(redis_store, k) - try: - headers = {} - headers["Content-Type"] = "application/json" - return make_response(result, - 200, - headers) - except Exception as e: - gsheets_ns.logger.error(e) - abort(500, "Problem encountered during processing, sorry.") - -@gsheets_ns.route('/service_version') -class ServiceVersion(Resource): - def get(self): - result = {"service_version": os.getenv("SERVICE_VERSION")} - return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file diff --git a/vis/js/dataprocessing/fetchers/FetcherFactory.js b/vis/js/dataprocessing/fetchers/FetcherFactory.js index ee4d436f1..a1a7d6bfc 100644 --- a/vis/js/dataprocessing/fetchers/FetcherFactory.js +++ b/vis/js/dataprocessing/fetchers/FetcherFactory.js @@ -1,5 +1,4 @@ import Fetcher from "./Fetcher"; -import GsheetsFetcher from "./GsheetsFetcher"; import LocalFetcher from "./LocalFetcher"; import ServerFetcher from "./ServerFetcher"; @@ -8,8 +7,6 @@ class FetcherFactory { switch (type) { case "local_files": return new LocalFetcher(config); - case "gsheets": - return new GsheetsFetcher(config); case "search_repos": return new ServerFetcher(config); default: diff --git a/vis/js/dataprocessing/fetchers/GsheetsFetcher.js b/vis/js/dataprocessing/fetchers/GsheetsFetcher.js deleted file mode 100644 index 49a82760e..000000000 --- a/vis/js/dataprocessing/fetchers/GsheetsFetcher.js +++ /dev/null @@ -1,21 +0,0 @@ -import Fetcher from "./Fetcher"; - -class GsheetsFetcher extends Fetcher { - async getData() { - const url = - this.config.serverUrl + - "services/getGSheetsMap.php?vis_id=" + - this.config.files[0].file + - "&q=" + - this.config.files[0].title + - "&context=true&streamgraph=" + - this.config.isStreamgraph; - - const response = await fetch(url); - const data = await response.json(); - - return data; - } -} - -export default GsheetsFetcher; diff --git a/vis/js/default-config.js b/vis/js/default-config.js index 57662d598..9dce4fea4 100644 --- a/vis/js/default-config.js +++ b/vis/js/default-config.js @@ -910,95 +910,6 @@ var config = { please_note: "Please note", citation_warning: "we were not able to verify whether this citation is formatted correctly based on the metadata received. Please check before reuse.", }, - eng_gsheets: { - loading: "Updating and retrieving map. This may take a few seconds, please hold on.", - search_placeholder: "Search within visualization...", - show_list: "Show list", - hide_list: "Hide list", - intro_label: "[more info]", - relevance: "relevance", - readers: "citations", - year: "date", - authors: "authors", - title: "title", - area: "Area", - backlink: "← Back to overview", - backlink_list: "Show all documents in area", - backlink_list_streamgraph: "Show all documents", - backlink_list_streamgraph_stream_selected: "Show all documents in stream", - keywords: "Keywords", - doctypes: "Document type(s)", - unknown: "Unknown", - no_keywords: "not available", - not_available: "not available", - no_title: "No title", - overview_label: 'Knowledge Map of', - streamgraph_label: 'Streamgraph of', - overview_authors_label: 'Overview of the works of', - streamgraph_authors_label: 'Streamgraph of the works of', - articles_label: 'resources', - most_recent_label: 'most recent', - most_relevant_label: 'most relevant', - most_relevant_tooltip: 'To determine the most relevant documents, we use the relevance ranking provided by the data source e.g. BASE. Data sources mainly use text similarity between your query and the article metadata to determine the relevance ranking. Please consult the FAQs for more information.', - most_relevant_tooltip_sg: "In this streamgraph you find the most relevant documents matching your query related to the top keywords. To determine the most relevant documents, we use the relevance ranking provided by the data source e.g. BASE. Data sources mainly use text similarity between your query and the article metadata to determine the relevance ranking. Please consult the FAQs for more information.", - source_label: 'Data source', - resulttype_label: 'Document type', - documenttypes_label: 'Document types', - timestamp_label: 'Last updated', - documenttypes_tooltip: 'The following document types were taken into consideration in the creation of this visualization (not all of them may appear in the visualization):', - default_area: "No area", - default_author: "", - default_id: "defaultid", - default_hash: "hashHash", - default_abstract: "No abstract available", - default_paper_title: "No title available", - default_authors: "No authors available", - default_published_in: "", - default_readers: 0, - default_url: "", - default_x: 1., - default_y: 1., - default_year: "", - sort_by_label: 'sort by:', - filter_by_label: 'show: ', - all: "any", - open_access: "Open Access", - "Journal Article": "Journal article", - Preprint: "Preprint", - ReFigure: "ReFigure", - Review: "Review", - link: 'link', - items: "items", - comment_by_label: "by", - pdf_not_loaded: "Sorry, we were not able to retrieve the PDF for this publication. You can get it directly from", - pdf_not_loaded_linktext: "this website", - share_button_title: "Share this knowledge map", - share_button_title_sg: "Share this streamgraph", - embed_button_title: "Embed this knowledge map on other websites", - embed_button_title_sg: "Embed this streamgraph on other websites", - embed_button_text: 'Copy', - copied_button_text: 'Copied', - embed_title: 'embed visualization', - embed_body_text: 'You can use this code to embed the visualization on your own website or in a dashboard.', - area_streamgraph: "Stream", - stream_year: "Year", - stream_doc_num: "Number of documents", - stream_docs: "Documents", - stream_total: "Total documents in stream", - empty_area_warning: "No matches found. Please reset your filter options above.", - lang_all: "All languages", - cite: "Cite", - cite_title_km: "Cite this knowledge map", - cite_title_sg: "Cite this streamgraph", - citation_template: "Open Knowledge Maps (${year}). ${type} for research on ${query}. Retrieved from ${source} [${date}].", - cite_vis_km: "Please cite this knowledge map as follows", - cite_vis_sg: "Please cite this streamgraph as follows", - cite_paper: "Cite this document as", - export_paper: "Export this document", - download: "Download", - please_note: "Please note", - citation_warning: "we were not able to verify whether this citation is formatted correctly based on the metadata received. Please check before reuse.", - }, }, scale_types: [], diff --git a/vis/js/templates/modals/InfoModal.jsx b/vis/js/templates/modals/InfoModal.jsx index f19ce6edf..5cdec65e8 100644 --- a/vis/js/templates/modals/InfoModal.jsx +++ b/vis/js/templates/modals/InfoModal.jsx @@ -34,8 +34,6 @@ const getInfoTemplate = (service, isStreamgraph, modalType) => { return TripleKMInfo; case "triple_sg": return TripleSGInfo; - case "gsheets": - return GsheetsInfo; case "covis": return CovisInfo; default: diff --git a/vis/test/component/modals.test.js b/vis/test/component/modals.test.js index 6b554b9b8..f5e30f1ae 100644 --- a/vis/test/component/modals.test.js +++ b/vis/test/component/modals.test.js @@ -49,7 +49,6 @@ const setup = (overrideModalsObject = {}, overrideStoreObject = {}) => { reloadLastUpdate: null, apiProperties: { headstartPath: null, - sheetID: null, persistenceBackend: null, }, openInfoModal: false, @@ -742,93 +741,6 @@ describe("Modals component", () => { ); }); - it("gsheets info modal renders", () => { - const storeObject = setup( - { - openInfoModal: true, - infoParams: { - main_curator_name: "John Doe", - main_curator_email: "john@doe.com", - project_name: "John's research", - project_website: null, - sheet_id: "xyz123", - }, - }, - { service: "gsheets" } - ); - const store = mockStore(storeObject); - - act(() => { - render( - - - - - , - container - ); - }); - - expect(document.querySelector("#info-title").textContent).toEqual( - "About this knowledge map" - ); - }); - - it("gsheets info modal renders with project website", () => { - const storeObject = setup( - { - openInfoModal: true, - infoParams: { - main_curator_name: "John Doe", - main_curator_email: "john@doe.com", - project_name: "John's research", - project_website: "johnswebsite.com", - sheet_id: "xyz123", - }, - }, - { service: "gsheets" } - ); - const store = mockStore(storeObject); - - act(() => { - render( - - - - - , - container - ); - }); - - expect(document.querySelector("#info-title").textContent).toEqual( - "About this knowledge map" - ); - }); - - it("covis knowledge map info modal renders", () => { - const storeObject = setup( - { openInfoModal: true }, - { service: "gsheets", isCovis: true, chartType: KNOWLEDGEMAP_MODE } - ); - const store = mockStore(storeObject); - - act(() => { - render( - - - - - , - container - ); - }); - - expect(document.querySelector("#info-title").textContent).toEqual( - "KNOWLEDGE MAP OF COVID-19 RESEARCH CURATED BY EXPERTS" - ); - }); - it("default knowledge map info modal renders", () => { const storeObject = setup( { openInfoModal: true }, From ed5206af6d64aeeb602cdbd030766be654bce828 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 18:34:40 +0200 Subject: [PATCH 11/75] orcid metadata mapping --- .../other-scripts/test/params_base.json | 2 +- .../other-scripts/test/params_openaire.json | 4 +- .../other-scripts/test/test_base.R | 2 +- server/workers/api/src/app.py | 2 +- server/workers/orcid/src/orcid.py | 70 +++++++++++++++---- 5 files changed, 63 insertions(+), 17 deletions(-) diff --git a/server/preprocessing/other-scripts/test/params_base.json b/server/preprocessing/other-scripts/test/params_base.json index c8db5e389..69185a479 100644 --- a/server/preprocessing/other-scripts/test/params_base.json +++ b/server/preprocessing/other-scripts/test/params_base.json @@ -7,5 +7,5 @@ "min_descsize": 300, "limit": 120, "list_size": 100, - "repo": "ftunivbern" + "q_advanced": "dcdoi:(\"10.5281/zenodo.1247473\" OR \"10.5281/zenodo.1065507\" OR \"10.3389/frma.2017.00013\" OR \"10.31263/voebm.v69i3.1733\" OR \"10.12685/027.7-4-2-157\" OR \"10.1007/s11192-016-1887-4\" OR \"10.5281/zenodo.50729\" OR \"10.5281/zenodo.50715\" OR \"10.1515/iwp-2015-0025\" OR \"10.1016/j.joi.2014.12.003\" OR \"10.5281/zenodo.35401\" OR \"10.1504/ijtel.2015.071922\" OR \"10.6084/m9.figshare.1320834\" OR \"10.1007/s11192-014-1365-9\" OR \"10.6084/m9.figshare.1091372\" OR \"10.1145/2494188.2494208\" OR \"10.6084/m9.figshare.156030.v1\" OR \"10.6084/m9.figshare.156030\" OR \"10.6084/m9.figshare.156030.v2\" OR \"10.1145/2187980.2188236\" OR \"10.1007/978-3-642-23985-4_18\" OR \"10.1504/ijtel.2011.045454\" OR \"10.1007/978-3-642-16020-2_59\" OR \"10.1007/978-3-540-88411-8_17\" OR \"10.3217/jucs-016-16-2214\")" } diff --git a/server/preprocessing/other-scripts/test/params_openaire.json b/server/preprocessing/other-scripts/test/params_openaire.json index 3e0b4076c..741aea717 100644 --- a/server/preprocessing/other-scripts/test/params_openaire.json +++ b/server/preprocessing/other-scripts/test/params_openaire.json @@ -1,4 +1,4 @@ -{"funder":"EC", - "project_id":"643410", +{"funder":"ARC", + "project_id":"DP0878177", "list_size": -1, "vis_id": "TEST_ID"} diff --git a/server/preprocessing/other-scripts/test/test_base.R b/server/preprocessing/other-scripts/test/test_base.R index 550684982..4fbe8e4cd 100644 --- a/server/preprocessing/other-scripts/test/test_base.R +++ b/server/preprocessing/other-scripts/test/test_base.R @@ -7,7 +7,7 @@ options(warn=1) wd <- dirname(dirname(rstudioapi::getActiveDocumentContext()$path)) setwd(wd) #Don't forget to set your working directory -query <- "parzival" #args[2] +query <- NULL #args[2] service <- "base" params <- NULL params_file <- "test/params_base.json" diff --git a/server/workers/api/src/app.py b/server/workers/api/src/app.py index 787d42274..34365ab2a 100644 --- a/server/workers/api/src/app.py +++ b/server/workers/api/src/app.py @@ -9,9 +9,9 @@ from apis.base import base_ns from apis.pubmed import pubmed_ns from apis.openaire import openaire_ns +from apis.orcid import orcid_ns from apis.create_vis import vis_ns from apis.export import export_ns -from apis.orcid import orcid_ns class ReverseProxied(object): diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 799013fbe..99cbb0f72 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -20,7 +20,7 @@ class OrcidClient(): - def __init__(self): + def __init__(self) -> None: self.separation = 0.1 self.rate_key = 'orcid-ratelimit' self.ORCID_CLIENT_ID = os.getenv("ORCID_CLIENT_ID") @@ -31,14 +31,14 @@ def __init__(self): else: self.sandbox = False - def authenticate(self): + def authenticate(self) -> str: orcid_auth = OrcidAuthentication(client_id=self.ORCID_CLIENT_ID, client_secret=self.ORCID_CLIENT_SECRET) access_token = orcid_auth.get_public_access_token() return access_token - def next_item(self): + def next_item(self) -> tuple: queue, msg = self.redis_store.blpop("orcid") msg = json.loads(msg.decode('utf-8')) k = msg.get('id') @@ -48,7 +48,7 @@ def next_item(self): return k, params, endpoint - def orcid_rate_limit_reached(self): + def orcid_rate_limit_reached(self) -> bool: """ This implementation is inspired by an implementation of Generic Cell Rate Algorithm based rate limiting, @@ -73,7 +73,7 @@ def orcid_rate_limit_reached(self): except LockError: return True - def run(self): + def run(self) -> None: while True: while self.base_rate_limit_reached(): self.logger.debug('🛑 Request is limited') @@ -96,7 +96,7 @@ def run(self): self.logger.error(params) self.logger.error(e) - def execute_search(self, params): + def execute_search(self, params) -> dict: q = params.get('q') service = params.get('service') data = {} @@ -121,22 +121,68 @@ def execute_search(self, params): self.logger.error(e) raise -def retrieve_full_works_metadata(orcid): +def extract_author_info(orcid) -> dict: + personal_details = orcid.personal_details() + orcid_id = orcid._orcid_id + author_name = " ".join( + [personal_details.get("name", {}).get("given-names", {}).get("value", ""), + personal_details.get("name", {}).get("family-name", {}).get("value", "")] + ) + author_keywords = ", ".join(orcid.keywords()[0]) + biography = personal_details.get("biography", {}).get("content", "") \ + if personal_details.get("biography", {}).get("visibility") == "public" \ + else "" + external_identifiers = extract_external_identifiers(orcid) + countries = extract_countries(orcid) + websites = extract_websites(orcid) + author_info = { + "orcid_id": orcid_id, + "author_name": author_name, + "author_keywords": author_keywords, + "biography": biography, + "websites": websites, + "external_identifiers": external_identifiers, + "country": countries + } + return author_info + +def extract_countries(orcid) -> list: + countries = pd.DataFrame(orcid.address()["address"]) + countries = countries[countries["visibility"] == "public"] + countries["country"] = countries["country"].apply(lambda x: x.get("value")) + countries = countries["country"] + return countries.tolist() + +def extract_external_identifiers(orcid) -> list: + external_identifiers = pd.DataFrame(orcid.external_identifiers()["external-identifier"]) + external_identifiers = external_identifiers[external_identifiers["visibility"] == "public"] + external_identifiers["external-id-url"] = external_identifiers["external-id-url"].apply(lambda x: x.get("value")) + external_identifiers = external_identifiers[[ "external-id-type", "external-id-url", "external-id-value", "external-id-relationship"]] + return external_identifiers.to_dict(orient='records') + +def extract_websites(orcid) -> list: + urls = pd.DataFrame(orcid.researcher_urls()["researcher-url"]) + urls = urls[urls["visibility"] == "public"] + urls["url"] = urls["url"].apply(lambda x: x.get("value")) + urls = urls[[ "url-name", "url"]] + return urls.to_dict(orient='records') + +def retrieve_full_works_metadata(orcid) -> pd.DataFrame: raw_works = pd.DataFrame(orcid.works()[1]["group"]).explode("work-summary") works = pd.json_normalize(pd.DataFrame(raw_works["work-summary"])) works["publication-date"] = works.apply(get_publication_date, axis=1) works["doi"] = works.apply(extract_dois, axis=1) return works -def apply_metadata_schema(works): +def apply_metadata_schema(works) -> pd.DataFrame: works.rename(columns=works_mapping, inplace=True) metadata = works return metadata -def filter_dicts_by_value(dicts, key, value): +def filter_dicts_by_value(dicts, key, value) -> list: return [d for d in dicts if d.get(key) == value] -def extract_dois(work): +def extract_dois(work) -> str: external_ids = work["external-ids.external-id"] external_ids = external_ids if isinstance(external_ids, list) else [] external_ids = (filter_dicts_by_value( @@ -147,7 +193,7 @@ def extract_dois(work): return doi -def get_publication_date(work): +def get_publication_date(work) -> str: year = work["publication-date.year.value"] month = work["publication-date.month.value"] day = work["publication-date.day.value"] @@ -176,4 +222,4 @@ def get_publication_date(work): "type": "resulttype", "url.value": "link", "journal-title.value": "published_in" -} \ No newline at end of file +} From 27cc689ff5b428a4e17f5923915bff1bb658491e Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 18:52:46 +0200 Subject: [PATCH 12/75] orcid rate limit bugfix --- server/workers/orcid/requirements.txt | 1 + server/workers/orcid/src/orcid.py | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt index aabcbd129..7bc223759 100644 --- a/server/workers/orcid/requirements.txt +++ b/server/workers/orcid/requirements.txt @@ -23,3 +23,4 @@ redis==4.3.6 six==1.16.0 typing-extensions==4.1.1 zipp==3.6.0 +pyorcid==1.2.0 \ No newline at end of file diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 99cbb0f72..6cd54258c 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -55,8 +55,8 @@ def orcid_rate_limit_reached(self) -> bool: seen on https://dev.to/astagi/rate-limiting-using-python-and-redis-58gk. It has been simplified and adjusted to our use case. - BASE demands one request per second (1 QPS), per - https://www.base-search.net/about/download/base_interface.pdf + ORCID allows 24 requests per second, with a burst limit of 40 requests. See also: + https://info.orcid.org/ufaqs/what-are-the-api-limits/ """ t = self.redis_store.time()[0] @@ -75,7 +75,7 @@ def orcid_rate_limit_reached(self) -> bool: def run(self) -> None: while True: - while self.base_rate_limit_reached(): + while self.orcid_rate_limit_reached(): self.logger.debug('🛑 Request is limited') time.sleep(0.1) k, params, endpoint = self.next_item() @@ -104,6 +104,7 @@ def execute_search(self, params) -> dict: orcid_id = params.get("orcid") try: orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) + author_info = extract_author_info(orcid) works = retrieve_full_works_metadata(orcid) metadata = apply_metadata_schema(works) # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] @@ -115,6 +116,8 @@ def execute_search(self, params) -> dict: input_data["text"] = text.to_json(orient='records') res = {} res["input_data"] = input_data + # merge author info into params + params = params.update(author_info) res["params"] = params return res except Exception as e: From df5761758f3cb3684a1f88d4e3c83fa78e3568dd Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 18:57:50 +0200 Subject: [PATCH 13/75] orcid container deployment --- server/workers/orcid/Dockerfile | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/Dockerfile b/server/workers/orcid/Dockerfile index c19b1b035..99ee65ef0 100644 --- a/server/workers/orcid/Dockerfile +++ b/server/workers/orcid/Dockerfile @@ -4,8 +4,17 @@ MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" RUN apt-get update RUN apt-get install -y gcc git libpq-dev -WORKDIR /orcid +ENV PYTHONPATH="${PYTHONPATH}:/headstart/:/headstart/orcid/:/headstart/orcid/src/" + +WORKDIR /headstart COPY workers/orcid/requirements.txt . RUN pip install --upgrade pip RUN pip install --no-cache-dir -r requirements.txt -COPY workers/orcid/src/ ./ \ No newline at end of file + + +COPY workers/common ./common +COPY workers/orcid ./orcid +RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log + +COPY workers/orcid/*.py ./ +ENTRYPOINT python3 run_orcid.py From ec96de54171a142b7fa30225ab5b4aa4c85c3e17 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 20 Jun 2024 19:25:33 +0200 Subject: [PATCH 14/75] orcid client initiation bugfix --- server/workers/orcid/run_orcid.py | 2 +- server/workers/orcid/src/orcid.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/run_orcid.py b/server/workers/orcid/run_orcid.py index 71d4e041e..bd9cdc45d 100644 --- a/server/workers/orcid/run_orcid.py +++ b/server/workers/orcid/run_orcid.py @@ -14,7 +14,7 @@ } redis_store = redis.StrictRedis(**redis_config) - wrapper = OrcidClient(None, None, redis_store, + wrapper = OrcidClient(redis_store, "english", os.environ.get("LOGLEVEL", "INFO")) wrapper.run() diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 6cd54258c..89920c483 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -20,7 +20,17 @@ class OrcidClient(): - def __init__(self) -> None: + def __init__(self, redis_store=None, language=None, loglevel="INFO") -> None: + self.redis_store = redis_store + self.default_params = {} + self.default_params["language"] = language + self.logger = logging.getLogger(__name__) + self.logger.setLevel(loglevel) + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(formatter) + handler.setLevel(loglevel) + self.logger.addHandler(handler) + self.separation = 0.1 self.rate_key = 'orcid-ratelimit' self.ORCID_CLIENT_ID = os.getenv("ORCID_CLIENT_ID") From 15425f8b77cefc8b48c1d4294260004e397dfb19 Mon Sep 17 00:00:00 2001 From: chreman Date: Fri, 21 Jun 2024 15:16:43 +0200 Subject: [PATCH 15/75] orcid integration bugfixes --- examples/search_repos/search.js | 6 +++ server/services/search.php | 3 +- server/workers/api/src/apis/openaire.py | 6 +-- server/workers/api/src/apis/orcid.py | 8 ++-- server/workers/orcid/src/orcid.py | 50 ++++++++++++++++++------- 5 files changed, 52 insertions(+), 21 deletions(-) diff --git a/examples/search_repos/search.js b/examples/search_repos/search.js index e077997b3..bcf8b791f 100644 --- a/examples/search_repos/search.js +++ b/examples/search_repos/search.js @@ -30,6 +30,12 @@ switch (data_config.service) { service_name = "OpenAire"; options = options_base; break; + + case 'orcid': + service_url = data_config.server_url + "services/searchORCID.php" + service_name = "ORCiD"; + options = options_base; + break; } $(window).bind("pageshow", function () { diff --git a/server/services/search.php b/server/services/search.php index 31588bfbe..325f1b976 100644 --- a/server/services/search.php +++ b/server/services/search.php @@ -66,7 +66,8 @@ function search($service, $dirty_query , "pubmed" => "PubMed" , "doaj" => "DOAJ" , "base" => "BASE" - , "openaire" => "OpenAire"); + , "openaire" => "OpenAire" + , "orcid" => "ORCID",); $query = ($do_clean_query === true) ? (cleanQuery($dirty_query, $transform_query_tolowercase, $add_slashes)) diff --git a/server/workers/api/src/apis/openaire.py b/server/workers/api/src/apis/openaire.py index 8b727288d..00db702ef 100644 --- a/server/workers/api/src/apis/openaire.py +++ b/server/workers/api/src/apis/openaire.py @@ -49,12 +49,12 @@ def post(self): """ params = request.get_json() openaire_ns.logger.debug(params) - #errors = search_param_schema.validate(params, partial=True) - params["limit"] = 100 - params["list_size"] = -1 + # errors = search_param_schema.validate(params, partial=True) # openaire_ns.logger.debug(errors) # if errors: # abort(400, str(errors)) + params["limit"] = 100 + params["list_size"] = -1 k = str(uuid.uuid4()) d = {"id": k, "params": params, "endpoint": "search"} diff --git a/server/workers/api/src/apis/orcid.py b/server/workers/api/src/apis/orcid.py index 3342359f8..5c54c6860 100644 --- a/server/workers/api/src/apis/orcid.py +++ b/server/workers/api/src/apis/orcid.py @@ -36,10 +36,10 @@ def post(self): orcid_ns.logger.debug(params) if "optradio" in params: del params["optradio"] - errors = search_param_schema.validate(params, partial=True) - orcid_ns.logger.debug(errors) - if errors: - abort(400, str(errors)) + # errors = search_param_schema.validate(params, partial=True) + # orcid_ns.logger.debug(errors) + # if errors: + # abort(400, str(errors)) k = str(uuid.uuid4()) d = {"id": k, "params": params, "endpoint": "search"} diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 89920c483..621301d71 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -1,10 +1,11 @@ import os +import sys import json -import subprocess import pandas as pd import logging from datetime import timedelta from dateutil.parser import parse +from common.decorators import error_logging_aspect import re from redis.exceptions import LockError import time @@ -30,7 +31,7 @@ def __init__(self, redis_store=None, language=None, loglevel="INFO") -> None: handler.setFormatter(formatter) handler.setLevel(loglevel) self.logger.addHandler(handler) - + self.separation = 0.1 self.rate_key = 'orcid-ratelimit' self.ORCID_CLIENT_ID = os.getenv("ORCID_CLIENT_ID") @@ -41,12 +42,15 @@ def __init__(self, redis_store=None, language=None, loglevel="INFO") -> None: else: self.sandbox = False + @error_logging_aspect(log_level=logging.ERROR) def authenticate(self) -> str: - orcid_auth = OrcidAuthentication(client_id=self.ORCID_CLIENT_ID, - client_secret=self.ORCID_CLIENT_SECRET) - access_token = orcid_auth.get_public_access_token() - return access_token - + try: + orcid_auth = OrcidAuthentication(client_id=self.ORCID_CLIENT_ID, + client_secret=self.ORCID_CLIENT_SECRET) + access_token = orcid_auth.get_public_access_token() + return access_token + except Exception as e: + raise e def next_item(self) -> tuple: queue, msg = self.redis_store.blpop("orcid") @@ -57,7 +61,7 @@ def next_item(self) -> tuple: endpoint = msg.get('endpoint') return k, params, endpoint - + @error_logging_aspect(log_level=logging.INFO) def orcid_rate_limit_reached(self) -> bool: """ This implementation is inspired by an implementation of @@ -83,6 +87,7 @@ def orcid_rate_limit_reached(self) -> bool: except LockError: return True + @error_logging_aspect(log_level=logging.ERROR) def run(self) -> None: while True: while self.orcid_rate_limit_reached(): @@ -106,6 +111,7 @@ def run(self) -> None: self.logger.error(params) self.logger.error(e) + @error_logging_aspect(log_level=logging.ERROR) def execute_search(self, params) -> dict: q = params.get('q') service = params.get('service') @@ -116,24 +122,29 @@ def execute_search(self, params) -> dict: orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) author_info = extract_author_info(orcid) works = retrieve_full_works_metadata(orcid) + self.logger.debug(works.columns) metadata = apply_metadata_schema(works) + self.logger.debug(metadata.columns) # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] text = pd.concat([metadata.id, metadata[["title", "paper_abstract"]] .apply(lambda x: " ".join(x), axis=1)], axis=1) text.columns = ["id", "content"] + self.logger.debug(metadata.head()) + self.logger.debug(text.head()) input_data = {} input_data["metadata"] = metadata.to_json(orient='records') input_data["text"] = text.to_json(orient='records') res = {} res["input_data"] = input_data # merge author info into params - params = params.update(author_info) + params.update(author_info) res["params"] = params return res except Exception as e: self.logger.error(e) raise +@error_logging_aspect(log_level=logging.ERROR) def extract_author_info(orcid) -> dict: personal_details = orcid.personal_details() orcid_id = orcid._orcid_id @@ -159,6 +170,7 @@ def extract_author_info(orcid) -> dict: } return author_info +@error_logging_aspect(log_level=logging.WARNING) def extract_countries(orcid) -> list: countries = pd.DataFrame(orcid.address()["address"]) countries = countries[countries["visibility"] == "public"] @@ -166,6 +178,7 @@ def extract_countries(orcid) -> list: countries = countries["country"] return countries.tolist() +@error_logging_aspect(log_level=logging.WARNING) def extract_external_identifiers(orcid) -> list: external_identifiers = pd.DataFrame(orcid.external_identifiers()["external-identifier"]) external_identifiers = external_identifiers[external_identifiers["visibility"] == "public"] @@ -173,6 +186,7 @@ def extract_external_identifiers(orcid) -> list: external_identifiers = external_identifiers[[ "external-id-type", "external-id-url", "external-id-value", "external-id-relationship"]] return external_identifiers.to_dict(orient='records') +@error_logging_aspect(log_level=logging.WARNING) def extract_websites(orcid) -> list: urls = pd.DataFrame(orcid.researcher_urls()["researcher-url"]) urls = urls[urls["visibility"] == "public"] @@ -180,13 +194,23 @@ def extract_websites(orcid) -> list: urls = urls[[ "url-name", "url"]] return urls.to_dict(orient='records') +@error_logging_aspect(log_level=logging.ERROR) def retrieve_full_works_metadata(orcid) -> pd.DataFrame: - raw_works = pd.DataFrame(orcid.works()[1]["group"]).explode("work-summary") - works = pd.json_normalize(pd.DataFrame(raw_works["work-summary"])) + works = pd.DataFrame(orcid.works()[1]["group"]).explode("work-summary") + works = pd.json_normalize(works["work-summary"]) works["publication-date"] = works.apply(get_publication_date, axis=1) works["doi"] = works.apply(extract_dois, axis=1) + # THIS IS EMPTY FOR NOW BECAUSE WE DON'T HAVE THIS INFO YET + works["short-description"] = "" + works["subject_orig"] = "" + works["subject_cleaned"] = "" + works["oa_state"] = 2 + works["resulttype"] = works["type"].map(lambda x: [x]) + works["subject"] = "" + works["sanitized_authors"] = "" return works +@error_logging_aspect(log_level=logging.ERROR) def apply_metadata_schema(works) -> pd.DataFrame: works.rename(columns=works_mapping, inplace=True) metadata = works @@ -195,6 +219,7 @@ def apply_metadata_schema(works) -> pd.DataFrame: def filter_dicts_by_value(dicts, key, value) -> list: return [d for d in dicts if d.get(key) == value] +@error_logging_aspect(log_level=logging.WARNING) def extract_dois(work) -> str: external_ids = work["external-ids.external-id"] external_ids = external_ids if isinstance(external_ids, list) else [] @@ -205,7 +230,7 @@ def extract_dois(work) -> str: doi = external_ids[0].get("external-id-value", "") if len(external_ids)>0 else "" return doi - +@error_logging_aspect(log_level=logging.WARNING) def get_publication_date(work) -> str: year = work["publication-date.year.value"] month = work["publication-date.month.value"] @@ -232,7 +257,6 @@ def get_publication_date(work) -> str: "short-description": "paper_abstract", "publication-date": "year", "work-contributors": "authors", - "type": "resulttype", "url.value": "link", "journal-title.value": "published_in" } From 3bc9c3b99ffdc502b00f3ebcbf0425446029c631 Mon Sep 17 00:00:00 2001 From: chreman Date: Sat, 22 Jun 2024 01:52:08 +0200 Subject: [PATCH 16/75] metadata sanitization --- server/workers/orcid/src/orcid.py | 70 ++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 621301d71..021c52dfc 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -11,6 +11,7 @@ import time import numpy as np from pyorcid import OrcidAuthentication, Orcid +from typing import Tuple @@ -52,7 +53,7 @@ def authenticate(self) -> str: except Exception as e: raise e - def next_item(self) -> tuple: + def next_item(self) -> Tuple[str, dict, str]: queue, msg = self.redis_store.blpop("orcid") msg = json.loads(msg.decode('utf-8')) k = msg.get('id') @@ -112,7 +113,7 @@ def run(self) -> None: self.logger.error(e) @error_logging_aspect(log_level=logging.ERROR) - def execute_search(self, params) -> dict: + def execute_search(self, params: dict) -> dict: q = params.get('q') service = params.get('service') data = {} @@ -124,6 +125,8 @@ def execute_search(self, params) -> dict: works = retrieve_full_works_metadata(orcid) self.logger.debug(works.columns) metadata = apply_metadata_schema(works) + metadata["authors"] = author_info["author_name"] + metadata = sanitize_metadata(metadata) self.logger.debug(metadata.columns) # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] text = pd.concat([metadata.id, metadata[["title", "paper_abstract"]] @@ -145,7 +148,7 @@ def execute_search(self, params) -> dict: raise @error_logging_aspect(log_level=logging.ERROR) -def extract_author_info(orcid) -> dict: +def extract_author_info(orcid: Orcid) -> dict: personal_details = orcid.personal_details() orcid_id = orcid._orcid_id author_name = " ".join( @@ -154,7 +157,7 @@ def extract_author_info(orcid) -> dict: ) author_keywords = ", ".join(orcid.keywords()[0]) biography = personal_details.get("biography", {}).get("content", "") \ - if personal_details.get("biography", {}).get("visibility") == "public" \ + if (personal_details.get("biography") and personal_details.get("biography", {}).get("visibility") == "public" )\ else "" external_identifiers = extract_external_identifiers(orcid) countries = extract_countries(orcid) @@ -171,31 +174,45 @@ def extract_author_info(orcid) -> dict: return author_info @error_logging_aspect(log_level=logging.WARNING) -def extract_countries(orcid) -> list: +def sanitize_metadata(metadata: pd.DataFrame) -> pd.DataFrame: + metadata["id"] = metadata["id"].astype(str) + return metadata + +@error_logging_aspect(log_level=logging.WARNING) +def extract_countries(orcid: Orcid) -> list: countries = pd.DataFrame(orcid.address()["address"]) - countries = countries[countries["visibility"] == "public"] - countries["country"] = countries["country"].apply(lambda x: x.get("value")) - countries = countries["country"] - return countries.tolist() + if not countries.empty: + countries = countries[countries["visibility"] == "public"] + countries["country"] = countries["country"].apply(lambda x: x.get("value")) + countries = countries["country"] + return countries.tolist() + else: + return [] @error_logging_aspect(log_level=logging.WARNING) -def extract_external_identifiers(orcid) -> list: +def extract_external_identifiers(orcid: Orcid) -> list: external_identifiers = pd.DataFrame(orcid.external_identifiers()["external-identifier"]) - external_identifiers = external_identifiers[external_identifiers["visibility"] == "public"] - external_identifiers["external-id-url"] = external_identifiers["external-id-url"].apply(lambda x: x.get("value")) - external_identifiers = external_identifiers[[ "external-id-type", "external-id-url", "external-id-value", "external-id-relationship"]] - return external_identifiers.to_dict(orient='records') + if not external_identifiers.empty: + external_identifiers = external_identifiers[external_identifiers["visibility"] == "public"] + external_identifiers["external-id-url"] = external_identifiers["external-id-url"].apply(lambda x: x.get("value")) + external_identifiers = external_identifiers[[ "external-id-type", "external-id-url", "external-id-value", "external-id-relationship"]] + return external_identifiers.to_dict(orient='records') + else: + return [] @error_logging_aspect(log_level=logging.WARNING) -def extract_websites(orcid) -> list: +def extract_websites(orcid: Orcid) -> list: urls = pd.DataFrame(orcid.researcher_urls()["researcher-url"]) - urls = urls[urls["visibility"] == "public"] - urls["url"] = urls["url"].apply(lambda x: x.get("value")) - urls = urls[[ "url-name", "url"]] - return urls.to_dict(orient='records') + if not urls.empty: + urls = urls[urls["visibility"] == "public"] + urls["url"] = urls["url"].apply(lambda x: x.get("value")) + urls = urls[[ "url-name", "url"]] + return urls.to_dict(orient='records') + else: + return [] @error_logging_aspect(log_level=logging.ERROR) -def retrieve_full_works_metadata(orcid) -> pd.DataFrame: +def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: works = pd.DataFrame(orcid.works()[1]["group"]).explode("work-summary") works = pd.json_normalize(works["work-summary"]) works["publication-date"] = works.apply(get_publication_date, axis=1) @@ -208,10 +225,13 @@ def retrieve_full_works_metadata(orcid) -> pd.DataFrame: works["resulttype"] = works["type"].map(lambda x: [x]) works["subject"] = "" works["sanitized_authors"] = "" + works["cited_by_tweeters_count"] = np.random.randint(0, 100, size=len(works)) + works["readers.mendeley"] = np.random.randint(0, 100, size=len(works)) + works["citation_count"] = np.random.randint(0, 100, size=len(works)) return works @error_logging_aspect(log_level=logging.ERROR) -def apply_metadata_schema(works) -> pd.DataFrame: +def apply_metadata_schema(works: pd.DataFrame) -> pd.DataFrame: works.rename(columns=works_mapping, inplace=True) metadata = works return metadata @@ -220,7 +240,7 @@ def filter_dicts_by_value(dicts, key, value) -> list: return [d for d in dicts if d.get(key) == value] @error_logging_aspect(log_level=logging.WARNING) -def extract_dois(work) -> str: +def extract_dois(work: pd.DataFrame) -> str: external_ids = work["external-ids.external-id"] external_ids = external_ids if isinstance(external_ids, list) else [] external_ids = (filter_dicts_by_value( @@ -237,14 +257,14 @@ def get_publication_date(work) -> str: day = work["publication-date.day.value"] publication_date = "" parsed_publication_date = publication_date - if year is not pd.np.NaN: + if year is not np.NaN: publication_date+=str(int(year)) parsed_publication_date = publication_date - if month is not pd.np.NaN: + if month is not np.NaN: publication_date+=("-"+str(int(month))) date_obj = parse(publication_date) parsed_publication_date = date_obj.strftime('%Y-%m') - if day is not pd.np.NaN: + if day is not np.NaN: publication_date+=("-"+str(int(day))) date_obj = parse(publication_date) parsed_publication_date = date_obj.strftime('%Y-%m-%d') From 3f7d3f21f40350e3cd72320ab8487ec00f383cc2 Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 25 Jun 2024 11:57:29 +0200 Subject: [PATCH 17/75] deduplication wip --- .../other-scripts/test/params_base.json | 3 +- .../other-scripts/test/test_base.R | 2 +- server/workers/base/src/base.py | 119 +----------------- server/workers/orcid/requirements.txt | 1 + server/workers/orcid/src/orcid.py | 31 ++++- 5 files changed, 39 insertions(+), 117 deletions(-) diff --git a/server/preprocessing/other-scripts/test/params_base.json b/server/preprocessing/other-scripts/test/params_base.json index 69185a479..01fb30795 100644 --- a/server/preprocessing/other-scripts/test/params_base.json +++ b/server/preprocessing/other-scripts/test/params_base.json @@ -6,6 +6,5 @@ "vis_id": "TEST_ID", "min_descsize": 300, "limit": 120, - "list_size": 100, - "q_advanced": "dcdoi:(\"10.5281/zenodo.1247473\" OR \"10.5281/zenodo.1065507\" OR \"10.3389/frma.2017.00013\" OR \"10.31263/voebm.v69i3.1733\" OR \"10.12685/027.7-4-2-157\" OR \"10.1007/s11192-016-1887-4\" OR \"10.5281/zenodo.50729\" OR \"10.5281/zenodo.50715\" OR \"10.1515/iwp-2015-0025\" OR \"10.1016/j.joi.2014.12.003\" OR \"10.5281/zenodo.35401\" OR \"10.1504/ijtel.2015.071922\" OR \"10.6084/m9.figshare.1320834\" OR \"10.1007/s11192-014-1365-9\" OR \"10.6084/m9.figshare.1091372\" OR \"10.1145/2494188.2494208\" OR \"10.6084/m9.figshare.156030.v1\" OR \"10.6084/m9.figshare.156030\" OR \"10.6084/m9.figshare.156030.v2\" OR \"10.1145/2187980.2188236\" OR \"10.1007/978-3-642-23985-4_18\" OR \"10.1504/ijtel.2011.045454\" OR \"10.1007/978-3-642-16020-2_59\" OR \"10.1007/978-3-540-88411-8_17\" OR \"10.3217/jucs-016-16-2214\")" + "list_size": 100 } diff --git a/server/preprocessing/other-scripts/test/test_base.R b/server/preprocessing/other-scripts/test/test_base.R index 4fbe8e4cd..468aacf8d 100644 --- a/server/preprocessing/other-scripts/test/test_base.R +++ b/server/preprocessing/other-scripts/test/test_base.R @@ -7,7 +7,7 @@ options(warn=1) wd <- dirname(dirname(rstudioapi::getActiveDocumentContext()$path)) setwd(wd) #Don't forget to set your working directory -query <- NULL #args[2] +query <- "machine learning" #args[2] service <- "base" params <- NULL params_file <- "test/params_base.json" diff --git a/server/workers/base/src/base.py b/server/workers/base/src/base.py index 3c7183fe0..6e05a2ff5 100644 --- a/server/workers/base/src/base.py +++ b/server/workers/base/src/base.py @@ -5,6 +5,12 @@ import logging from datetime import timedelta from common.r_wrapper import RWrapper +from common.deduplication import find_version_in_doi, get_unversioned_doi, get_publisher_doi, \ + find_duplicate_indexes, mark_duplicate_dois, mark_duplicate_links,\ + identify_relations, remove_false_positives_doi, remove_false_positives_link, \ + add_false_negatives, remove_textual_duplicates_from_different_sources, \ + mark_latest_doi, prioritize_OA_and_latest + import re from redis.exceptions import LockError import time @@ -186,121 +192,8 @@ def run(self): self.logger.error(params) self.logger.error(e) -pattern_doi = re.compile(r"\.v(\d)+$") pattern_annotations = re.compile(r"([A-Za-z]+:[\w'\- ]+);?") -def find_version_in_doi(doi): - m = pattern_doi.findall(doi) - if m: - return int(m[0]) - else: - return None - -def extract_doi_suffix(doi): - return doi.split("/")[4:] - -def get_unversioned_doi(doi): - doi = "/".join(doi.split("/")[3:6]) - return pattern_doi.sub("", doi) - -def get_publisher_doi(doi): - pdoi = re.findall(r"org/10\.(\d+)", doi) - if len(pdoi) > 0: - return pdoi[0] - else: - return "" - -def mark_duplicate_dois(df): - for doi, index in df.groupby("doi").groups.items(): - if doi: - if len(index) > 1: - df.loc[index, "doi_duplicate"] = True - return df - -def mark_duplicate_links(df): - for link, index in df.groupby("link").groups.items(): - if link: - if len(index) > 1: - df.loc[index, "link_duplicate"] = True - return df - -def identify_relations(df): - for udoi in df.unversioned_doi.unique(): - if udoi: - tmp = df[df.identifier.str.contains(udoi)] - if len(tmp) > 1: - relations = tmp.id - r = pd.Series([relations.values.tolist()]*len(tmp), index=relations.index) - df.loc[relations.index, "relations"] = r - df.loc[relations.index, "has_relations"] = True - return df - -def remove_false_positives_doi(df): - df.loc[df[(df.doi != "") & (df.is_duplicate) & (~df.doi_duplicate)].index, "is_duplicate"] = False - return df - -def remove_false_positives_link(df): - df.loc[df[(df.link != "") & (df.is_duplicate) & (~df.link_duplicate)].index, "is_duplicate"] = False - return df - -def add_false_negatives(df): - df.loc[df[(~df.is_duplicate) & (df.link_duplicate)].index, "is_duplicate"] = True - df.loc[df[(~df.is_duplicate) & (df.doi_duplicate)].index, "is_duplicate"] = True - return df - -def find_duplicate_indexes(df): - dupind = df.id.map(lambda x: df[df.duplicates.str.contains(x)].index) - tmp = pd.DataFrame(dupind).astype(str).drop_duplicates().index - return dupind[tmp] - -def mark_latest_doi(df, dupind): - for _, idx in dupind.iteritems(): - idx = df.index.intersection(idx) - tmp = df.loc[idx] - for udoi in list(filter(None, tmp.unversioned_doi.unique().tolist())): - tmp2 = tmp[tmp.unversioned_doi == udoi] - if len(tmp2) > 0: - df.loc[tmp2.index, "is_latest"] = False - df.loc[tmp2.index, "keep"] = False - versions = tmp2.id - latest = tmp2.sort_values("doi_version", ascending=False).head(1).id - v = [{"versions": versions.values.tolist(), "latest": latest.values.tolist()}]*len(tmp2) - df.loc[versions.index, "versions"] = v - df.loc[latest.index, "is_latest"] = True - df.loc[latest.index, "keep"] = True - return df - -def remove_textual_duplicates_from_different_sources(df, dupind): - for _, idx in dupind.iteritems(): - if len(idx) > 1: - tmp = df.loc[idx] - df.loc[tmp.index, "is_duplicate"] = True - df.loc[tmp.index, "is_latest"] = False - publisher_dois = list(filter(None, tmp.publisher_doi.unique().tolist())) - if len(publisher_dois) > 0: - # keep entry with doi - df.loc[idx, "keep"] = False - df.loc[tmp[tmp.publisher_doi!=""].index, "is_latest"] = True - df.loc[tmp[tmp.publisher_doi!=""].index, "keep"] = True - else: - df.loc[tmp.sort_values(["doi", "year"], ascending=[False, False]).head(1).index, "is_latest"] = True - df.loc[tmp.sort_values(["doi", "year"], ascending=[False, False]).head(1).index, "keep"] = True - return df - -def prioritize_OA_and_latest(df, dupind): - for _, idx in dupind.iteritems(): - idx = df.index.intersection(idx) - if len(idx) > 1: - tmp = df.loc[idx] - df.loc[idx, "keep"] = False - df.loc[idx, "is_latest"] = False - if len(tmp[tmp.oa_state=="1"]) > 0: - df.loc[tmp[tmp.oa_state=="1"].sort_values("year", ascending=False).head(1).index, "keep"] = True - df.loc[tmp[tmp.oa_state=="1"].sort_values("year", ascending=False).head(1).index, "is_latest"] = True - else: - df.loc[tmp.sort_values("year", ascending=False).head(1).index, "keep"] = True - df.loc[tmp.sort_values("year", ascending=False).head(1).index, "is_latest"] = True - return df def filter_duplicates(df): df.drop_duplicates("id", inplace=True, keep="first") diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt index 7bc223759..55dd8627c 100644 --- a/server/workers/orcid/requirements.txt +++ b/server/workers/orcid/requirements.txt @@ -10,6 +10,7 @@ importlib-resources==5.4.0 itsdangerous==2.1.2 jsonschema==3.2.0 MarkupSafe==2.1.3 +Levenshtein==0.21.1 mistune==2.0.5 numpy==1.19.5 packaging==21.3 diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 021c52dfc..0cf2b083e 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -6,6 +6,8 @@ from datetime import timedelta from dateutil.parser import parse from common.decorators import error_logging_aspect +from common.deduplication import find_duplicate_indexes,\ + prioritize_OA_and_latest, mark_latest_doi, mark_duplicates import re from redis.exceptions import LockError import time @@ -126,6 +128,8 @@ def execute_search(self, params: dict) -> dict: self.logger.debug(works.columns) metadata = apply_metadata_schema(works) metadata["authors"] = author_info["author_name"] + #metadata = mark_duplicates(metadata) + #metadata = filter_duplicates(metadata) metadata = sanitize_metadata(metadata) self.logger.debug(metadata.columns) # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] @@ -143,6 +147,13 @@ def execute_search(self, params: dict) -> dict: params.update(author_info) res["params"] = params return res + except ValueError as e: + self.logger.error(e) + self.logger.error(params) + res = {} + res["params"] = params + res["status"] = "error" + res["reason"] = ["invalid orcid id"] except Exception as e: self.logger.error(e) raise @@ -269,7 +280,25 @@ def get_publication_date(work) -> str: date_obj = parse(publication_date) parsed_publication_date = date_obj.strftime('%Y-%m-%d') return parsed_publication_date - + +@error_logging_aspect(log_level=logging.ERROR) +def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame: + df.drop_duplicates("id", inplace=True, keep="first") + df["is_latest"] = True + df["doi_duplicate"] = False + df["has_relations"] = False + df["link_duplicate"] = False + df["keep"] = False + dupind = find_duplicate_indexes(df) + pure_datasets = df[df.type == "data-set"] + non_datasets = df.loc[df.index.difference(pure_datasets.index)] + non_datasets = prioritize_OA_and_latest(non_datasets, dupind) + pure_datasets = mark_latest_doi(pure_datasets, dupind) + filtered_non_datasets = non_datasets[non_datasets.is_latest==True] + filtered_datasets = pure_datasets[(pure_datasets.keep==True) | (pure_datasets.is_duplicate==False)] + filtered = pd.concat([filtered_non_datasets, filtered_datasets]) + filtered.sort_index(inplace=True) + return filtered works_mapping = { "put-code": "id", From ea0b0f349d409f183f4f68f22ef20b0aa736d19d Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 25 Jun 2024 12:57:27 +0200 Subject: [PATCH 18/75] doc type tags --- server/workers/orcid/requirements.txt | 1 + server/workers/orcid/src/orcid.py | 50 ++++++++++++++++++- .../templates/listentry/StandardListEntry.jsx | 2 +- 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt index 55dd8627c..1b995d3e4 100644 --- a/server/workers/orcid/requirements.txt +++ b/server/workers/orcid/requirements.txt @@ -21,6 +21,7 @@ python-dateutil==2.8.2 pytz==2023.3.post1 PyYAML==6.0.1 redis==4.3.6 +scikit-learn==0.24.2 six==1.16.0 typing-extensions==4.1.1 zipp==3.6.0 diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 0cf2b083e..f47615a73 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -233,7 +233,7 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: works["subject_orig"] = "" works["subject_cleaned"] = "" works["oa_state"] = 2 - works["resulttype"] = works["type"].map(lambda x: [x]) + works["resulttype"] = works.type.map(lambda x: [doc_type_mapping.get(x)]) works["subject"] = "" works["sanitized_authors"] = "" works["cited_by_tweeters_count"] = np.random.randint(0, 100, size=len(works)) @@ -309,3 +309,51 @@ def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame: "url.value": "link", "journal-title.value": "published_in" } + +doc_type_mapping = { + "book": "Book", + "book-chapter": "Book chapter", + "book-review": "Book review", + "dictionary-entry": "Dictionary entry", + "dissertation": "Dissertation", + "dissertation-thesis": "Dissertation thesis", + "enyclopaedia-entry": "Encyclopedia entry", + "edited-book": "Edited book", + "journal-article": "Journal article", + "journal-issue": "Journal issue", + "magazine-article": "Magazine article", + "manual": "Manual", + "online-resource": "Online resource", + "newsletter-article": "Newsletter article", + "newspaper-article": "Newspaper article", + "preprint": "Preprint", + "report": "Report", + "review": "Review", + "research-tool": "Research tool", + "supervised-student-publication": "Supervised student publication", + "test": "Test", + "translation": "Translation", + "website": "Website", + "working-paper": "Working paper", + "conference-abstract": "Conference abstract", + "conference-paper": "Conference paper", + "conference-poster": "Conference poster", + "disclosure": "Disclosure", + "license": "License", + "patent": "Patent", + "registered-copyright": "Registered copyright", + "trademark": "Trademark", + "annotation": "Annotation", + "artistic-performance": "Artistic performance", + "data-management-plan": "Data management plan", + "data-set": "Data set", + "invention": "Invention", + "lecture-speech": "Lecture speech", + "physical-object": "Physical object", + "research-technique": "Research technique", + "software": "Software", + "spin-off-company": "Spin-off company", + "standards-and-policy": "Standards and policy", + "technical-standard": "Technical standard", + "other": "Other" +} \ No newline at end of file diff --git a/vis/js/templates/listentry/StandardListEntry.jsx b/vis/js/templates/listentry/StandardListEntry.jsx index b5d722496..042331953 100644 --- a/vis/js/templates/listentry/StandardListEntry.jsx +++ b/vis/js/templates/listentry/StandardListEntry.jsx @@ -107,7 +107,7 @@ const mapStateToProps = (state) => ({ isStreamgraph: state.chartType === STREAMGRAPH_MODE, showBacklink: state.chartType === STREAMGRAPH_MODE && !!state.selectedPaper, isInStreamBacklink: !!state.selectedBubble, - showDocTags: state.service === "base", + showDocTags: state.service === "base" || state.service === "orcid", showAllDocTypes: state.service === "base" && !!state.selectedPaper, }); From ad32c7fe643138bd382ad5fcceac78e8f2b53edb Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 25 Jun 2024 14:01:03 +0200 Subject: [PATCH 19/75] date handling --- server/workers/orcid/src/orcid.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index f47615a73..c8c11e9ca 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -263,9 +263,18 @@ def extract_dois(work: pd.DataFrame) -> str: @error_logging_aspect(log_level=logging.WARNING) def get_publication_date(work) -> str: - year = work["publication-date.year.value"] - month = work["publication-date.month.value"] - day = work["publication-date.day.value"] + try: + year = work["publication-date.year.value"] + except KeyError: + year = np.NaN + try: + month = work["publication-date.month.value"] + except KeyError: + month = np.NaN + try: + day = work["publication-date.day.value"] + except KeyError: + day = np.NaN publication_date = "" parsed_publication_date = publication_date if year is not np.NaN: From 2a543f2a9ba6b1da83f1751f541ab25e392d651a Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 25 Jun 2024 14:07:25 +0200 Subject: [PATCH 20/75] add today param to orcid --- server/services/searchORCID.php | 2 +- server/workers/base/requirements.txt | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/server/services/searchORCID.php b/server/services/searchORCID.php index a820eb57c..ab6a5eb30 100644 --- a/server/services/searchORCID.php +++ b/server/services/searchORCID.php @@ -10,7 +10,7 @@ $dirty_query = library\CommUtils::getParameter($_POST, "orcid"); $precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null); -$params_array = array("orcid"); +$params_array = array("orcid", "today"); $optional_get_params = []; function filterEmptyString($value) diff --git a/server/workers/base/requirements.txt b/server/workers/base/requirements.txt index 614689b2a..af70f14cb 100644 --- a/server/workers/base/requirements.txt +++ b/server/workers/base/requirements.txt @@ -5,6 +5,7 @@ idna==2.6 importlib-metadata==4.8.3 keyring==10.6.0 keyrings.alt==3.0 +Levenshtein==0.21.1 numpy==1.19.5 packaging==21.3 pandas==1.1.5 @@ -15,6 +16,7 @@ python-dateutil==2.8.2 pytz==2023.3.post1 pyxdg==0.25 redis==4.3.6 +scikit-learn==0.24.2 SecretStorage==2.3.1 six==1.11.0 typing-extensions==4.1.1 From bf9311d9ff3e264e2366f32c9100e948741752dd Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 25 Jun 2024 14:14:39 +0200 Subject: [PATCH 21/75] logging cleanup --- server/workers/orcid/src/orcid.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index c8c11e9ca..2df31ff08 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -125,19 +125,15 @@ def execute_search(self, params: dict) -> dict: orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) author_info = extract_author_info(orcid) works = retrieve_full_works_metadata(orcid) - self.logger.debug(works.columns) metadata = apply_metadata_schema(works) metadata["authors"] = author_info["author_name"] #metadata = mark_duplicates(metadata) #metadata = filter_duplicates(metadata) metadata = sanitize_metadata(metadata) - self.logger.debug(metadata.columns) # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] text = pd.concat([metadata.id, metadata[["title", "paper_abstract"]] .apply(lambda x: " ".join(x), axis=1)], axis=1) text.columns = ["id", "content"] - self.logger.debug(metadata.head()) - self.logger.debug(text.head()) input_data = {} input_data["metadata"] = metadata.to_json(orient='records') input_data["text"] = text.to_json(orient='records') From a22e3218cfad51ee148ef4e79199c586c749ae04 Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 25 Jun 2024 18:12:26 +0200 Subject: [PATCH 22/75] display doc types in orcid integration --- vis/js/templates/listentry/StandardListEntry.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vis/js/templates/listentry/StandardListEntry.jsx b/vis/js/templates/listentry/StandardListEntry.jsx index 042331953..fddba23db 100644 --- a/vis/js/templates/listentry/StandardListEntry.jsx +++ b/vis/js/templates/listentry/StandardListEntry.jsx @@ -108,7 +108,7 @@ const mapStateToProps = (state) => ({ showBacklink: state.chartType === STREAMGRAPH_MODE && !!state.selectedPaper, isInStreamBacklink: !!state.selectedBubble, showDocTags: state.service === "base" || state.service === "orcid", - showAllDocTypes: state.service === "base" && !!state.selectedPaper, + showAllDocTypes: (state.service === "base" || state.service === "orcid") && !!state.selectedPaper, }); export default connect( From 3f4c7b77cdb39f762b4c0b9a1e220ce7b7b019c3 Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 2 Jul 2024 11:33:16 +0200 Subject: [PATCH 23/75] temporary change for dev process --- server/services/searchORCID.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/services/searchORCID.php b/server/services/searchORCID.php index ab6a5eb30..6b57077e0 100644 --- a/server/services/searchORCID.php +++ b/server/services/searchORCID.php @@ -31,7 +31,8 @@ function filterEmptyString($value) $result = search("orcid", $dirty_query , $post_params, $params_array , true - , true, null + # TODO: set back to true before deployment + , false, null , $precomputed_id, false); echo $result From e5976569674e63663491f998e3fdc42b0b71816f Mon Sep 17 00:00:00 2001 From: chreman Date: Fri, 5 Jul 2024 11:54:23 +0200 Subject: [PATCH 24/75] added missing files --- server/workers/common/decorators.py | 25 ++++ server/workers/common/deduplication.py | 181 +++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 server/workers/common/decorators.py create mode 100644 server/workers/common/deduplication.py diff --git a/server/workers/common/decorators.py b/server/workers/common/decorators.py new file mode 100644 index 000000000..bb1c390c3 --- /dev/null +++ b/server/workers/common/decorators.py @@ -0,0 +1,25 @@ +import os +import sys +import logging + + +# Configure the logging +logging.basicConfig(level=os.getenv("LOGLEVEL", "INFO"), + format='%(asctime)s %(levelname)-8s %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler(sys.stdout) +logger.addHandler(handler) + +def error_logging_aspect(log_level=logging.ERROR): + def decorator(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as e: + logger.log(log_level, f"Error in {func.__name__}: {e}", exc_info=True) + # Optionally, re-raise the exception if you want the calling code to handle it + raise + return wrapper + return decorator \ No newline at end of file diff --git a/server/workers/common/deduplication.py b/server/workers/common/deduplication.py new file mode 100644 index 000000000..37ca7aec0 --- /dev/null +++ b/server/workers/common/deduplication.py @@ -0,0 +1,181 @@ +import re +import numpy as np +import pandas as pd +import Levenshtein +from sklearn.metrics import pairwise_distances + +pattern_doi = re.compile(r"\.v(\d)+$") + +def find_version_in_doi(doi): + m = pattern_doi.findall(doi) + if m: + return int(m[0]) + else: + return None + +def get_unversioned_doi(doi): + doi = "/".join(doi.split("/")[3:6]) + return pattern_doi.sub("", doi) + +def get_publisher_doi(doi): + pdoi = re.findall(r"org/10\.(\d+)", doi) + if len(pdoi) > 0: + return pdoi[0] + else: + return "" + +def find_duplicate_indexes(df): + dupind = df.id.map(lambda x: df[df.duplicates.str.contains(x)].index) + tmp = pd.DataFrame(dupind).astype(str).drop_duplicates().index + return dupind[tmp] + +def mark_duplicate_dois(df): + for doi, index in df.groupby("doi").groups.items(): + if doi: + if len(index) > 1: + df.loc[index, "doi_duplicate"] = True + return df + +def mark_duplicate_links(df): + for link, index in df.groupby("link").groups.items(): + if link: + if len(index) > 1: + df.loc[index, "link_duplicate"] = True + return df + + +def identify_relations(df): + for udoi in df.unversioned_doi.unique(): + if udoi: + tmp = df[df.identifier.str.contains(udoi)] + if len(tmp) > 1: + relations = tmp.id + r = pd.Series([relations.values.tolist()]*len(tmp), index=relations.index) + df.loc[relations.index, "relations"] = r + df.loc[relations.index, "has_relations"] = True + return df + +def remove_false_positives_doi(df): + df.loc[df[(df.doi != "") & (df.is_duplicate) & (~df.doi_duplicate)].index, "is_duplicate"] = False + return df + +def remove_false_positives_link(df): + df.loc[df[(df.link != "") & (df.is_duplicate) & (~df.link_duplicate)].index, "is_duplicate"] = False + return df + +def add_false_negatives(df): + df.loc[df[(~df.is_duplicate) & (df.link_duplicate)].index, "is_duplicate"] = True + df.loc[df[(~df.is_duplicate) & (df.doi_duplicate)].index, "is_duplicate"] = True + return df + +def remove_textual_duplicates_from_different_sources(df, dupind): + for _, idx in dupind.iteritems(): + if len(idx) > 1: + tmp = df.loc[idx] + df.loc[tmp.index, "is_duplicate"] = True + df.loc[tmp.index, "is_latest"] = False + publisher_dois = list(filter(None, tmp.publisher_doi.unique().tolist())) + if len(publisher_dois) > 0: + # keep entry with doi + df.loc[idx, "keep"] = False + df.loc[tmp[tmp.publisher_doi!=""].index, "is_latest"] = True + df.loc[tmp[tmp.publisher_doi!=""].index, "keep"] = True + else: + df.loc[tmp.sort_values(["doi", "year"], ascending=[False, False]).head(1).index, "is_latest"] = True + df.loc[tmp.sort_values(["doi", "year"], ascending=[False, False]).head(1).index, "keep"] = True + return df + +def mark_latest_doi(df, dupind): + for _, idx in dupind.iteritems(): + idx = df.index.intersection(idx) + tmp = df.loc[idx] + for udoi in list(filter(None, tmp.unversioned_doi.unique().tolist())): + tmp2 = tmp[tmp.unversioned_doi == udoi] + if len(tmp2) > 0: + df.loc[tmp2.index, "is_latest"] = False + df.loc[tmp2.index, "keep"] = False + versions = tmp2.id + latest = tmp2.sort_values("doi_version", ascending=False).head(1).id + v = [{"versions": versions.values.tolist(), "latest": latest.values.tolist()}]*len(tmp2) + df.loc[versions.index, "versions"] = v + df.loc[latest.index, "is_latest"] = True + df.loc[latest.index, "keep"] = True + return df + +def prioritize_OA_and_latest(df, dupind): + for _, idx in dupind.iteritems(): + idx = df.index.intersection(idx) + if len(idx) > 1: + tmp = df.loc[idx] + df.loc[idx, "keep"] = False + df.loc[idx, "is_latest"] = False + if len(tmp[tmp.oa_state=="1"]) > 0: + df.loc[tmp[tmp.oa_state=="1"].sort_values("year", ascending=False).head(1).index, "keep"] = True + df.loc[tmp[tmp.oa_state=="1"].sort_values("year", ascending=False).head(1).index, "is_latest"] = True + else: + df.loc[tmp.sort_values("year", ascending=False).head(1).index, "keep"] = True + df.loc[tmp.sort_values("year", ascending=False).head(1).index, "is_latest"] = True + return df + +def mark_duplicates(metadata): + dt = deduplicate_titles(metadata, 0) + duplicate_candidates = dt["duplicate_candidates"] + metadata["is_duplicate"] = metadata["id"].map(lambda x: x in duplicate_candidates) + +def deduplicate_titles(metadata, list_size=-1): + duplicate_candidates = [] + + metadata['oa_state'] = metadata['oa_state'].replace("2", 0) + metadata = metadata.sort_values(by=['oa_state', 'subject', 'paper_abstract', 'authors', 'published_in'], + ascending=[False, False, False, False, False]) + + index = (metadata['title'].str.contains(" ") == False) | (metadata['title'].str.len() < 15) + metadata.loc[index, 'title'] = metadata.loc[index, 'title'] + " " + metadata['authors'] + + num_items = len(metadata) + max_replacements = num_items - list_size if num_items > list_size else -1 + + ids = metadata['id'].tolist() + titles = metadata['title'].str.lower().tolist() + count = 1 + + # create a matrix with the Levenshtein distance between all titles + # first create a dataframe with all pairwise titles in rows and columns + + + lv_matrix = compute_lv_matrix(titles, num_items) + length_matrix = metadata['title'].str.len().values + n = len(length_matrix) + str_matrix = np.tile(length_matrix, (n, 1)) + str_matrix_t = str_matrix.T + str_max_matrix = np.maximum(str_matrix, str_matrix_t) + lv_ratio_matrix = lv_matrix / str_max_matrix + + duplicates = lv_ratio_matrix < 1 / 15.83 + strict_duplicates = lv_ratio_matrix < 0.03 + tmp = strict_duplicates.copy() + np.fill_diagonal(tmp, False) + + identified_duplicates = [] + for col in range(tmp.shape[1]): + duplicate_ids = [str(ids[i]) for i in np.where(tmp[:, col])[0]] + if len(duplicate_ids) > 0: + identified_duplicates.append(",".join(duplicate_ids)) + else: + identified_duplicates.append("") + + if len(identified_duplicates) > 0: + identified_duplicates_df = pd.DataFrame({'id': ids, 'duplicates': identified_duplicates}) + else: + identified_duplicates_df = pd.DataFrame({'id': ids, 'duplicates': [""] * len(ids)}) + + return {"duplicate_candidates": duplicate_candidates, "identified_duplicates": identified_duplicates_df} + +def compute_lv_matrix(titles, n): + distance_matrix = np.zeros((n, n)) + for i in range(n): + for j in range(i + 1, n): # Only compute upper triangle + dist = Levenshtein.distance(titles[i], titles[j]) + distance_matrix[i, j] = dist + distance_matrix[j, i] = dist # Symmetric matrix + return distance_matrix \ No newline at end of file From 407544992b783369a0c174cccfa0d570783315a9 Mon Sep 17 00:00:00 2001 From: chreman Date: Sat, 6 Jul 2024 11:37:17 +0200 Subject: [PATCH 25/75] metadata enrichment wip --- server/services/getPDF.php | 2 +- server/services/searchORCID.php | 2 +- server/workers/orcid/src/orcid.py | 17 ++++++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/server/services/getPDF.php b/server/services/getPDF.php index 253964abc..17f990c2b 100644 --- a/server/services/getPDF.php +++ b/server/services/getPDF.php @@ -88,7 +88,7 @@ function getContentFromURL($link) { curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); - curl_setopt($ch, CURLOPT_VERBOSE, true); + curl_setopt($ch, CURLOPT_VERBOSE, false); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); $response = curl_exec($ch); $redir = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); diff --git a/server/services/searchORCID.php b/server/services/searchORCID.php index 6b57077e0..edfbacf07 100644 --- a/server/services/searchORCID.php +++ b/server/services/searchORCID.php @@ -11,7 +11,7 @@ $precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null); $params_array = array("orcid", "today"); -$optional_get_params = []; +$optional_get_params = array("limit"); function filterEmptyString($value) { diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 2df31ff08..1845d05e5 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -130,8 +130,9 @@ def execute_search(self, params: dict) -> dict: #metadata = mark_duplicates(metadata) #metadata = filter_duplicates(metadata) metadata = sanitize_metadata(metadata) + metadata = metadata.head(params.get("limit")) # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] - text = pd.concat([metadata.id, metadata[["title", "paper_abstract"]] + text = pd.concat([metadata.id, metadata[["title", "paper_abstract", "subtitle", "published_in"]] .apply(lambda x: " ".join(x), axis=1)], axis=1) text.columns = ["id", "content"] input_data = {} @@ -152,6 +153,8 @@ def execute_search(self, params: dict) -> dict: res["reason"] = ["invalid orcid id"] except Exception as e: self.logger.error(e) + res["params"] = params + res["status"] = "error" raise @error_logging_aspect(log_level=logging.ERROR) @@ -183,6 +186,8 @@ def extract_author_info(orcid: Orcid) -> dict: @error_logging_aspect(log_level=logging.WARNING) def sanitize_metadata(metadata: pd.DataFrame) -> pd.DataFrame: metadata["id"] = metadata["id"].astype(str) + metadata["title"] = metadata["title"].fillna("").astype(str) + metadata["subtitle"] = metadata["subtitle"].fillna("").astype(str) return metadata @error_logging_aspect(log_level=logging.WARNING) @@ -228,12 +233,14 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: works["short-description"] = "" works["subject_orig"] = "" works["subject_cleaned"] = "" - works["oa_state"] = 2 + works["subtitle"] = works["title.subtitle"] works["resulttype"] = works.type.map(lambda x: [doc_type_mapping.get(x)]) + works["oa_state"] = 2 + works["link"] = works["url.value"].fillna("").str.lower().map(lambda x: x if x.endswith(".pdf") else "").to_list() + works["oa_state"] = works["link"].map(lambda x: 1 if x else 2) works["subject"] = "" - works["sanitized_authors"] = "" - works["cited_by_tweeters_count"] = np.random.randint(0, 100, size=len(works)) - works["readers.mendeley"] = np.random.randint(0, 100, size=len(works)) + works["sanitized_authors"] = "" + # TODO replace with data from rAltmetrics/rCrossref works["citation_count"] = np.random.randint(0, 100, size=len(works)) return works From 9e341b8ab988f5c4c1d3ece6f7baa4d673bf112b Mon Sep 17 00:00:00 2001 From: chreman Date: Sat, 6 Jul 2024 11:38:32 +0200 Subject: [PATCH 26/75] metadata enrichment wip --- server/workers/api/src/apis/orcid.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/workers/api/src/apis/orcid.py b/server/workers/api/src/apis/orcid.py index 5c54c6860..8cdab775a 100644 --- a/server/workers/api/src/apis/orcid.py +++ b/server/workers/api/src/apis/orcid.py @@ -36,6 +36,8 @@ def post(self): orcid_ns.logger.debug(params) if "optradio" in params: del params["optradio"] + if "limit" not in params: + params["limit"] = 200 # errors = search_param_schema.validate(params, partial=True) # orcid_ns.logger.debug(errors) # if errors: From 14ec80b0fb1feef35fdf9533c69b4a520cae686d Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Mon, 8 Jul 2024 16:49:55 +0200 Subject: [PATCH 27/75] feat: extend orcid pipeline (data extraction) --- server/workers/orcid/src/orcid.py | 119 +++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 18 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 2df31ff08..71becd545 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -218,25 +218,108 @@ def extract_websites(orcid: Orcid) -> list: else: return [] -@error_logging_aspect(log_level=logging.ERROR) +def get_short_description(work) -> str: + try: + short_description = work["short-description"] + except KeyError: + short_description = "" + return short_description + + +def get_authors(work) -> str: + try: + contributors = work.get("contributors", {}).get("contributor", []) + + authors = [] + + for contributor in contributors: + author = contributor.get("credit-name", {}).get("value") + if author: + authors.append(author) + + authors_str = "; ".join(authors) + except KeyError: + authors_str = "" + + return authors_str + + +def get_subjects(work) -> str: + try: + subjects = work["subject"] + except KeyError: + subjects = "" + return subjects + + +def get_title(work) -> str: + try: + title = work.get("title", {}) + value = title.get("title", {}).get("value", "") + except AttributeError: + value = "" + return value + + +def get_subtitle(work) -> str: + try: + title = work.get("title", {}) + subtitle = title.get("subtitle", {}).get("value", "") + except AttributeError: + subtitle = "" + return subtitle + + +def get_paper_abstract(work) -> str: + try: + paper_abstract = work["short-description"] + except KeyError: + paper_abstract = "" + return paper_abstract + + +def get_resulttype(work) -> str: + try: + resulttype = work["work-type"] + except KeyError: + resulttype = "" + return resulttype + + +def published_in(work) -> str: + try: + published_in = work["publication-date"]["year"]["value"] + except KeyError: + published_in = "" + return published_in + + def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: - works = pd.DataFrame(orcid.works()[1]["group"]).explode("work-summary") - works = pd.json_normalize(works["work-summary"]) - works["publication-date"] = works.apply(get_publication_date, axis=1) - works["doi"] = works.apply(extract_dois, axis=1) - # THIS IS EMPTY FOR NOW BECAUSE WE DON'T HAVE THIS INFO YET - works["short-description"] = "" - works["subject_orig"] = "" - works["subject_cleaned"] = "" - works["oa_state"] = 2 - works["resulttype"] = works.type.map(lambda x: [doc_type_mapping.get(x)]) - works["subject"] = "" - works["sanitized_authors"] = "" - works["cited_by_tweeters_count"] = np.random.randint(0, 100, size=len(works)) - works["readers.mendeley"] = np.random.randint(0, 100, size=len(works)) - works["citation_count"] = np.random.randint(0, 100, size=len(works)) - return works - + works_data = pd.DataFrame(orcid.works_full_metadata()) + # works["publication-date"] = works.apply(get_publication_date, axis=1) + # works["doi"] = works.apply(extract_dois, axis=1) + + new_works_data = pd.DataFrame() + + # Perform transformations and store in new DataFrame + new_works_data["title"] = works_data.apply(get_title, axis=1) + new_works_data["subtitle"] = works_data.apply(get_subtitle, axis=1) + new_works_data["authors"] = works_data.apply(get_authors, axis=1) + new_works_data["paper_abstract"] = works_data.apply(get_paper_abstract, axis=1) + new_works_data["year"] = works_data.apply(get_publication_date, axis=1) + new_works_data["published_in"] = works_data.apply(published_in, axis=1) + new_works_data["resulttype"] = works_data.apply(get_resulttype, axis=1) + new_works_data["oa_state"] = 2 + new_works_data["subject"] = works_data.apply(get_subjects, axis=1) + new_works_data["cited_by_tweeters_count"] = np.random.randint( + 0, 100, size=len(works_data) + ) + new_works_data["readers.mendeley"] = np.random.randint(0, 100, size=len(works_data)) + new_works_data["citation_count"] = np.random.randint(0, 100, size=len(works_data)) + + return new_works_data + + @error_logging_aspect(log_level=logging.ERROR) def apply_metadata_schema(works: pd.DataFrame) -> pd.DataFrame: works.rename(columns=works_mapping, inplace=True) From 1df2ec3b442145a2f56ea527f3d21ec208217e61 Mon Sep 17 00:00:00 2001 From: chreman Date: Mon, 8 Jul 2024 22:14:24 +0200 Subject: [PATCH 28/75] dataset bugfix; error handling bugfix --- server/workers/orcid/src/orcid.py | 57 +++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 1845d05e5..c0487fe5d 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -14,6 +14,7 @@ import numpy as np from pyorcid import OrcidAuthentication, Orcid from typing import Tuple +import requests @@ -151,11 +152,12 @@ def execute_search(self, params: dict) -> dict: res["params"] = params res["status"] = "error" res["reason"] = ["invalid orcid id"] + return res except Exception as e: self.logger.error(e) res["params"] = params res["status"] = "error" - raise + return res @error_logging_aspect(log_level=logging.ERROR) def extract_author_info(orcid: Orcid) -> dict: @@ -312,6 +314,57 @@ def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame: filtered.sort_index(inplace=True) return filtered +@error_logging_aspect(log_level=logging.ERROR) +def enrich_from_BASE(metadata: pd.DataFrame) -> pd.DataFrame: + dois = metadata[metadata.doi.map(lambda x: len(x)>0)].doi.to_list() + doi_batches = batch_strings(dois) + url_BASE = "http://proxy-proxy-1/"+os.getenv("COMPOSE_PROJECT_NAME")+"/base/search" + params_BASE = { + "q": "", + "sorting": "most-relevant", + "document_types": ["4", "11", "111", "13", "16", "7", "5", + "12", "121", "122", "17", "19", "3", "52", + "2", "F", "1A", "14", "15", "6", "51", + "1", "18", "181", "183", "182"], + "from": "1665-01-01", + "to": pd.Timestamp.now().strftime("%Y-%m-%d"), + "vis_type": "overview", + "raw": True, + "list_size": 120, + "min_descsize": 0 + } + + tmp = [] + for batch in doi_batches: + try: + params_BASE["q_advanced"] = batch + response = requests.post(url_BASE, json=params_BASE) + data = response.json() + if "input_data" in data: + tmp.append(pd.DataFrame(json.loads(data['input_data']["metadata"]))) + except Exception as e: + logging.error(e) + enrichment_data = pd.concat(tmp) + enrichment_data = enrichment_data[enrichment_data.doi.str.contains("|".join(dois))] + return metadata + +def batch_strings(strings, limit=400): + batches = [] + current_batch = "" + + for string in strings: + substring = 'OR dcdoi:"'+string+'"' + if len(current_batch) + len(substring) + 1 > limit: # +1 for space or no space if first + batches.append("("+current_batch.strip()+")") # Add current batch to batches + current_batch = 'dcdoi:"'+string+'"' # Start a new batch with the current string + else: + current_batch += " " + substring if current_batch else substring # Add string to current batch + + if current_batch: # Add the last batch if it's not empty + batches.append("("+current_batch.strip()+")") + + return batches + works_mapping = { "put-code": "id", "title.title.value": "title", @@ -358,7 +411,7 @@ def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame: "annotation": "Annotation", "artistic-performance": "Artistic performance", "data-management-plan": "Data management plan", - "data-set": "Data set", + "data-set": "Dataset", "invention": "Invention", "lecture-speech": "Lecture speech", "physical-object": "Physical object", From f43c492ee9d41c7007fbb2156137a073425470aa Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 9 Jul 2024 14:20:33 +0200 Subject: [PATCH 29/75] cleanup --- server/workers/orcid/src/orcid.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index c0487fe5d..27812c500 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -126,12 +126,17 @@ def execute_search(self, params: dict) -> dict: orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) author_info = extract_author_info(orcid) works = retrieve_full_works_metadata(orcid) - metadata = apply_metadata_schema(works) + if len(works) == 0: + res = {} + res["params"] = params + res["status"] = "error" + res["reason"] = ["not enough results for orcid"] + return res metadata["authors"] = author_info["author_name"] #metadata = mark_duplicates(metadata) #metadata = filter_duplicates(metadata) metadata = sanitize_metadata(metadata) - metadata = metadata.head(params.get("limit")) + metadata = metadata.head(int(params.get("limit"))) # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] text = pd.concat([metadata.id, metadata[["title", "paper_abstract", "subtitle", "published_in"]] .apply(lambda x: " ".join(x), axis=1)], axis=1) @@ -155,8 +160,10 @@ def execute_search(self, params: dict) -> dict: return res except Exception as e: self.logger.error(e) + res = {} res["params"] = params res["status"] = "error" + res["reason"] = ["unexpected data processing error"] return res @error_logging_aspect(log_level=logging.ERROR) @@ -190,6 +197,7 @@ def sanitize_metadata(metadata: pd.DataFrame) -> pd.DataFrame: metadata["id"] = metadata["id"].astype(str) metadata["title"] = metadata["title"].fillna("").astype(str) metadata["subtitle"] = metadata["subtitle"].fillna("").astype(str) + metadata["published_in"] = metadata["published_in"].fillna("").astype(str) return metadata @error_logging_aspect(log_level=logging.WARNING) @@ -229,6 +237,7 @@ def extract_websites(orcid: Orcid) -> list: def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: works = pd.DataFrame(orcid.works()[1]["group"]).explode("work-summary") works = pd.json_normalize(works["work-summary"]) + works.drop(columns=["url", "title"], inplace=True) works["publication-date"] = works.apply(get_publication_date, axis=1) works["doi"] = works.apply(extract_dois, axis=1) # THIS IS EMPTY FOR NOW BECAUSE WE DON'T HAVE THIS INFO YET @@ -371,7 +380,6 @@ def batch_strings(strings, limit=400): "short-description": "paper_abstract", "publication-date": "year", "work-contributors": "authors", - "url.value": "link", "journal-title.value": "published_in" } From 9a1467cc5df50ee63bdee07f4f1b489e76dd44db Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 9 Jul 2024 16:19:48 +0200 Subject: [PATCH 30/75] feat: add reference to pyorcid repo --- server/workers/orcid/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt index 1b995d3e4..12071581b 100644 --- a/server/workers/orcid/requirements.txt +++ b/server/workers/orcid/requirements.txt @@ -14,7 +14,7 @@ Levenshtein==0.21.1 mistune==2.0.5 numpy==1.19.5 packaging==21.3 -pandas==1.1.5 +pandas==1.3.0 pyparsing==3.1.1 pyrsistent==0.18.0 python-dateutil==2.8.2 @@ -25,4 +25,4 @@ scikit-learn==0.24.2 six==1.16.0 typing-extensions==4.1.1 zipp==3.6.0 -pyorcid==1.2.0 \ No newline at end of file +pyorcid @ git+https://github.com/OpenKnowledgeMaps/PyOrcid.git@main \ No newline at end of file From d33d8cacca1c03ef4c70af1392dbc9d94204298c Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 9 Jul 2024 17:06:34 +0200 Subject: [PATCH 31/75] updates for new metadata --- server/workers/orcid/src/orcid.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 683bb524f..1fcace656 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -125,14 +125,13 @@ def execute_search(self, params: dict) -> dict: try: orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) author_info = extract_author_info(orcid) - works = retrieve_full_works_metadata(orcid) - if len(works) == 0: + metadata = retrieve_full_works_metadata(orcid) + if len(metadata) == 0: res = {} res["params"] = params res["status"] = "error" res["reason"] = ["not enough results for orcid"] return res - metadata["authors"] = author_info["author_name"] #metadata = mark_duplicates(metadata) #metadata = filter_duplicates(metadata) metadata = sanitize_metadata(metadata) @@ -197,6 +196,7 @@ def sanitize_metadata(metadata: pd.DataFrame) -> pd.DataFrame: metadata["id"] = metadata["id"].astype(str) metadata["title"] = metadata["title"].fillna("").astype(str) metadata["subtitle"] = metadata["subtitle"].fillna("").astype(str) + metadata["paper_abstract"] = metadata["paper_abstract"].fillna("").astype(str) metadata["published_in"] = metadata["published_in"].fillna("").astype(str) return metadata @@ -308,7 +308,7 @@ def published_in(work) -> str: published_in = "" return published_in - +@error_logging_aspect(log_level=logging.ERROR) def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: works_data = pd.DataFrame(orcid.works_full_metadata()) # works["publication-date"] = works.apply(get_publication_date, axis=1) @@ -325,11 +325,7 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: new_works_data["published_in"] = works_data.apply(published_in, axis=1) new_works_data["resulttype"] = works_data.apply(get_resulttype, axis=1) new_works_data["oa_state"] = 2 - new_works_data["subject"] = works_data.apply(get_subjects, axis=1) - new_works_data["cited_by_tweeters_count"] = np.random.randint( - 0, 100, size=len(works_data) - ) - new_works_data["readers.mendeley"] = np.random.randint(0, 100, size=len(works_data)) + new_works_data["subject"] = "" # this needs to come from BASE enrichment new_works_data["citation_count"] = np.random.randint(0, 100, size=len(works_data)) return new_works_data From c668ab7f9263989e9a52e76be8eb33df2ba38a0a Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 9 Jul 2024 17:24:00 +0200 Subject: [PATCH 32/75] added doc strings --- server/workers/orcid/src/orcid.py | 74 ++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 1fcace656..0ce7c24c7 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -93,6 +93,15 @@ def orcid_rate_limit_reached(self) -> bool: @error_logging_aspect(log_level=logging.ERROR) def run(self) -> None: + """ + This function is the main loop of the OrcidClient. It will continuously + check for new items in the Redis queue, process them, and store the results + back in Redis. + + The function will also check if the rate limit for ORCID requests is reached. + + return: None + """ while True: while self.orcid_rate_limit_reached(): self.logger.debug('🛑 Request is limited') @@ -117,6 +126,18 @@ def run(self) -> None: @error_logging_aspect(log_level=logging.ERROR) def execute_search(self, params: dict) -> dict: + """ + This function is the main function for the search endpoint. It will + retrieve the ORCID data for the given ORCID ID, extract the author + information and the works metadata, and return the results. + + Parameters: + - params (dict): The parameters for the search endpoint. The parameters + should contain the ORCID ID of the author. + + Returns: + - dict: The results of the search endpoint. + """ q = params.get('q') service = params.get('service') data = {} @@ -167,6 +188,15 @@ def execute_search(self, params: dict) -> dict: @error_logging_aspect(log_level=logging.ERROR) def extract_author_info(orcid: Orcid) -> dict: + """ + This function extracts the author information from the ORCID data. + + Parameters: + - orcid (Orcid): The Orcid object containing the ORCID data. + + Returns: + - dict: The author information extracted from the ORCID data. + """ personal_details = orcid.personal_details() orcid_id = orcid._orcid_id author_name = " ".join( @@ -193,6 +223,16 @@ def extract_author_info(orcid: Orcid) -> dict: @error_logging_aspect(log_level=logging.WARNING) def sanitize_metadata(metadata: pd.DataFrame) -> pd.DataFrame: + """ + This function sanitizes the metadata DataFrame by converting all columns + to string type and filling missing values with an empty string. + + Parameters: + - metadata (pd.DataFrame): The metadata DataFrame to sanitize. + + Returns: + - pd.DataFrame: The sanitized metadata DataFrame. + """ metadata["id"] = metadata["id"].astype(str) metadata["title"] = metadata["title"].fillna("").astype(str) metadata["subtitle"] = metadata["subtitle"].fillna("").astype(str) @@ -310,6 +350,15 @@ def published_in(work) -> str: @error_logging_aspect(log_level=logging.ERROR) def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: + """ + This function retrieves the full works metadata from the ORCID data. + + Parameters: + - orcid (Orcid): The Orcid object containing the ORCID data. + + Returns: + - pd.DataFrame: The full works metadata retrieved from the ORCID data. + """ works_data = pd.DataFrame(orcid.works_full_metadata()) # works["publication-date"] = works.apply(get_publication_date, axis=1) # works["doi"] = works.apply(extract_dois, axis=1) @@ -401,8 +450,18 @@ def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame: @error_logging_aspect(log_level=logging.ERROR) def enrich_from_BASE(metadata: pd.DataFrame) -> pd.DataFrame: + """ + This function enriches the metadata DataFrame with additional information + from the BASE database. + + Parameters: + - metadata (pd.DataFrame): The metadata DataFrame to enrich. + + Returns: + - pd.DataFrame: The enriched metadata DataFrame. + """ dois = metadata[metadata.doi.map(lambda x: len(x)>0)].doi.to_list() - doi_batches = batch_strings(dois) + doi_batches = batch_dois(dois) url_BASE = "http://proxy-proxy-1/"+os.getenv("COMPOSE_PROJECT_NAME")+"/base/search" params_BASE = { "q": "", @@ -433,7 +492,18 @@ def enrich_from_BASE(metadata: pd.DataFrame) -> pd.DataFrame: enrichment_data = enrichment_data[enrichment_data.doi.str.contains("|".join(dois))] return metadata -def batch_strings(strings, limit=400): +def batch_dois(strings, limit=400): + """ + This function batches a list of strings into groups of strings that + together are less than a specified limit. + It is used to batch DOIs for BASE enrichment. + + Parameters: + - strings (list): The list of strings to batch. + + Returns: + - list: The list of batches of strings. + """ batches = [] current_batch = "" From b217e2560b2e9e455e9900b586b43fe130512d3d Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 9 Jul 2024 23:51:42 +0200 Subject: [PATCH 33/75] bugfix handling missing metadata --- server/workers/orcid/src/orcid.py | 42 ++++++++++++++++++------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 0ce7c24c7..74c240591 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -130,6 +130,11 @@ def execute_search(self, params: dict) -> dict: This function is the main function for the search endpoint. It will retrieve the ORCID data for the given ORCID ID, extract the author information and the works metadata, and return the results. + In case of errors, it will return an error reason. Following errors + are possible: + - invalid orcid id + - not enough results for orcid + - unexpected data processing error Parameters: - params (dict): The parameters for the search endpoint. The parameters @@ -233,7 +238,6 @@ def sanitize_metadata(metadata: pd.DataFrame) -> pd.DataFrame: Returns: - pd.DataFrame: The sanitized metadata DataFrame. """ - metadata["id"] = metadata["id"].astype(str) metadata["title"] = metadata["title"].fillna("").astype(str) metadata["subtitle"] = metadata["subtitle"].fillna("").astype(str) metadata["paper_abstract"] = metadata["paper_abstract"].fillna("").astype(str) @@ -282,8 +286,9 @@ def get_short_description(work) -> str: def get_authors(work) -> str: - try: - contributors = work.get("contributors", {}).get("contributor", []) + try: + contributors = work["contributors"] + contributors = contributors.get("contributor", {}) if contributors else [] authors = [] @@ -335,16 +340,16 @@ def get_paper_abstract(work) -> str: def get_resulttype(work) -> str: try: - resulttype = work["work-type"] + resulttype = work["type"] except KeyError: - resulttype = "" + resulttype = "" return resulttype def published_in(work) -> str: try: - published_in = work["publication-date"]["year"]["value"] - except KeyError: + published_in = work["journal-title"]["value"] + except Exception: published_in = "" return published_in @@ -359,20 +364,22 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: Returns: - pd.DataFrame: The full works metadata retrieved from the ORCID data. """ - works_data = pd.DataFrame(orcid.works_full_metadata()) + works_data = pd.DataFrame(orcid.works_full_metadata(limit=1000)) # works["publication-date"] = works.apply(get_publication_date, axis=1) # works["doi"] = works.apply(extract_dois, axis=1) new_works_data = pd.DataFrame() # Perform transformations and store in new DataFrame + new_works_data["id"] = works_data["put-code"].astype(str) new_works_data["title"] = works_data.apply(get_title, axis=1) new_works_data["subtitle"] = works_data.apply(get_subtitle, axis=1) new_works_data["authors"] = works_data.apply(get_authors, axis=1) - new_works_data["paper_abstract"] = works_data.apply(get_paper_abstract, axis=1) + new_works_data["paper_abstract"] = works_data.apply(get_paper_abstract, axis=1).fillna("") new_works_data["year"] = works_data.apply(get_publication_date, axis=1) new_works_data["published_in"] = works_data.apply(published_in, axis=1) - new_works_data["resulttype"] = works_data.apply(get_resulttype, axis=1) + new_works_data["resulttype"] = works_data.apply(get_resulttype, axis=1).map(lambda x: doc_type_mapping.get(x, "")) + new_works_data["doi"] = works_data.apply(extract_dois, axis=1) new_works_data["oa_state"] = 2 new_works_data["subject"] = "" # this needs to come from BASE enrichment new_works_data["citation_count"] = np.random.randint(0, 100, size=len(works_data)) @@ -391,7 +398,8 @@ def filter_dicts_by_value(dicts, key, value) -> list: @error_logging_aspect(log_level=logging.WARNING) def extract_dois(work: pd.DataFrame) -> str: - external_ids = work["external-ids.external-id"] + external_ids = work["external-ids"] + external_ids = external_ids["external-id"] if external_ids else [] external_ids = external_ids if isinstance(external_ids, list) else [] external_ids = (filter_dicts_by_value( external_ids, @@ -403,16 +411,16 @@ def extract_dois(work: pd.DataFrame) -> str: @error_logging_aspect(log_level=logging.WARNING) def get_publication_date(work) -> str: try: - year = work["publication-date.year.value"] - except KeyError: + year = work["publication-date"]["year"]["value"] + except Exception: year = np.NaN try: - month = work["publication-date.month.value"] - except KeyError: + month = work["publication-date"]["month"]["value"] + except Exception: month = np.NaN try: - day = work["publication-date.day.value"] - except KeyError: + day = work["publication-date"]["day"]["value"] + except Exception: day = np.NaN publication_date = "" parsed_publication_date = publication_date From d7b5523e2f9b0c0aec225f6d06634606c0147e71 Mon Sep 17 00:00:00 2001 From: chreman Date: Wed, 10 Jul 2024 14:12:59 +0200 Subject: [PATCH 34/75] author name bugfix --- server/workers/orcid/src/orcid.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 74c240591..e060dfaea 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -152,6 +152,7 @@ def execute_search(self, params: dict) -> dict: orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) author_info = extract_author_info(orcid) metadata = retrieve_full_works_metadata(orcid) + metadata["authors"] = metadata["authors"].map(lambda x: author_info["author_name"] if x=="" else x) if len(metadata) == 0: res = {} res["params"] = params @@ -163,7 +164,7 @@ def execute_search(self, params: dict) -> dict: metadata = sanitize_metadata(metadata) metadata = metadata.head(int(params.get("limit"))) # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] - text = pd.concat([metadata.id, metadata[["title", "paper_abstract", "subtitle", "published_in"]] + text = pd.concat([metadata.id, metadata[["title", "paper_abstract", "subtitle", "published_in", "authors"]] .apply(lambda x: " ".join(x), axis=1)], axis=1) text.columns = ["id", "content"] input_data = {} From c7d7e7d44cff53bc5437f8c7412f1e8715d86f8a Mon Sep 17 00:00:00 2001 From: chreman Date: Wed, 10 Jul 2024 15:10:21 +0200 Subject: [PATCH 35/75] robustify author metadata handling --- server/workers/orcid/src/orcid.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index e060dfaea..e519dd678 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -294,9 +294,12 @@ def get_authors(work) -> str: authors = [] for contributor in contributors: - author = contributor.get("credit-name", {}).get("value") - if author: - authors.append(author) + if contributor is not None: + credit_name = contributor.get("credit-name", {}) + if credit_name is not None: + author = credit_name.get("value") + if author: + authors.append(author) authors_str = "; ".join(authors) except KeyError: From d6636cc9466d25cac50774c102ad15de7485a147 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Mon, 15 Jul 2024 17:30:39 +0200 Subject: [PATCH 36/75] feat: (orcid) improvements in error handling and refactoring --- .gitignore | 4 +- .../proxy/templates/default.conf.template | 2 +- local_dev/searchflow-container/Dockerfile | 3 +- .../other-scripts/run_base_contentproviders.R | 17 +-- .../preprocessing/other-scripts/summarize.R | 35 ++++-- server/services/searchBASE.php | 27 +++-- server/workers/api/src/apis/orcid.py | 98 ++++++++++------- server/workers/api/src/apis/pubmed.py | 79 +++++++------- server/workers/api/src/apis/utils.py | 6 +- server/workers/orcid/src/orcid.py | 102 ++++++++++-------- server/workers/tests/Dockerfile_tests | 2 +- 11 files changed, 224 insertions(+), 151 deletions(-) diff --git a/.gitignore b/.gitignore index d70ade073..bb2f2ffb5 100644 --- a/.gitignore +++ b/.gitignore @@ -44,7 +44,6 @@ server/preprocessing/other-scripts/renv /*.Rproj .Rproj.user - # python files *.pyc *.pkl @@ -54,6 +53,7 @@ server/preprocessing/other-scripts/renv /server/nbproject/private/ .pytest_cache *__pycache__* +.venv # python files *.pyc @@ -63,4 +63,4 @@ server/preprocessing/other-scripts/renv /nbproject/private/ /server/nbproject/private/ .pytest_cache -*__pycache__* \ No newline at end of file +*__pycache__* diff --git a/local_dev/proxy/templates/default.conf.template b/local_dev/proxy/templates/default.conf.template index c9c375ee6..8cb86f749 100644 --- a/local_dev/proxy/templates/default.conf.template +++ b/local_dev/proxy/templates/default.conf.template @@ -16,4 +16,4 @@ server { proxy_pass http://dev-persistence-1:5001/api/persistence/; } } -} \ No newline at end of file +} \ No newline at end of file diff --git a/local_dev/searchflow-container/Dockerfile b/local_dev/searchflow-container/Dockerfile index a2da2d021..81ecdb00a 100644 --- a/local_dev/searchflow-container/Dockerfile +++ b/local_dev/searchflow-container/Dockerfile @@ -9,7 +9,8 @@ RUN docker-php-ext-install pdo pdo_sqlite mbstring xml fileinfo RUN apt-get install -y gconf-service libasound2 libatk1.0-0 libatk-bridge2.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget ENV NVM_DIR /usr/local/nvm ENV NODE_VERSION 10.17.0 -RUN curl https://raw.githubusercontent.com/creationix/nvm/v0.24.1/install.sh | bash \ +RUN export NVM_DIR="$NVM_DIR" && mkdir -p $NVM_DIR \ + && curl https://raw.githubusercontent.com/creationix/nvm/v0.39.7/install.sh | bash \ && . $NVM_DIR/nvm.sh \ && nvm install $NODE_VERSION \ && nvm alias default $NODE_VERSION \ diff --git a/server/preprocessing/other-scripts/run_base_contentproviders.R b/server/preprocessing/other-scripts/run_base_contentproviders.R index 51be0e36e..e47a67a65 100644 --- a/server/preprocessing/other-scripts/run_base_contentproviders.R +++ b/server/preprocessing/other-scripts/run_base_contentproviders.R @@ -25,18 +25,21 @@ if (DEBUG==TRUE){ setup_logging('INFO') } - log <- getLogger('base_repos') +failed <- list() +failed$status <- 'error' + tryCatch({ contentproviders <- bs_repositories("") - triple <- list(name="GoTriple", "internal_name"="fttriple") + if (is.null(contentproviders) || nrow(contentproviders) == 0) { + stop("No content providers retrieved.") + } + triple <- list(name = "GoTriple", internal_name = "fttriple") contentproviders <- rbind(contentproviders, triple) -}, error=function(err){ - log$error(paste("Contentprovider failed", "base", "retrieve_contentproviders", "", err, sep="||")) - failed <- list() - failed$reason <- list(err) - failed$status <- 'error' +}, error = function(err) { + log$error(paste("Content provider retrieval failed", "base", "retrieve_contentproviders", "", err, sep = "||")) + failed$reason <- list(err$message) }) diff --git a/server/preprocessing/other-scripts/summarize.R b/server/preprocessing/other-scripts/summarize.R index 4d56ec723..e28cf6b19 100644 --- a/server/preprocessing/other-scripts/summarize.R +++ b/server/preprocessing/other-scripts/summarize.R @@ -27,12 +27,12 @@ prune_ngrams <- function(ngrams, stops){ for (i in seq(1, total_length, batch_size)) { tokenized_ngrams = lapply(tokenized_ngrams, function(x) { Filter(function(tokens){ - !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tolower(tokens[[1]]))) + !any(stringi::stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tolower(tokens[[1]]))) }, x)}) # remove ngrams ending with a stopword tokenized_ngrams = lapply(tokenized_ngrams, function(x) { Filter(function(tokens){ - !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tolower(tail(tokens,1)))) + !any(stringi::stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tolower(tail(tokens,1)))) }, x)}) } # remove ngrams starting and ending with the same word @@ -215,17 +215,30 @@ another_prune_ngrams <- function(ngrams, stops){ # check if first token of ngrams in stopword list batch_size <- 1000 total_length <- length(stops) + for (i in seq(1, total_length, batch_size)) try({ - tokens = lapply(tokens, function(y){ - Filter(function(x){ - if (x[1] != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], x[1])) - }, y)}) - # check if last token of ngrams in stopword list - tokens = lapply(tokens, function(y){ - Filter(function(x){ - if (tail(x,1) != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tail(x,1))) - }, y)}) + tokens = lapply(tokens, function(y){ + Filter(function(x){ + if (!is.na(x[1]) && x[1] != "") { + !any(stringi::stri_detect_fixed(stops[i:min(i+batch_size-1, total_length)], x[1])) + } else { + FALSE + } + }, y) + }) + # check if last token of ngrams in stopword list + tokens = lapply(tokens, function(y){ + Filter(function(x){ + last_token <- tail(x, 1) + if (!is.na(last_token) && last_token != "") { + !any(stringi::stri_detect_fixed(stops[i:min(i+batch_size-1, total_length)], last_token)) + } else { + FALSE + } + }, y) + }) }) + # check that first token is not the same as the last token tokens = lapply(tokens, function(y){ if(length(y) > 1) { diff --git a/server/services/searchBASE.php b/server/services/searchBASE.php index febaaaa6d..7b60a5638 100644 --- a/server/services/searchBASE.php +++ b/server/services/searchBASE.php @@ -8,9 +8,10 @@ use headstart\library; $dirty_query = library\CommUtils::getParameter($_POST, "q"); -$precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null); -$params_array = array("from", "to", "document_types", "sorting", "min_descsize"); +$precomputed_id = $_POST["unique_id"] ?? null; + +$params_array = ["from", "to", "document_types", "sorting", "min_descsize"]; $optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id", "custom_title", "custom_clustering"]; function filterEmptyString($value) @@ -19,8 +20,8 @@ function filterEmptyString($value) return $value !== ''; } -foreach($optional_get_params as $param) { - if(isset($_POST[$param])) { +foreach ($optional_get_params as $param) { + if (isset($_POST[$param])) { $params_array[] = $param; } } @@ -37,12 +38,18 @@ function filterEmptyString($value) } } +$result = search( + "base", + $dirty_query, + $post_params, + $params_array, + true, + true, + null, + $precomputed_id, + false +); -$result = search("base", $dirty_query - , $post_params, $params_array - , true - , true, null - , $precomputed_id, false); echo $result -?> +?> \ No newline at end of file diff --git a/server/workers/api/src/apis/orcid.py b/server/workers/api/src/apis/orcid.py index 8cdab775a..d1285d3cc 100644 --- a/server/workers/api/src/apis/orcid.py +++ b/server/workers/api/src/apis/orcid.py @@ -9,60 +9,84 @@ from .request_validators import SearchParamSchema from apis.utils import get_key, redis_store + +# Namespace setup orcid_ns = Namespace("orcid", description="ORCiD API operations") -search_param_schema = SearchParamSchema() +# Schema validator +search_param_schema = SearchParamSchema() -orcid_querymodel = orcid_ns.model("SearchQuery", - {"q": fields.String(example='', - description='query string', - required=True), - "orcid": fields.String(example='1234-5678-9012-3456', - description='ORCiD iD', - required=True)}) +# Constants +DEFAULT_LIMIT = 200 +REDIS_TIMEOUT = 300 +# Model definition +orcid_querymodel = orcid_ns.model( + "SearchQuery", + { + "q": fields.String( + example='', + description='query string', + required=True + ), + "orcid": fields.String( + example='1234-5678-9012-3456', + description='ORCiD iD', + required=True + ) + } +) -@orcid_ns.route('/search') +@orcid_ns.route("/search") class Search(Resource): - @orcid_ns.doc(responses={200: 'OK', - 400: 'Invalid search parameters'}) + @orcid_ns.doc(responses={200: "OK", 400: "Invalid search parameters"}) @orcid_ns.expect(orcid_querymodel) @orcid_ns.produces(["application/json", "text/csv"]) def post(self): """ + Perform a search query using ORCiD API. """ - params = request.get_json() - orcid_ns.logger.debug(params) - if "optradio" in params: - del params["optradio"] - if "limit" not in params: - params["limit"] = 200 - # errors = search_param_schema.validate(params, partial=True) - # orcid_ns.logger.debug(errors) - # if errors: - # abort(400, str(errors)) - k = str(uuid.uuid4()) - d = {"id": k, "params": params, - "endpoint": "search"} - orcid_ns.logger.debug(d) - redis_store.rpush("orcid", json.dumps(d)) - q_len = redis_store.llen("orcid") - orcid_ns.logger.debug("Queue length: %s %d %s" %("orcid", q_len, k)) - result = get_key(redis_store, k, 300) try: - headers = {} - if request.headers["Accept"] == "application/json": - headers["Content-Type"] = "application/json" - return make_response(result, - 200, - headers) + params = request.get_json() + orcid_ns.logger.debug(params) + + self.clean_params(params) + + request_id = str(uuid.uuid4()) + task_data = {"id": request_id, "params": params, "endpoint": "search"} + orcid_ns.logger.debug(task_data) + redis_store.rpush("orcid", json.dumps(task_data)) + + queue_length = redis_store.llen("orcid") + orcid_ns.logger.debug(f"Queue length: orcid {queue_length} {request_id}") + + result = get_key(redis_store, request_id, REDIS_TIMEOUT) + headers = self.get_response_headers() + + return make_response(result, 200, headers) except Exception as e: orcid_ns.logger.error(e) abort(500, "Problem encountered, check logs.") -@orcid_ns.route('/service_version') + def clean_params(self, params): + if "optradio" in params: + del params["optradio"] + if "limit" not in params: + params["limit"] = DEFAULT_LIMIT + + def get_response_headers(self): + headers = {} + if request.headers["Accept"] == "application/json": + headers["Content-Type"] = "application/json" + return headers + + +@orcid_ns.route("/service_version") class ServiceVersion(Resource): def get(self): + """ + Get the current service version. + """ result = {"service_version": os.getenv("SERVICE_VERSION")} - return make_response(result, 200, {"Content-Type": "application/json"}) + return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file diff --git a/server/workers/api/src/apis/pubmed.py b/server/workers/api/src/apis/pubmed.py index 2f85a77d7..d2964c056 100644 --- a/server/workers/api/src/apis/pubmed.py +++ b/server/workers/api/src/apis/pubmed.py @@ -9,45 +9,48 @@ from apis.utils import get_key, redis_store - pubmed_ns = Namespace("pubmed", description="PubMed API operations") search_param_schema = SearchParamSchema() -pubmed_querymodel = pubmed_ns.model("SearchQuery", - {"q": fields.String(example='feminicide', - description='query string', - required=True), - "sorting": fields.String(example='most-recent', - description='most-relevant or most-recent', - required=True), - "from": fields.String(example='2019-01-01', - description='yyyy-MM-dd', - required=True), - "to": fields.String(example='2019-12-31', - description='yyyy-MM-dd', - required=True), - "vis_type": fields.String(example='overview', - description='overview or timeline', - required=True), - "limit": fields.Integer(example=100, - description='max. number of results'), - "language": fields.String(example='en', - description='language code, optional', - required=False), - "raw": fields.Boolean(example="false", - description='raw results from ElasticSearch')}) +pubmed_querymodel = pubmed_ns.model( + "SearchQuery", + { + "q": fields.String( + example="feminicide", description="query string", required=True + ), + "sorting": fields.String( + example="most-recent", + description="most-relevant or most-recent", + required=True, + ), + "from": fields.String( + example="2019-01-01", description="yyyy-MM-dd", required=True + ), + "to": fields.String( + example="2019-12-31", description="yyyy-MM-dd", required=True + ), + "vis_type": fields.String( + example="overview", description="overview or timeline", required=True + ), + "limit": fields.Integer(example=100, description="max. number of results"), + "language": fields.String( + example="en", description="language code, optional", required=False + ), + "raw": fields.Boolean( + example="false", description="raw results from ElasticSearch" + ), + }, +) -@pubmed_ns.route('/search') +@pubmed_ns.route("/search") class Search(Resource): - @pubmed_ns.doc(responses={200: 'OK', - 400: 'Invalid search parameters'}) + @pubmed_ns.doc(responses={200: "OK", 400: "Invalid search parameters"}) @pubmed_ns.expect(pubmed_querymodel) @pubmed_ns.produces(["application/json", "text/csv"]) def post(self): - """ - """ + """ """ params = request.get_json() pubmed_ns.logger.debug(params) if "optradio" in params: @@ -59,12 +62,11 @@ def post(self): if errors: abort(400, str(errors)) k = str(uuid.uuid4()) - d = {"id": k, "params": params, - "endpoint": "search"} + d = {"id": k, "params": params, "endpoint": "search"} pubmed_ns.logger.debug(d) redis_store.rpush("pubmed", json.dumps(d)) q_len = redis_store.llen("pubmed") - pubmed_ns.logger.debug("Queue length: %s %d %s" %("pubmed", q_len, k)) + pubmed_ns.logger.debug("Queue length: %s %d %s" % ("pubmed", q_len, k)) result = get_key(redis_store, k) try: headers = {} @@ -77,18 +79,19 @@ def post(self): else: result = pd.read_json(json.loads(result)).to_csv() headers["Content-Type"] = "text/csv" - headers["Content-Disposition"] = "attachment; filename={0}.csv".format(k) + headers["Content-Disposition"] = "attachment; filename={0}.csv".format( + k + ) if params.get("raw") is True: headers["Content-Type"] = "application/json" - return make_response(result, - 200, - headers) + return make_response(result, 200, headers) except Exception as e: pubmed_ns.logger.error(e) abort(500, "Problem encountered, check logs.") -@pubmed_ns.route('/service_version') + +@pubmed_ns.route("/service_version") class ServiceVersion(Resource): def get(self): result = {"service_version": os.getenv("SERVICE_VERSION")} - return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file + return make_response(result, 200, {"Content-Type": "application/json"}) diff --git a/server/workers/api/src/apis/utils.py b/server/workers/api/src/apis/utils.py index 606ba4562..7f8da3e71 100644 --- a/server/workers/api/src/apis/utils.py +++ b/server/workers/api/src/apis/utils.py @@ -7,6 +7,7 @@ import re import redis import pandas as pd +import pathlib redis_config = { "host": os.getenv("REDIS_HOST"), @@ -91,7 +92,10 @@ def get_or_create_contentprovider_lookup(): cp_dict = df.name.to_dict() return cp_dict except Exception as e: - df = pd.read_json("contentproviders.json") + df = pd.read_json( + pathlib.Path(__file__).parent.absolute() / + "contentproviders.json" + ) df.set_index("internal_name", inplace=True) cp_dict = df.name.to_dict() return cp_dict diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 74c240591..43ef89cf5 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -102,6 +102,7 @@ def run(self) -> None: return: None """ + # TODO: add retry mechanism while True: while self.orcid_rate_limit_reached(): self.logger.debug('🛑 Request is limited') @@ -112,7 +113,9 @@ def run(self) -> None: if endpoint == "search": try: res = self.execute_search(params) + self.logger.debug(res) res["id"] = k + if res.get("status") == "error" or params.get('raw') is True: self.redis_store.set(k+"_output", json.dumps(res)) else: @@ -150,13 +153,31 @@ def execute_search(self, params: dict) -> dict: orcid_id = params.get("orcid") try: orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) + except Exception as e: + self.logger.error(e) + res = {} + res["params"] = params + res["status"] = "error" + res["reason"] = ["invalid orcid id"] + self.logger.debug( + f"ORCID {orcid_id} is invalid." + ) + return res + + try: author_info = extract_author_info(orcid) metadata = retrieve_full_works_metadata(orcid) + self.logger.debug(f"metadata retrieved and length is: {len(metadata)}") + self.logger.debug(metadata) + if len(metadata) == 0: res = {} res["params"] = params res["status"] = "error" res["reason"] = ["not enough results for orcid"] + self.logger.debug( + f"ORCID {orcid_id} has no works metadata." + ) return res #metadata = mark_duplicates(metadata) #metadata = filter_duplicates(metadata) @@ -191,6 +212,27 @@ def execute_search(self, params: dict) -> dict: res["reason"] = ["unexpected data processing error"] return res + +# TODO: the following functions should be moved to a separate module +def get_nested_value(data, keys, default=None): + """ + Recursively retrieves a nested value from a dictionary. + + :param data: Dictionary to retrieve the value from + :param keys: List of keys to follow in the dictionary + :param default: Default value to return if any key is not found + :return: The retrieved value or the default value + """ + for key in keys: + try: + data = data.get(key) + if data is None: + return default + except AttributeError: + return default + return data + + @error_logging_aspect(log_level=logging.ERROR) def extract_author_info(orcid: Orcid) -> dict: """ @@ -278,22 +320,18 @@ def extract_websites(orcid: Orcid) -> list: return [] def get_short_description(work) -> str: - try: - short_description = work["short-description"] - except KeyError: - short_description = "" - return short_description + return get_nested_value(work, ["short-description"], "") def get_authors(work) -> str: - try: - contributors = work["contributors"] - contributors = contributors.get("contributor", {}) if contributors else [] + try: + contributors = get_nested_value(work, ["contributors", "contributor"], []) authors = [] for contributor in contributors: - author = contributor.get("credit-name", {}).get("value") + author = get_nested_value(contributor, ["credit-name", "value"], None) + if author: authors.append(author) @@ -305,53 +343,30 @@ def get_authors(work) -> str: def get_subjects(work) -> str: - try: - subjects = work["subject"] - except KeyError: - subjects = "" - return subjects + return get_nested_value(work, ["subject"], "") def get_title(work) -> str: - try: - title = work.get("title", {}) - value = title.get("title", {}).get("value", "") - except AttributeError: - value = "" - return value + return get_nested_value(work, ["title", "title", "value"], "") def get_subtitle(work) -> str: - try: - title = work.get("title", {}) - subtitle = title.get("subtitle", {}).get("value", "") - except AttributeError: - subtitle = "" - return subtitle + return get_nested_value(work, ["title", "subtitle", "value"], "") def get_paper_abstract(work) -> str: - try: - paper_abstract = work["short-description"] - except KeyError: - paper_abstract = "" - return paper_abstract + return get_nested_value(work, ["short-description"], "") def get_resulttype(work) -> str: - try: - resulttype = work["type"] - except KeyError: - resulttype = "" - return resulttype + return get_nested_value(work, ["type"], "") def published_in(work) -> str: - try: - published_in = work["journal-title"]["value"] - except Exception: - published_in = "" - return published_in + return get_nested_value(work, ["journal-title", "value"], "") + +def get_put_code(work) -> str: + return get_nested_value(work, ["put-code"], "") @error_logging_aspect(log_level=logging.ERROR) def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: @@ -370,8 +385,11 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: new_works_data = pd.DataFrame() + if works_data.empty: + return new_works_data + # Perform transformations and store in new DataFrame - new_works_data["id"] = works_data["put-code"].astype(str) + new_works_data["id"] = works_data.apply(get_put_code, axis=1) new_works_data["title"] = works_data.apply(get_title, axis=1) new_works_data["subtitle"] = works_data.apply(get_subtitle, axis=1) new_works_data["authors"] = works_data.apply(get_authors, axis=1) diff --git a/server/workers/tests/Dockerfile_tests b/server/workers/tests/Dockerfile_tests index 2e17dc03d..3ec2fd241 100644 --- a/server/workers/tests/Dockerfile_tests +++ b/server/workers/tests/Dockerfile_tests @@ -1,4 +1,4 @@ -FROM python:3.7 +FROM python:3.8 MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" From 132ab1b095ece4496963e77c0dd51685a4d8e7b9 Mon Sep 17 00:00:00 2001 From: chreman Date: Wed, 24 Jul 2024 10:50:36 +0200 Subject: [PATCH 37/75] minor README update --- server/workers/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/workers/README.md b/server/workers/README.md index c0c42fbca..82b521e2e 100644 --- a/server/workers/README.md +++ b/server/workers/README.md @@ -79,11 +79,11 @@ PostgreSQL service: * Manual database creation for Postgres: -Enter container: `docker exec -it VARYINGNAME_db_1 psql -U headstart` +Enter database container as postgres user: `docker exec -it VARYINGNAME_db_1 psql -U headstart` Execute command: `CREATE DATABASE databasename;` -Exit the container and re-enter it as normal user: `docker exec -it VARYINGNAME_persistence_1 /bin/bash` +Exit the database container and enter persistence container as normal user: `docker exec -it VARYINGNAME_persistence_1 /bin/bash` Execute command: `python manage.py` From fa5ad312360eb3bd57544e7d42ab2934b2ee5455 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Wed, 24 Jul 2024 10:58:08 +0200 Subject: [PATCH 38/75] improve orcid error handling --- server/services/search.php | 3 ++- server/workers/orcid/src/orcid.py | 29 +++++++++++------------------ 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/server/services/search.php b/server/services/search.php index 325f1b976..c3e6f0807 100644 --- a/server/services/search.php +++ b/server/services/search.php @@ -53,7 +53,6 @@ function cleanQuery($dirty_query, $transform_query_tolowercase, $add_slashes) { return $query; } - function search($service, $dirty_query , $post_params, $param_types , $transform_query_tolowercase = true @@ -100,6 +99,7 @@ function search($service, $dirty_query $unique_id = ($precomputed_id === null)?($unique_id):($precomputed_id); $post_params["vis_id"] = $unique_id; if (array_key_exists("repo", $post_params)) { + // ? what it is? $payload = json_encode(array("repo" => $post_params["repo"])); $res = $apiclient->call_api($service . "/contentproviders", $payload); $res = $res["result"]; @@ -119,6 +119,7 @@ function search($service, $dirty_query } } + // ? here we are making requests, should we restructure a little bit our requests to handle errors better? $payload = json_encode($post_params); $res = $apiclient->call_api($service . "/search", $payload); if ($res["httpcode"] != 200) { diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 43ef89cf5..e5744ab3a 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -3,16 +3,14 @@ import json import pandas as pd import logging -from datetime import timedelta from dateutil.parser import parse from common.decorators import error_logging_aspect from common.deduplication import find_duplicate_indexes,\ - prioritize_OA_and_latest, mark_latest_doi, mark_duplicates -import re + prioritize_OA_and_latest, mark_latest_doi from redis.exceptions import LockError import time import numpy as np -from pyorcid import OrcidAuthentication, Orcid +from pyorcid import OrcidAuthentication, Orcid, errors as pyorcid_errors from typing import Tuple import requests @@ -151,18 +149,7 @@ def execute_search(self, params: dict) -> dict: data = {} data["params"] = params orcid_id = params.get("orcid") - try: - orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) - except Exception as e: - self.logger.error(e) - res = {} - res["params"] = params - res["status"] = "error" - res["reason"] = ["invalid orcid id"] - self.logger.debug( - f"ORCID {orcid_id} is invalid." - ) - return res + orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) try: author_info = extract_author_info(orcid) @@ -196,7 +183,11 @@ def execute_search(self, params: dict) -> dict: params.update(author_info) res["params"] = params return res - except ValueError as e: + except ( + pyorcid_errors.Forbidden, + pyorcid_errors.NotFound, + pyorcid_errors.BadRequest, + ) as e: self.logger.error(e) self.logger.error(params) res = {} @@ -204,8 +195,10 @@ def execute_search(self, params: dict) -> dict: res["status"] = "error" res["reason"] = ["invalid orcid id"] return res - except Exception as e: + # Unauthorized also should be internal server error, because we do not use client's credentials + except (pyorcid_errors.Unauthorized, Exception) as e: self.logger.error(e) + self.logger.error(params) res = {} res["params"] = params res["status"] = "error" From 9b908c22ec8dbf7a842144292fafbe2a5cb69213 Mon Sep 17 00:00:00 2001 From: chreman Date: Fri, 26 Jul 2024 18:21:50 +0200 Subject: [PATCH 39/75] get additional metadata for ORCID work URLs --- server/workers/orcid/src/orcid.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 6bc658755..e3fc22ecf 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -362,6 +362,25 @@ def published_in(work) -> str: def get_put_code(work) -> str: return get_nested_value(work, ["put-code"], "") +def get_url(work) -> str: + url = get_nested_value(work, ["url", "value"], "") + if url == "": + ids = get_nested_value(work, ["external-ids", "external-id"], "") + if ids: + for id in ids: + if id["external-id-value"].startswith("http"): + url = id["external-id-value"] + break + return url + +def get_link(work) -> str: + url = get_nested_value(work, ["url", "value"], "") + if url.lower().endswith(".pdf"): + link = url + else: + link = "" + return link + @error_logging_aspect(log_level=logging.ERROR) def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: """ @@ -395,6 +414,8 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: new_works_data["oa_state"] = 2 new_works_data["subject"] = "" # this needs to come from BASE enrichment new_works_data["citation_count"] = np.random.randint(0, 100, size=len(works_data)) + new_works_data["url"] = works_data.apply(get_url, axis=1) + new_works_data["link"] = works_data.apply(get_link, axis=1) return new_works_data From 7fca25254c2e6fca0814c6d72c29238b48b5a380 Mon Sep 17 00:00:00 2001 From: chreman Date: Mon, 29 Jul 2024 16:23:55 +0200 Subject: [PATCH 40/75] works id bugfix --- server/workers/orcid/src/orcid.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index e3fc22ecf..48c648100 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -402,7 +402,7 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: return new_works_data # Perform transformations and store in new DataFrame - new_works_data["id"] = works_data.apply(get_put_code, axis=1) + new_works_data["id"] = works_data.apply(get_put_code, axis=1).astype(str) new_works_data["title"] = works_data.apply(get_title, axis=1) new_works_data["subtitle"] = works_data.apply(get_subtitle, axis=1) new_works_data["authors"] = works_data.apply(get_authors, axis=1) @@ -413,7 +413,6 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: new_works_data["doi"] = works_data.apply(extract_dois, axis=1) new_works_data["oa_state"] = 2 new_works_data["subject"] = "" # this needs to come from BASE enrichment - new_works_data["citation_count"] = np.random.randint(0, 100, size=len(works_data)) new_works_data["url"] = works_data.apply(get_url, axis=1) new_works_data["link"] = works_data.apply(get_link, axis=1) From bd39d37bfe66c89dadcc4f7a4c1169046239f360 Mon Sep 17 00:00:00 2001 From: chreman Date: Mon, 29 Jul 2024 16:24:08 +0200 Subject: [PATCH 41/75] readme update --- server/workers/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/workers/README.md b/server/workers/README.md index 82b521e2e..c0c42fbca 100644 --- a/server/workers/README.md +++ b/server/workers/README.md @@ -79,11 +79,11 @@ PostgreSQL service: * Manual database creation for Postgres: -Enter database container as postgres user: `docker exec -it VARYINGNAME_db_1 psql -U headstart` +Enter container: `docker exec -it VARYINGNAME_db_1 psql -U headstart` Execute command: `CREATE DATABASE databasename;` -Exit the database container and enter persistence container as normal user: `docker exec -it VARYINGNAME_persistence_1 /bin/bash` +Exit the container and re-enter it as normal user: `docker exec -it VARYINGNAME_persistence_1 /bin/bash` Execute command: `python manage.py` From b7453640d49b78f738e42ddb14d2b233c5112700 Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 30 Jul 2024 13:48:19 +0200 Subject: [PATCH 42/75] orcid PDF OA state workaround --- server/workers/orcid/src/orcid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 48c648100..4dc064c23 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -411,10 +411,10 @@ def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: new_works_data["published_in"] = works_data.apply(published_in, axis=1) new_works_data["resulttype"] = works_data.apply(get_resulttype, axis=1).map(lambda x: doc_type_mapping.get(x, "")) new_works_data["doi"] = works_data.apply(extract_dois, axis=1) - new_works_data["oa_state"] = 2 new_works_data["subject"] = "" # this needs to come from BASE enrichment new_works_data["url"] = works_data.apply(get_url, axis=1) new_works_data["link"] = works_data.apply(get_link, axis=1) + new_works_data["oa_state"] = new_works_data.link.map(lambda x: 1 if x else 2) return new_works_data From 9f87e5a136b8943a2bc06a9ad35bfaa8ff2d1311 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 30 Jul 2024 17:02:54 +0200 Subject: [PATCH 43/75] fix: orcid publication date parsing --- server/services/searchORCID.php | 24 ++++++++++++++-------- server/workers/orcid/src/orcid.py | 34 +++++++++++++++---------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/server/services/searchORCID.php b/server/services/searchORCID.php index edfbacf07..aca7f0f5a 100644 --- a/server/services/searchORCID.php +++ b/server/services/searchORCID.php @@ -7,8 +7,9 @@ use headstart\library; -$dirty_query = library\CommUtils::getParameter($_POST, "orcid"); -$precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null); +// trim ORCID query +$dirty_query = trim(library\CommUtils::getParameter($_POST, "orcid")); +$precomputed_id = $_POST["unique_id"] ?? null; $params_array = array("orcid", "today"); $optional_get_params = array("limit"); @@ -28,12 +29,19 @@ function filterEmptyString($value) $post_params = $_POST; -$result = search("orcid", $dirty_query - , $post_params, $params_array - , true - # TODO: set back to true before deployment - , false, null - , $precomputed_id, false); +$result = search( + "orcid", + $dirty_query, + $post_params, + $params_array, + true, + // TODO: set back to true before deployment + false, + null, + $precomputed_id, + false +); + echo $result ?> diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index e5744ab3a..85272a03c 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -422,30 +422,30 @@ def extract_dois(work: pd.DataFrame) -> str: @error_logging_aspect(log_level=logging.WARNING) def get_publication_date(work) -> str: try: - year = work["publication-date"]["year"]["value"] - except Exception: - year = np.NaN + year = get_nested_value(work, ["publication-date", "year", "value"], np.nan) + except KeyError: + year = np.nan try: - month = work["publication-date"]["month"]["value"] - except Exception: - month = np.NaN + month = get_nested_value(work, ["publication-date", "month", "value"], np.nan) + except KeyError: + month = np.nan try: - day = work["publication-date"]["day"]["value"] - except Exception: - day = np.NaN + day = get_nested_value(work, ["publication-date", "day", "value"], np.nan) + except KeyError: + day = np.nan publication_date = "" parsed_publication_date = publication_date - if year is not np.NaN: - publication_date+=str(int(year)) + if year is not np.nan: + publication_date += str(int(year)) parsed_publication_date = publication_date - if month is not np.NaN: - publication_date+=("-"+str(int(month))) + if month is not np.nan: + publication_date += "-" + str(int(month)) date_obj = parse(publication_date) - parsed_publication_date = date_obj.strftime('%Y-%m') - if day is not np.NaN: - publication_date+=("-"+str(int(day))) + parsed_publication_date = date_obj.strftime("%Y-%m") + if day is not np.nan: + publication_date += "-" + str(int(day)) date_obj = parse(publication_date) - parsed_publication_date = date_obj.strftime('%Y-%m-%d') + parsed_publication_date = date_obj.strftime("%Y-%m-%d") return parsed_publication_date @error_logging_aspect(log_level=logging.ERROR) From aaa796969748744d1c62d0c5b16c72c964e4d0f1 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Thu, 1 Aug 2024 11:14:37 +0200 Subject: [PATCH 44/75] fix: get_publication_date for orcid --- server/workers/orcid/src/orcid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py index 99c766065..142441f6f 100644 --- a/server/workers/orcid/src/orcid.py +++ b/server/workers/orcid/src/orcid.py @@ -459,7 +459,7 @@ def get_publication_date(work) -> str: if year is not np.nan: publication_date += str(int(year)) parsed_publication_date = publication_date - if month is not np.nan: + if month is not np.nan and month != "00": publication_date += "-" + str(int(month)) date_obj = parse(publication_date) parsed_publication_date = date_obj.strftime("%Y-%m") From 01510c499eacb87c03ddda3de5ff0ff3f48fdb10 Mon Sep 17 00:00:00 2001 From: modsen-hedgehog <111496612+modsen-hedgehog@users.noreply.github.com> Date: Mon, 2 Sep 2024 17:18:57 +0300 Subject: [PATCH 45/75] Visconnect prototype metrics (#768) * wip * orcid PDF OA state workaround * metrics wip * cleanup * restructuring of common.utils * metrics for orcid wip * code restructuring * metrics enrichment for orcid * calculate aggregated author metrics * fix metrics datatype * bugfix * bugfix * add metrics runner in R * new backend data structure for author metadata * feat: orcid metracs integration (#765) * Visconnect prototype metrics updates (#766) * feat: orcid metracs integration * feat: refactor scale map * feat: refactor orcid client * refactor: metrics and base workers * feat: add orcid integration tests * feat: configure testing environment * fix: data transformation and orcid authentication * refactor: orcid client integration * refactor: orcid client integration * minor: logging * minor: logging * feat: fix academic age calculation * feat: resolve comments after review * feat: improve orcid structure * feat: improve orcid structure * refactor and improve data model --------- Co-authored-by: chreman --- .docker.test.env | 26 + .flake8 | 5 + .gitignore | 11 + docker-compose-dataworker.yml | 19 - docker-compose-end2endtest.yml | 320 +++++++-- docker-compose-phptest.yml | 3 - docker-compose.yml | 37 +- local_dev/dev.env | 24 + local_dev/proxy/docker-compose.yml | 2 - local_dev/searchflow-container/Dockerfile | 2 +- pyproject.toml | 25 + requirements-dev.txt | 6 + .../classes/headstart/library/APIClient.php | 6 +- .../preprocessing/other-scripts/altmetrics.R | 80 --- server/preprocessing/other-scripts/metrics.R | 79 +++ .../preprocessing/other-scripts/run_metrics.R | 72 ++ .../other-scripts/test/params_base.json | 7 +- .../other-scripts/test/test_openaire.R | 4 +- .../other-scripts/text_similarity.R | 4 +- server/workers/api/.dockerignore | 11 + server/workers/api/Dockerfile | 21 +- server/workers/api/pyproject.toml | 4 + server/workers/api/requirements-e.txt | 2 + server/workers/api/requirements.txt | 8 +- server/workers/api/src/apis/base.py | 43 +- server/workers/api/src/apis/create_vis.py | 45 +- server/workers/api/src/apis/openaire.py | 70 +- server/workers/api/src/apis/orcid.py | 2 +- server/workers/api/src/apis/pubmed.py | 58 +- .../api/src/apis/request_validators.py | 13 +- server/workers/api/src/app.py | 33 +- server/workers/api/test.sh | 12 + .../workers/{common => api/tests}/__init__.py | 0 .../{orcid => api/tests/e2e}/__init__.py | 0 server/workers/api/tests/e2e/conftest.py | 11 + server/workers/api/tests/e2e/test_base.py | 10 + server/workers/api/tests/e2e/test_openaire.py | 10 + server/workers/api/tests/e2e/test_orcid.py | 10 + server/workers/api/tests/e2e/test_pubmed.py | 10 + server/workers/api/tests/mock_app.py | 59 ++ .../tests/test_data/digital-education.json | 10 + server/workers/base/Dockerfile | 9 +- server/workers/base/pyproject.toml | 4 + server/workers/base/requirements-e.txt | 2 + server/workers/base/src/base.py | 34 +- server/workers/common/common/__init__.py | 0 .../common}/contentproviders.json | 0 .../workers/common/{ => common}/decorators.py | 0 .../common/{ => common}/deduplication.py | 7 +- server/workers/common/common/proxy.py | 31 + .../workers/common/{ => common}/r_wrapper.py | 1 - server/workers/common/common/rate_limiter.py | 39 + .../{api/src/apis => common/common}/utils.py | 24 +- server/workers/common/pyproject.toml | 14 + server/workers/common/setup.py | 7 + server/workers/dataprocessing/.dockerignore | 10 + server/workers/dataprocessing/Dockerfile | 7 +- .../workers/dataprocessing/requirements-e.txt | 2 + .../workers/dataprocessing/src/headstart.py | 10 +- server/workers/metrics/.dockerignore | 10 + server/workers/metrics/Dockerfile | 170 +++++ server/workers/metrics/__init__.py | 0 server/workers/metrics/activate.R | 668 ++++++++++++++++++ server/workers/metrics/dependencies.R | 7 + server/workers/metrics/example_metrics.env | 8 + server/workers/metrics/renv.lock | 559 +++++++++++++++ server/workers/metrics/requirements-e.txt | 2 + server/workers/metrics/requirements.txt | 21 + .../run_orcid.py => metrics/run_metrics.py} | 17 +- server/workers/metrics/src/__init__.py | 0 server/workers/metrics/src/metrics.py | 108 +++ server/workers/openaire/.dockerignore | 10 + server/workers/openaire/Dockerfile | 16 +- server/workers/openaire/requirements-e.txt | 2 + server/workers/orcid/Dockerfile | 17 +- server/workers/orcid/pyproject.toml | 8 + server/workers/orcid/requirements-e.txt | 2 + server/workers/orcid/requirements.txt | 7 +- server/workers/orcid/src/config.py | 43 ++ server/workers/orcid/src/main.py | 52 ++ server/workers/orcid/src/model.py | 69 ++ server/workers/orcid/src/orcid.py | 618 ---------------- server/workers/orcid/src/orcid_service.py | 225 ++++++ .../orcid/src/repositories/__init__.py | 0 .../orcid/src/repositories/author_info.py | 130 ++++ .../workers/orcid/src/repositories/works.py | 186 +++++ server/workers/orcid/src/worker.py | 64 ++ server/workers/orcid/tests/__init__.py | 0 server/workers/orcid/tests/unit/__init__.py | 0 .../orcid/tests/unit/orcid_addresses.json | 116 +++ .../tests/unit/orcid_researcher_urls.json | 38 + .../workers/orcid/tests/unit/orcid_work.json | 172 +++++ .../orcid/tests/unit/personal_details.json | 30 + .../orcid/tests/unit/test_transform.py | 64 ++ server/workers/persistence/.dockerignore | 10 + server/workers/persistence/Dockerfile | 10 +- server/workers/persistence/__init__.py | 0 server/workers/persistence/requirements-e.txt | 2 + server/workers/persistence/src/app.py | 33 +- server/workers/persistence/src/database.py | 76 +- server/workers/pubmed/.dockerignore | 10 + server/workers/pubmed/Dockerfile | 10 +- server/workers/pubmed/requirements-e.txt | 2 + ...{Dockerfile_backend => Dockerfile.backend} | 0 .../{Dockerfile_tests => Dockerfile.tests} | 3 +- server/workers/tests/README.md | 4 +- server/workers/tests/mock_app.py | 26 +- server/workers/tests/test_end2end.py | 257 +++---- server/workers/tests/test_orcid.py | 183 +++++ vis/js/HeadstartRunner.js | 3 +- vis/js/actions/index.js | 7 +- vis/js/components/ContextLine.js | 136 ++-- vis/js/components/KnowledgeMap.js | 1 - vis/js/components/Modals.js | 2 + vis/js/components/Toolbar.js | 5 +- vis/js/dataprocessing/managers/DataManager.js | 87 ++- vis/js/default-config.js | 235 +----- vis/js/reducers/author.js | 29 + vis/js/reducers/index.js | 2 + vis/js/reducers/modals.js | 12 + .../contextfeatures/ResearcherInfo.jsx | 42 ++ vis/js/templates/listentry/OrcidMetrics.jsx | 62 ++ .../templates/listentry/StandardListEntry.jsx | 17 +- vis/js/templates/modals/InfoModal.jsx | 1 - .../templates/modals/ResearcherInfoModal.jsx | 65 ++ .../researcher-modal/OrcidResearcherInfo.jsx | 77 ++ vis/js/utils/data.js | 4 +- vis/stylesheets/modules/_map.scss | 2 +- vis/stylesheets/modules/list/_entry.scss | 2 +- vis/stylesheets/modules/list/_header.scss | 2 +- vis/stylesheets/modules/map/_header.scss | 5 + 131 files changed, 4730 insertions(+), 1508 deletions(-) create mode 100644 .docker.test.env create mode 100644 .flake8 delete mode 100644 docker-compose-dataworker.yml create mode 100644 local_dev/dev.env create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt delete mode 100644 server/preprocessing/other-scripts/altmetrics.R create mode 100644 server/preprocessing/other-scripts/metrics.R create mode 100644 server/preprocessing/other-scripts/run_metrics.R create mode 100644 server/workers/api/.dockerignore create mode 100644 server/workers/api/pyproject.toml create mode 100644 server/workers/api/requirements-e.txt create mode 100755 server/workers/api/test.sh rename server/workers/{common => api/tests}/__init__.py (100%) rename server/workers/{orcid => api/tests/e2e}/__init__.py (100%) create mode 100644 server/workers/api/tests/e2e/conftest.py create mode 100644 server/workers/api/tests/e2e/test_base.py create mode 100644 server/workers/api/tests/e2e/test_openaire.py create mode 100644 server/workers/api/tests/e2e/test_orcid.py create mode 100644 server/workers/api/tests/e2e/test_pubmed.py create mode 100644 server/workers/api/tests/mock_app.py create mode 100644 server/workers/api/tests/test_data/digital-education.json create mode 100644 server/workers/base/pyproject.toml create mode 100644 server/workers/base/requirements-e.txt create mode 100644 server/workers/common/common/__init__.py rename server/workers/{api/src/apis => common/common}/contentproviders.json (100%) rename server/workers/common/{ => common}/decorators.py (100%) rename server/workers/common/{ => common}/deduplication.py (97%) create mode 100644 server/workers/common/common/proxy.py rename server/workers/common/{ => common}/r_wrapper.py (99%) create mode 100644 server/workers/common/common/rate_limiter.py rename server/workers/{api/src/apis => common/common}/utils.py (83%) create mode 100644 server/workers/common/pyproject.toml create mode 100644 server/workers/common/setup.py create mode 100644 server/workers/dataprocessing/.dockerignore create mode 100644 server/workers/dataprocessing/requirements-e.txt create mode 100644 server/workers/metrics/.dockerignore create mode 100644 server/workers/metrics/Dockerfile create mode 100644 server/workers/metrics/__init__.py create mode 100644 server/workers/metrics/activate.R create mode 100644 server/workers/metrics/dependencies.R create mode 100644 server/workers/metrics/example_metrics.env create mode 100644 server/workers/metrics/renv.lock create mode 100644 server/workers/metrics/requirements-e.txt create mode 100644 server/workers/metrics/requirements.txt rename server/workers/{orcid/run_orcid.py => metrics/run_metrics.py} (51%) create mode 100644 server/workers/metrics/src/__init__.py create mode 100644 server/workers/metrics/src/metrics.py create mode 100644 server/workers/openaire/.dockerignore create mode 100644 server/workers/openaire/requirements-e.txt create mode 100644 server/workers/orcid/pyproject.toml create mode 100644 server/workers/orcid/requirements-e.txt create mode 100644 server/workers/orcid/src/config.py create mode 100644 server/workers/orcid/src/main.py create mode 100644 server/workers/orcid/src/model.py delete mode 100644 server/workers/orcid/src/orcid.py create mode 100644 server/workers/orcid/src/orcid_service.py create mode 100644 server/workers/orcid/src/repositories/__init__.py create mode 100644 server/workers/orcid/src/repositories/author_info.py create mode 100644 server/workers/orcid/src/repositories/works.py create mode 100644 server/workers/orcid/src/worker.py create mode 100644 server/workers/orcid/tests/__init__.py create mode 100644 server/workers/orcid/tests/unit/__init__.py create mode 100644 server/workers/orcid/tests/unit/orcid_addresses.json create mode 100644 server/workers/orcid/tests/unit/orcid_researcher_urls.json create mode 100644 server/workers/orcid/tests/unit/orcid_work.json create mode 100644 server/workers/orcid/tests/unit/personal_details.json create mode 100644 server/workers/orcid/tests/unit/test_transform.py create mode 100644 server/workers/persistence/.dockerignore create mode 100644 server/workers/persistence/__init__.py create mode 100644 server/workers/persistence/requirements-e.txt create mode 100644 server/workers/pubmed/.dockerignore create mode 100644 server/workers/pubmed/requirements-e.txt rename server/workers/tests/{Dockerfile_backend => Dockerfile.backend} (100%) rename server/workers/tests/{Dockerfile_tests => Dockerfile.tests} (79%) create mode 100644 server/workers/tests/test_orcid.py create mode 100644 vis/js/reducers/author.js create mode 100644 vis/js/templates/contextfeatures/ResearcherInfo.jsx create mode 100644 vis/js/templates/listentry/OrcidMetrics.jsx create mode 100644 vis/js/templates/modals/ResearcherInfoModal.jsx create mode 100644 vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx diff --git a/.docker.test.env b/.docker.test.env new file mode 100644 index 000000000..c6fb25bec --- /dev/null +++ b/.docker.test.env @@ -0,0 +1,26 @@ +# APP +FLASK_ENV=development +BEHIND_PROXY=False +SERVICE_VERSION=7d60186f594d420f82901f0514eb3c7e6b6e62d5 +LOGLEVEL=DEBUG +LOGFILE="/var/log/headstart/headstart.log" + +# DB +POSTGRES_USER=testuser +POSTGRES_PASSWORD=postgres +POSTGRES_HOST=db_server +POSTGRES_PORT=5434 +POSTGRES_DATABASE=testdb + +# REDIS +REDIS_HOST=redis-1 +REDIS_PORT=6355 +REDIS_PASSWORD=testredispassword +REDIS_DB=0 + +# R +R_BASE_APIKEY=5812aa4367eb1dc3d366d99fdaaef0e3 + +# ORCID +ORCID_CLIENT_ID="APP-DL8ZAR72EZWW15NX" +ORCID_CLIENT_SECRET="a3389b1a-19c0-4f78-857a-4aba19f5fa46" \ No newline at end of file diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..3867c3f86 --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +max-line-length = 100 +# required for compatibility with Black: +extend-ignore = E203 +exclude = .venv \ No newline at end of file diff --git a/.gitignore b/.gitignore index bb2f2ffb5..e5b535348 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,15 @@ server/preprocessing/other-scripts/renv /nbproject/private/ /server/nbproject/private/ .pytest_cache +*.egg-info/ *__pycache__* + +# Mac files +.DS_Store + +# personal +.vscode/settings.json +# temporal +local_dev/config_local_headstart.ini +local_dev/config_local_searchflow.ini + diff --git a/docker-compose-dataworker.yml b/docker-compose-dataworker.yml deleted file mode 100644 index 9ca1e3804..000000000 --- a/docker-compose-dataworker.yml +++ /dev/null @@ -1,19 +0,0 @@ -version: '3.7' - -services: - - dataprocessing: - image: dataprocessing:${SERVICE_VERSION} - env_file: - - server/workers/dataprocessing/dataprocessing.env - environment: - SERVICE_VERSION: "${SERVICE_VERSION}" - REDIS_HOST: "${REDIS_HOST}" - REDIS_PORT: "${REDIS_PORT}" - REDIS_DB: "${REDIS_DB}" - REDIS_PASSWORD: "${REDIS_PASSWORD}" - restart: always - volumes: - - /opt/local/renv/cache:/renv/cache - - /var/log/headstart:/var/log/headstart - network_mode: host \ No newline at end of file diff --git a/docker-compose-end2endtest.yml b/docker-compose-end2endtest.yml index f6d07a283..891230de3 100644 --- a/docker-compose-end2endtest.yml +++ b/docker-compose-end2endtest.yml @@ -1,23 +1,22 @@ -# docker-compose-end2endtest.yml - -version: '3.7' services: end2endtest: build: context: ./server/workers/tests - dockerfile: ./Dockerfile_tests + dockerfile: ./Dockerfile.tests container_name: end2endtest hostname: "end2endtest" environment: - POSTGRES_USER: "testuser" - POSTGRES_PASSWORD: "testpassword" - POSTGRES_HOST: "test_db" - POSTGRES_PORT: 5432 - DEFAULT_DATABASE: "testdb" + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_HOST: ${POSTGRES_HOST} + POSTGRES_PORT: ${POSTGRES_PORT} + POSTGRES_HOSTS: ${POSTGRES_HOST} + POSTGRES_PORTS: ${POSTGRES_PORT} + DEFAULT_DATABASE: ${POSTGRES_DATABASE} SERVICE_VERSION: "test_version" ports: - - "0.0.0.0:5000:5000" + - "7575:80" volumes: - ./server/:/app depends_on: @@ -28,40 +27,72 @@ services: networks: - test - backend: - container_name: backend - hostname: "backend" - build: - context: ./server/workers/tests - dockerfile: ./Dockerfile_backend - volumes: - - ./server/:/var/www/html/server - - ./server/workers/tests/test_data/test.sqlite:/var/www/localstorage/test.sqlite + db: + container_name: ${POSTGRES_HOST} + image: 'postgres:12.2-alpine' restart: "no" + hostname: ${POSTGRES_HOST} + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_PORT: ${POSTGRES_PORT} + DEFAULT_DATABASE: ${POSTGRES_DATABASE} + command: postgres -c config_file=/etc/postgresql.conf -c hba_file=/etc/pg_hba.conf + volumes: + - ./server/workers/tests/test_data/pg_hba_test_local.conf:/etc/pg_hba.conf + - ./server/workers/tests/test_data/postgresql_test_local.conf:/etc/postgresql.conf + ports: + - "${POSTGRES_PORT}:${POSTGRES_PORT}" networks: - test - ports: - - "80:80" + + redis: + image: 'redis:6.0-alpine' + restart: unless-stopped + hostname: "${REDIS_HOST}" + # container_name: "${REDIS_HOST}" + environment: + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + command: ["redis-server", "/etc/redis/redis.conf", "--bind", "${REDIS_HOST}", "--port", "${REDIS_PORT}"] + volumes: + - 'redis:/var/lib/redis/data' + - ./local_dev/redis.conf:/etc/redis/redis.conf + ports: + - "127.0.0.1:${REDIS_PORT}:${REDIS_PORT}" + networks: + - test api: build: - context: server - dockerfile: workers/api/Dockerfile + context: server/workers + dockerfile: api/Dockerfile + container_name: api + hostname: "api" restart: unless-stopped environment: - SERVICE_VERSION: "test" - BEHIND_PROXY: "false" - DEFAULT_DATABASE: "testdb" - FLASK_ENV: "development" - volumes: - - ./server/workers/tests/mock_app.py:/app/mock_app.py - command: ["python", "mock_app.py"] + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + REDIS_DB: "${REDIS_DB}" + BEHIND_PROXY: "${BEHIND_PROXY}" + FLASK_ENV: "${FLASK_ENV}" + POSTGRES_USER: "${POSTGRES_USER}" + POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}" + POSTGRES_HOST: "${POSTGRES_HOST}" + DEFAULT_DATABASE: "${POSTGRES_DATABASE}" + command: ["python", "api/tests/mock_app.py"] + depends_on: + - redis + - db networks: - test persistence: - container_name: api - hostname: "test_api" + container_name: persistence + hostname: "persistence" build: context: server dockerfile: workers/persistence/Dockerfile @@ -69,37 +100,230 @@ services: environment: SERVICE_VERSION: "test" BEHIND_PROXY: "false" - DEFAULT_DATABASE: "testdb" FLASK_ENV: "development" + POSTGRES_HOSTS: ${POSTGRES_HOST} + POSTGRES_PORTS: ${POSTGRES_PORT} + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + DEFAULT_DATABASE: ${POSTGRES_DATABASE} volumes: - - ./server/workers/tests/mock_app.py:/app/mock_app.py + - ./server/workers/tests/mock_app.py:/persistence/mock_app.py command: ["python", "mock_app.py"] networks: - test - db: - container_name: test_db - image: 'postgres:12.2-alpine' + backend: + build: + context: ./server/workers/tests + dockerfile: ./Dockerfile.backend + container_name: backend + hostname: "backend" + volumes: + - ./server/:/var/www/html/server + - ./server/workers/tests/test_data/test.sqlite:/var/www/localstorage/test.sqlite restart: "no" - hostname: "db_server" + networks: + - test + depends_on: + - api + ports: + - "80:80" + + orcid: + build: + context: server + dockerfile: workers/orcid/Dockerfile + restart: unless-stopped environment: - POSTGRES_USER: "testuser" - POSTGRES_PASSWORD: "testpassword" - POSTGRES_HOST: "db_server" - POSTGRES_PORT: 5432 - DEFAULT_DATABASE: "testdb" - command: postgres -c config_file=/etc/postgresql.conf -c hba_file=/etc/pg_hba.conf + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + LOGLEVEL: "${LOGLEVEL}" + LC_ALL: "en_US.UTF-8" + LANG: "en_US.UTF-8" + PYTHONIOENCODING: "utf-8" + ORCID_CLIENT_ID: "${ORCID_CLIENT_ID}" + ORCID_CLIENT_SECRET: "${ORCID_CLIENT_SECRET}" + depends_on: + - redis + networks: + - test + + dataprocessing: + build: + context: server + dockerfile: workers/dataprocessing/Dockerfile + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + LOGLEVEL: "${LOGLEVEL}" + LOGFILE: "${LOGFILE}" + RENV_VERSION: 0.14.0-5 + CRAN_REPOS: https://cran.wu.ac.at + LC_ALL: "en_US.UTF-8" + LANG: "en_US.UTF-8" + RENV_PATHS_CACHE: /renv/cache + PYTHONIOENCODING: "utf-8" + restart: unless-stopped volumes: - # - db_data:/var/lib/postgresql/data - - ./server/workers/tests/test_data/pg_hba_test_local.conf:/etc/pg_hba.conf - - ./server/workers/tests/test_data/postgresql_test_local.conf:/etc/postgresql.conf + - ./local_dev/renv/cache:/renv/cache + - /var/log/headstart:/var/log/headstart + - ./server/preprocessing/other-scripts:/headstart/other-scripts + depends_on: + - redis + networks: + - test + + base: + build: + context: server + dockerfile: workers/base/Dockerfile + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + LOGLEVEL: "${LOGLEVEL}" + LOGFILE: "/var/log/headstart/headstart.log" + RENV_VERSION: 0.14.0-5 + CRAN_REPOS: https://cran.wu.ac.at + LC_ALL: "en_US.UTF-8" + LANG: "en_US.UTF-8" + RENV_PATHS_CACHE: /renv/cache + PYTHONIOENCODING: "utf-8" + R_BASE_APIKEY: "${R_BASE_APIKEY}" + restart: unless-stopped + volumes: + - ./local_dev/renv/cache:/renv/cache + - /var/log/headstart:/var/log/headstart + - ./server/preprocessing/other-scripts:/headstart/other-scripts + depends_on: + - redis + networks: + - test + + pubmed: + build: + context: server + dockerfile: workers/pubmed/Dockerfile + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + LOGLEVEL: "${LOGLEVEL}" + LOGFILE: "${LOGFILE}" + RENV_VERSION: 0.14.0-5 + CRAN_REPOS: https://cran.wu.ac.at + LC_ALL: "en_US.UTF-8" + LANG: "en_US.UTF-8" + RENV_PATHS_CACHE: /renv/cache + PYTHONIOENCODING: "utf-8" + restart: unless-stopped + volumes: + - ./local_dev/renv/cache:/renv/cache + - /var/log/headstart:/var/log/headstart + - ./server/preprocessing/other-scripts:/headstart/other-scripts + depends_on: + - redis + networks: + - test + + openaire: + build: + context: server + dockerfile: workers/openaire/Dockerfile + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + LOGLEVEL: "${LOGLEVEL}" + LOGFILE: "${LOGFILE}" + RENV_VERSION: 0.14.0-5 + CRAN_REPOS: https://cran.wu.ac.at + LC_ALL: "en_US.UTF-8" + LANG: "en_US.UTF-8" + RENV_PATHS_CACHE: /renv/cache + PYTHONIOENCODING: "utf-8" + restart: unless-stopped + volumes: + - ./local_dev/renv/cache:/renv/cache + - /var/log/headstart:/var/log/headstart + - ./server/preprocessing/other-scripts:/headstart/other-scripts + depends_on: + - redis + networks: + - test + + metrics: + build: + context: server + dockerfile: workers/metrics/Dockerfile + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + LOGLEVEL: "${LOGLEVEL}" + LOGFILE: "${LOGFILE}" + RENV_VERSION: 0.14.0-5 + CRAN_REPOS: https://cran.wu.ac.at + LC_ALL: "en_US.UTF-8" + LANG: "en_US.UTF-8" + RENV_PATHS_CACHE: /renv/cache + PYTHONIOENCODING: "utf-8" + restart: unless-stopped + volumes: + - ./local_dev/renv/cache:/renv/cache + - /var/log/headstart:/var/log/headstart + - ./server/preprocessing/other-scripts:/headstart/other-scripts + depends_on: + - redis + networks: + - test + + searchflow: + build: local_dev/searchflow-container + volumes: + - ../project-website:/var/www/html + - ./local_dev/config_local_projectwebsite.php:/var/www/html/config_local.php + - ../search-flow/:/var/www/html/search-flow + - ./local_dev/config_local_searchflow.ini:/var/www/html/search-flow/config_local.ini + - ../Headstart:/var/www/html/headstart + - ./local_dev/config_local_headstart.ini:/var/www/html/headstart/server/preprocessing/conf/config_local.ini + - ./local_dev/entrypoint.php:/var/www/html/entrypoint.php ports: - - "5432:5432" + - 127.0.0.1:8085:80 networks: - test + # proxy: + # image: 'nginx' + # volumes: + # - ./templates:/etc/nginx/templates + # environment: + # - NGINX_PORT=80 + # ports: + # - '127.0.0.1:8081:80' + # networks: + # - dev_headstart + volumes: + redis: db_data: + driver: local + api_cache: + driver: local networks: test: diff --git a/docker-compose-phptest.yml b/docker-compose-phptest.yml index 6c423a42d..d6835f546 100644 --- a/docker-compose-phptest.yml +++ b/docker-compose-phptest.yml @@ -1,6 +1,3 @@ -# docker-compose-phptest.yml - -version: '3.7' services: composer: image: composer:2.5.8 diff --git a/docker-compose.yml b/docker-compose.yml index 0af7ee29d..cdd88e267 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.7' - services: db: @@ -29,7 +27,7 @@ services: - 'redis:/var/lib/redis/data' - ./local_dev/redis.conf:/etc/redis/redis.conf ports: - - "127.0.0.1:${REDIS_PORT}:6379" + - "127.0.0.1:${REDIS_PORT}:${REDIS_PORT}" networks: - headstart @@ -47,15 +45,16 @@ services: BEHIND_PROXY: "${BEHIND_PROXY}" DEFAULT_DATABASE: "${DEFAULT_DATABASE}" FLASK_ENV: "${FLASK_ENV}" - command: ["python", "app.py"] volumes: - ./api_cache:/var/api_cache - - ./server/workers/api/src:/api + - ./server/workers/api/src:/app/api/src + - ./server/workers/common:/app/common depends_on: - redis - base - pubmed - openaire + - orcid networks: - headstart @@ -215,6 +214,34 @@ services: networks: - headstart + metrics: + build: + context: server + dockerfile: workers/metrics/Dockerfile + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + LOGLEVEL: "${LOGLEVEL}" + LOGFILE: "${LOGFILE}" + RENV_VERSION: 0.14.0-5 + CRAN_REPOS: https://cran.wu.ac.at + LC_ALL: "en_US.UTF-8" + LANG: "en_US.UTF-8" + RENV_PATHS_CACHE: /renv/cache + PYTHONIOENCODING: "utf-8" + restart: unless-stopped + volumes: + - ./local_dev/renv/cache:/renv/cache + - /var/log/headstart:/var/log/headstart + - ./server/preprocessing/other-scripts:/headstart/other-scripts + depends_on: + - redis + networks: + - headstart + searchflow: build: local_dev/searchflow-container volumes: diff --git a/local_dev/dev.env b/local_dev/dev.env new file mode 100644 index 000000000..5a6371873 --- /dev/null +++ b/local_dev/dev.env @@ -0,0 +1,24 @@ +COMPOSE_PROJECT_NAME=dev +SERVICE_VERSION=7d60186f594d420f82901f0514eb3c7e6b6e62d5 +POSTGRES_DB=postgres +POSTGRES_USER=headstart +POSTGRES_PASSWORD=testpgpassword +POSTGRES_HOSTS=dev-db-1 +POSTGRES_PORTS=5432 +POSTGRES_HOSTNAME=dev-db-1 +API_PORT=5001 +REDIS_HOST=dev-redis-1 +REDIS_PORT=6379 +REDIS_DB=0 +REDIS_PASSWORD=testredispassword +LOGLEVEL=DEBUG +LOGFILE="/var/log/headstart/headstart.log" + +BEHIND_PROXY=True +DEFAULT_DATABASE=dev +FLASK_ENV=development +COMPOSE_HTTP_TIMEOUT=300 + +R_BASE_APIKEY=5812aa4367eb1dc3d366d99fdaaef0e3 +ORCID_CLIENT_ID="APP-DL8ZAR72EZWW15NX" +ORCID_CLIENT_SECRET="a3389b1a-19c0-4f78-857a-4aba19f5fa46" \ No newline at end of file diff --git a/local_dev/proxy/docker-compose.yml b/local_dev/proxy/docker-compose.yml index aeb843da2..95bbf6f28 100644 --- a/local_dev/proxy/docker-compose.yml +++ b/local_dev/proxy/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.7' - services: proxy: diff --git a/local_dev/searchflow-container/Dockerfile b/local_dev/searchflow-container/Dockerfile index 81ecdb00a..6b5c16486 100644 --- a/local_dev/searchflow-container/Dockerfile +++ b/local_dev/searchflow-container/Dockerfile @@ -1,6 +1,6 @@ FROM php:8.0-apache -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" +LABEL maintainer="Chris Kittel " RUN a2enmod rewrite RUN apt-get update && apt-get install -y curl libsqlite3-dev php7.4-sqlite libonig-dev libxml2-dev diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..1b89800d7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.black] +line-length = 100 +target-version = ['py38'] + +[tool.pylint."messages control"] +ignore = ["setup.py", "__init__.py"] +disable = "all" +enable = [ + "empty-docstring", + "missing-class-docstring", + "missing-function-docstring", + "missing-module-docstring" +] + +[tool.isort] +profile = "black" +known_first_party = ["mycorp"] # see package configuration below + +[tool.pyright] +reportMissingTypeArgument = true # Report generic classes used without type arguments +strictListInference = true # Use union types when inferring types of lists elements, instead of Any diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..1366d6300 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,6 @@ +black==22.3.0 +flake8==4.0.1 +isort==5.10.1 +pyright==1.1.239 +pytest==7.0.1 +pytest-cov==3.0.0 # Coverage extension \ No newline at end of file diff --git a/server/classes/headstart/library/APIClient.php b/server/classes/headstart/library/APIClient.php index bac1a0482..a49ab3334 100644 --- a/server/classes/headstart/library/APIClient.php +++ b/server/classes/headstart/library/APIClient.php @@ -15,10 +15,8 @@ public function load_configs($ini_array) { $this->settings = $this->ini_array["general"]; $this->database = $this->ini_array["connection"]["database"]; $api_url = $this->ini_array["general"]["api_url"]; - $api_flavor = isset($this->ini_array["general"]["api_flavor"]) - ? ($this->ini_array["general"]["api_flavor"]) - : "stable"; - $this->base_route = $api_url . $api_flavor . "/"; + $api_flavor = $this->ini_array["general"]["api_flavor"] ?? ""; + $this->base_route = $api_url . ($api_flavor ? $api_flavor . "/" : ""); } public function call_api($endpoint, $payload) { diff --git a/server/preprocessing/other-scripts/altmetrics.R b/server/preprocessing/other-scripts/altmetrics.R deleted file mode 100644 index 816031f93..000000000 --- a/server/preprocessing/other-scripts/altmetrics.R +++ /dev/null @@ -1,80 +0,0 @@ -library('rAltmetric') -library('rcrossref') - -alog <- getLogger('altmetrics') - - -enrich_output_json <- function(output_json){ - start.time <- Sys.time() - - output<- fromJSON(output_json) - - results <- get_altmetrics(output$doi) - - if (nrow(results) > 0){ - if (!("cited_by_tweeters_count" %in% names(results))) { - results[["cited_by_tweeters_count"]] = NA - } - if (!("readers.mendeley" %in% names(results))) { - results[["readers.mendeley"]] = NA - } - results <- results[c('doi', 'cited_by_tweeters_count', 'readers.mendeley')] - output <- merge(x = output, y = results, by='doi', all.x=TRUE) - } else { - output$'cited_by_tweeters_count' <- NA - output$'readers.mendeley' <- NA - alog$info("No altmetrics found for any paper in this dataset.") - } - output <- add_citations(output) - - #Remove duplicate lines - TODO: check for root of this problem - output = unique(output) - - output$cited_by_tweeters_count[is.na(output$cited_by_tweeters_count)] <- "N/A" - output$citation_count[is.na(output$citation_count)] <- "N/A" - output$'readers.mendeley'[is.na(output$'readers.mendeley')] <- "N/A" - - output_json <- toJSON(output) - - end.time <- Sys.time() - time.taken <- end.time - start.time - alog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Time taken:", time.taken, sep=" ")) - - return (output_json) -} - -get_altmetrics <- function(dois){ - valid_dois <- which(dois!="") - ids <- list(c(dois[valid_dois])) - results <- data.frame() - for (doi in dois[valid_dois]){ - tryCatch({ - metrics <- altmetric_data(altmetrics(doi=doi)) - results <- rbind.fill(results, metrics) - }, error = function(err){ - alog$debug(gsub("[\r\n]", "", paste(err, doi, sep=" "))) - }) - } - return (results) -} - -add_citations <- function(output){ - dois <- output$doi - valid_dois <- which(dois!="") - - # doc_parse_raw exception hotfix - #cit_count = cr_citation_count(doi=dois[valid_dois], async=TRUE) - cit_count = data.frame() - for (doi in dois[valid_dois]){ - cc <- tryCatch({ - cr_citation_count(doi=doi, async=TRUE) - }, error = function(err){ - alog$debug(gsub("[\r\n]", "", paste(err, doi, sep=" "))) - return(list(doi=doi, count=NA)) - }) - cit_count <- rbind(cit_count, cc) - } - output = merge(x=output, y=cit_count, by='doi', all.x = TRUE) - names(output)[names(output)=="count"] <- "citation_count" - return (output) -} diff --git a/server/preprocessing/other-scripts/metrics.R b/server/preprocessing/other-scripts/metrics.R new file mode 100644 index 000000000..119550db6 --- /dev/null +++ b/server/preprocessing/other-scripts/metrics.R @@ -0,0 +1,79 @@ +library('rAltmetric') +library('rcrossref') +library("plyr") + +alog <- getLogger('metrics') + + +enrich_metadata_metrics <- function(metadata){ + start.time <- Sys.time() + + results <- get_altmetrics(metadata$doi) + requested_metrics <- c("cited_by_wikipedia_count", + "cited_by_msm_count", + "cited_by_policies_count", + "cited_by_patents_count", + "cited_by_accounts_count") + + if (nrow(results) > 0){ + for (metric in requested_metrics){ + if (!(metric %in% names(results))){ + results[[metric]] = NA + } + } + requested_metrics <- c("doi", requested_metrics) + results <- results[requested_metrics] + # only add the metrics that are requested where the DOI exists + # merge the metadata with the results of the altmetrics + # don't remove any rows from the metadata, just add the altmetrics to the + # output + output <- merge(x=metadata, y=results, by='doi', all.x = TRUE, all.y = FALSE) + } else { + for (metric in requested_metrics){ + metadata[[metric]] = NA + } + alog$info("No altmetrics found for any paper in this dataset.") + output <- metadata + } + output <- add_citations(output) + + #Remove duplicate lines - TODO: check for root of this problem + output = unique(output) + + output_json <- toJSON(output) + + end.time <- Sys.time() + time.taken <- end.time - start.time + alog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Time taken:", time.taken, sep=" ")) + + return (output_json) +} + +get_altmetrics <- function(dois){ + valid_dois <- unique(dois[which(dois!="")]) + results <- data.frame() + for (doi in valid_dois){ + tryCatch({ + metrics <- altmetric_data(altmetrics(doi=doi, apikey="")) + results <- rbind.fill(results, metrics) + }, error = function(err){ + alog$debug(gsub("[\r\n]", "", paste(err, doi, sep=" "))) + }) + } + return (results) +} + +add_citations <- function(output){ + dois <- output$doi + valid_dois <- unique(dois[which(dois!="")]) + + cc <- tryCatch({ + cr_citation_count(doi=valid_dois, async=TRUE) + }, error = function(err){ + alog$debug(gsub("[\r\n]", "", paste(err, doi, sep=" "))) + return(list(doi=dois, count=NA)) + }) + names(cc)[names(cc)=="count"] <- "citation_count" + output = merge(x=output, y=cc, by='doi', all.x = TRUE) + return (output) +} diff --git a/server/preprocessing/other-scripts/run_metrics.R b/server/preprocessing/other-scripts/run_metrics.R new file mode 100644 index 000000000..f33e00a7f --- /dev/null +++ b/server/preprocessing/other-scripts/run_metrics.R @@ -0,0 +1,72 @@ +rm(list = ls()) + +args <- commandArgs(TRUE) +wd <- args[1] +query <- args[2] +service <- args[3] + +setwd(wd) #Don't forget to set your working directory + +renv::activate() +renv::restore(lockfile = '../renv.lock') +Sys.setlocale(category="LC_ALL", locale = "en_US.UTF-8") + +library(jsonlite) +library(logging) +library(doParallel) + +source('utils.R') +if (Sys.getenv("LOGLEVEL") == "DEBUG") { + DEBUG <- FALSE +} else { + DEBUG <- TRUE +} + +if (DEBUG==TRUE){ + setup_logging('DEBUG') +} else { + setup_logging('INFO') +} + + +tslog <- getLogger('ts') + +f <- file("stdin") +open(f) +data = fromJSON(readLines(f)) +params <- data$params +if ("q_advanced" %in% names(params)) { + params$q_advanced <- sanitize_query(params$q_advanced)$sanitized_query +} +metadata <- fromJSON(data$metadata) + + +if (!is.null(params$lang_id)) { + lang_id <- params$lang_id +} else { + lang_id <- 'all' +} + +source("utils.R") +source('metrics.R') + +registerDoParallel(detectCores(all.tests = FALSE, logical = TRUE)-1) +.GlobalEnv$VIS_ID <- params$vis_id + +failed <- list(params=params) +tryCatch({ + metadata <- enrich_metadata_metrics(metadata) +}, error=function(err){ + tslog$error(gsub("\n", " ", paste("Metric enrichment failed", service, paste(params, collapse=" "), err, sep="||"))) + failed$query <<- params$q + failed$query_reason <<- err$message +}) + +if (exists('metadata')) { + print(toJSON(metadata)) + print(toJSON(metadata)) +} else { + output_json <- detect_error(failed, service, params) + print(output_json) + print(output_json) +} diff --git a/server/preprocessing/other-scripts/test/params_base.json b/server/preprocessing/other-scripts/test/params_base.json index 01fb30795..4cc496534 100644 --- a/server/preprocessing/other-scripts/test/params_base.json +++ b/server/preprocessing/other-scripts/test/params_base.json @@ -1,10 +1,11 @@ { - "document_types":["121", "7", "13", "14", "15", "16", "17", "18", "6"], + "document_types":["4", "11", "111", "13", "16", "7", "5", "12", "121", "122", "17", "19", "3", "52", "2", "F", "1A", "14", "15", "6", "51", "1", "18", "181", "183", "182"], "from":"1665-01-01", "to":"2023-12-07", "sorting":"most-relevant", "vis_id": "TEST_ID", - "min_descsize": 300, + "min_descsize": 0, "limit": 120, - "list_size": 100 + "list_size": 100, + "q_advanced": "(dcdoi:\"10.5281/zenodo.3999345\" OR dcdoi:\"10.5281/zenodo.3935964\" OR dcdoi:\"10.5281/zenodo.3935963\" OR dcdoi:\"10.1371/journal.pcbi.1007704\" OR dcdoi:\"10.5281/zenodo.4317253\" OR dcdoi:\"10.5281/zenodo.3641795\")" } diff --git a/server/preprocessing/other-scripts/test/test_openaire.R b/server/preprocessing/other-scripts/test/test_openaire.R index 187387b94..c7a7ec743 100644 --- a/server/preprocessing/other-scripts/test/test_openaire.R +++ b/server/preprocessing/other-scripts/test/test_openaire.R @@ -27,7 +27,7 @@ tslog <- getLogger('ts') source("vis_layout.R") source('openaire.R') -source('altmetrics.R') +source('metrics.R') MAX_CLUSTERS = 15 @@ -68,7 +68,7 @@ if(exists('input_data')) { if (!exists('output_json')) { output_json <- detect_error(failed, service, params) } else if (service=='openaire' && exists('output_json')) { - output_json <- enrich_output_json(output_json) + output_json <- enrich_metadata_metrics(output_json) } #print(output_json) diff --git a/server/preprocessing/other-scripts/text_similarity.R b/server/preprocessing/other-scripts/text_similarity.R index 9395dae55..effb5d0fe 100644 --- a/server/preprocessing/other-scripts/text_similarity.R +++ b/server/preprocessing/other-scripts/text_similarity.R @@ -20,7 +20,7 @@ if (DEBUG==TRUE){ tslog <- getLogger('ts') source(paste("../other-scripts/vis_layout.R", sep="")) -source('../other-scripts/altmetrics.R') +source('../other-scripts/metrics.R') if(!is.null(params_file) && !is.na(params_file)) { @@ -94,7 +94,7 @@ if(exists('input_data')) { if (!exists('output_json')) { output_json <- detect_error(failed, service, params) } else if (service=='openaire' && exists('output_json')) { - output_json <- enrich_output_json(output_json) + output_json <- enrich_metadata_metrics(output_json) } print(output_json) diff --git a/server/workers/api/.dockerignore b/server/workers/api/.dockerignore new file mode 100644 index 000000000..86d0f8499 --- /dev/null +++ b/server/workers/api/.dockerignore @@ -0,0 +1,11 @@ +.cache + +# python +.venv +__pycache__ +.pytest_cache +*.ipynb +.pynb_checkpoints + +# macOS +.DS_Store \ No newline at end of file diff --git a/server/workers/api/Dockerfile b/server/workers/api/Dockerfile index 01418d71f..54c2befc8 100644 --- a/server/workers/api/Dockerfile +++ b/server/workers/api/Dockerfile @@ -1,14 +1,23 @@ FROM python:3.8 -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" +LABEL maintainer="Chris Kittel " RUN apt-get update RUN apt-get install -y gcc git libpq-dev -WORKDIR /api -COPY workers/api/requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt -RUN pip install git+https://github.com/python-restx/flask-restx -COPY workers/api/src/ ./ +WORKDIR /app +COPY ./workers/api/requirements.txt ./api/requirements.txt +RUN pip install --no-cache-dir -r ./api/requirements.txt + +COPY ./workers/common ./common + +COPY ./workers/api/requirements-e.txt ./api/requirements-e.txt + +RUN cd api && pip install --no-cache-dir -r requirements-e.txt && cd .. + +COPY ./workers/api/src ./api/src +COPY ./workers/api/tests ./api/tests + +CMD ["python", "api/src/app.py"] diff --git a/server/workers/api/pyproject.toml b/server/workers/api/pyproject.toml new file mode 100644 index 000000000..a650eb415 --- /dev/null +++ b/server/workers/api/pyproject.toml @@ -0,0 +1,4 @@ +[project] +name = "api" +version = "0.1.0" +description = "API microservice." diff --git a/server/workers/api/requirements-e.txt b/server/workers/api/requirements-e.txt new file mode 100644 index 000000000..a35d9cfef --- /dev/null +++ b/server/workers/api/requirements-e.txt @@ -0,0 +1,2 @@ +# Include the common package from the local file system +-e ../common \ No newline at end of file diff --git a/server/workers/api/requirements.txt b/server/workers/api/requirements.txt index 397f84dbd..df7b256f6 100644 --- a/server/workers/api/requirements.txt +++ b/server/workers/api/requirements.txt @@ -19,9 +19,9 @@ jsonschema==3.2.0 MarkupSafe==2.1.3 marshmallow==3.14.1 mistune==2.0.5 -numpy==1.19.5 +numpy==1.24.4 packaging==21.3 -pandas==1.1.5 +pandas==1.3.1 psycopg2-binary==2.9.8 pyparsing==3.1.1 pyrsistent==0.18.0 @@ -30,6 +30,8 @@ pytz==2023.3.post1 PyYAML==6.0.1 redis==4.3.6 six==1.16.0 -typing-extensions==4.1.1 +typing-extensions==4.2.0 Werkzeug==3.0.1 zipp==3.6.0 +# # Include the common package from the local file system +# -e ../common \ No newline at end of file diff --git a/server/workers/api/src/apis/base.py b/server/workers/api/src/apis/base.py index d2a91f642..a45fcf81a 100644 --- a/server/workers/api/src/apis/base.py +++ b/server/workers/api/src/apis/base.py @@ -7,9 +7,9 @@ from flask import request, make_response, jsonify, abort, g from flask_restx import Namespace, Resource, fields from .request_validators import SearchParamSchema -from apis.utils import get_key, redis_store, contentprovider_lookup - +from common.utils import get_key, redis_store, get_or_create_contentprovider_lookup +contentprovider_lookup = get_or_create_contentprovider_lookup() base_ns = Namespace("base", description="BASE API operations") search_param_schema = SearchParamSchema() @@ -40,7 +40,16 @@ "raw": fields.Boolean(example="false", description='raw results from ElasticSearch')}) - +# Utility function to set response headers +def set_response_headers(accept_header, is_raw, result, filename): + headers = {} + if accept_header == "application/json": + headers["Content-Type"] = "application/json" + elif accept_header == "text/csv": + result = pd.read_json(json.loads(result)).to_csv() if is_raw else pd.read_json(json.loads(result)).to_csv() + headers["Content-Type"] = "text/csv" + headers["Content-Disposition"] = f"attachment; filename={filename}.csv" + return result, headers @base_ns.route('/search') class Search(Resource): @@ -69,31 +78,17 @@ def post(self): base_ns.logger.debug(errors) if errors: abort(400, str(errors)) - k = str(uuid.uuid4()) - d = {"id": k, "params": params, + request_id = str(uuid.uuid4()) + d = {"id": request_id, "params": params, "endpoint": "search"} base_ns.logger.debug(d) redis_store.rpush("base", json.dumps(d)) q_len = redis_store.llen("base") - base_ns.logger.debug("Queue length: %s %d %s" %("base", q_len, k)) - result = get_key(redis_store, k, 300) + base_ns.logger.debug("Queue length: %s %d %s" %("base", q_len, request_id)) + result = get_key(redis_store, request_id, 300) try: - headers = {} - if request.headers["Accept"] == "application/json": - headers["Content-Type"] = "application/json" - if request.headers["Accept"] == "text/csv": - if params.get("raw") is True: - df = pd.read_json(json.loads(result)) - result = df.to_csv() - else: - result = pd.read_json(json.loads(result)).to_csv() - headers["Content-Type"] = "text/csv" - headers["Content-Disposition"] = "attachment; filename={0}.csv".format(k) - if params.get("raw") is True: - headers["Content-Type"] = "application/json" - return make_response(result, - 200, - headers) + result, headers = set_response_headers(request.headers["Accept"], params.get("raw"), result, request_id) + return make_response(result, 200, headers) except Exception as e: base_ns.logger.error(e) abort(500, "Problem encountered, check logs.") @@ -127,8 +122,6 @@ def post(self): abort(500, "Problem encountered, check logs.") - -@base_ns.route('/service_version') class ServiceVersion(Resource): def get(self): result = {"service_version": os.getenv("SERVICE_VERSION")} diff --git a/server/workers/api/src/apis/create_vis.py b/server/workers/api/src/apis/create_vis.py index b4b089b5a..583b8303c 100644 --- a/server/workers/api/src/apis/create_vis.py +++ b/server/workers/api/src/apis/create_vis.py @@ -10,7 +10,7 @@ from flask import request, make_response, jsonify, abort from flask_restx import Namespace, Resource, fields from .request_validators import SearchParamSchema -from apis.utils import get_key +from common.utils import get_key from apis.base import base_querymodel @@ -24,10 +24,23 @@ } redis_store = redis.StrictRedis(**redis_config) -input_model = vis_ns.model("InputModel", - {"params": fields.Nested(base_querymodel), - "input_data": fields.String()}) +input_model = vis_ns.model( + "InputModel", + { + "params": fields.Nested(base_querymodel), + "input_data": fields.String() + }, +) +def set_response_headers(accept_header, is_raw, result, filename): + headers = {} + if accept_header == "application/json": + headers["Content-Type"] = "application/json" + elif accept_header == "text/csv": + result = pd.read_json(json.loads(result)).to_csv() if is_raw else pd.read_json(json.loads(result)).to_csv() + headers["Content-Type"] = "text/csv" + headers["Content-Disposition"] = f"attachment; filename={filename}.csv" + return result, headers @vis_ns.route('/create') class Create(Resource): @@ -42,28 +55,16 @@ def post(self): params = data["params"] vis_ns.logger.debug(params) input_data = data["input_data"] - k = str(uuid.uuid4()) - d = {"id": k, "params": params, + request_id = str(uuid.uuid4()) + d = {"id": request_id, "params": params, "input_data": input_data} redis_store.rpush("input_data", json.dumps(d).encode('utf8')) q_len = redis_store.llen("input_data") - vis_ns.logger.debug("Queue length: %s %d %s" %("input_data", q_len, k)) - result = get_key(redis_store, k) + vis_ns.logger.debug("Queue length: %s %d %s" %("input_data", q_len, request_id)) + result = get_key(redis_store, request_id) try: - headers = {} - if request.headers["Accept"] == "application/json": - headers["Content-Type"] = "application/json" - if request.headers["Accept"] == "text/csv": - if params.get("raw") is True: - df = pd.read_json(json.loads(result)) - result = df.to_csv() - else: - result = pd.read_json(json.loads(result)).to_csv() - headers["Content-Type"] = "text/csv" - headers["Content-Disposition"] = "attachment; filename={0}.csv".format(k) - return make_response(result, - 200, - headers) + result, headers = set_response_headers(request.headers["Accept"], params.get("raw"), result, request_id) + return make_response(result, 200, headers) except Exception as e: vis_ns.logger.error(e) abort(500, "Problem encountered, check logs.") diff --git a/server/workers/api/src/apis/openaire.py b/server/workers/api/src/apis/openaire.py index 00db702ef..1d6a1a52a 100644 --- a/server/workers/api/src/apis/openaire.py +++ b/server/workers/api/src/apis/openaire.py @@ -6,38 +6,55 @@ from flask import Blueprint, request, make_response, jsonify, abort from flask_restx import Namespace, Resource, fields from .request_validators import SearchParamSchema -from apis.utils import get_key, redis_store +from common.utils import get_key, redis_store openaire_ns = Namespace("openaire", description="OpenAIRE API operations") search_param_schema = SearchParamSchema() -search_query = openaire_ns.model("SearchQuery", - {"q": fields.String(example='feminicide', - description='query string', - required=True), - "sorting": fields.String(example='most-recent', - description='most-relevant or most-recent', - required=True), - "from": fields.String(example='2019-01-01', - description='yyyy-MM-dd', - required=True), - "to": fields.String(example='2019-12-31', - description='yyyy-MM-dd', - required=True), - "vis_type": fields.String(example='overview', - description='overview or timeline', - required=True), - "limit": fields.Integer(example=100, - description='max. number of results'), - "language": fields.String(example='en', - description='language code, optional', - required=False), - "raw": fields.Boolean(example="false", - description='raw results from ElasticSearch')}) +search_query = openaire_ns.model( + "SearchQuery", + { + "q": fields.String( + example="feminicide", description="query string", required=True + ), + "sorting": fields.String( + example="most-recent", + description="most-relevant or most-recent", + required=True, + ), + "from": fields.String( + example="2019-01-01", description="yyyy-MM-dd", required=True + ), + "to": fields.String( + example="2019-12-31", description="yyyy-MM-dd", required=True + ), + "vis_type": fields.String( + example="overview", description="overview or timeline", required=True + ), + "limit": fields.Integer(example=100, description="max. number of results"), + "language": fields.String( + example="en", description="language code, optional", required=False + ), + "raw": fields.Boolean( + example="false", description="raw results from ElasticSearch" + ), + }, +) +# Utility function to set response headers +def set_response_headers(accept_header, is_raw, result, filename): + headers = {} + if accept_header == "application/json": + headers["Content-Type"] = "application/json" + elif accept_header == "text/csv": + result = pd.read_json(json.loads(result)).to_csv() if is_raw else pd.read_json(json.loads(result)).to_csv() + headers["Content-Type"] = "text/csv" + headers["Content-Disposition"] = f"attachment; filename={filename}.csv" + return result, headers + @openaire_ns.route('/search') class Search(Resource): @openaire_ns.doc(responses={200: 'OK', @@ -64,13 +81,14 @@ def post(self): openaire_ns.logger.debug("Queue length: %s %d %s" %("openaire", q_len, k)) result = get_key(redis_store, k, 300) try: + result, headers = set_response_headers(request.headers["Accept"], params.get("raw"), result, k) + headers = {} if request.headers["Accept"] == "application/json": headers["Content-Type"] = "application/json" if request.headers["Accept"] == "text/csv": if params.get("raw") is True: - df = pd.read_json(json.loads(result)) - result = df.to_csv() + result = pd.read_json(json.loads(result)).to_csv() else: result = pd.read_json(json.loads(result)).to_csv() headers["Content-Type"] = "text/csv" diff --git a/server/workers/api/src/apis/orcid.py b/server/workers/api/src/apis/orcid.py index d1285d3cc..1c30aaf64 100644 --- a/server/workers/api/src/apis/orcid.py +++ b/server/workers/api/src/apis/orcid.py @@ -7,7 +7,7 @@ from flask import request, make_response, jsonify, abort, g from flask_restx import Namespace, Resource, fields from .request_validators import SearchParamSchema -from apis.utils import get_key, redis_store +from common.utils import get_key, redis_store # Namespace setup diff --git a/server/workers/api/src/apis/pubmed.py b/server/workers/api/src/apis/pubmed.py index d2964c056..51dedf1a9 100644 --- a/server/workers/api/src/apis/pubmed.py +++ b/server/workers/api/src/apis/pubmed.py @@ -6,13 +6,12 @@ from flask import Blueprint, request, make_response, jsonify, abort from flask_restx import Namespace, Resource, fields from .request_validators import SearchParamSchema -from apis.utils import get_key, redis_store +from common.utils import get_key, redis_store pubmed_ns = Namespace("pubmed", description="PubMed API operations") search_param_schema = SearchParamSchema() - pubmed_querymodel = pubmed_ns.model( "SearchQuery", { @@ -43,6 +42,16 @@ }, ) +# Utility function to set response headers +def set_response_headers(accept_header, is_raw, result, filename): + headers = {} + if accept_header == "application/json": + headers["Content-Type"] = "application/json" + elif accept_header == "text/csv": + result = pd.read_json(json.loads(result)).to_csv() if is_raw else pd.read_json(json.loads(result)).to_csv() + headers["Content-Type"] = "text/csv" + headers["Content-Disposition"] = f"attachment; filename={filename}.csv" + return result, headers @pubmed_ns.route("/search") class Search(Resource): @@ -51,39 +60,30 @@ class Search(Resource): @pubmed_ns.produces(["application/json", "text/csv"]) def post(self): """ """ - params = request.get_json() + params: dict[str, str] = request.get_json() pubmed_ns.logger.debug(params) - if "optradio" in params: - del params["optradio"] - errors = search_param_schema.validate(params, partial=True) + + # Clean and validate params + params.pop("optradio", None) params["limit"] = 100 params["list_size"] = 100 - pubmed_ns.logger.debug(errors) + errors = search_param_schema.validate(params, partial=True) if errors: + pubmed_ns.logger.debug(errors) abort(400, str(errors)) - k = str(uuid.uuid4()) - d = {"id": k, "params": params, "endpoint": "search"} - pubmed_ns.logger.debug(d) - redis_store.rpush("pubmed", json.dumps(d)) - q_len = redis_store.llen("pubmed") - pubmed_ns.logger.debug("Queue length: %s %d %s" % ("pubmed", q_len, k)) - result = get_key(redis_store, k) + + # Log and queue the request + request_id = str(uuid.uuid4()) + request_data = {"id": request_id, "params": params, "endpoint": "search"} + pubmed_ns.logger.debug(request_data) + redis_store.rpush("pubmed", json.dumps(request_data)) + queue_length = redis_store.llen("pubmed") + pubmed_ns.logger.debug("Queue length: %s %d %s" % ("pubmed", queue_length, request_id)) + + # Get the result + result = get_key(redis_store, request_id) try: - headers = {} - if request.headers["Accept"] == "application/json": - headers["Content-Type"] = "application/json" - if request.headers["Accept"] == "text/csv": - if params.get("raw") is True: - df = pd.read_json(json.loads(result)) - result = df.to_csv() - else: - result = pd.read_json(json.loads(result)).to_csv() - headers["Content-Type"] = "text/csv" - headers["Content-Disposition"] = "attachment; filename={0}.csv".format( - k - ) - if params.get("raw") is True: - headers["Content-Type"] = "application/json" + result, headers = set_response_headers(request.headers.get("Accept"), params.get("raw"), result, request_id) return make_response(result, 200, headers) except Exception as e: pubmed_ns.logger.error(e) diff --git a/server/workers/api/src/apis/request_validators.py b/server/workers/api/src/apis/request_validators.py index 81a0f5be2..2ca6bd67f 100644 --- a/server/workers/api/src/apis/request_validators.py +++ b/server/workers/api/src/apis/request_validators.py @@ -10,7 +10,7 @@ class SearchParamSchema(Schema): format="%Y-%m-%d") to = fields.Date(required=True, format="%Y-%m-%d") - vis_type = fields.Str(require=True) + vis_type = fields.Str(required=True) limit = fields.Int() year_range = fields.Str() today = fields.Str() @@ -38,11 +38,12 @@ class SearchParamSchema(Schema): @pre_load def fix_years(self, in_data, **kwargs): - if len(in_data.get('from')) == 4: - in_data["from"] = in_data["from"]+"-01-01" - if len(in_data.get('to')) == 4: - in_data["to"] = in_data["to"]+"-12-31" - return in_data + from_date = in_data.get('from') + to_date = in_data.get('to') + if from_date and len(from_date) == 4: + in_data["from"] = from_date + "-01-01" + if to_date and len(to_date) == 4: + in_data["to"] = to_date + "-12-31" @pre_load def fix_limit(self, in_data, **kwargs): diff --git a/server/workers/api/src/app.py b/server/workers/api/src/app.py index 34365ab2a..546eacb2b 100644 --- a/server/workers/api/src/app.py +++ b/server/workers/api/src/app.py @@ -13,38 +13,7 @@ from apis.create_vis import vis_ns from apis.export import export_ns - -class ReverseProxied(object): - '''Wrap the application in this middleware and configure the - front-end server to add these headers, to let you quietly bind - this to a URL other than / and to an HTTP scheme that is - different than what is used locally. - - location /myprefix { - proxy_pass http://192.168.0.1:5001; - proxy_set_header Host $host; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Scheme $scheme; - proxy_set_header X-Script-Name /myprefix; - } - - :param app: the WSGI application - ''' - def __init__(self, app): - self.app = app - - def __call__(self, environ, start_response): - script_name = environ.get('HTTP_X_SCRIPT_NAME', '') - if script_name: - environ['SCRIPT_NAME'] = script_name - path_info = environ['PATH_INFO'] - if path_info.startswith(script_name): - environ['PATH_INFO'] = path_info[len(script_name):] - - scheme = environ.get('HTTP_X_SCHEME', '') - if scheme: - environ['wsgi.url_scheme'] = scheme - return self.app(environ, start_response) +from common.proxy import ReverseProxied def api_patches(app): diff --git a/server/workers/api/test.sh b/server/workers/api/test.sh new file mode 100755 index 000000000..e64a8a0c9 --- /dev/null +++ b/server/workers/api/test.sh @@ -0,0 +1,12 @@ +export REDIS_HOST=localhost +export REDIS_PORT=6355 +export REDIS_PASSWORD=testredispassword +export REDIS_DB=0 + +export POSTGRES_USER=testuser +export POSTGRES_PASSWORD=postgres +export POSTGRES_HOST=localhost +export POSTGRES_PORT=5434 +export POSTGRES_DATABASE=testdb + +pytest -v \ No newline at end of file diff --git a/server/workers/common/__init__.py b/server/workers/api/tests/__init__.py similarity index 100% rename from server/workers/common/__init__.py rename to server/workers/api/tests/__init__.py diff --git a/server/workers/orcid/__init__.py b/server/workers/api/tests/e2e/__init__.py similarity index 100% rename from server/workers/orcid/__init__.py rename to server/workers/api/tests/e2e/__init__.py diff --git a/server/workers/api/tests/e2e/conftest.py b/server/workers/api/tests/e2e/conftest.py new file mode 100644 index 000000000..c48e541ad --- /dev/null +++ b/server/workers/api/tests/e2e/conftest.py @@ -0,0 +1,11 @@ +import pytest +from pytest import MonkeyPatch + +from src.app import app + +@pytest.fixture +def client(monkeypatch: MonkeyPatch): + monkeypatch.setenv('SERVICE_VERSION', '1.0.0') + app.config['TESTING'] = True + with app.test_client() as client: + yield client diff --git a/server/workers/api/tests/e2e/test_base.py b/server/workers/api/tests/e2e/test_base.py new file mode 100644 index 000000000..ba8278bfe --- /dev/null +++ b/server/workers/api/tests/e2e/test_base.py @@ -0,0 +1,10 @@ +from flask.testing import FlaskClient + +def test_base(client: FlaskClient): + response = client.get('/api/base/service_version') + data = response.data + print(data) + assert response.status_code == 200 + assert response.headers['Content-Type'] == 'application/json' + assert response.json == {"service_version": "1.0.0"} + diff --git a/server/workers/api/tests/e2e/test_openaire.py b/server/workers/api/tests/e2e/test_openaire.py new file mode 100644 index 000000000..b657e890f --- /dev/null +++ b/server/workers/api/tests/e2e/test_openaire.py @@ -0,0 +1,10 @@ +from flask.testing import FlaskClient + +def test_orcid(client: FlaskClient): + response = client.get('/api/openaire/service_version') + data = response.data + print(data) + assert response.status_code == 200 + assert response.headers['Content-Type'] == 'application/json' + assert response.json == {"service_version": "1.0.0"} + diff --git a/server/workers/api/tests/e2e/test_orcid.py b/server/workers/api/tests/e2e/test_orcid.py new file mode 100644 index 000000000..89b07a8b3 --- /dev/null +++ b/server/workers/api/tests/e2e/test_orcid.py @@ -0,0 +1,10 @@ +from flask.testing import FlaskClient + +def test_orcid(client: FlaskClient): + response = client.get('/api/orcid/service_version') + data = response.data + print(data) + assert response.status_code == 200 + assert response.headers['Content-Type'] == 'application/json' + assert response.json == {"service_version": "1.0.0"} + diff --git a/server/workers/api/tests/e2e/test_pubmed.py b/server/workers/api/tests/e2e/test_pubmed.py new file mode 100644 index 000000000..b9b4401e5 --- /dev/null +++ b/server/workers/api/tests/e2e/test_pubmed.py @@ -0,0 +1,10 @@ +from flask.testing import FlaskClient + +def test_orcid(client: FlaskClient): + response = client.get('/api/pubmed/service_version') + data = response.data + print(data) + assert response.status_code == 200 + assert response.headers['Content-Type'] == 'application/json' + assert response.json == {"service_version": "1.0.0"} + diff --git a/server/workers/api/tests/mock_app.py b/server/workers/api/tests/mock_app.py new file mode 100644 index 000000000..480a22ecc --- /dev/null +++ b/server/workers/api/tests/mock_app.py @@ -0,0 +1,59 @@ +import os +import json +import sys +import logging +from flask import Flask, make_response +from flask_restx import Api + +def create_app(config_name): + app = Flask(__name__) + + bind_params = { + "user": os.getenv("POSTGRES_USER"), + "pw": os.getenv("POSTGRES_PASSWORD"), + "host": os.getenv("POSTGRES_HOST"), + "port": os.getenv("POSTGRES_PORT"), + "db": os.getenv("DEFAULT_DATABASE") + } + app.config['SQLALCHEMY_DATABASE_URI'] = 'postgresql://%(user)s:%(pw)s@%(host)s:%(port)s/%(db)s' % bind_params + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False + app.config['PORT'] = 80 + app.config['HOST'] = '0.0.0.0' + app.config['DEBUG'] = True + app.config['TESTING'] = True + app.config['ENV'] = 'development' + + # Add any configuration settings based on `config_name` (e.g., development, testing, production) + # ... + + @app.route('/hello') + def hello_world(): + return "Hello, World!" + + @app.route('/api/stable/base/search') + def base_search(): + try: + with open("/app/workers/api/test_data/digital-education.json") as f: + data = json.load(f) + headers = {} + headers["Content-Type"] = "application/json" + return make_response(data, 200, headers) + except Exception as e: + print(e) + return make_response("Error", 500) + + + api = Api(app) + + handler = logging.StreamHandler(sys.stderr) + handler.setLevel(logging.DEBUG) + app.logger.setLevel(logging.DEBUG) + app.logger.addHandler(handler) + # app.logger.debug(app.config) + # app.logger.debug(app.url_map) + + return app + +if __name__ == '__main__': + app = create_app(config_name="testing") + app.run(host="0.0.0.0", port=80, debug=True) diff --git a/server/workers/api/tests/test_data/digital-education.json b/server/workers/api/tests/test_data/digital-education.json new file mode 100644 index 000000000..3bc79c404 --- /dev/null +++ b/server/workers/api/tests/test_data/digital-education.json @@ -0,0 +1,10 @@ +{ + "context": { + "id": "530133cf1768e6606f63c641a1a96768", + "query": "digital education", + "service": "base", + "timestamp": "2020-07-09 18:20:14", + "params": "{\"from\":\"1665-01-01\",\"to\":\"2020-07-09\",\"document_types\":[\"121\"],\"sorting\":\"most-relevant\",\"lang_id\":[\"all-lang\"]}" + }, + "data": "[{\"id\":\"001a8aa44f55f1d133d4ed2b0e7b4a1e4125901c84d5086b9648dded9251ed1c\",\"relation\":\"doi:10.1080\\/18377122.2016.1222238; issn:1837-7122; issn:1837-7130; orcid:https:\\/\\/orcid.org\\/0000-0002-2888-4974; orcid:https:\\/\\/orcid.org\\/0000-0001-7206-4781\",\"identifier\":\"https:\\/\\/espace.library.uq.edu.au\\/view\\/UQ:403365\",\"title\":\"Computer says no: an analysis of three digital food education resources\",\"paper_abstract\":\"What kind of thing will food education become in digitisedclassrooms? Drawn from a broader research project concernedwith the\\u2018e turn\\u2019in school health and physical education, thispaper analyses three examples of digital food education (DEF).This is done by considering the role of digital technology inchanging\\u2013or not changing\\u2013earlier forms of food education. Ineach case, these processes are viewed as portals of connectionthrough which knowledge claims are produced, copied, merged,manipulated, juxtaposed and re-represented. Food education is,therefore, conceptualised not as the distillation of scientificknowledge, but as the uses to which this knowledge can be put.Our overall finding\\u2013that in many ways DEF is not very differentfrom that which preceded it\\u2013echoes other scholars; nutritionismdressed in digital garb is still nutritionism. However, rather thanarguing that DEF needs to adhere more faithfully to nutritionalscience, we argue the reverse; that digital technology has the as yetunmet potential to move food education away from nutritionalscience towards something more intellectually rich and educationallyengaging\",\"published_in\":\"\",\"year\":\"2016-08-26\",\"subject_orig\":\"Digital games; Digital food education; Digitised classrooms; Health and physical education; Actor network theory; 2732 Orthopedics and Sports Medicine; 3304 Education; 3612 Physical Therapy; Sports Therapy and Rehabilitation\",\"subject\":\"Digital games; Digital food education; Digitised classrooms; Health and physical education; Actor network theory; Sports Therapy and Rehabilitation\",\"authors\":\"Gard, Michael; Enright, Eimear\",\"link\":\"https:\\/\\/espace.library.uq.edu.au\\/view\\/UQ:403365\",\"oa_state\":\"2\",\"url\":\"001a8aa44f55f1d133d4ed2b0e7b4a1e4125901c84d5086b9648dded9251ed1c\",\"relevance\":51,\"lang_detected\":\"english\",\"cluster_labels\":\"Decision support, Digital food education, Education revolution\",\"x\":\"0.0336057408338733\",\"y\":\"-0.016385020362244\",\"area_uri\":3,\"area\":\"Decision support, Digital food education, Education revolution\"},{\"id\":\"008ea92dafd41bdb55abf7cb8b4f43deb52ac003a2b15a8c5eb8743ae021533d\",\"relation\":\"doi:10.5281\\/zenodo.1292856; https:\\/\\/doi.org\\/10.5281\\/zenodo.1292855; https:\\/\\/zenodo.org\\/record\\/1292856\",\"identifier\":\"https:\\/\\/doi.org\\/10.5281\\/zenodo.1292855; https:\\/\\/zenodo.org\\/record\\/1292856\",\"title\":\"Digital Education And Learning: The Growing Trend In Academic And Business Spaces\\u2014An International Overview\",\"paper_abstract\":\"Abstract ; The world becomes Digital day by day and thus activities, features and sectors and different spaces are highly associated with Digital Tools, Techniques and Technologies. Education domain becomes highly technology enabled in recent past and this strategy is rising out and as a result, various concepts, areas, and domains have been created viz. Education Technology, E-Learning, Online Education, Blended Learning and as a whole this concept and this area may be called as a Digital Education\\/ Digital Learning. Internationally many universities have started educational programs leading to Bachelors and Masters Degree in respect of Digital Education and its subfields (mentioned above). The awards are offered in different subjects and are tagged with concentration and major in this area. The Digital Education becomes an important area of research as well due to its importance, many universities have started research program leading to PhD and other professional doctorate degrees. This study is concentrated on Masters degrees in the field of Digital Education and Digital Learning which are available internationally, based on selected research methodologies. This paper emphasizes the role, growth and values of Digital Education and Learning including future growth and stakeholders in this field.\",\"published_in\":\"\",\"year\":\"2018\",\"subject_orig\":\"Digital Education; E-Learning; Higher Education; Digitalization; MSc (Digital Education); International Universities; Professional Degrees\",\"subject\":\"Digital Education; E-Learning; Higher Education; Digitalization; MSc (Digital Education); International Universities; Professional Degrees\",\"authors\":\"P. K. Paul; P. S. Aithal\",\"link\":\"https:\\/\\/doi.org\\/10.5281\\/zenodo.1292855\",\"oa_state\":\"1\",\"url\":\"008ea92dafd41bdb55abf7cb8b4f43deb52ac003a2b15a8c5eb8743ae021533d\",\"relevance\":118,\"lang_detected\":\"english\",\"cluster_labels\":\"Digital citizenship, Digital education revolution, Digital literacies\",\"x\":\"-0.036157510495911\",\"y\":\"0.00378080835898564\",\"area_uri\":1,\"area\":\"Digital citizenship, Digital education revolution, Digital literacies\"},{\"id\":\"0526533f14f3af97c71d809791195c453141782efc037245ec56295d92dec108\",\"relation\":\"https:\\/\\/eprints.soton.ac.uk\\/71809\\/1\\/index.html; Seale, Jane, Draffan, E.A. and Wald, Mike (2010) Digital agility and digital decision-making: conceptualising digital inclusion in the context of disabled learners in higher education. Studies in Higher Education, 35 (4), 445-461. (doi:10.1080\\/03075070903131628 ).\",\"identifier\":\"https:\\/\\/eprints.soton.ac.uk\\/71809\\/; https:\\/\\/eprints.soton.ac.uk\\/71809\\/1\\/index.html\",\"title\":\"Digital agility and digital decision-making: conceptualising digital inclusion in the context of disabled learners in higher education\",\"paper_abstract\":\"Digital inclusion in higher education has tended to be understood solely in terms of accessibility, which does little to further our understanding of the role technology plays in the learning experiences of disabled students. In this article, the authors propose a conceptual framework for exploring digital inclusion in higher education that attempts to broaden the way in which it is understood. The conceptual framework encompasses two strands: one that focuses on technology, personal and contextual factors, and one that focuses on resources and choices. This framework will be used to present and discuss the results of a study which aimed to explore the e-learning experiences of disabled students at one higher education institution. The discussion will focus particularly on concepts of digital agility and digital decision-making, and will consider the potential implications for the empowerment of disabled students.\",\"published_in\":\"\",\"year\":\"2010\",\"subject_orig\":\"\",\"subject\":\"agility digital; conceptualising digital; context disabled\",\"authors\":\"Seale, Jane; Draffan, E.A.; Wald, Mike\",\"link\":\"https:\\/\\/eprints.soton.ac.uk\\/71809\\/\",\"oa_state\":\"2\",\"url\":\"0526533f14f3af97c71d809791195c453141782efc037245ec56295d92dec108\",\"relevance\":99,\"lang_detected\":\"english\",\"cluster_labels\":\"Higher education institutions, Digital inclusion, Higher education students\",\"x\":\"-0.13240233339121\",\"y\":\"0.124595505005699\",\"area_uri\":5,\"area\":\"Higher education institutions, Digital inclusion, Higher education students\"}]" +} diff --git a/server/workers/base/Dockerfile b/server/workers/base/Dockerfile index 30382ee24..7d99f853d 100644 --- a/server/workers/base/Dockerfile +++ b/server/workers/base/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:18.04 -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" +LABEL maintainer="Chris Kittel " ENV DEBIAN_FRONTEND=noninteractive @@ -154,11 +154,14 @@ COPY workers/base/activate.R . RUN R -e 'renv::consent(provided = TRUE)' && \ R -e 'setwd("./"); renv::activate(); renv::restore(lockfile = "./renv.lock")' -COPY workers/common ./common +COPY workers/common ../common +COPY workers/base/requirements-e.txt . +RUN pip3 install --no-cache-dir -r requirements-e.txt + COPY workers/base ./base COPY preprocessing/resources ./resources COPY preprocessing/other-scripts ./other-scripts RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log COPY workers/base/*.py ./ -ENTRYPOINT python3 run_base.py +CMD ["python3", "run_base.py"] diff --git a/server/workers/base/pyproject.toml b/server/workers/base/pyproject.toml new file mode 100644 index 000000000..a650eb415 --- /dev/null +++ b/server/workers/base/pyproject.toml @@ -0,0 +1,4 @@ +[project] +name = "api" +version = "0.1.0" +description = "API microservice." diff --git a/server/workers/base/requirements-e.txt b/server/workers/base/requirements-e.txt new file mode 100644 index 000000000..d01580003 --- /dev/null +++ b/server/workers/base/requirements-e.txt @@ -0,0 +1,2 @@ +# Include the common package from the local file system +-e ../common diff --git a/server/workers/base/src/base.py b/server/workers/base/src/base.py index 6e05a2ff5..af7a603b3 100644 --- a/server/workers/base/src/base.py +++ b/server/workers/base/src/base.py @@ -3,7 +3,6 @@ import subprocess import pandas as pd import logging -from datetime import timedelta from common.r_wrapper import RWrapper from common.deduplication import find_version_in_doi, get_unversioned_doi, get_publisher_doi, \ find_duplicate_indexes, mark_duplicate_dois, mark_duplicate_links,\ @@ -12,10 +11,9 @@ mark_latest_doi, prioritize_OA_and_latest import re -from redis.exceptions import LockError import time -import numpy as np from parsers import improved_df_parsing +from common.rate_limiter import RateLimiter formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') @@ -28,8 +26,7 @@ def __init__(self, *args): # set separation for requests # respecting BASE rate limit of 1 qps # separation = round(period_in_seconds / rate limit per second) - self.separation = 1.5 - self.rate_key = 'base-ratelimit' + self.rate_limiter = RateLimiter(self.redis_store, 'base-ratelimit', 1.5) try: result = self.get_contentproviders() @@ -50,31 +47,6 @@ def next_item(self): endpoint = msg.get('endpoint') return k, params, endpoint - def base_rate_limit_reached(self): - """ - This implementation is inspired by an implementation of - Generic Cell Rate Algorithm based rate limiting, - seen on https://dev.to/astagi/rate-limiting-using-python-and-redis-58gk. - It has been simplified and adjusted to our use case. - - BASE demands one request per second (1 QPS), per - https://www.base-search.net/about/download/base_interface.pdf - """ - - t = self.redis_store.time()[0] - self.redis_store.setnx(self.rate_key, 0) - try: - with self.redis_store.lock('lock:' + self.rate_key, blocking_timeout=5) as lock: - theoretical_arrival_time = max(float(self.redis_store.get(self.rate_key)), t) - if theoretical_arrival_time - t <= 0: - new_theoretical_arrival_time = max(theoretical_arrival_time, t) + self.separation - self.redis_store.set(self.rate_key, new_theoretical_arrival_time) - return False - return True - # the locking mechanism is needed if a key is requested multiple times at the same time - except LockError: - return True - def execute_search(self, params): q = params.get('q') service = params.get('service') @@ -161,7 +133,7 @@ def get_contentproviders(self): def run(self): while True: - while self.base_rate_limit_reached(): + while self.rate_limiter.rate_limit_reached(): self.logger.debug('🛑 Request is limited') time.sleep(0.1) k, params, endpoint = self.next_item() diff --git a/server/workers/common/common/__init__.py b/server/workers/common/common/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/api/src/apis/contentproviders.json b/server/workers/common/common/contentproviders.json similarity index 100% rename from server/workers/api/src/apis/contentproviders.json rename to server/workers/common/common/contentproviders.json diff --git a/server/workers/common/decorators.py b/server/workers/common/common/decorators.py similarity index 100% rename from server/workers/common/decorators.py rename to server/workers/common/common/decorators.py diff --git a/server/workers/common/deduplication.py b/server/workers/common/common/deduplication.py similarity index 97% rename from server/workers/common/deduplication.py rename to server/workers/common/common/deduplication.py index 37ca7aec0..70fe663ba 100644 --- a/server/workers/common/deduplication.py +++ b/server/workers/common/common/deduplication.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd import Levenshtein -from sklearn.metrics import pairwise_distances pattern_doi = re.compile(r"\.v(\d)+$") @@ -69,7 +68,7 @@ def add_false_negatives(df): return df def remove_textual_duplicates_from_different_sources(df, dupind): - for _, idx in dupind.iteritems(): + for _, idx in dupind.items(): if len(idx) > 1: tmp = df.loc[idx] df.loc[tmp.index, "is_duplicate"] = True @@ -86,7 +85,7 @@ def remove_textual_duplicates_from_different_sources(df, dupind): return df def mark_latest_doi(df, dupind): - for _, idx in dupind.iteritems(): + for _, idx in dupind.items(): idx = df.index.intersection(idx) tmp = df.loc[idx] for udoi in list(filter(None, tmp.unversioned_doi.unique().tolist())): @@ -103,7 +102,7 @@ def mark_latest_doi(df, dupind): return df def prioritize_OA_and_latest(df, dupind): - for _, idx in dupind.iteritems(): + for _, idx in dupind.items(): idx = df.index.intersection(idx) if len(idx) > 1: tmp = df.loc[idx] diff --git a/server/workers/common/common/proxy.py b/server/workers/common/common/proxy.py new file mode 100644 index 000000000..deac405bd --- /dev/null +++ b/server/workers/common/common/proxy.py @@ -0,0 +1,31 @@ +class ReverseProxied(object): + '''Wrap the application in this middleware and configure the + front-end server to add these headers, to let you quietly bind + this to a URL other than / and to an HTTP scheme that is + different than what is used locally. + + location /myprefix { + proxy_pass http://192.168.0.1:5001; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Scheme $scheme; + proxy_set_header X-Script-Name /myprefix; + } + + :param app: the WSGI application + ''' + def __init__(self, app): + self.app = app + + def __call__(self, environ, start_response): + script_name = environ.get('HTTP_X_SCRIPT_NAME', '') + if script_name: + environ['SCRIPT_NAME'] = script_name + path_info = environ['PATH_INFO'] + if path_info.startswith(script_name): + environ['PATH_INFO'] = path_info[len(script_name):] + + scheme = environ.get('HTTP_X_SCHEME', '') + if scheme: + environ['wsgi.url_scheme'] = scheme + return self.app(environ, start_response) diff --git a/server/workers/common/r_wrapper.py b/server/workers/common/common/r_wrapper.py similarity index 99% rename from server/workers/common/r_wrapper.py rename to server/workers/common/common/r_wrapper.py index 870b2c4cd..8332700a5 100644 --- a/server/workers/common/r_wrapper.py +++ b/server/workers/common/common/r_wrapper.py @@ -2,7 +2,6 @@ import sys import copy import json -import redis import logging diff --git a/server/workers/common/common/rate_limiter.py b/server/workers/common/common/rate_limiter.py new file mode 100644 index 000000000..7a63d3d66 --- /dev/null +++ b/server/workers/common/common/rate_limiter.py @@ -0,0 +1,39 @@ +import logging +from redis.exceptions import LockError +from redis import StrictRedis +from common.decorators import error_logging_aspect + +class RateLimiter: + """ + RateLimiter class to limit the number of requests to the ORCID API. + """ + def __init__(self, redis_store: StrictRedis, rate_key, separation = 0.1): + self.redis_store = redis_store + self.rate_key = rate_key + self.separation = separation + + @error_logging_aspect(log_level=logging.INFO) + def rate_limit_reached(self) -> bool: + """ + This implementation is inspired by an implementation of + Generic Cell Rate Algorithm based rate limiting, + seen on https://dev.to/astagi/rate-limiting-using-python-and-redis-58gk. + It has been simplified and adjusted to our use case. + + ORCID allows 24 requests per second, with a burst limit of 40 requests. See also: + https://info.orcid.org/ufaqs/what-are-the-api-limits/ + """ + + t = self.redis_store.time()[0] + self.redis_store.setnx(self.rate_key, 0) + try: + with self.redis_store.lock('lock:' + self.rate_key, blocking_timeout=5) as _: + theoretical_arrival_time = max(float(self.redis_store.get(self.rate_key)), t) + if theoretical_arrival_time - t <= 0: + new_theoretical_arrival_time = max(theoretical_arrival_time, t) + self.separation + self.redis_store.set(self.rate_key, new_theoretical_arrival_time) + return False + return True + # the locking mechanism is needed if a key is requested multiple times at the same time + except LockError: + return True \ No newline at end of file diff --git a/server/workers/api/src/apis/utils.py b/server/workers/common/common/utils.py similarity index 83% rename from server/workers/api/src/apis/utils.py rename to server/workers/common/common/utils.py index 7f8da3e71..aa0e240f3 100644 --- a/server/workers/api/src/apis/utils.py +++ b/server/workers/common/common/utils.py @@ -9,6 +9,7 @@ import pandas as pd import pathlib + redis_config = { "host": os.getenv("REDIS_HOST"), "port": os.getenv("REDIS_PORT"), @@ -16,8 +17,10 @@ "password": os.getenv("REDIS_PASSWORD"), "client_name": "api" } -redis_store = redis.StrictRedis(**redis_config) +print("Connecting to Redis with config: ", redis_config) + +redis_store = redis.StrictRedis(**redis_config) def get_key(store, key, timeout=180): wait_s = 1 @@ -80,7 +83,7 @@ def get_or_create_contentprovider_lookup(): k = str(uuid.uuid4()) d = {"id": k, "params": {},"endpoint": "contentproviders"} redis_store.rpush("base", json.dumps(d)) - result = get_key(redis_store, k) + result = get_key(redis_store, k, 10) if result.get("status") == "error": df = pd.read_json("contentproviders.json") df.set_index("internal_name", inplace=True) @@ -100,4 +103,19 @@ def get_or_create_contentprovider_lookup(): cp_dict = df.name.to_dict() return cp_dict -contentprovider_lookup = get_or_create_contentprovider_lookup() \ No newline at end of file +def get_nested_value(data, keys, default=None): + """ + Recursively retrieves a nested value from a dictionary. + + :param data: Dictionary to retrieve the value from + :param keys: List of keys to follow in the dictionary + :param default: Default value to return if any key is not found + :return: The retrieved value or the default value + """ + for key in keys: + if not hasattr(data, 'get'): + return default + data = data.get(key) + if data is None: + return default + return data diff --git a/server/workers/common/pyproject.toml b/server/workers/common/pyproject.toml new file mode 100644 index 000000000..cd8f542c5 --- /dev/null +++ b/server/workers/common/pyproject.toml @@ -0,0 +1,14 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "common" +version = "0.1.0" +description = "A common utility package shared across microservices." + +[tool.setuptools.packages.find] +where = ["."] + +[tool.setuptools] +package-dir = {"" = "."} \ No newline at end of file diff --git a/server/workers/common/setup.py b/server/workers/common/setup.py new file mode 100644 index 000000000..19657e87b --- /dev/null +++ b/server/workers/common/setup.py @@ -0,0 +1,7 @@ +from setuptools import setup, find_packages + +setup( + name="your_project_name", + version="0.1", + packages=find_packages(), +) \ No newline at end of file diff --git a/server/workers/dataprocessing/.dockerignore b/server/workers/dataprocessing/.dockerignore new file mode 100644 index 000000000..2a5c729b7 --- /dev/null +++ b/server/workers/dataprocessing/.dockerignore @@ -0,0 +1,10 @@ +renv +__pycache__ +.cache +.pytest_cache +.Rproj.user +.RData +.Rhistory +*.Rproj +.pynb_checkpoints +*.ipynb \ No newline at end of file diff --git a/server/workers/dataprocessing/Dockerfile b/server/workers/dataprocessing/Dockerfile index 2f529795a..604bfb448 100644 --- a/server/workers/dataprocessing/Dockerfile +++ b/server/workers/dataprocessing/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:18.04 -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" +LABEL maintainer="Chris Kittel " ENV DEBIAN_FRONTEND=noninteractive @@ -155,7 +155,10 @@ COPY workers/dataprocessing/activate.R . RUN R -e 'renv::consent(provided = TRUE)' && \ R -e 'setwd("./"); renv::activate(); renv::restore(lockfile = "./renv.lock")' -COPY workers/common ./common +COPY workers/common ../common +COPY workers/dataprocessing/requirements-e.txt . +RUN pip3 install --no-cache-dir -r requirements-e.txt + COPY workers/dataprocessing ./dataprocessing COPY preprocessing/resources ./resources COPY preprocessing/other-scripts ./other-scripts diff --git a/server/workers/dataprocessing/requirements-e.txt b/server/workers/dataprocessing/requirements-e.txt new file mode 100644 index 000000000..a35d9cfef --- /dev/null +++ b/server/workers/dataprocessing/requirements-e.txt @@ -0,0 +1,2 @@ +# Include the common package from the local file system +-e ../common \ No newline at end of file diff --git a/server/workers/dataprocessing/src/headstart.py b/server/workers/dataprocessing/src/headstart.py index 5978737f6..aefc674e7 100644 --- a/server/workers/dataprocessing/src/headstart.py +++ b/server/workers/dataprocessing/src/headstart.py @@ -50,7 +50,8 @@ def next_item(self): k = msg.get('id') params = self.add_default_params(msg.get('params')) input_data = msg.get('input_data') - return k, params, input_data + author = msg.get('author') + return k, params, input_data, author def execute_search(self, params, input_data): q = params.get('q') @@ -84,7 +85,7 @@ def run(self): time.sleep(30) while self.tunnel_open: try: - k, params, input_data = self.next_item() + k, params, input_data, author = self.next_item() self.logger.debug(k) self.logger.debug(params) except (RedisError, ConnectionRefusedError): @@ -103,7 +104,10 @@ def run(self): res["status"] = "success" self.redis_store.set(k+"_output", json.dumps(res)) else: - res = self.execute_search(params, input_data) + res = {} + documents = self.execute_search(params, input_data) + res["documents"] = documents + res["author"] = author self.redis_store.set(k+"_output", json.dumps(res)) except ValueError as e: self.logger.error(params) diff --git a/server/workers/metrics/.dockerignore b/server/workers/metrics/.dockerignore new file mode 100644 index 000000000..2a5c729b7 --- /dev/null +++ b/server/workers/metrics/.dockerignore @@ -0,0 +1,10 @@ +renv +__pycache__ +.cache +.pytest_cache +.Rproj.user +.RData +.Rhistory +*.Rproj +.pynb_checkpoints +*.ipynb \ No newline at end of file diff --git a/server/workers/metrics/Dockerfile b/server/workers/metrics/Dockerfile new file mode 100644 index 000000000..d42d901b0 --- /dev/null +++ b/server/workers/metrics/Dockerfile @@ -0,0 +1,170 @@ +FROM ubuntu:20.04 + +LABEL maintainer="Chris Kittel " + +ENV DEBIAN_FRONTEND=noninteractive + +ARG R_VERSION +ARG BUILD_DATE +ARG CRAN +## Setting a BUILD_DATE will set CRAN to the matching MRAN date +## No BUILD_DATE means that CRAN will default to latest +ENV R_VERSION=${R_VERSION:-3.6.3} \ + CRAN=${CRAN:-https://cran.rstudio.com} + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bash-completion \ + ca-certificates \ + file \ + fonts-texgyre \ + g++ \ + gfortran \ + gsfonts \ + libblas-dev \ + libbz2-1.0 \ + libcurl4 \ + libjpeg-turbo8-dev \ + libopenblas-dev \ + libpangocairo-1.0-0 \ + libpcre3 \ + libpng16-16 \ + libreadline-dev \ + libtiff5 \ + liblzma5 \ + locales \ + make \ + unzip \ + zip \ + zlib1g \ + && BUILDDEPS="curl \ + default-jdk \ + libbz2-dev \ + libcairo2-dev \ + libcurl4-openssl-dev \ + libpango1.0-dev \ + libjpeg-dev \ + libpcre3-dev \ + libpng-dev \ + libreadline-dev \ + libtiff5-dev \ + liblzma-dev \ + libx11-dev \ + libxt-dev \ + perl \ + tcl8.6-dev \ + tk8.6-dev \ + x11proto-core-dev \ + xauth \ + xfonts-base \ + xvfb \ + zlib1g-dev" \ + && apt-get install -y --no-install-recommends $BUILDDEPS \ + && cd tmp/ \ + ## Download source code + && curl -O https://cran.r-project.org/src/base/R-3/R-${R_VERSION}.tar.gz \ + ## Extract source code + && tar -xf R-${R_VERSION}.tar.gz \ + && cd R-${R_VERSION} \ + ## Set compiler flags + && R_PAPERSIZE=letter \ + R_BATCHSAVE="--no-save --no-restore" \ + R_BROWSER=xdg-open \ + PAGER=/usr/bin/pager \ + PERL=/usr/bin/perl \ + R_UNZIPCMD=/usr/bin/unzip \ + R_ZIPCMD=/usr/bin/zip \ + R_PRINTCMD=/usr/bin/lpr \ + LIBnn=lib \ + AWK=/usr/bin/awk \ + CFLAGS="-g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g" \ + CXXFLAGS="-g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g" \ + ## Configure options + ./configure --enable-R-shlib \ + --enable-memory-profiling \ + --with-readline \ + --with-blas \ + --with-tcltk \ + --disable-nls \ + --with-recommended-packages \ + ## Build and install + && make \ + && make install \ + ## Add a library directory (for user-installed packages) + && mkdir -p /usr/local/lib/R/site-library \ + && chown root:staff /usr/local/lib/R/site-library \ + && chmod g+ws /usr/local/lib/R/site-library \ + ## Fix library path + && sed -i '/^R_LIBS_USER=.*$/d' /usr/local/lib/R/etc/Renviron \ + && echo "R_LIBS_USER=\${R_LIBS_USER-'/usr/local/lib/R/site-library'}" >> /usr/local/lib/R/etc/Renviron \ + && echo "R_LIBS=\${R_LIBS-'/usr/local/lib/R/site-library:/usr/local/lib/R/library:/usr/lib/R/library'}" >> /usr/local/lib/R/etc/Renviron \ + ## Set configured CRAN mirror + && if [ -z "$BUILD_DATE" ]; then MRAN=$CRAN; \ + else MRAN=https://mran.microsoft.com/snapshot/${BUILD_DATE}; fi \ + && echo MRAN=$MRAN >> /etc/environment \ + && echo "options(repos = c(CRAN='$MRAN'), download.file.method = 'libcurl')" >> /usr/local/lib/R/etc/Rprofile.site \ + ## Use littler installation scripts + && Rscript -e "install.packages(c('littler', 'docopt'), repo = '$CRAN')" \ + && ln -s /usr/local/lib/R/site-library/littler/examples/install2.r /usr/local/bin/install2.r \ + && ln -s /usr/local/lib/R/site-library/littler/examples/installGithub.r /usr/local/bin/installGithub.r \ + && ln -s /usr/local/lib/R/site-library/littler/bin/r /usr/local/bin/r \ + ## Clean up from R source install + && cd / \ + && rm -rf /tmp/* \ + && apt-get remove --purge -y $BUILDDEPS \ + && apt-get autoremove -y \ + && apt-get autoclean -y \ + && rm -rf /var/lib/apt/lists/* + +RUN locale-gen en_US.UTF-8 && \ + update-locale LANG=en_US.UTF-8 && \ + update-locale LC_ALL=en_US.UTF-8 && \ + export LANGUAGE=en_US.UTF-8 && \ + export LANG=en_US.UTF-8 && \ + export LC_ALL=en_US.UTF-8 && \ + dpkg-reconfigure locales + +RUN apt update && apt full-upgrade -y && \ + apt install -y links curl vim libcurl4-openssl-dev \ + libxml2-dev libz-dev libpoppler-cpp-dev \ + libopenmpi-dev libzmq3-dev build-essential python3-dev \ + libssl1.1 libssl-dev && \ + apt clean && \ + rm -f /etc/localtime && \ + ln -s /usr/share/zoneinfo/Europe/Vienna /etc/localtime && \ + dpkg --configure -a + +RUN apt-get -y install python3 python3-pip + +RUN apt-get -y install automake +RUN R -e 'options(repos="https://cran.wu.ac.at")' && \ + R -e 'install.packages("remotes")' && \ + R -e 'install.packages("renv", version="0.14.0-5")' + +ENV PYTHONPATH="/headstart/:/headstart/metrics/:/headstart/metrics/src/" + +WORKDIR /headstart +COPY workers/metrics/requirements.txt . +RUN pip3 install --no-cache-dir Cython +RUN pip3 install --upgrade pip +RUN pip3 install --no-cache-dir -r requirements.txt + +COPY workers/metrics/renv.lock . +COPY workers/metrics/activate.R . + +RUN apt-get -y install automake +RUN R -e 'renv::consent(provided = TRUE)' && \ + R -e 'setwd("./"); renv::activate(); renv::restore(lockfile = "./renv.lock")' + +COPY workers/common ../common +COPY workers/metrics/requirements-e.txt . +RUN pip3 install --no-cache-dir -r requirements-e.txt + +COPY workers/metrics ./metrics +COPY preprocessing/resources ./resources +COPY preprocessing/other-scripts ./other-scripts +RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log + +COPY workers/metrics/*.py ./ + +CMD ["python3", "run_metrics.py"] diff --git a/server/workers/metrics/__init__.py b/server/workers/metrics/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/metrics/activate.R b/server/workers/metrics/activate.R new file mode 100644 index 000000000..304fd900a --- /dev/null +++ b/server/workers/metrics/activate.R @@ -0,0 +1,668 @@ + +local({ + + # the requested version of renv + version <- "0.14.0" + + # the project directory + project <- getwd() + + # allow environment variable to control activation + activate <- Sys.getenv("RENV_ACTIVATE_PROJECT") + if (!nzchar(activate)) { + + # don't auto-activate when R CMD INSTALL is running + if (nzchar(Sys.getenv("R_INSTALL_PKG"))) + return(FALSE) + + } + + # bail if activation was explicitly disabled + if (tolower(activate) %in% c("false", "f", "0")) + return(FALSE) + + # avoid recursion + if (nzchar(Sys.getenv("RENV_R_INITIALIZING"))) + return(invisible(TRUE)) + + # signal that we're loading renv during R startup + Sys.setenv("RENV_R_INITIALIZING" = "true") + on.exit(Sys.unsetenv("RENV_R_INITIALIZING"), add = TRUE) + + # signal that we've consented to use renv + options(renv.consent = TRUE) + + # load the 'utils' package eagerly -- this ensures that renv shims, which + # mask 'utils' packages, will come first on the search path + library(utils, lib.loc = .Library) + + # check to see if renv has already been loaded + if ("renv" %in% loadedNamespaces()) { + + # if renv has already been loaded, and it's the requested version of renv, + # nothing to do + spec <- .getNamespaceInfo(.getNamespace("renv"), "spec") + if (identical(spec[["version"]], version)) + return(invisible(TRUE)) + + # otherwise, unload and attempt to load the correct version of renv + unloadNamespace("renv") + + } + + # load bootstrap tools + bootstrap <- function(version, library) { + + # attempt to download renv + tarball <- tryCatch(renv_bootstrap_download(version), error = identity) + if (inherits(tarball, "error")) + stop("failed to download renv ", version) + + # now attempt to install + status <- tryCatch(renv_bootstrap_install(version, tarball, library), error = identity) + if (inherits(status, "error")) + stop("failed to install renv ", version) + + } + + renv_bootstrap_tests_running <- function() { + getOption("renv.tests.running", default = FALSE) + } + + renv_bootstrap_repos <- function() { + + # check for repos override + repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) + if (!is.na(repos)) + return(repos) + + # if we're testing, re-use the test repositories + if (renv_bootstrap_tests_running()) + return(getOption("renv.tests.repos")) + + # retrieve current repos + repos <- getOption("repos") + + # ensure @CRAN@ entries are resolved + repos[repos == "@CRAN@"] <- getOption( + "renv.repos.cran", + "https://cloud.r-project.org" + ) + + # add in renv.bootstrap.repos if set + default <- c(FALLBACK = "https://cloud.r-project.org") + extra <- getOption("renv.bootstrap.repos", default = default) + repos <- c(repos, extra) + + # remove duplicates that might've snuck in + dupes <- duplicated(repos) | duplicated(names(repos)) + repos[!dupes] + + } + + renv_bootstrap_download <- function(version) { + + # if the renv version number has 4 components, assume it must + # be retrieved via github + nv <- numeric_version(version) + components <- unclass(nv)[[1]] + + methods <- if (length(components) == 4L) { + list( + renv_bootstrap_download_github + ) + } else { + list( + renv_bootstrap_download_cran_latest, + renv_bootstrap_download_cran_archive + ) + } + + for (method in methods) { + path <- tryCatch(method(version), error = identity) + if (is.character(path) && file.exists(path)) + return(path) + } + + stop("failed to download renv ", version) + + } + + renv_bootstrap_download_impl <- function(url, destfile) { + + mode <- "wb" + + # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 + fixup <- + Sys.info()[["sysname"]] == "Windows" && + substring(url, 1L, 5L) == "file:" + + if (fixup) + mode <- "w+b" + + utils::download.file( + url = url, + destfile = destfile, + mode = mode, + quiet = TRUE + ) + + } + + renv_bootstrap_download_cran_latest <- function(version) { + + spec <- renv_bootstrap_download_cran_latest_find(version) + + message("* Downloading renv ", version, " ... ", appendLF = FALSE) + + type <- spec$type + repos <- spec$repos + + info <- tryCatch( + utils::download.packages( + pkgs = "renv", + destdir = tempdir(), + repos = repos, + type = type, + quiet = TRUE + ), + condition = identity + ) + + if (inherits(info, "condition")) { + message("FAILED") + return(FALSE) + } + + # report success and return + message("OK (downloaded ", type, ")") + info[1, 2] + + } + + renv_bootstrap_download_cran_latest_find <- function(version) { + + # check whether binaries are supported on this system + binary <- + getOption("renv.bootstrap.binary", default = TRUE) && + !identical(.Platform$pkgType, "source") && + !identical(getOption("pkgType"), "source") && + Sys.info()[["sysname"]] %in% c("Darwin", "Windows") + + types <- c(if (binary) "binary", "source") + + # iterate over types + repositories + for (type in types) { + for (repos in renv_bootstrap_repos()) { + + # retrieve package database + db <- tryCatch( + as.data.frame( + utils::available.packages(type = type, repos = repos), + stringsAsFactors = FALSE + ), + error = identity + ) + + if (inherits(db, "error")) + next + + # check for compatible entry + entry <- db[db$Package %in% "renv" & db$Version %in% version, ] + if (nrow(entry) == 0) + next + + # found it; return spec to caller + spec <- list(entry = entry, type = type, repos = repos) + return(spec) + + } + } + + # if we got here, we failed to find renv + fmt <- "renv %s is not available from your declared package repositories" + stop(sprintf(fmt, version)) + + } + + renv_bootstrap_download_cran_archive <- function(version) { + + name <- sprintf("renv_%s.tar.gz", version) + repos <- renv_bootstrap_repos() + urls <- file.path(repos, "src/contrib/Archive/renv", name) + destfile <- file.path(tempdir(), name) + + message("* Downloading renv ", version, " ... ", appendLF = FALSE) + + for (url in urls) { + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (identical(status, 0L)) { + message("OK") + return(destfile) + } + + } + + message("FAILED") + return(FALSE) + + } + + renv_bootstrap_download_github <- function(version) { + + enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") + if (!identical(enabled, "TRUE")) + return(FALSE) + + # prepare download options + pat <- Sys.getenv("GITHUB_PAT") + if (nzchar(Sys.which("curl")) && nzchar(pat)) { + fmt <- "--location --fail --header \"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "curl", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + fmt <- "--header=\"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "wget", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } + + message("* Downloading renv ", version, " from GitHub ... ", appendLF = FALSE) + + url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) + name <- sprintf("renv_%s.tar.gz", version) + destfile <- file.path(tempdir(), name) + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (!identical(status, 0L)) { + message("FAILED") + return(FALSE) + } + + message("OK") + return(destfile) + + } + + renv_bootstrap_install <- function(version, tarball, library) { + + # attempt to install it into project library + message("* Installing renv ", version, " ... ", appendLF = FALSE) + dir.create(library, showWarnings = FALSE, recursive = TRUE) + + # invoke using system2 so we can capture and report output + bin <- R.home("bin") + exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" + r <- file.path(bin, exe) + args <- c("--vanilla", "CMD", "INSTALL", "-l", shQuote(library), shQuote(tarball)) + output <- system2(r, args, stdout = TRUE, stderr = TRUE) + message("Done!") + + # check for successful install + status <- attr(output, "status") + if (is.numeric(status) && !identical(status, 0L)) { + header <- "Error installing renv:" + lines <- paste(rep.int("=", nchar(header)), collapse = "") + text <- c(header, lines, output) + writeLines(text, con = stderr()) + } + + status + + } + + renv_bootstrap_platform_prefix <- function() { + + # construct version prefix + version <- paste(R.version$major, R.version$minor, sep = ".") + prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") + + # include SVN revision for development versions of R + # (to avoid sharing platform-specific artefacts with released versions of R) + devel <- + identical(R.version[["status"]], "Under development (unstable)") || + identical(R.version[["nickname"]], "Unsuffered Consequences") + + if (devel) + prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") + + # build list of path components + components <- c(prefix, R.version$platform) + + # include prefix if provided by user + prefix <- renv_bootstrap_platform_prefix_impl() + if (!is.na(prefix) && nzchar(prefix)) + components <- c(prefix, components) + + # build prefix + paste(components, collapse = "/") + + } + + renv_bootstrap_platform_prefix_impl <- function() { + + # if an explicit prefix has been supplied, use it + prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) + if (!is.na(prefix)) + return(prefix) + + # if the user has requested an automatic prefix, generate it + auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) + if (auto %in% c("TRUE", "True", "true", "1")) + return(renv_bootstrap_platform_prefix_auto()) + + # empty string on failure + "" + + } + + renv_bootstrap_platform_prefix_auto <- function() { + + prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) + if (inherits(prefix, "error") || prefix %in% "unknown") { + + msg <- paste( + "failed to infer current operating system", + "please file a bug report at https://github.com/rstudio/renv/issues", + sep = "; " + ) + + warning(msg) + + } + + prefix + + } + + renv_bootstrap_platform_os <- function() { + + sysinfo <- Sys.info() + sysname <- sysinfo[["sysname"]] + + # handle Windows + macOS up front + if (sysname == "Windows") + return("windows") + else if (sysname == "Darwin") + return("macos") + + # check for os-release files + for (file in c("/etc/os-release", "/usr/lib/os-release")) + if (file.exists(file)) + return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) + + # check for redhat-release files + if (file.exists("/etc/redhat-release")) + return(renv_bootstrap_platform_os_via_redhat_release()) + + "unknown" + + } + + renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { + + # read /etc/os-release + release <- utils::read.table( + file = file, + sep = "=", + quote = c("\"", "'"), + col.names = c("Key", "Value"), + comment.char = "#", + stringsAsFactors = FALSE + ) + + vars <- as.list(release$Value) + names(vars) <- release$Key + + # get os name + os <- tolower(sysinfo[["sysname"]]) + + # read id + id <- "unknown" + for (field in c("ID", "ID_LIKE")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + id <- vars[[field]] + break + } + } + + # read version + version <- "unknown" + for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + version <- vars[[field]] + break + } + } + + # join together + paste(c(os, id, version), collapse = "-") + + } + + renv_bootstrap_platform_os_via_redhat_release <- function() { + + # read /etc/redhat-release + contents <- readLines("/etc/redhat-release", warn = FALSE) + + # infer id + id <- if (grepl("centos", contents, ignore.case = TRUE)) + "centos" + else if (grepl("redhat", contents, ignore.case = TRUE)) + "redhat" + else + "unknown" + + # try to find a version component (very hacky) + version <- "unknown" + + parts <- strsplit(contents, "[[:space:]]")[[1L]] + for (part in parts) { + + nv <- tryCatch(numeric_version(part), error = identity) + if (inherits(nv, "error")) + next + + version <- nv[1, 1] + break + + } + + paste(c("linux", id, version), collapse = "-") + + } + + renv_bootstrap_library_root_name <- function(project) { + + # use project name as-is if requested + asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") + if (asis) + return(basename(project)) + + # otherwise, disambiguate based on project's path + id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) + paste(basename(project), id, sep = "-") + + } + + renv_bootstrap_library_root <- function(project) { + + path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) + if (!is.na(path)) + return(path) + + path <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) + if (!is.na(path)) { + name <- renv_bootstrap_library_root_name(project) + return(file.path(path, name)) + } + + prefix <- renv_bootstrap_profile_prefix() + paste(c(project, prefix, "renv/library"), collapse = "/") + + } + + renv_bootstrap_validate_version <- function(version) { + + loadedversion <- utils::packageDescription("renv", fields = "Version") + if (version == loadedversion) + return(TRUE) + + # assume four-component versions are from GitHub; three-component + # versions are from CRAN + components <- strsplit(loadedversion, "[.-]")[[1]] + remote <- if (length(components) == 4L) + paste("rstudio/renv", loadedversion, sep = "@") + else + paste("renv", loadedversion, sep = "@") + + fmt <- paste( + "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.", + "Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.", + "Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.", + sep = "\n" + ) + + msg <- sprintf(fmt, loadedversion, version, remote) + warning(msg, call. = FALSE) + + FALSE + + } + + renv_bootstrap_hash_text <- function(text) { + + hashfile <- tempfile("renv-hash-") + on.exit(unlink(hashfile), add = TRUE) + + writeLines(text, con = hashfile) + tools::md5sum(hashfile) + + } + + renv_bootstrap_load <- function(project, libpath, version) { + + # try to load renv from the project library + if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) + return(FALSE) + + # warn if the version of renv loaded does not match + renv_bootstrap_validate_version(version) + + # load the project + renv::load(project) + + TRUE + + } + + renv_bootstrap_profile_load <- function(project) { + + # if RENV_PROFILE is already set, just use that + profile <- Sys.getenv("RENV_PROFILE", unset = NA) + if (!is.na(profile) && nzchar(profile)) + return(profile) + + # check for a profile file (nothing to do if it doesn't exist) + path <- file.path(project, "renv/local/profile") + if (!file.exists(path)) + return(NULL) + + # read the profile, and set it if it exists + contents <- readLines(path, warn = FALSE) + if (length(contents) == 0L) + return(NULL) + + # set RENV_PROFILE + profile <- contents[[1L]] + if (nzchar(profile)) + Sys.setenv(RENV_PROFILE = profile) + + profile + + } + + renv_bootstrap_profile_prefix <- function() { + profile <- renv_bootstrap_profile_get() + if (!is.null(profile)) + return(file.path("renv/profiles", profile)) + } + + renv_bootstrap_profile_get <- function() { + profile <- Sys.getenv("RENV_PROFILE", unset = "") + renv_bootstrap_profile_normalize(profile) + } + + renv_bootstrap_profile_set <- function(profile) { + profile <- renv_bootstrap_profile_normalize(profile) + if (is.null(profile)) + Sys.unsetenv("RENV_PROFILE") + else + Sys.setenv(RENV_PROFILE = profile) + } + + renv_bootstrap_profile_normalize <- function(profile) { + + if (is.null(profile) || profile %in% c("", "default")) + return(NULL) + + profile + + } + + # load the renv profile, if any + renv_bootstrap_profile_load(project) + + # construct path to library root + root <- renv_bootstrap_library_root(project) + + # construct library prefix for platform + prefix <- renv_bootstrap_platform_prefix() + + # construct full libpath + libpath <- file.path(root, prefix) + + # attempt to load + if (renv_bootstrap_load(project, libpath, version)) + return(TRUE) + + # load failed; inform user we're about to bootstrap + prefix <- paste("# Bootstrapping renv", version) + postfix <- paste(rep.int("-", 77L - nchar(prefix)), collapse = "") + header <- paste(prefix, postfix) + message(header) + + # perform bootstrap + bootstrap(version, libpath) + + # exit early if we're just testing bootstrap + if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) + return(TRUE) + + # try again to load + if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { + message("* Successfully installed and loaded renv ", version, ".") + return(renv::load()) + } + + # failed to download or load renv; warn the user + msg <- c( + "Failed to find an renv installation: the project will not be loaded.", + "Use `renv::activate()` to re-initialize the project." + ) + + warning(paste(msg, collapse = "\n"), call. = FALSE) + +}) diff --git a/server/workers/metrics/dependencies.R b/server/workers/metrics/dependencies.R new file mode 100644 index 000000000..814daa1ec --- /dev/null +++ b/server/workers/metrics/dependencies.R @@ -0,0 +1,7 @@ +library(rAltmetric) +library(rcrossref) +library(logging) +library(jsonlite) +library(doParallel) +library(dplyr) +library(stringdist) \ No newline at end of file diff --git a/server/workers/metrics/example_metrics.env b/server/workers/metrics/example_metrics.env new file mode 100644 index 000000000..568972967 --- /dev/null +++ b/server/workers/metrics/example_metrics.env @@ -0,0 +1,8 @@ +LOGFILE=/var/log/headstart/headstart.log +RENV_VERSION=0.14.0-5 +CRAN_REPOS=https://cran.wu.ac.at +LOGLEVEL=DEBUG +LC_ALL="en_US.UTF-8", +LANG="en_US.UTF-8" +RENV_PATHS_CACHE=/renv/cache +PYTHONIOENCODING="utf-8" \ No newline at end of file diff --git a/server/workers/metrics/renv.lock b/server/workers/metrics/renv.lock new file mode 100644 index 000000000..d3b212761 --- /dev/null +++ b/server/workers/metrics/renv.lock @@ -0,0 +1,559 @@ +{ + "R": { + "Version": "3.6.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cloud.r-project.org" + } + ] + }, + "Packages": { + "DT": { + "Package": "DT", + "Version": "0.33", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "64ff3427f559ce3f2597a4fe13255cb6" + }, + "R6": { + "Package": "R6", + "Version": "2.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "470851b6d5d0ac559e9d01bb352b4021" + }, + "Rcpp": { + "Package": "Rcpp", + "Version": "1.0.13", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "f27411eb6d9c3dada5edd444b8416675" + }, + "XML": { + "Package": "XML", + "Version": "3.98-1.20", + "Source": "URL", + "Repository": "CRAN", + "RemoteType": "url", + "RemoteUrl": "https://cran.r-project.org/src/contrib/Archive/XML/XML_3.98-1.20.tar.gz", + "Hash": "9de1032729115c474bd8a49838f46731" + }, + "askpass": { + "Package": "askpass", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cad6cf7f1d5f6e906700b9d3e718c796" + }, + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "543776ae6848fde2f48ff3816d0628bc" + }, + "bslib": { + "Package": "bslib", + "Version": "0.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b299c6741ca9746fb227debcb0f9fb6c" + }, + "cachem": { + "Package": "cachem", + "Version": "1.1.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cd9a672193789068eb5a2aad65a0dedf" + }, + "cli": { + "Package": "cli", + "Version": "3.6.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b21916dd77a27642b447374a5d30ecf3" + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-16", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "89cf4b8207269ccf82fbeb6473fd662b" + }, + "commonmark": { + "Package": "commonmark", + "Version": "1.9.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5d8225445acb167abf7797de48b2ee3c" + }, + "crayon": { + "Package": "crayon", + "Version": "1.5.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "859d96e65ef198fd43e82b9628d593ef" + }, + "crosstalk": { + "Package": "crosstalk", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "ab12c7b080a57475248a30f4db6298c0" + }, + "crul": { + "Package": "crul", + "Version": "1.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "22e70c5046981d39b7bf7af74433e396" + }, + "curl": { + "Package": "curl", + "Version": "5.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" + }, + "data.table": { + "Package": "data.table", + "Version": "1.15.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8ee9ac56ef633d0c7cab8b2ca87d683e" + }, + "digest": { + "Package": "digest", + "Version": "0.6.36", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fd6824ad91ede64151e93af67df6376b" + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.16", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2dc413572eb42475179bfe0afabd2adf" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" + }, + "evaluate": { + "Package": "evaluate", + "Version": "0.23", + "Source": "URL", + "Repository": "CRAN", + "RemoteType": "url", + "RemoteUrl": "https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz", + "Hash": "7171a28eaa45639b4ea80bb6beaaf0a1" + }, + "fansi": { + "Package": "fansi", + "Version": "1.0.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "962174cf2aeb5b9eea581522286a911f" + }, + "fastmap": { + "Package": "fastmap", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "aa5e1cd11c2d15497494c5292d7ffcc8" + }, + "fontawesome": { + "Package": "fontawesome", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d" + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e32cfc0973caba11b65b1fa691b4d8c9" + }, + "fs": { + "Package": "fs", + "Version": "1.6.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "15aeb8c27f5ea5161f9f6a641fafd93a" + }, + "generics": { + "Package": "generics", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + }, + "glue": { + "Package": "glue", + "Version": "1.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e0b3a53876554bd45879e596cdb10a52" + }, + "highr": { + "Package": "highr", + "Version": "0.11", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d65ba49117ca223614f71b60d85b8ab7" + }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.8.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "81d371a9cc60640e74e4ab6ac46dcedc" + }, + "htmlwidgets": { + "Package": "htmlwidgets", + "Version": "1.6.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "04291cc45198225444a397606810ac37" + }, + "httpcode": { + "Package": "httpcode", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "13641a1c6d2cc98801b76764078e17ea" + }, + "httpuv": { + "Package": "httpuv", + "Version": "1.6.15", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d55aa087c47a63ead0f6fc10f8fa1ee0" + }, + "httr": { + "Package": "httr", + "Version": "1.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.13", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "64778782a89480e9a644f69aad9a2877" + }, + "jquerylib": { + "Package": "jquerylib", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5aab57a3bd297eee1c1d862735972182" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.8", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e1b9c55281c5adc4dd113652d9e26768" + }, + "knitr": { + "Package": "knitr", + "Version": "1.48", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "acf380f300c721da9fde7df115a5f86f" + }, + "later": { + "Package": "later", + "Version": "1.3.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a3e051d405326b8b0012377434c62b37" + }, + "lazyeval": { + "Package": "lazyeval", + "Version": "0.2.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d908914ae53b04d4c0c0fd72ecc35370" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035" + }, + "logging": { + "Package": "logging", + "Version": "0.10-108", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fbef2ca79e23f11e033e89317b4c4770" + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472" + }, + "memoise": { + "Package": "memoise", + "Version": "2.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" + }, + "mime": { + "Package": "mime", + "Version": "0.12", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "18e9c28c1d3ca1560ce30658b22ce104" + }, + "miniUI": { + "Package": "miniUI", + "Version": "0.1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fec5f52652d60615fdb3957b3d74324a" + }, + "openssl": { + "Package": "openssl", + "Version": "2.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2bcca3848e4734eb3b16103bc9aa4b8e" + }, + "pillar": { + "Package": "pillar", + "Version": "1.9.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "15da5a8412f317beeee6175fbc76f4bb" + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01f28d4278f15c76cddbea05899c5d6f" + }, + "plyr": { + "Package": "plyr", + "Version": "1.8.9", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "6b8177fd19982f0020743fadbfdbd933" + }, + "promises": { + "Package": "promises", + "Version": "1.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "434cd5388a3979e74be5c219bcd6e77d" + }, + "rAltmetric": { + "Package": "rAltmetric", + "Version": "0.7.9000", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "rAltmetric", + "RemoteUsername": "OpenKnowledgeMaps", + "RemoteRef": "HEAD", + "RemoteSha": "2e0fec44a736a2c6bacc0fbec6d899d7bda12c24", + "Hash": "003ad437f2a08a189a4babc0c07cba89" + }, + "rappdirs": { + "Package": "rappdirs", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5e3c5dc0b071b21fa128676560dbe94d" + }, + "rcrossref": { + "Package": "rcrossref", + "Version": "1.2.009", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "rcrossref", + "RemoteUsername": "OpenKnowledgeMaps", + "RemoteRef": "HEAD", + "RemoteSha": "3af60e484155fde4781d36eb8d30b94b6da03b77", + "Hash": "7214795f0df1a8bc7c2361adad27126b" + }, + "renv": { + "Package": "renv", + "Version": "0.14.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "30e5eba91b67f7f4d75d31de14bbfbdc" + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1" + }, + "rlist": { + "Package": "rlist", + "Version": "0.4.6.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "290c8ea0700d2e7258082d0025386e68" + }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.27", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "27f9502e1cdbfa195f94e03b0f517484" + }, + "sass": { + "Package": "sass", + "Version": "0.4.9", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d53dbfddf695303ea4ad66f86e99b95d" + }, + "shiny": { + "Package": "shiny", + "Version": "1.9.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01666113b04d504390d60736f39bd0d0" + }, + "sourcetools": { + "Package": "sourcetools", + "Version": "0.1.7-1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5f5a7629f956619d519205ec475fe647" + }, + "stringdist": { + "Package": "stringdist", + "Version": "0.9.12", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "f360720fa3feb7db9d4133b31ebb067f" + }, + "stringi": { + "Package": "stringi", + "Version": "1.8.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "39e1144fd75428983dc3f63aa53dfa91" + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "960e2ae9e09656611e0b8214ad543207" + }, + "sys": { + "Package": "sys", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3a1be13d68d47a8cd0bfd74739ca1555" + }, + "tibble": { + "Package": "tibble", + "Version": "3.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a84e2cc86d07289b3b6f5069df7a004c" + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "829f27b9c4919c16b593794a6344d6c0" + }, + "tinytex": { + "Package": "tinytex", + "Version": "0.52", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cfbad971a71f0e27cec22e544a08bc3b" + }, + "triebeard": { + "Package": "triebeard", + "Version": "0.4.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "642507a148b0dd9b5620177e0a044413" + }, + "urltools": { + "Package": "urltools", + "Version": "1.7.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e86a704261a105f4703f653e05defa3e" + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "62b65c52671e6665f803ff02954446e9" + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a" + }, + "withr": { + "Package": "withr", + "Version": "3.0.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" + }, + "xfun": { + "Package": "xfun", + "Version": "0.46", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "00ce32f398db0415dde61abfef11300c" + }, + "xml2": { + "Package": "xml2", + "Version": "1.3.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" + }, + "xtable": { + "Package": "xtable", + "Version": "1.8-4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8acdf8af494d9ec19ccb2481a9b11c2" + }, + "yaml": { + "Package": "yaml", + "Version": "2.3.10", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "51dab85c6c98e50a18d7551e9d49f76c" + } + } +} diff --git a/server/workers/metrics/requirements-e.txt b/server/workers/metrics/requirements-e.txt new file mode 100644 index 000000000..a35d9cfef --- /dev/null +++ b/server/workers/metrics/requirements-e.txt @@ -0,0 +1,2 @@ +# Include the common package from the local file system +-e ../common \ No newline at end of file diff --git a/server/workers/metrics/requirements.txt b/server/workers/metrics/requirements.txt new file mode 100644 index 000000000..7679debf7 --- /dev/null +++ b/server/workers/metrics/requirements.txt @@ -0,0 +1,21 @@ +asn1crypto==0.24.0 +async-timeout==4.0.2 +cryptography==2.1.4 +idna==2.6 +importlib-metadata==4.8.3 +keyring==10.6.0 +keyrings.alt==3.0 +Levenshtein==0.21.1 +numpy==1.19.5 +packaging==21.3 +pandas==1.1.5 +pycrypto==2.6.1 +pyparsing==3.1.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +pyxdg==0.25 +redis==4.3.6 +SecretStorage==2.3.1 +six==1.11.0 +typing-extensions==4.1.1 +zipp==3.6.0 diff --git a/server/workers/orcid/run_orcid.py b/server/workers/metrics/run_metrics.py similarity index 51% rename from server/workers/orcid/run_orcid.py rename to server/workers/metrics/run_metrics.py index bd9cdc45d..2d71e936c 100644 --- a/server/workers/orcid/run_orcid.py +++ b/server/workers/metrics/run_metrics.py @@ -1,20 +1,25 @@ import os + # import json import redis -from orcid.src.orcid import OrcidClient +from metrics.src.metrics import MetricsClient -if __name__ == '__main__': +if __name__ == "__main__": redis_config = { "host": os.getenv("REDIS_HOST"), "port": os.getenv("REDIS_PORT"), "db": os.getenv("REDIS_DB"), "password": os.getenv("REDIS_PASSWORD"), - "client_name": "orcid_retrieval" + "client_name": "metrics_retrieval", } redis_store = redis.StrictRedis(**redis_config) - wrapper = OrcidClient(redis_store, - "english", - os.environ.get("LOGLEVEL", "INFO")) + wrapper = MetricsClient( + "./other-scripts", + "run_metrics.R", + redis_store, + "english", + os.environ.get("LOGLEVEL", "INFO"), + ) wrapper.run() diff --git a/server/workers/metrics/src/__init__.py b/server/workers/metrics/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/metrics/src/metrics.py b/server/workers/metrics/src/metrics.py new file mode 100644 index 000000000..67155157a --- /dev/null +++ b/server/workers/metrics/src/metrics.py @@ -0,0 +1,108 @@ +import time +import json +import subprocess +import logging +from common.r_wrapper import RWrapper +from common.decorators import error_logging_aspect +from common.rate_limiter import RateLimiter + +formatter = logging.Formatter( + fmt='%(asctime)s %(levelname)-8s %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) + + +class MetricsClient(RWrapper): + SEPARATION_TIME = 1.5 # Separation time for rate limiting in seconds + + def __init__(self, *args): + super().__init__(*args) + self.rate_key = 'metrics-ratelimit' + self.rate_limiter = RateLimiter( + self.redis_store, + self.rate_key, + self.SEPARATION_TIME + ) + + def next_item(self): + queue, msg = self.redis_store.blpop("metrics") + msg = json.loads(msg.decode('utf-8')) + item_id = msg.get('id') + params = self.add_default_params(msg.get('params')) + metadata = msg.get('metadata') + return item_id, params, metadata + + @error_logging_aspect(log_level=logging.ERROR) + def execute_search(self, params: dict, metadata: str) -> dict: + command = [ + self.command, + self.runner, + self.wd, + params.get('q'), + params.get('service') + ] + + self.logger.debug(f"Executing command: {command}") + + data = { + "params": params, + "metadata": metadata + } + + try: + proc = subprocess.Popen( + command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + encoding='utf-8' + ) + stdout, stderr = proc.communicate(json.dumps(data)) + + output = [line for line in stdout.split('\n') if line] + errors = [line for line in stderr.split('\n') if line] + + if not output: + raise ValueError("No output received from the subprocess") + + raw_metadata = json.loads(output[-2])[0] + + if isinstance(raw_metadata, dict) and raw_metadata.get('status') == "error": + return raw_metadata + + return { + "input_data": raw_metadata, + "params": params + } + + except Exception as e: + self.logger.error(f"Error during command execution: {e}") + if errors: + self.logger.error(f"Stderr: {errors}") + raise + + @error_logging_aspect(log_level=logging.ERROR) + def run(self): + while True: + while self.rate_limiter.rate_limit_reached(): + self.logger.debug('🛑 Request is limited') + time.sleep(0.1) + continue + + item_id, params, metadata = self.next_item() + self.logger.debug(f"Processing item: {item_id}") + self.logger.debug(f"Params: {params}") + + try: + result = self.execute_search(params, metadata) + result["id"] = item_id + + output_key = f"{item_id}_output" + self.logger.debug(f"Storing result in key: {output_key}") + self.logger.debug(f"Result: {result}") + self.redis_store.set(output_key, json.dumps(result)) + + except Exception as e: + self.logger.exception("Exception during metrics enrichment.") + self.logger.error(f"Params: {params}") + self.logger.error(f"Error: {e}") \ No newline at end of file diff --git a/server/workers/openaire/.dockerignore b/server/workers/openaire/.dockerignore new file mode 100644 index 000000000..2a5c729b7 --- /dev/null +++ b/server/workers/openaire/.dockerignore @@ -0,0 +1,10 @@ +renv +__pycache__ +.cache +.pytest_cache +.Rproj.user +.RData +.Rhistory +*.Rproj +.pynb_checkpoints +*.ipynb \ No newline at end of file diff --git a/server/workers/openaire/Dockerfile b/server/workers/openaire/Dockerfile index 0e8372b95..918081c51 100644 --- a/server/workers/openaire/Dockerfile +++ b/server/workers/openaire/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:20.04 -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" +LABEL maintainer="Chris Kittel " ENV DEBIAN_FRONTEND=noninteractive @@ -127,7 +127,7 @@ RUN locale-gen en_US.UTF-8 && \ RUN apt update && apt full-upgrade -y && \ apt install -y links curl vim libcurl4-openssl-dev \ libxml2-dev libz-dev libpoppler-cpp-dev \ - libopenmpi-dev libzmq3-dev \ + libopenmpi-dev libzmq3-dev build-essential python3-dev \ libssl1.1 libssl-dev && \ apt clean && \ rm -f /etc/localtime && \ @@ -141,8 +141,12 @@ RUN R -e 'options(repos="https://cran.wu.ac.at")' && \ R -e 'install.packages("remotes")' && \ R -e 'install.packages("renv", version="0.14.0-5")' +ENV PYTHONPATH="/headstart/:/headstart/openaire/:/headstart/openaire/src/" + WORKDIR /headstart COPY workers/openaire/requirements.txt . +RUN pip3 install --no-cache-dir Cython +RUN pip3 install --upgrade pip RUN pip3 install --no-cache-dir -r requirements.txt COPY workers/openaire/renv.lock . @@ -152,11 +156,15 @@ RUN apt-get -y install automake RUN R -e 'renv::consent(provided = TRUE)' && \ R -e 'setwd("./"); renv::activate(); renv::restore(lockfile = "./renv.lock")' -COPY workers/common ./common +COPY workers/common ../common +COPY workers/openaire/requirements-e.txt . +RUN pip3 install --no-cache-dir -r requirements-e.txt + COPY workers/openaire ./openaire COPY preprocessing/resources ./resources COPY preprocessing/other-scripts ./other-scripts RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log COPY workers/openaire/*.py ./ -ENTRYPOINT python3 run_openaire.py + +CMD ["python3", "run_openaire.py"] diff --git a/server/workers/openaire/requirements-e.txt b/server/workers/openaire/requirements-e.txt new file mode 100644 index 000000000..a35d9cfef --- /dev/null +++ b/server/workers/openaire/requirements-e.txt @@ -0,0 +1,2 @@ +# Include the common package from the local file system +-e ../common \ No newline at end of file diff --git a/server/workers/orcid/Dockerfile b/server/workers/orcid/Dockerfile index 99ee65ef0..227420bcd 100644 --- a/server/workers/orcid/Dockerfile +++ b/server/workers/orcid/Dockerfile @@ -1,20 +1,25 @@ FROM python:3.8 -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" + +LABEL maintainer="Chris Kittel " RUN apt-get update RUN apt-get install -y gcc git libpq-dev -ENV PYTHONPATH="${PYTHONPATH}:/headstart/:/headstart/orcid/:/headstart/orcid/src/" +ENV PYTHONPATH="/headstart/:/headstart/orcid/:/headstart/orcid/src/" WORKDIR /headstart + +# Install core dependencies COPY workers/orcid/requirements.txt . RUN pip install --upgrade pip RUN pip install --no-cache-dir -r requirements.txt +# Install shared dependencies +COPY workers/common ../common +COPY workers/orcid/requirements-e.txt . +RUN pip3 install --no-cache-dir -r requirements-e.txt -COPY workers/common ./common -COPY workers/orcid ./orcid +COPY workers/orcid ./ RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log -COPY workers/orcid/*.py ./ -ENTRYPOINT python3 run_orcid.py +CMD ["python3", "src/main.py"] diff --git a/server/workers/orcid/pyproject.toml b/server/workers/orcid/pyproject.toml new file mode 100644 index 000000000..d21e13ce8 --- /dev/null +++ b/server/workers/orcid/pyproject.toml @@ -0,0 +1,8 @@ +[tool.pytest.ini_options] +pythonpath = [ + '.', + 'src' +] +testpaths = [ + 'tests' +] \ No newline at end of file diff --git a/server/workers/orcid/requirements-e.txt b/server/workers/orcid/requirements-e.txt new file mode 100644 index 000000000..a35d9cfef --- /dev/null +++ b/server/workers/orcid/requirements-e.txt @@ -0,0 +1,2 @@ +# Include the common package from the local file system +-e ../common \ No newline at end of file diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt index 12071581b..dd278cc32 100644 --- a/server/workers/orcid/requirements.txt +++ b/server/workers/orcid/requirements.txt @@ -12,17 +12,16 @@ jsonschema==3.2.0 MarkupSafe==2.1.3 Levenshtein==0.21.1 mistune==2.0.5 -numpy==1.19.5 +numpy==1.24.4 packaging==21.3 -pandas==1.3.0 +pandas==1.3.1 pyparsing==3.1.1 pyrsistent==0.18.0 python-dateutil==2.8.2 pytz==2023.3.post1 PyYAML==6.0.1 redis==4.3.6 -scikit-learn==0.24.2 six==1.16.0 -typing-extensions==4.1.1 +typing-extensions==4.2.0 zipp==3.6.0 pyorcid @ git+https://github.com/OpenKnowledgeMaps/PyOrcid.git@main \ No newline at end of file diff --git a/server/workers/orcid/src/config.py b/server/workers/orcid/src/config.py new file mode 100644 index 000000000..92eb37737 --- /dev/null +++ b/server/workers/orcid/src/config.py @@ -0,0 +1,43 @@ +import os +from typing import TypedDict, Optional + +# Define types for the configurations +class LoggingConfig(TypedDict): + level: str + format: str + datefmt: str + +class RedisConfig(TypedDict): + host: str + port: int + db: int + password: Optional[str] + client_name: str + +class OrcidConfig(TypedDict): + client_id: Optional[str] + client_secret: Optional[str] + sandbox: bool + +# Logging configuration +LOGGING_CONFIG: LoggingConfig = { + "level": os.getenv("LOG_LEVEL", "INFO"), + "format": "%(asctime)s %(levelname)-8s %(message)s", + "datefmt": "%Y-%m-%d %H:%M:%S" +} + +# Redis configuration +REDIS_CONFIG: RedisConfig = { + "host": os.getenv("REDIS_HOST", "localhost"), + "port": int(os.getenv("REDIS_PORT", 6379)), + "db": int(os.getenv("REDIS_DB", 0)), + "password": os.getenv("REDIS_PASSWORD"), + "client_name": "orcid_retrieval", +} + +# ORCID API configuration +ORCID_CONFIG: OrcidConfig = { + "client_id": os.getenv("ORCID_CLIENT_ID"), + "client_secret": os.getenv("ORCID_CLIENT_SECRET"), + "sandbox": os.getenv("ORCID_SANDBOX", "False").lower() == "true", +} \ No newline at end of file diff --git a/server/workers/orcid/src/main.py b/server/workers/orcid/src/main.py new file mode 100644 index 000000000..ee10c79a4 --- /dev/null +++ b/server/workers/orcid/src/main.py @@ -0,0 +1,52 @@ +import logging + +import redis + +from config import LOGGING_CONFIG, REDIS_CONFIG, ORCID_CONFIG +from common.rate_limiter import RateLimiter +from orcid_service import OrcidService +from worker import OrcidWorker + +def setup_logging(): + logging.basicConfig( + level=LOGGING_CONFIG["level"], + format=LOGGING_CONFIG["format"], + datefmt=LOGGING_CONFIG["datefmt"] + ) + logger = logging.getLogger(__name__) + logger.info("Logging is configured.") + return logger + +def create_redis_store(): + redis_store = redis.StrictRedis(**REDIS_CONFIG) + return redis_store + +def create_data_retriever(redis_store: redis.StrictRedis): + data_retriever = OrcidService.create( + orcid_client_id=ORCID_CONFIG['client_id'], + orcid_client_secret=ORCID_CONFIG['client_secret'], + sandbox=ORCID_CONFIG['sandbox'], + redis_store=redis_store + ) + return data_retriever + +def main(): + # Set up logging + logger = setup_logging() + + # Set up Redis + redis_store = create_redis_store() + + # Initialize ORCID data retriever + data_retriever = create_data_retriever(redis_store) + + rate_limiter = RateLimiter(redis_store, "orcid-ratelimit") + + # Initialize the worker + worker = OrcidWorker(redis_store, data_retriever, rate_limiter) + + logger.info("Starting ORCID worker") + worker.run() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/server/workers/orcid/src/model.py b/server/workers/orcid/src/model.py new file mode 100644 index 000000000..9741084b8 --- /dev/null +++ b/server/workers/orcid/src/model.py @@ -0,0 +1,69 @@ + +from dataclasses import dataclass, field +from typing import TypedDict, List, Optional + + +@dataclass +class Website(TypedDict): + url_name: str + url: str + + +@dataclass +class ExternalIdentifier(TypedDict): + type: str + url: str + value: str + relationship: str + + +@dataclass +class AuthorInfo: + # TODO: consider renaming it to something more generic, + # TODO: so that it will be possible to reuse between different client integrations + orcid_id: str + # TODO: consider storing First Name and Last Name separately, usually it's better + # TODO: for data computation. The only issue we may face with this approach is that some + # TODO: data integrations may return author name as a single field + author_name: Optional[str] = None + biography: Optional[str] = None + author_keywords: Optional[str] = None + academic_age: Optional[str] = None + websites: List['Website'] = field(default_factory=list) + external_identifiers: List['ExternalIdentifier'] = field(default_factory=list) + countries: List[str] = field(default_factory=list) + total_citations: Optional[int] = None + total_unique_social_media_mentions: Optional[int] = None + total_neppr: Optional[int] = None + h_index: Optional[int] = None + normalized_h_index: Optional[int] = None + + +@dataclass +class Work: + id: str + identifier: str + authors: str + title: str + subtitle: str + paper_abstract: str + year: int + oa_state: str + subject_orig: str + subject_cleaned: str + relevance: str + link: str + published_in: str + fulltext: str + language: str + subject: str + url: str + relation: str + resulttype: str + doi: str + citation_count: int + cited_by_accounts_count: int + cited_by_wikipedia_count: int + cited_by_msm_count: int + cited_by_policies_count: int + cited_by_patents_count: int diff --git a/server/workers/orcid/src/orcid.py b/server/workers/orcid/src/orcid.py deleted file mode 100644 index 142441f6f..000000000 --- a/server/workers/orcid/src/orcid.py +++ /dev/null @@ -1,618 +0,0 @@ -import os -import sys -import json -import pandas as pd -import logging -from dateutil.parser import parse -from common.decorators import error_logging_aspect -from common.deduplication import find_duplicate_indexes,\ - prioritize_OA_and_latest, mark_latest_doi -from redis.exceptions import LockError -import time -import numpy as np -from pyorcid import OrcidAuthentication, Orcid, errors as pyorcid_errors -from typing import Tuple -import requests - - - -formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s', - datefmt='%Y-%m-%d %H:%M:%S') - - - -class OrcidClient(): - - def __init__(self, redis_store=None, language=None, loglevel="INFO") -> None: - self.redis_store = redis_store - self.default_params = {} - self.default_params["language"] = language - self.logger = logging.getLogger(__name__) - self.logger.setLevel(loglevel) - handler = logging.StreamHandler(sys.stdout) - handler.setFormatter(formatter) - handler.setLevel(loglevel) - self.logger.addHandler(handler) - - self.separation = 0.1 - self.rate_key = 'orcid-ratelimit' - self.ORCID_CLIENT_ID = os.getenv("ORCID_CLIENT_ID") - self.ORCID_CLIENT_SECRET = os.getenv("ORCID_CLIENT_SECRET") - self.access_token = self.authenticate() - if os.getenv("FLASK_ENV") == "dev": - self.sandbox = True - else: - self.sandbox = False - - @error_logging_aspect(log_level=logging.ERROR) - def authenticate(self) -> str: - try: - orcid_auth = OrcidAuthentication(client_id=self.ORCID_CLIENT_ID, - client_secret=self.ORCID_CLIENT_SECRET) - access_token = orcid_auth.get_public_access_token() - return access_token - except Exception as e: - raise e - - def next_item(self) -> Tuple[str, dict, str]: - queue, msg = self.redis_store.blpop("orcid") - msg = json.loads(msg.decode('utf-8')) - k = msg.get('id') - params = msg.get('params') - params["service"] = "orcid" - endpoint = msg.get('endpoint') - return k, params, endpoint - - @error_logging_aspect(log_level=logging.INFO) - def orcid_rate_limit_reached(self) -> bool: - """ - This implementation is inspired by an implementation of - Generic Cell Rate Algorithm based rate limiting, - seen on https://dev.to/astagi/rate-limiting-using-python-and-redis-58gk. - It has been simplified and adjusted to our use case. - - ORCID allows 24 requests per second, with a burst limit of 40 requests. See also: - https://info.orcid.org/ufaqs/what-are-the-api-limits/ - """ - - t = self.redis_store.time()[0] - self.redis_store.setnx(self.rate_key, 0) - try: - with self.redis_store.lock('lock:' + self.rate_key, blocking_timeout=5) as lock: - theoretical_arrival_time = max(float(self.redis_store.get(self.rate_key)), t) - if theoretical_arrival_time - t <= 0: - new_theoretical_arrival_time = max(theoretical_arrival_time, t) + self.separation - self.redis_store.set(self.rate_key, new_theoretical_arrival_time) - return False - return True - # the locking mechanism is needed if a key is requested multiple times at the same time - except LockError: - return True - - @error_logging_aspect(log_level=logging.ERROR) - def run(self) -> None: - """ - This function is the main loop of the OrcidClient. It will continuously - check for new items in the Redis queue, process them, and store the results - back in Redis. - - The function will also check if the rate limit for ORCID requests is reached. - - return: None - """ - # TODO: add retry mechanism - while True: - while self.orcid_rate_limit_reached(): - self.logger.debug('🛑 Request is limited') - time.sleep(0.1) - k, params, endpoint = self.next_item() - self.logger.debug(k) - self.logger.debug(params) - if endpoint == "search": - try: - res = self.execute_search(params) - self.logger.debug(res) - res["id"] = k - - if res.get("status") == "error" or params.get('raw') is True: - self.redis_store.set(k+"_output", json.dumps(res)) - else: - self.redis_store.rpush("input_data", json.dumps(res).encode('utf8')) - q_len = self.redis_store.llen("input_data") - self.logger.debug("Queue length: %s %d %s" %("input_data", q_len, k)) - except Exception as e: - self.logger.exception("Exception during data retrieval.") - self.logger.error(params) - self.logger.error(e) - - @error_logging_aspect(log_level=logging.ERROR) - def execute_search(self, params: dict) -> dict: - """ - This function is the main function for the search endpoint. It will - retrieve the ORCID data for the given ORCID ID, extract the author - information and the works metadata, and return the results. - In case of errors, it will return an error reason. Following errors - are possible: - - invalid orcid id - - not enough results for orcid - - unexpected data processing error - - Parameters: - - params (dict): The parameters for the search endpoint. The parameters - should contain the ORCID ID of the author. - - Returns: - - dict: The results of the search endpoint. - """ - q = params.get('q') - service = params.get('service') - data = {} - data["params"] = params - orcid_id = params.get("orcid") - orcid = Orcid(orcid_id=orcid_id, orcid_access_token=self.access_token, state = "public", sandbox=self.sandbox) - - try: - author_info = extract_author_info(orcid) - metadata = retrieve_full_works_metadata(orcid) - self.logger.debug(f"metadata retrieved and length is: {len(metadata)}") - self.logger.debug(metadata) - - if len(metadata) == 0: - res = {} - res["params"] = params - res["status"] = "error" - res["reason"] = ["not enough results for orcid"] - self.logger.debug( - f"ORCID {orcid_id} has no works metadata." - ) - return res - metadata["authors"] = metadata["authors"].map(lambda x: author_info["author_name"] if x=="" else x) - #metadata = mark_duplicates(metadata) - #metadata = filter_duplicates(metadata) - metadata = sanitize_metadata(metadata) - metadata = metadata.head(int(params.get("limit"))) - # in BASE it is ["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"] - text = pd.concat([metadata.id, metadata[["title", "paper_abstract", "subtitle", "published_in", "authors"]] - .apply(lambda x: " ".join(x), axis=1)], axis=1) - text.columns = ["id", "content"] - input_data = {} - input_data["metadata"] = metadata.to_json(orient='records') - input_data["text"] = text.to_json(orient='records') - res = {} - res["input_data"] = input_data - # merge author info into params - params.update(author_info) - res["params"] = params - return res - except ( - pyorcid_errors.Forbidden, - pyorcid_errors.NotFound, - pyorcid_errors.BadRequest, - ) as e: - self.logger.error(e) - self.logger.error(params) - res = {} - res["params"] = params - res["status"] = "error" - res["reason"] = ["invalid orcid id"] - return res - # Unauthorized also should be internal server error, because we do not use client's credentials - except (pyorcid_errors.Unauthorized, Exception) as e: - self.logger.error(e) - self.logger.error(params) - res = {} - res["params"] = params - res["status"] = "error" - res["reason"] = ["unexpected data processing error"] - return res - - -# TODO: the following functions should be moved to a separate module -def get_nested_value(data, keys, default=None): - """ - Recursively retrieves a nested value from a dictionary. - - :param data: Dictionary to retrieve the value from - :param keys: List of keys to follow in the dictionary - :param default: Default value to return if any key is not found - :return: The retrieved value or the default value - """ - for key in keys: - try: - data = data.get(key) - if data is None: - return default - except AttributeError: - return default - return data - - -@error_logging_aspect(log_level=logging.ERROR) -def extract_author_info(orcid: Orcid) -> dict: - """ - This function extracts the author information from the ORCID data. - - Parameters: - - orcid (Orcid): The Orcid object containing the ORCID data. - - Returns: - - dict: The author information extracted from the ORCID data. - """ - personal_details = orcid.personal_details() - orcid_id = orcid._orcid_id - author_name = " ".join( - [personal_details.get("name", {}).get("given-names", {}).get("value", ""), - personal_details.get("name", {}).get("family-name", {}).get("value", "")] - ) - author_keywords = ", ".join(orcid.keywords()[0]) - biography = personal_details.get("biography", {}).get("content", "") \ - if (personal_details.get("biography") and personal_details.get("biography", {}).get("visibility") == "public" )\ - else "" - external_identifiers = extract_external_identifiers(orcid) - countries = extract_countries(orcid) - websites = extract_websites(orcid) - author_info = { - "orcid_id": orcid_id, - "author_name": author_name, - "author_keywords": author_keywords, - "biography": biography, - "websites": websites, - "external_identifiers": external_identifiers, - "country": countries - } - return author_info - -@error_logging_aspect(log_level=logging.WARNING) -def sanitize_metadata(metadata: pd.DataFrame) -> pd.DataFrame: - """ - This function sanitizes the metadata DataFrame by converting all columns - to string type and filling missing values with an empty string. - - Parameters: - - metadata (pd.DataFrame): The metadata DataFrame to sanitize. - - Returns: - - pd.DataFrame: The sanitized metadata DataFrame. - """ - metadata["title"] = metadata["title"].fillna("").astype(str) - metadata["subtitle"] = metadata["subtitle"].fillna("").astype(str) - metadata["paper_abstract"] = metadata["paper_abstract"].fillna("").astype(str) - metadata["published_in"] = metadata["published_in"].fillna("").astype(str) - return metadata - -@error_logging_aspect(log_level=logging.WARNING) -def extract_countries(orcid: Orcid) -> list: - countries = pd.DataFrame(orcid.address()["address"]) - if not countries.empty: - countries = countries[countries["visibility"] == "public"] - countries["country"] = countries["country"].apply(lambda x: x.get("value")) - countries = countries["country"] - return countries.tolist() - else: - return [] - -@error_logging_aspect(log_level=logging.WARNING) -def extract_external_identifiers(orcid: Orcid) -> list: - external_identifiers = pd.DataFrame(orcid.external_identifiers()["external-identifier"]) - if not external_identifiers.empty: - external_identifiers = external_identifiers[external_identifiers["visibility"] == "public"] - external_identifiers["external-id-url"] = external_identifiers["external-id-url"].apply(lambda x: x.get("value")) - external_identifiers = external_identifiers[[ "external-id-type", "external-id-url", "external-id-value", "external-id-relationship"]] - return external_identifiers.to_dict(orient='records') - else: - return [] - -@error_logging_aspect(log_level=logging.WARNING) -def extract_websites(orcid: Orcid) -> list: - urls = pd.DataFrame(orcid.researcher_urls()["researcher-url"]) - if not urls.empty: - urls = urls[urls["visibility"] == "public"] - urls["url"] = urls["url"].apply(lambda x: x.get("value")) - urls = urls[[ "url-name", "url"]] - return urls.to_dict(orient='records') - else: - return [] - -def get_short_description(work) -> str: - return get_nested_value(work, ["short-description"], "") - - -def get_authors(work) -> str: - try: - contributors = get_nested_value(work, ["contributors", "contributor"], []) - - authors = [] - - for contributor in contributors: - author = get_nested_value(contributor, ["credit-name", "value"], None) - - if author: - authors.append(author) - - authors_str = "; ".join(authors) - except KeyError: - authors_str = "" - - return authors_str - - -def get_subjects(work) -> str: - return get_nested_value(work, ["subject"], "") - - -def get_title(work) -> str: - return get_nested_value(work, ["title", "title", "value"], "") - - -def get_subtitle(work) -> str: - return get_nested_value(work, ["title", "subtitle", "value"], "") - - -def get_paper_abstract(work) -> str: - return get_nested_value(work, ["short-description"], "") - - -def get_resulttype(work) -> str: - return get_nested_value(work, ["type"], "") - - -def published_in(work) -> str: - return get_nested_value(work, ["journal-title", "value"], "") - -def get_put_code(work) -> str: - return get_nested_value(work, ["put-code"], "") - -def get_url(work) -> str: - url = get_nested_value(work, ["url", "value"], "") - if url == "": - ids = get_nested_value(work, ["external-ids", "external-id"], "") - if ids: - for id in ids: - if id["external-id-value"].startswith("http"): - url = id["external-id-value"] - break - return url - -def get_link(work) -> str: - url = get_nested_value(work, ["url", "value"], "") - if url.lower().endswith(".pdf"): - link = url - else: - link = "" - return link - -@error_logging_aspect(log_level=logging.ERROR) -def retrieve_full_works_metadata(orcid: Orcid) -> pd.DataFrame: - """ - This function retrieves the full works metadata from the ORCID data. - - Parameters: - - orcid (Orcid): The Orcid object containing the ORCID data. - - Returns: - - pd.DataFrame: The full works metadata retrieved from the ORCID data. - """ - works_data = pd.DataFrame(orcid.works_full_metadata(limit=1000)) - # works["publication-date"] = works.apply(get_publication_date, axis=1) - # works["doi"] = works.apply(extract_dois, axis=1) - - new_works_data = pd.DataFrame() - - if works_data.empty: - return new_works_data - - # Perform transformations and store in new DataFrame - new_works_data["id"] = works_data.apply(get_put_code, axis=1).astype(str) - new_works_data["title"] = works_data.apply(get_title, axis=1) - new_works_data["subtitle"] = works_data.apply(get_subtitle, axis=1) - new_works_data["authors"] = works_data.apply(get_authors, axis=1) - new_works_data["paper_abstract"] = works_data.apply(get_paper_abstract, axis=1).fillna("") - new_works_data["year"] = works_data.apply(get_publication_date, axis=1) - new_works_data["published_in"] = works_data.apply(published_in, axis=1) - new_works_data["resulttype"] = works_data.apply(get_resulttype, axis=1).map(lambda x: doc_type_mapping.get(x, "")) - new_works_data["doi"] = works_data.apply(extract_dois, axis=1) - new_works_data["oa_state"] = 2 - new_works_data["subject"] = "" # this needs to come from BASE enrichment - new_works_data["url"] = works_data.apply(get_url, axis=1) - new_works_data["link"] = works_data.apply(get_link, axis=1) - - return new_works_data - - -@error_logging_aspect(log_level=logging.ERROR) -def apply_metadata_schema(works: pd.DataFrame) -> pd.DataFrame: - works.rename(columns=works_mapping, inplace=True) - metadata = works - return metadata - -def filter_dicts_by_value(dicts, key, value) -> list: - return [d for d in dicts if d.get(key) == value] - -@error_logging_aspect(log_level=logging.WARNING) -def extract_dois(work: pd.DataFrame) -> str: - external_ids = work["external-ids"] - external_ids = external_ids["external-id"] if external_ids else [] - external_ids = external_ids if isinstance(external_ids, list) else [] - external_ids = (filter_dicts_by_value( - external_ids, - key="external-id-type", - value="doi") if len(external_ids)>0 else "") - doi = external_ids[0].get("external-id-value", "") if len(external_ids)>0 else "" - return doi - -@error_logging_aspect(log_level=logging.WARNING) -def get_publication_date(work) -> str: - try: - year = get_nested_value(work, ["publication-date", "year", "value"], np.nan) - except KeyError: - year = np.nan - try: - month = get_nested_value(work, ["publication-date", "month", "value"], np.nan) - except KeyError: - month = np.nan - try: - day = get_nested_value(work, ["publication-date", "day", "value"], np.nan) - except KeyError: - day = np.nan - publication_date = "" - parsed_publication_date = publication_date - if year is not np.nan: - publication_date += str(int(year)) - parsed_publication_date = publication_date - if month is not np.nan and month != "00": - publication_date += "-" + str(int(month)) - date_obj = parse(publication_date) - parsed_publication_date = date_obj.strftime("%Y-%m") - if day is not np.nan: - publication_date += "-" + str(int(day)) - date_obj = parse(publication_date) - parsed_publication_date = date_obj.strftime("%Y-%m-%d") - return parsed_publication_date - -@error_logging_aspect(log_level=logging.ERROR) -def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame: - df.drop_duplicates("id", inplace=True, keep="first") - df["is_latest"] = True - df["doi_duplicate"] = False - df["has_relations"] = False - df["link_duplicate"] = False - df["keep"] = False - dupind = find_duplicate_indexes(df) - pure_datasets = df[df.type == "data-set"] - non_datasets = df.loc[df.index.difference(pure_datasets.index)] - non_datasets = prioritize_OA_and_latest(non_datasets, dupind) - pure_datasets = mark_latest_doi(pure_datasets, dupind) - filtered_non_datasets = non_datasets[non_datasets.is_latest==True] - filtered_datasets = pure_datasets[(pure_datasets.keep==True) | (pure_datasets.is_duplicate==False)] - filtered = pd.concat([filtered_non_datasets, filtered_datasets]) - filtered.sort_index(inplace=True) - return filtered - -@error_logging_aspect(log_level=logging.ERROR) -def enrich_from_BASE(metadata: pd.DataFrame) -> pd.DataFrame: - """ - This function enriches the metadata DataFrame with additional information - from the BASE database. - - Parameters: - - metadata (pd.DataFrame): The metadata DataFrame to enrich. - - Returns: - - pd.DataFrame: The enriched metadata DataFrame. - """ - dois = metadata[metadata.doi.map(lambda x: len(x)>0)].doi.to_list() - doi_batches = batch_dois(dois) - url_BASE = "http://proxy-proxy-1/"+os.getenv("COMPOSE_PROJECT_NAME")+"/base/search" - params_BASE = { - "q": "", - "sorting": "most-relevant", - "document_types": ["4", "11", "111", "13", "16", "7", "5", - "12", "121", "122", "17", "19", "3", "52", - "2", "F", "1A", "14", "15", "6", "51", - "1", "18", "181", "183", "182"], - "from": "1665-01-01", - "to": pd.Timestamp.now().strftime("%Y-%m-%d"), - "vis_type": "overview", - "raw": True, - "list_size": 120, - "min_descsize": 0 - } - - tmp = [] - for batch in doi_batches: - try: - params_BASE["q_advanced"] = batch - response = requests.post(url_BASE, json=params_BASE) - data = response.json() - if "input_data" in data: - tmp.append(pd.DataFrame(json.loads(data['input_data']["metadata"]))) - except Exception as e: - logging.error(e) - enrichment_data = pd.concat(tmp) - enrichment_data = enrichment_data[enrichment_data.doi.str.contains("|".join(dois))] - return metadata - -def batch_dois(strings, limit=400): - """ - This function batches a list of strings into groups of strings that - together are less than a specified limit. - It is used to batch DOIs for BASE enrichment. - - Parameters: - - strings (list): The list of strings to batch. - - Returns: - - list: The list of batches of strings. - """ - batches = [] - current_batch = "" - - for string in strings: - substring = 'OR dcdoi:"'+string+'"' - if len(current_batch) + len(substring) + 1 > limit: # +1 for space or no space if first - batches.append("("+current_batch.strip()+")") # Add current batch to batches - current_batch = 'dcdoi:"'+string+'"' # Start a new batch with the current string - else: - current_batch += " " + substring if current_batch else substring # Add string to current batch - - if current_batch: # Add the last batch if it's not empty - batches.append("("+current_batch.strip()+")") - - return batches - -works_mapping = { - "put-code": "id", - "title.title.value": "title", - "short-description": "paper_abstract", - "publication-date": "year", - "work-contributors": "authors", - "journal-title.value": "published_in" -} - -doc_type_mapping = { - "book": "Book", - "book-chapter": "Book chapter", - "book-review": "Book review", - "dictionary-entry": "Dictionary entry", - "dissertation": "Dissertation", - "dissertation-thesis": "Dissertation thesis", - "enyclopaedia-entry": "Encyclopedia entry", - "edited-book": "Edited book", - "journal-article": "Journal article", - "journal-issue": "Journal issue", - "magazine-article": "Magazine article", - "manual": "Manual", - "online-resource": "Online resource", - "newsletter-article": "Newsletter article", - "newspaper-article": "Newspaper article", - "preprint": "Preprint", - "report": "Report", - "review": "Review", - "research-tool": "Research tool", - "supervised-student-publication": "Supervised student publication", - "test": "Test", - "translation": "Translation", - "website": "Website", - "working-paper": "Working paper", - "conference-abstract": "Conference abstract", - "conference-paper": "Conference paper", - "conference-poster": "Conference poster", - "disclosure": "Disclosure", - "license": "License", - "patent": "Patent", - "registered-copyright": "Registered copyright", - "trademark": "Trademark", - "annotation": "Annotation", - "artistic-performance": "Artistic performance", - "data-management-plan": "Data management plan", - "data-set": "Dataset", - "invention": "Invention", - "lecture-speech": "Lecture speech", - "physical-object": "Physical object", - "research-technique": "Research technique", - "software": "Software", - "spin-off-company": "Spin-off company", - "standards-and-policy": "Standards and policy", - "technical-standard": "Technical standard", - "other": "Other" -} \ No newline at end of file diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py new file mode 100644 index 000000000..f49d4bec1 --- /dev/null +++ b/server/workers/orcid/src/orcid_service.py @@ -0,0 +1,225 @@ +import logging +import json +import pandas as pd +import uuid +from common.decorators import error_logging_aspect +import numpy as np +from pyorcid import Orcid, errors as pyorcid_errors +from pyorcid.orcid_authentication import OrcidAuthentication +from typing import Tuple +from common.utils import get_key +from repositories.author_info import AuthorInfoRepository +from repositories.works import WorksRepository +from redis import StrictRedis +from typing import Dict +from model import AuthorInfo + +class OrcidService: + logger = logging.getLogger(__name__) + + def __init__( + self, + access_token: str, + sandbox: bool, + redis_store: StrictRedis, + ) -> None: + self.access_token = access_token + self.sandbox = sandbox + self.redis_store = redis_store + + @staticmethod + def create( + orcid_client_id: str, + orcid_client_secret: str, + sandbox: bool = False, + redis_store: StrictRedis = None, + ): + orcid_auth = OrcidAuthentication( + client_id=orcid_client_id, client_secret=orcid_client_secret + ) + access_token = orcid_auth.get_public_access_token() + + return OrcidService( + access_token=access_token, + sandbox=sandbox, + redis_store=redis_store, + ) + + @error_logging_aspect(log_level=logging.ERROR) + def execute_search(self, params: Dict[str, str]) -> Dict[str, str]: + try: + orcid_id = params.get("orcid") + # limit = params.get("limit") + orcid = self._initialize_orcid(orcid_id) + author_info, metadata = self._retrieve_author_info_and_metadata(orcid) + + if metadata.empty: + return self._handle_insufficient_results(params, orcid_id) + + + metadata = self._process_metadata(metadata, author_info, params) + + return self._format_response(data=metadata, author_info=author_info, params=params) + except ( + pyorcid_errors.Forbidden, + pyorcid_errors.NotFound, + pyorcid_errors.BadRequest, + ) as e: + return self._handle_error(params, "invalid orcid id", e) + except (pyorcid_errors.Unauthorized, Exception) as e: + return self._handle_error(params, "unexpected data processing error", e) + + def enrich_metadata(self, params: Dict[str, str], metadata: pd.DataFrame) -> pd.DataFrame: + """ + This function enriches the metadata DataFrame with additional information + from external sources, in this case crossref and altmetric. + The function will store the enriched metadata in the Redis queue for further + processing, from where it will be picked up by the metrics worker. + Returned data will be the original metadata enriched with additional + metadata columns from the external sources. + + Parameters: + - params (dict): The parameters for the search endpoint. + - metadata (pd.DataFrame): The metadata DataFrame to enrich. + + Returns: + - pd.DataFrame: The enriched metadata DataFrame. + """ + request_id = str(uuid.uuid4()) + task_data = { + "id": request_id, + "params": params, + "metadata": metadata.to_json(orient="records"), + } + self.redis_store.rpush("metrics", json.dumps(task_data)) + result = get_key(self.redis_store, request_id, 300) + metadata = pd.DataFrame(json.loads(result["input_data"])) + for c in [ + "citation_count", + "cited_by_wikipedia_count", + "cited_by_msm_count", + "cited_by_policies_count", + "cited_by_patents_count", + "cited_by_accounts_count", + ]: + if c not in metadata.columns: + metadata[c] = np.NaN + return metadata + + def enrich_author_info(self, author_info: AuthorInfo, metadata: pd.DataFrame) -> Dict[str, str]: + """ + This function enriches the author information with additional information. + Specifically, we extract and aggregate metrics data from the author's works, + such as citation counts and altmetric counts. + + Parameters: + - author_info (dict): The author information dictionary. + - metadata (pd.DataFrame): The metadata DataFrame containing the author's works. + + Returns: + - dict: The enriched author information dictionary. + """ + + # Total citations + author_info.total_citations = int( + metadata["citation_count"].astype(float).sum() + ) + + # Total unique social media mentions + author_info.total_unique_social_media_mentions = int( + metadata["cited_by_accounts_count"].astype(float).sum() + ) + + # Total NEPPR (non-academic references) + author_info.total_neppr = int( + metadata[ + [ + "cited_by_wikipedia_count", + "cited_by_msm_count", + "cited_by_policies_count", + "cited_by_patents_count", + ] + ] + .astype(float) + .sum() + .sum() + ) + + # Calculate h-index + citation_counts = ( + metadata["citation_count"].astype(float).sort_values(ascending=False).values + ) + h_index = np.sum(citation_counts >= np.arange(1, len(citation_counts) + 1)) + author_info.h_index = int(h_index) + + def extract_year(value): + try: + # Attempt to extract the year assuming various formats + year_str = str(value) + if len(year_str) >= 4: + return int(year_str[:4]) + return None + except (ValueError, TypeError): + return None + + # Apply the function to extract the year + metadata["publication_year"] = metadata["year"].apply(extract_year) + + academic_age = author_info.academic_age + + # Calculate normalized h-index + author_info.normalized_h_index = ( + h_index / academic_age if academic_age & academic_age > 0 else 0 + ) + + return author_info + + def _initialize_orcid(self, orcid_id: str) -> Orcid: + self.logger.debug(f"Initializing ORCID {orcid_id} with access token {self.access_token}") + return Orcid( + orcid_id=orcid_id, + orcid_access_token=self.access_token, + state="public", + sandbox=self.sandbox, + ) + + def _retrieve_author_info_and_metadata(self, orcid: Orcid) -> Tuple[Dict[str, str], pd.DataFrame]: + author_info = AuthorInfoRepository(orcid).extract_author_info() + metadata = WorksRepository(orcid).get_full_works_metadata() + + return author_info, metadata + + def _process_metadata(self, metadata: pd.DataFrame, author_info: AuthorInfo, params: Dict[str, str]) -> pd.DataFrame: + metadata["authors"] = metadata["authors"].replace("", author_info.author_name) + metadata = self.enrich_metadata(params, metadata) + author_info = self.enrich_author_info(author_info, metadata) + metadata = metadata.head(int(params.get("limit"))) + return metadata + + def _format_response(self, data: pd.DataFrame, author_info: AuthorInfo, params: Dict[str, str]) -> Dict[str, str]: + text = pd.concat([data.id, data[["title", "paper_abstract", "subtitle", "published_in", "authors"]] + .apply(lambda x: " ".join(x), axis=1)], axis=1) + text.columns = ["id", "content"] + + response = { + "input_data": { + "metadata": data.to_json(orient='records'), + "text": text.to_json(orient='records') + }, + # TODO: consider to return model? + "author": author_info.__dict__, + "params": params + } + return response + + def _handle_insufficient_results(self, params: Dict[str, str], orcid_id: str) -> Dict[str, str]: + self.logger.debug(f"ORCID {orcid_id} has no works metadata.") + return { + "params": params, + "status": "error", + "reason": ["not enough results for orcid"], + } + + def _handle_error(self, params: Dict[str, str], reason: str, exception: Exception) -> Dict[str, str]: + self.logger.error(exception) + return {"params": params, "status": "error", "reason": [reason]} \ No newline at end of file diff --git a/server/workers/orcid/src/repositories/__init__.py b/server/workers/orcid/src/repositories/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/orcid/src/repositories/author_info.py b/server/workers/orcid/src/repositories/author_info.py new file mode 100644 index 000000000..a7dbc2d67 --- /dev/null +++ b/server/workers/orcid/src/repositories/author_info.py @@ -0,0 +1,130 @@ +from datetime import datetime +from pyorcid import Orcid +import pandas as pd +from common.utils import get_nested_value +from typing import List, Dict +from model import AuthorInfo, ExternalIdentifier, Website + + +class AuthorInfoRepository: + def __init__(self, orcid: Orcid) -> None: + self.orcid = orcid + + def extract_author_info(self) -> AuthorInfo: + author_info = AuthorInfo( + orcid_id=self.orcid._orcid_id + ) + + personal_details = self.orcid.personal_details() + author_info.author_name = self.extract_author_name(personal_details) + author_info.biography = self.extract_biography(personal_details) + + keywords, _ = self.orcid.keywords() + author_info.author_keywords = ", ".join(keywords) + + education, _ = self.orcid.educations() + author_info.academic_age = self.calculate_academic_age(education) + + external_identifiers = self.orcid.external_identifiers()["external-identifier"] + author_info.external_identifiers = self.extract_external_identifiers(external_identifiers) + + addresses = self.orcid.address()["address"] + author_info.countries = self.extract_countries(addresses) + + researcher_urls = self.orcid.researcher_urls()["researcher-url"] + author_info.websites = self.extract_websites(researcher_urls) + + return author_info + + def extract_author_name(self, personal_details: Dict[str, str]) -> str: + return " ".join( + [ + get_nested_value(personal_details, ["name", "given-names", "value"], ""), + get_nested_value(personal_details, ["name", "family-name", "value"], ""), + ] + ) + + def extract_biography(self, personal_details: Dict[str, str]) -> str: + return ( + get_nested_value(personal_details, ["biography", "content"], "") + if ( + get_nested_value(personal_details, ["biography", "visibility"], "") == "public" + ) + else "" + ) + + def extract_countries(self, addresses: List[Dict[str, str]]) -> List[str]: + countries = pd.DataFrame(addresses) + if countries.empty: + return [] + countries = countries[countries["visibility"] == "public"] + countries["country"] = countries["country"].apply(lambda x: x.get("value") if isinstance(x, dict) else "") + countries = countries["country"] + return countries.tolist() + + def calculate_academic_age(self, data: List[Dict[str, str]]) -> int: + # Possible terms for a PhD-equivalent role + doctoral_terms = [ + "phd", "dphil", "doctorate", "doctoral", + "edd", "dsc", "md-phd", "jd-phd", "dr.phil.", "dr.rer.nat.", + "doctor of science", "doctor of education", "doctor's degree" + ] + + # Find the PhD-equivalent end date + phd_end_date = None + for entry in data: + # Check if the Role matches any PhD-equivalent term + if any(term in entry["Role"].lower() for term in doctoral_terms): + phd_end_date = entry["end-date"] + break + + # If no PhD end date is found, return None + if not phd_end_date: + return None + + # Convert PhD end date to a datetime object + phd_end_date = datetime.strptime(phd_end_date, "%m/%Y") + + # Calculate the number of years since the PhD + current_date = datetime.now() + academic_age = current_date.year - phd_end_date.year - ((current_date.month, current_date.day) < (phd_end_date.month, 1)) + + return academic_age + + def extract_external_identifiers( + self, + data: List[Dict[str, str]] + ) -> List[ExternalIdentifier]: + external_identifiers = pd.DataFrame( + data + ) + + if external_identifiers.empty: + return [] + + external_identifiers = external_identifiers[ + external_identifiers["visibility"] == "public" + ] + external_identifiers["external-id-url"] = external_identifiers[ + "external-id-url" + ].apply(lambda x: x.get("value") if isinstance(x, dict) else "") + + return external_identifiers[ + [ + "external-id-type", + "external-id-url", + "external-id-value", + "external-id-relationship", + ] + ].to_dict(orient="records") + + def extract_websites(self, researcher_urls: List[Dict[str, str]]) -> List[Website]: + urls = pd.DataFrame(researcher_urls) + + if urls.empty: + return [] + + urls = urls[urls["visibility"] == "public"] + urls["url"] = urls["url"].apply(lambda x: x.get("value")) + urls = urls[["url-name", "url"]] + return urls.to_dict(orient="records") diff --git a/server/workers/orcid/src/repositories/works.py b/server/workers/orcid/src/repositories/works.py new file mode 100644 index 000000000..90df09e9c --- /dev/null +++ b/server/workers/orcid/src/repositories/works.py @@ -0,0 +1,186 @@ +from dateutil.parser import parse +from pyorcid import Orcid +import pandas as pd +import numpy as np +from common.utils import get_nested_value + +class WorksRepository: + def __init__(self, orcid: Orcid) -> None: + self.orcid = orcid + + def get_full_works_metadata(self, limit: int = 10000) -> pd.DataFrame: + """ + This function retrieves the full metadata for all works associated with an ORCID. + + Parameters: + - orcid (Orcid): The Orcid object to use for retrieving the works data. + + Returns: + - pd.DataFrame: The full metadata for all works associated with the ORCID. + """ + + works_data = self.orcid.works_full_metadata(limit=limit) + return self.transform_works_metadata(pd.DataFrame(works_data)) + + def transform_works_metadata(self, works_data: pd.DataFrame) -> pd.DataFrame: + new_works_data = pd.DataFrame() + + if works_data.empty: + return new_works_data + + # Perform transformations and store in new DataFrame + new_works_data["id"] = works_data.apply(self.get_put_code, axis=1).astype(str) + new_works_data["title"] = works_data.apply(self.get_title, axis=1).astype(str) + new_works_data["subtitle"] = works_data.apply(self.get_subtitle, axis=1).astype(str) + new_works_data["authors"] = works_data.apply(self.get_authors, axis=1) + new_works_data["paper_abstract"] = works_data.apply( + self.get_paper_abstract, axis=1 + ).astype(str) + new_works_data["year"] = works_data.apply(self.get_publication_date, axis=1) + new_works_data["published_in"] = works_data.apply(self.published_in, axis=1).astype(str) + new_works_data["resulttype"] = works_data.apply(self.get_resulttype, axis=1).map( + lambda x: doc_type_mapping.get(x, "") + ) + new_works_data["doi"] = works_data.apply(self.extract_dois, axis=1) + new_works_data["subject"] = "" # this needs to come from BASE enrichment + new_works_data["url"] = works_data.apply(self.get_url, axis=1) + new_works_data["link"] = works_data.apply(self.get_link, axis=1) + new_works_data["oa_state"] = new_works_data.link.map(lambda x: 1 if x else 2) + + return new_works_data + + def get_authors(self, work) -> str: + contributors = get_nested_value(work, ["contributors", "contributor"], []) + + authors = [] + + for contributor in contributors: + author = get_nested_value(contributor, ["credit-name", "value"], None) + + if author: + authors.append(author) + + return "; ".join(authors) + + def get_title(self, work) -> str: + return get_nested_value(work, ["title", "title", "value"], "") + + def get_subtitle(self, work) -> str: + return get_nested_value(work, ["title", "subtitle", "value"], "") + + def get_paper_abstract(self, work) -> str: + return get_nested_value(work, ["short-description"], "") + + def get_resulttype(self, work) -> str: + return get_nested_value(work, ["type"], "") + + def published_in(self, work) -> str: + return get_nested_value(work, ["journal-title", "value"], "") + + def get_put_code(self, work) -> str: + return get_nested_value(work, ["put-code"], "") + + def get_url(self, work) -> str: + # Try to get the primary URL + url = get_nested_value(work, ["url", "value"], "") + if url: + return url + + # Fallback to checking external IDs if no URL was found + ids = get_nested_value(work, ["external-ids", "external-id"], []) + if isinstance(ids, list): + for id in ids: + external_url = id.get("external-id-value", "") + if external_url.startswith("http"): + return external_url + + return "" + + def get_link(self, work) -> str: + url = get_nested_value(work, ["url", "value"], "") + if url.lower().endswith(".pdf"): + return url + return "" + + def extract_dois(self, work: pd.DataFrame) -> str: + external_ids = get_nested_value(work, ["external-ids", "external-id"], []) + + if not isinstance(external_ids, list) or not external_ids: + return "" + + dois = [ + eid.get("external-id-value", "") + for eid in external_ids + if eid.get("external-id-type") == "doi" + ] + + return dois[0] if dois else "" + + def get_publication_date(self, work) -> str: + year = get_nested_value(work, ["publication-date", "year", "value"], np.nan) + month = get_nested_value(work, ["publication-date", "month", "value"], np.nan) + day = get_nested_value(work, ["publication-date", "day", "value"], np.nan) + + publication_date = "" + parsed_publication_date = publication_date + if year is not np.nan: + publication_date += str(int(year)) + parsed_publication_date = publication_date + if month is not np.nan and month != "00": + publication_date += "-" + str(int(month)) + date_obj = parse(publication_date) + parsed_publication_date = date_obj.strftime("%Y-%m") + if day is not np.nan: + publication_date += "-" + str(int(day)) + date_obj = parse(publication_date) + parsed_publication_date = date_obj.strftime("%Y-%m-%d") + return parsed_publication_date + + +doc_type_mapping = { + "book": "Book", + "book-chapter": "Book chapter", + "book-review": "Book review", + "dictionary-entry": "Dictionary entry", + "dissertation": "Dissertation", + "dissertation-thesis": "Dissertation thesis", + "enyclopaedia-entry": "Encyclopedia entry", + "edited-book": "Edited book", + "journal-article": "Journal article", + "journal-issue": "Journal issue", + "magazine-article": "Magazine article", + "manual": "Manual", + "online-resource": "Online resource", + "newsletter-article": "Newsletter article", + "newspaper-article": "Newspaper article", + "preprint": "Preprint", + "report": "Report", + "review": "Review", + "research-tool": "Research tool", + "supervised-student-publication": "Supervised student publication", + "test": "Test", + "translation": "Translation", + "website": "Website", + "working-paper": "Working paper", + "conference-abstract": "Conference abstract", + "conference-paper": "Conference paper", + "conference-poster": "Conference poster", + "disclosure": "Disclosure", + "license": "License", + "patent": "Patent", + "registered-copyright": "Registered copyright", + "trademark": "Trademark", + "annotation": "Annotation", + "artistic-performance": "Artistic performance", + "data-management-plan": "Data management plan", + "data-set": "Dataset", + "invention": "Invention", + "lecture-speech": "Lecture speech", + "physical-object": "Physical object", + "research-technique": "Research technique", + "software": "Software", + "spin-off-company": "Spin-off company", + "standards-and-policy": "Standards and policy", + "technical-standard": "Technical standard", + "other": "Other", +} diff --git a/server/workers/orcid/src/worker.py b/server/workers/orcid/src/worker.py new file mode 100644 index 000000000..190dee157 --- /dev/null +++ b/server/workers/orcid/src/worker.py @@ -0,0 +1,64 @@ +import json +import logging +from common.decorators import error_logging_aspect +import time +from typing import Tuple, Dict +from common.rate_limiter import RateLimiter +from redis import Redis +from orcid_service import OrcidService + +class OrcidWorker: + service = "orcid" + + logger = logging.getLogger(__name__) + + def __init__( + self, + redis_store: Redis, + data_retriever: OrcidService, + rate_limiter: RateLimiter, + ) -> None: + self.redis_store = redis_store + self.data_retriever = data_retriever + self.rate_limiter = rate_limiter + + def next_item(self) -> Tuple[str, Dict[str, str], str]: + _, message = self.redis_store.blpop(self.service) + try: + message_data: Dict[str, str] = json.loads(message.decode("utf-8")) + except (json.JSONDecodeError, AttributeError) as e: + raise ValueError(f"Failed to decode message: {e}") + + request_id = message_data.get("id") + params = message_data.get("params") + params["service"] = self.service + endpoint = message_data.get("endpoint") + return request_id, params, endpoint + + @error_logging_aspect(log_level=logging.ERROR) + def run(self) -> None: + while True: + while self.rate_limiter.rate_limit_reached(): + self.logger.debug("🛑 Request is limited") + time.sleep(0.1) + request_id, params, endpoint = self.next_item() + self.logger.debug(request_id) + self.logger.debug(params) + if endpoint == "search": + self.handle_search(request_id, params) + + def handle_search(self, request_id: str, params: Dict[str, str]) -> None: + try: + res = self.data_retriever.execute_search(params) + res["id"] = request_id + + if res.get("status") == "error" or params.get("raw") is True: + self.redis_store.set(request_id + "_output", json.dumps(res)) + else: + self.redis_store.rpush("input_data", json.dumps(res).encode("utf8")) + queue_length = self.redis_store.llen("input_data") + self.logger.debug(f"Queue length: input_data {queue_length} {request_id}") + except Exception as e: + self.logger.exception("Exception during data retrieval.") + self.logger.error(params) + self.logger.error(e) \ No newline at end of file diff --git a/server/workers/orcid/tests/__init__.py b/server/workers/orcid/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/orcid/tests/unit/__init__.py b/server/workers/orcid/tests/unit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/orcid/tests/unit/orcid_addresses.json b/server/workers/orcid/tests/unit/orcid_addresses.json new file mode 100644 index 000000000..87fbd5996 --- /dev/null +++ b/server/workers/orcid/tests/unit/orcid_addresses.json @@ -0,0 +1,116 @@ +{ + "last-modified-date": { + "value": 1680340041403 + }, + "external-identifier": [ + { + "created-date": { + "value": 1431096210330 + }, + "last-modified-date": { + "value": 1517869567419 + }, + "source": { + "source-orcid": null, + "source-client-id": { + "uri": "https://orcid.org/client/0000-0003-1377-5676", + "path": "0000-0003-1377-5676", + "host": "orcid.org" + }, + "source-name": { + "value": "ResearcherID" + }, + "assertion-origin-orcid": { + "uri": "https://orcid.org/0000-0003-0444-7080", + "path": "0000-0003-0444-7080", + "host": "orcid.org" + }, + "assertion-origin-client-id": null, + "assertion-origin-name": { + "value": "Chao Xu" + } + }, + "external-id-type": "ResearcherID", + "external-id-value": "G-3885-2010", + "external-id-url": { + "value": "http://www.researcherid.com/rid/G-3885-2010" + }, + "external-id-relationship": "self", + "visibility": "public", + "path": "/0000-0003-0444-7080/external-identifiers/269622", + "put-code": 269622, + "display-index": 0 + }, + { + "created-date": { + "value": 1445094120127 + }, + "last-modified-date": { + "value": 1445094120127 + }, + "source": { + "source-orcid": null, + "source-client-id": { + "uri": "https://orcid.org/client/APP-4DPEW6SVSNQHKC8G", + "path": "APP-4DPEW6SVSNQHKC8G", + "host": "orcid.org" + }, + "source-name": { + "value": "Loop" + }, + "assertion-origin-orcid": null, + "assertion-origin-client-id": null, + "assertion-origin-name": null + }, + "external-id-type": "Loop profile", + "external-id-value": "285285", + "external-id-url": { + "value": "https://loop.frontiersin.org/people/285285" + }, + "external-id-relationship": "self", + "visibility": "public", + "path": "/0000-0003-0444-7080/external-identifiers/351949", + "put-code": 351949, + "display-index": 0 + }, + { + "created-date": { + "value": 1680340041403 + }, + "last-modified-date": { + "value": 1680340041403 + }, + "source": { + "source-orcid": null, + "source-client-id": { + "uri": "https://orcid.org/client/0000-0002-5982-8983", + "path": "0000-0002-5982-8983", + "host": "orcid.org" + }, + "source-name": { + "value": "Scopus - Elsevier" + }, + "assertion-origin-orcid": { + "uri": "https://orcid.org/0000-0003-0444-7080", + "path": "0000-0003-0444-7080", + "host": "orcid.org" + }, + "assertion-origin-client-id": null, + "assertion-origin-name": { + "value": "Chao Xu" + } + }, + "external-id-type": "Scopus Author ID", + "external-id-value": "56182786200", + "external-id-url": { + "value": "http://www.scopus.com/inward/authorDetails.url?authorID=56182786200&partnerID=MN8TOARS" + }, + "external-id-relationship": "self", + "visibility": "public", + "path": "/0000-0003-0444-7080/external-identifiers/2996962", + "put-code": 2996962, + "display-index": 0 + } + ], + "path": "/0000-0003-0444-7080/external-identifiers" +} \ No newline at end of file diff --git a/server/workers/orcid/tests/unit/orcid_researcher_urls.json b/server/workers/orcid/tests/unit/orcid_researcher_urls.json new file mode 100644 index 000000000..57ac1472d --- /dev/null +++ b/server/workers/orcid/tests/unit/orcid_researcher_urls.json @@ -0,0 +1,38 @@ +{ + "last-modified-date": { + "value": 1512458209207 + }, + "researcher-url": [ + { + "created-date": { + "value": 1512458209206 + }, + "last-modified-date": { + "value": 1512458209207 + }, + "source": { + "source-orcid": { + "uri": "https://orcid.org/0000-0003-0444-7080", + "path": "0000-0003-0444-7080", + "host": "orcid.org" + }, + "source-client-id": null, + "source-name": { + "value": "Chao Xu" + }, + "assertion-origin-orcid": null, + "assertion-origin-client-id": null, + "assertion-origin-name": null + }, + "url-name": "google scholar website", + "url": { + "value": "https://scholar.google.com/citations?user=-2P6PfoAAAAJ&hl=en" + }, + "visibility": "public", + "path": "/0000-0003-0444-7080/researcher-urls/1307414", + "put-code": 1307414, + "display-index": 1 + } + ], + "path": "/0000-0003-0444-7080/researcher-urls" +} \ No newline at end of file diff --git a/server/workers/orcid/tests/unit/orcid_work.json b/server/workers/orcid/tests/unit/orcid_work.json new file mode 100644 index 000000000..85b7117a5 --- /dev/null +++ b/server/workers/orcid/tests/unit/orcid_work.json @@ -0,0 +1,172 @@ +{ + "created-date": { + "value": 1633613405001 + }, + "last-modified-date": { + "value": 1674531292361 + }, + "source": { + "source-orcid": null, + "source-client-id": { + "uri": "https://orcid.org/client/0000-0001-9884-1913", + "path": "0000-0001-9884-1913", + "host": "orcid.org" + }, + "source-name": { + "value": "Crossref" + }, + "assertion-origin-orcid": null, + "assertion-origin-client-id": null, + "assertion-origin-name": null + }, + "put-code": 101107018, + "path": "/0000-0002-2499-6696/work/101107018", + "title": { + "title": { + "value": "Aquatic ecosystem responses to environmental and climatic changes in NE China since the last deglaciation (\\u223c17, 500\\u00a0cal\\u00a0yr BP) tracked by diatom assemblages from Lake Moon" + }, + "subtitle": null, + "translated-title": null + }, + "journal-title": { + "value": "Quaternary Science Reviews" + }, + "short-description": null, + "citation": null, + "type": "journal-article", + "publication-date": { + "year": { + "value": "2021" + }, + "month": { + "value": "11" + }, + "day": null + }, + "external-ids": { + "external-id": [ + { + "external-id-type": "doi", + "external-id-value": "10.1016/j.quascirev.2021.107218", + "external-id-normalized": { + "value": "10.1016/j.quascirev.2021.107218", + "transient": true + }, + "external-id-normalized-error": null, + "external-id-url": { + "value": "https://doi.org/10.1016/j.quascirev.2021.107218" + }, + "external-id-relationship": "self" + } + ] + }, + "url": { + "value": "https://doi.org/10.1016/j.quascirev.2021.107218" + }, + "contributors": { + "contributor": [ + { + "contributor-orcid": null, + "credit-name": { + "value": "Jie Chen" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + }, + { + "contributor-orcid": null, + "credit-name": { + "value": "Jianbao Liu" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + }, + { + "contributor-orcid": null, + "credit-name": { + "value": "Kathleen M. R\\u00fchland" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + }, + { + "contributor-orcid": null, + "credit-name": { + "value": "John P. Smol" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + }, + { + "contributor-orcid": null, + "credit-name": { + "value": "Xiaosen Zhang" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + }, + { + "contributor-orcid": null, + "credit-name": { + "value": "Zhiping Zhang" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + }, + { + "contributor-orcid": null, + "credit-name": { + "value": "Aifeng Zhou" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + }, + { + "contributor-orcid": null, + "credit-name": { + "value": "Zhongwei Shen" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + }, + { + "contributor-orcid": null, + "credit-name": { + "value": "Fahu Chen" + }, + "contributor-email": null, + "contributor-attributes": { + "contributor-sequence": null, + "contributor-role": "author" + } + } + ] + }, + "language-code": null, + "country": null, + "visibility": "public" +} \ No newline at end of file diff --git a/server/workers/orcid/tests/unit/personal_details.json b/server/workers/orcid/tests/unit/personal_details.json new file mode 100644 index 000000000..521c007a6 --- /dev/null +++ b/server/workers/orcid/tests/unit/personal_details.json @@ -0,0 +1,30 @@ +{ + "last-modified-date": { + "value": 1460763065672 + }, + "name": { + "created-date": { + "value": 1460763065672 + }, + "last-modified-date": { + "value": 1460763065672 + }, + "given-names": { + "value": "Chao" + }, + "family-name": { + "value": "Xu" + }, + "credit-name": null, + "source": null, + "visibility": "public", + "path": "0000-0003-0444-7080" + }, + "other-names": { + "last-modified-date": null, + "other-name": [], + "path": "/0000-0003-0444-7080/other-names" + }, + "biography": null, + "path": "/0000-0003-0444-7080/personal-details" +} \ No newline at end of file diff --git a/server/workers/orcid/tests/unit/test_transform.py b/server/workers/orcid/tests/unit/test_transform.py new file mode 100644 index 000000000..2e28a7211 --- /dev/null +++ b/server/workers/orcid/tests/unit/test_transform.py @@ -0,0 +1,64 @@ +import pytest +import pandas as pd +import numpy as np +from src.orcid_service import OrcidService + +# Sample test data and expected outcomes +@pytest.fixture +def sample_author_info(): + # Sample author information dictionary + return { + "orcid_id": "0000-0002-1825-0097", + "author_name": "John Doe", + "biography": "A short biography", + "author_keywords": "keyword1, keyword2", + "academic_age": 14, + "websites": [ + {"url_name": "personal", "url": "http://example.com"}, + {"url_name": "institutional", "url": "http://institution.com"} + ], + "external_identifiers": [ + {"type": "ResearcherID", "url": "http://researcherid.com", "value": "12345", "relationship": "self"}, + {"type": "Scopus", "url": "http://scopus.com", "value": "67890", "relationship": "self"} + ], + "countries": ["USA", "Canada"] + } + +@pytest.fixture +def sample_metadata(): + # Create a sample metadata DataFrame + data = { + 'citation_count': [10, 20, 30, 40], + 'cited_by_accounts_count': [5, 2, 7, 3], + 'cited_by_wikipedia_count': [1, 0, 2, 0], + 'cited_by_msm_count': [0, 1, 1, 0], + 'cited_by_policies_count': [0, 0, 1, 0], + 'cited_by_patents_count': [0, 1, 0, 0], + 'year': [2005, 2010, 2015, 2020] + } + return pd.DataFrame(data) + +def test_enrich_author_info(sample_author_info, sample_metadata): + # Mock an instance of OrcidService (if enrich_author_info is an instance method) + orcid_service = OrcidService(access_token="dummy_token", sandbox=True, redis_store=None) + + # Call the method to test + enriched_info = orcid_service.enrich_author_info(sample_author_info, sample_metadata) + + # Assertions to check expected outcomes + assert enriched_info["total_citations"] == 100 + assert enriched_info["total_unique_social_media_mentions"] == 17 + assert enriched_info["total_neppr"] == 7 + + # Calculate h-index for given sample data + citation_counts = sample_metadata["citation_count"].astype(float).sort_values(ascending=False).values + expected_h_index = int(np.sum(citation_counts >= np.arange(1, len(citation_counts) + 1))) + assert enriched_info["h_index"] == expected_h_index + + # Expected academic age calculation + expected_academic_age = 14 # Earliest year from the sample data + assert enriched_info["academic_age"] == expected_academic_age + + # Normalized h-index calculation + expected_normalized_h_index = (expected_h_index / expected_academic_age) if expected_academic_age > 0 else 0 + assert enriched_info["normalized_h_index"] == expected_normalized_h_index \ No newline at end of file diff --git a/server/workers/persistence/.dockerignore b/server/workers/persistence/.dockerignore new file mode 100644 index 000000000..2a5c729b7 --- /dev/null +++ b/server/workers/persistence/.dockerignore @@ -0,0 +1,10 @@ +renv +__pycache__ +.cache +.pytest_cache +.Rproj.user +.RData +.Rhistory +*.Rproj +.pynb_checkpoints +*.ipynb \ No newline at end of file diff --git a/server/workers/persistence/Dockerfile b/server/workers/persistence/Dockerfile index f682ba27b..6feefeec9 100644 --- a/server/workers/persistence/Dockerfile +++ b/server/workers/persistence/Dockerfile @@ -1,5 +1,5 @@ FROM python:3.8 -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" +LABEL maintainer="Chris Kittel " RUN apt-get update RUN apt-get install -y gcc git libpq-dev @@ -9,4 +9,10 @@ COPY workers/persistence/requirements.txt . RUN pip install --upgrade pip RUN pip install --no-cache-dir -r requirements.txt RUN pip install git+https://github.com/python-restx/flask-restx -COPY workers/persistence/src/ ./ \ No newline at end of file + +COPY workers/common ../common +COPY workers/persistence/requirements-e.txt ./ +RUN pip install --no-cache-dir -r requirements-e.txt +COPY workers/persistence/src/ ./ + +CMD ["python3", "app.py"] \ No newline at end of file diff --git a/server/workers/persistence/__init__.py b/server/workers/persistence/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/server/workers/persistence/requirements-e.txt b/server/workers/persistence/requirements-e.txt new file mode 100644 index 000000000..a35d9cfef --- /dev/null +++ b/server/workers/persistence/requirements-e.txt @@ -0,0 +1,2 @@ +# Include the common package from the local file system +-e ../common \ No newline at end of file diff --git a/server/workers/persistence/src/app.py b/server/workers/persistence/src/app.py index 48aafba9a..0ec5e2d05 100644 --- a/server/workers/persistence/src/app.py +++ b/server/workers/persistence/src/app.py @@ -8,38 +8,7 @@ from persistence import persistence_ns -class ReverseProxied(object): - '''Wrap the application in this middleware and configure the - front-end server to add these headers, to let you quietly bind - this to a URL other than / and to an HTTP scheme that is - different than what is used locally. - - location /myprefix { - proxy_pass http://192.168.0.1:5001; - proxy_set_header Host $host; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Scheme $scheme; - proxy_set_header X-Script-Name /myprefix; - } - - :param app: the WSGI application - ''' - def __init__(self, app): - self.app = app - - def __call__(self, environ, start_response): - script_name = environ.get('HTTP_X_SCRIPT_NAME', '') - if script_name: - environ['SCRIPT_NAME'] = script_name - path_info = environ['PATH_INFO'] - if path_info.startswith(script_name): - environ['PATH_INFO'] = path_info[len(script_name):] - - scheme = environ.get('HTTP_X_SCHEME', '') - if scheme: - environ['wsgi.url_scheme'] = scheme - return self.app(environ, start_response) - +from common.proxy import ReverseProxied def api_patches(app): api_fixed = Api( diff --git a/server/workers/persistence/src/database.py b/server/workers/persistence/src/database.py index 4d008ab7a..c4464d5bf 100644 --- a/server/workers/persistence/src/database.py +++ b/server/workers/persistence/src/database.py @@ -1,33 +1,53 @@ import os -from sqlalchemy.orm import sessionmaker +from sqlalchemy.orm import sessionmaker, declarative_base from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base - -hosts = [] -for host, port in zip(os.getenv("POSTGRES_HOSTS").split(","), - os.getenv("POSTGRES_PORTS").split(",")): - hosts.append(f"{host}:{port}") -host_string = "&host=".join(hosts) - -bind_params = { - "user": os.getenv("POSTGRES_USER"), - "pw": os.getenv("POSTGRES_PASSWORD"), - "host": host_string, - "db": os.getenv("DEFAULT_DATABASE") -} - -if len(hosts) == 1: - engine = create_engine('postgresql+psycopg2://%(user)s:%(pw)s@%(host)s/%(db)s' % bind_params, - max_overflow=5, - pool_pre_ping=True, - pool_recycle=600, - pool_size=5) -else: - engine = create_engine('postgresql+psycopg2://%(user)s:%(pw)s@/%(db)s?host=%(host)s&target_session_attrs=read-write' % bind_params, - max_overflow=5, - pool_pre_ping=True, - pool_recycle=600, - pool_size=5) +from sqlalchemy.engine.base import Engine +def get_host_string() -> str: + """Construct the host string from environment variables.""" + hosts = os.getenv("POSTGRES_HOSTS", "").split(",") + ports = os.getenv("POSTGRES_PORTS", "").split(",") + + if not hosts or not ports: + raise ValueError( + "POSTGRES_HOSTS and POSTGRES_PORTS environment variables must be set and non-empty." + ) + + return "&host=".join([f"{host}:{port}" for host, port in zip(hosts, ports)]) + +def create_db_engine() -> "Engine": + """Create the SQLAlchemy engine based on the provided environment variables.""" + user = os.getenv("POSTGRES_USER") + password = os.getenv("POSTGRES_PASSWORD") + database = os.getenv("DEFAULT_DATABASE") + hosts = os.getenv("POSTGRES_HOSTS").split(",") + + if not user or not password or not database: + raise ValueError( + "POSTGRES_USER, POSTGRES_PASSWORD, and DEFAULT_DATABASE environment variables must be set." + ) + + host_string = get_host_string() + + + if len(hosts) == 1: + connection_string = ( + f"postgresql+psycopg2://{user}:{password}@{host_string}/{database}" + ) + else: + connection_string = f"postgresql+psycopg2://{user}:{password}@/{database}?host={host_string}&target_session_attrs=read-write" + + return create_engine( + connection_string, + max_overflow=5, + pool_pre_ping=True, + pool_recycle=600, + pool_size=5, + ) + +# Create the SQLAlchemy engine +engine = create_db_engine() + +# Set up session and base Session = sessionmaker(bind=engine) Base = declarative_base() \ No newline at end of file diff --git a/server/workers/pubmed/.dockerignore b/server/workers/pubmed/.dockerignore new file mode 100644 index 000000000..2a5c729b7 --- /dev/null +++ b/server/workers/pubmed/.dockerignore @@ -0,0 +1,10 @@ +renv +__pycache__ +.cache +.pytest_cache +.Rproj.user +.RData +.Rhistory +*.Rproj +.pynb_checkpoints +*.ipynb \ No newline at end of file diff --git a/server/workers/pubmed/Dockerfile b/server/workers/pubmed/Dockerfile index f43846d7f..d3a45d1f0 100644 --- a/server/workers/pubmed/Dockerfile +++ b/server/workers/pubmed/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:18.04 -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" +LABEL maintainer="Chris Kittel " ENV DEBIAN_FRONTEND=noninteractive @@ -152,11 +152,15 @@ COPY workers/pubmed/activate.R . RUN R -e 'renv::consent(provided = TRUE)' && \ R -e 'setwd("./"); renv::activate(); renv::restore(lockfile = "./renv.lock")' -COPY workers/common ./common +COPY workers/common ../common +COPY workers/pubmed/requirements-e.txt . +RUN pip3 install --no-cache-dir -r requirements-e.txt + COPY workers/pubmed ./pubmed COPY preprocessing/resources ./resources COPY preprocessing/other-scripts ./other-scripts RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log COPY workers/pubmed/*.py ./ -ENTRYPOINT python3 run_pubmed.py + +CMD ["python3", "run_pubmed.py"] diff --git a/server/workers/pubmed/requirements-e.txt b/server/workers/pubmed/requirements-e.txt new file mode 100644 index 000000000..a35d9cfef --- /dev/null +++ b/server/workers/pubmed/requirements-e.txt @@ -0,0 +1,2 @@ +# Include the common package from the local file system +-e ../common \ No newline at end of file diff --git a/server/workers/tests/Dockerfile_backend b/server/workers/tests/Dockerfile.backend similarity index 100% rename from server/workers/tests/Dockerfile_backend rename to server/workers/tests/Dockerfile.backend diff --git a/server/workers/tests/Dockerfile_tests b/server/workers/tests/Dockerfile.tests similarity index 79% rename from server/workers/tests/Dockerfile_tests rename to server/workers/tests/Dockerfile.tests index 3ec2fd241..3543c8731 100644 --- a/server/workers/tests/Dockerfile_tests +++ b/server/workers/tests/Dockerfile.tests @@ -1,11 +1,12 @@ FROM python:3.8 -MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" +LABEL maintainer="Chris Kittel " RUN apt-get update RUN apt-get install -y --no-install-recommends gcc git python3-dev RUN apt-get install -y libpq-dev WORKDIR /app + RUN pip install git+https://github.com/python-restx/flask-restx RUN pip install pytest requests sqlalchemy sqlalchemy-utils psycopg2-binary \ No newline at end of file diff --git a/server/workers/tests/README.md b/server/workers/tests/README.md index 133d3669a..4c8906397 100644 --- a/server/workers/tests/README.md +++ b/server/workers/tests/README.md @@ -45,7 +45,7 @@ In `docker-compose-end2endtest.yml`, the POSTGRES_HOST should be the name of the end2endtest: build: context: ./server/workers/tests - dockerfile: ./Dockerfile_tests + dockerfile: ./Dockerfile.tests hostname: "end2endtest" environment: POSTGRES_USER: "testuser" @@ -61,7 +61,7 @@ listen_addresses = 'localhost,headstart_db_1,headstart-db-1' Run tests ``` -docker-compose -f docker-compose-end2endtest.yml run end2endtest +docker compose -f docker-compose-end2endtest.yml --env-file .docker.test.env run end2endtest ``` ## Notes about the end-to-end test setup diff --git a/server/workers/tests/mock_app.py b/server/workers/tests/mock_app.py index 274cc0d91..dcb24f0ec 100644 --- a/server/workers/tests/mock_app.py +++ b/server/workers/tests/mock_app.py @@ -2,27 +2,29 @@ import json import sys import logging -from flask import Flask, make_response, jsonify +from flask import Flask, make_response from flask_restx import Api sys.path.append("workers/persistence/src") -from persistence import persistence_ns -from database import sessions +import persistence as some_module + +print(some_module) def create_app(config_name): app = Flask(__name__) bind_params = { - "user": os.getenv("POSTGRES_USER"), - "pw": os.getenv("POSTGRES_PASSWORD"), - "host": os.getenv("POSTGRES_HOST"), - "port": os.getenv("POSTGRES_PORT"), - "db": os.getenv("DEFAULT_DATABASE") + "user": os.getenv("POSTGRES_USER"), + "pw": os.getenv("POSTGRES_PASSWORD"), + "host": os.getenv("POSTGRES_HOST"), + "port": os.getenv("POSTGRES_PORT"), + "db": os.getenv("DEFAULT_DATABASE") } app.config['SQLALCHEMY_DATABASE_URI'] = 'postgresql://%(user)s:%(pw)s@%(host)s:%(port)s/%(db)s' % bind_params app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False - app.config['PORT'] = 5000 + app.config['PORT'] = 80 + app.config['HOST'] = '0.0.0.0' app.config['DEBUG'] = True app.config['TESTING'] = True app.config['ENV'] = 'development' @@ -48,7 +50,7 @@ def base_search(): api = Api(app) - api.add_namespace(persistence_ns, path='/api/stable/persistence') + # api.add_namespace(persistence_ns, path='/api/stable/persistence') handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging.DEBUG) @@ -58,3 +60,7 @@ def base_search(): # app.logger.debug(app.url_map) return app + +if __name__ == '__main__': + app = create_app(config_name="testing") + app.run(host="0.0.0.0", port=5001, debug=True) diff --git a/server/workers/tests/test_end2end.py b/server/workers/tests/test_end2end.py index 82d3fd238..77ccd804b 100644 --- a/server/workers/tests/test_end2end.py +++ b/server/workers/tests/test_end2end.py @@ -2,26 +2,28 @@ import pytest import requests from workers.tests.mock_app import create_app -from sqlalchemy import create_engine, inspect -from sqlalchemy_utils import database_exists, create_database, drop_database +from sqlalchemy import create_engine, text +from sqlalchemy_utils import database_exists, create_database from workers.persistence.src.models import Base @pytest.fixture def app(): # Create a test Flask app with a temporary test database - # It will setup the Visualizations and Revisions tables - # each time the test runs and drop them when the test is done app = create_app(config_name="testing") engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI']) + if not database_exists(engine.url): create_database(engine.url) + Base.metadata.bind = engine + with app.app_context(): Base.metadata.create_all(bind=engine) yield app - engine.execute('DROP TABLE IF EXISTS visualizations;') - engine.execute('DROP TABLE IF EXISTS revisions;') - drop_database(engine.url) + + with engine.connect() as connection: + connection.execute(text("DROP TABLE IF EXISTS visualizations;")) + connection.execute(text("DROP TABLE IF EXISTS revisions;")) @pytest.fixture def test_client(app): @@ -55,13 +57,20 @@ def load_test_data(): def test_hello_world(app, test_client): - response = test_client.get('/hello') + response = test_client.get('http://api/hello') assert response.status_code == 200 assert b"Hello, World!" in response.data def test_search_api_reachability(app, test_client): # Test that the base URL returns a 200 (OK) HTTP status code - response = test_client.get('/api/stable/base/search') + response = test_client.get('http://api/api/stable/base/search') + assert response.status_code == 200 + assert response.content_type == "application/json" + data = json.loads(response.get_data(as_text=True)) + +def test_search_api_reachability(app, test_client): + # Test that the base URL returns a 200 (OK) HTTP status code + response = test_client.get('http://api/api/stable/persistence/search') assert response.status_code == 200 assert response.content_type == "application/json" data = json.loads(response.get_data(as_text=True)) @@ -79,122 +88,126 @@ def test_search_endpoint(app): } response = requests.post(url, data=params) response.raise_for_status() + print(f"Text: {response.text}") + print(f"Status code: {response.status_code}") + print(f"Content type: {response.headers['Content-Type']}") data = response.json() assert "query" in data assert "id" in data assert "status" in data -def test_createID(app, test_client): - url = "/api/stable/persistence/createID/testdb" - params = { - "params": {"q": "digital education", - "from": "1809-01-01", - "to": "2023-07-28", - "document_types": ["121"], - "sorting": "most-recent"}, - "param_types": ["q", "from", "to", "document_types", "sorting"] - } - response = test_client.post(url, json=params) - assert b'{\n "unique_id": "dffef32544bc14a48e9f3aa2824e2513"\n}\n' in response.data +# def test_createID(app, test_client): +# url = "/api/stable/persistence/createID/testdb" +# params = { +# "params": {"q": "digital education", +# "from": "1809-01-01", +# "to": "2023-07-28", +# "document_types": ["121"], +# "sorting": "most-recent"}, +# "param_types": ["q", "from", "to", "document_types", "sorting"] +# } +# response = test_client.post(url, json=params) +# assert b'{\n "unique_id": "dffef32544bc14a48e9f3aa2824e2513"\n}\n' in response.data + +# def test_getLatestRevision(app): +# url = "http://backend/server/services/getLatestRevision.php" +# params = { +# "vis_id": "530133cf1768e6606f63c641a1a96768", +# } +# response = requests.get(url, params=params) +# response.raise_for_status() +# data = response.json() +# assert type(data) == list + +# def test_getLatestRevisionWithContext(app, populate_db): +# url = "http://backend/server/services/getLatestRevision.php" +# params = { +# "vis_id": "530133cf1768e6606f63c641a1a96768", +# "context": True +# } +# response = requests.get(url, params=params) +# response.raise_for_status() +# data = response.json() +# assert type(data) == dict + +# def test_getLatestRevisionWithDetails(app, populate_db): +# url = "http://backend/server/services/getLatestRevision.php" +# params = { +# "vis_id": "530133cf1768e6606f63c641a1a96768", +# "context": True, +# "details": True +# } +# response = requests.get(url, params=params) +# response.raise_for_status() +# data = response.json() +# assert type(data) == dict + +# def test_persistence_api(app, test_client): +# url = "/api/stable/persistence/service_version" +# response = test_client.get(url) +# assert response.status_code == 200 +# assert b"test_version" in response.data + +# def test_persistence_api_create_visualization(app, test_client): +# data, vis_params = load_test_data() +# url = "/api/stable/persistence/createVisualization/testdb" +# params = { +# "vis_id": "530133cf1768e6606f63c641a1a96768", +# "vis_title": "digital education", +# "data": data, +# "vis_clean_query": "digital education", +# "vis_query": "digital education", +# "vis_params": json.dumps({"context": vis_params}) +# } +# response = test_client.post(url, json=params) +# assert b'{\n "success": true\n}\n' in response.data +# url = "/api/stable/persistence/getLastVersion/testdb" +# params = { +# "vis_id": "530133cf1768e6606f63c641a1a96768", +# "context": True +# } +# response = test_client.post(url, json=params) +# assert "rev_data" in response.json.keys() +# assert "rev_timestamp" in response.json.keys() +# assert "rev_vis" in response.json.keys() +# assert "vis_params" in response.json.keys() +# assert "vis_query" in response.json.keys() +# assert "vis_title" in response.json.keys() +# return_data = response.json["rev_data"] +# assert data == return_data + +# def test_search_and_getLatestRevision(app, populate_db): +# """ +# This test will be able to test SQLite and Postgres persistence. +# """ +# url = "http://backend/server/services/searchBASE.php" +# params = { +# "unique_id": "530133cf1768e6606f63c641a1a96768", +# "from": "1809-01-01", +# "to": "2023-07-28", +# "document_types": ["121"], +# "q": "digital education", +# "sorting": "most-recent" +# } +# response = requests.post(url, data=params) +# response.raise_for_status() +# data = response.json() +# assert "query" in data +# assert "id" in data +# assert "status" in data +# url = "http://backend/server/services/getLatestRevision.php" +# params = { +# "vis_id": "530133cf1768e6606f63c641a1a96768", +# "context": True +# } +# response = requests.get(url, params=params) +# response.raise_for_status() +# r = response.json() +# assert type(r) == dict +# assert "context" in r.keys() +# assert "data" in r.keys() +# assert type(r["context"]) == dict +# assert type(r["data"]) == str +# data = json.loads(r["data"]) +# assert type(data) == list -def test_getLatestRevision(app): - url = "http://backend/server/services/getLatestRevision.php" - params = { - "vis_id": "530133cf1768e6606f63c641a1a96768", - } - response = requests.get(url, params=params) - response.raise_for_status() - data = response.json() - assert type(data) == list - -def test_getLatestRevisionWithContext(app, populate_db): - url = "http://backend/server/services/getLatestRevision.php" - params = { - "vis_id": "530133cf1768e6606f63c641a1a96768", - "context": True - } - response = requests.get(url, params=params) - response.raise_for_status() - data = response.json() - assert type(data) == dict - -def test_getLatestRevisionWithDetails(app, populate_db): - url = "http://backend/server/services/getLatestRevision.php" - params = { - "vis_id": "530133cf1768e6606f63c641a1a96768", - "context": True, - "details": True - } - response = requests.get(url, params=params) - response.raise_for_status() - data = response.json() - assert type(data) == dict - -def test_persistence_api(app, test_client): - url = "/api/stable/persistence/service_version" - response = test_client.get(url) - assert response.status_code == 200 - assert b"test_version" in response.data - -def test_persistence_api_create_visualization(app, test_client): - data, vis_params = load_test_data() - url = "/api/stable/persistence/createVisualization/testdb" - params = { - "vis_id": "530133cf1768e6606f63c641a1a96768", - "vis_title": "digital education", - "data": data, - "vis_clean_query": "digital education", - "vis_query": "digital education", - "vis_params": json.dumps({"context": vis_params}) - } - response = test_client.post(url, json=params) - assert b'{\n "success": true\n}\n' in response.data - url = "/api/stable/persistence/getLastVersion/testdb" - params = { - "vis_id": "530133cf1768e6606f63c641a1a96768", - "context": True - } - response = test_client.post(url, json=params) - assert "rev_data" in response.json.keys() - assert "rev_timestamp" in response.json.keys() - assert "rev_vis" in response.json.keys() - assert "vis_params" in response.json.keys() - assert "vis_query" in response.json.keys() - assert "vis_title" in response.json.keys() - return_data = response.json["rev_data"] - assert data == return_data - -def test_search_and_getLatestRevision(app, populate_db): - """ - This test will be able to test SQLite and Postgres persistence. - """ - url = "http://backend/server/services/searchBASE.php" - params = { - "unique_id": "530133cf1768e6606f63c641a1a96768", - "from": "1809-01-01", - "to": "2023-07-28", - "document_types": ["121"], - "q": "digital education", - "sorting": "most-recent" - } - response = requests.post(url, data=params) - response.raise_for_status() - data = response.json() - assert "query" in data - assert "id" in data - assert "status" in data - url = "http://backend/server/services/getLatestRevision.php" - params = { - "vis_id": "530133cf1768e6606f63c641a1a96768", - "context": True - } - response = requests.get(url, params=params) - response.raise_for_status() - r = response.json() - assert type(r) == dict - assert "context" in r.keys() - assert "data" in r.keys() - assert type(r["context"]) == dict - assert type(r["data"]) == str - data = json.loads(r["data"]) - assert type(data) == list diff --git a/server/workers/tests/test_orcid.py b/server/workers/tests/test_orcid.py new file mode 100644 index 000000000..f4c1cee3e --- /dev/null +++ b/server/workers/tests/test_orcid.py @@ -0,0 +1,183 @@ +import unittest +from unittest.mock import patch, MagicMock, Mock +import json +import pandas as pd +from redis import Redis +from orcid.src.worker import OrcidWorker +from pyorcid import errors as pyorcid_errors +import requests + +class TestOrcidClient(unittest.TestCase): + + def setUp(self): + self.mock_redis = MagicMock(spec=Redis) + self.client = OrcidWorker(redis_store=self.mock_redis) + + self.mock_author_info = { + "author_name": "John Doe", + "total_citations": 15, + } + self.mock_metadata = pd.DataFrame({ + "id": ["123"], + "authors": [["John Doe"]], + "title": ["Test Title"], + "subtitle": [""], + "paper_abstract": ["Test abstract"], + "published_in": ["Test Journal"], + "year": [2020], + "citation_count": [10], + }) + + self.mock_enriched_metadata = pd.DataFrame({ + "id": ["123"], + "authors": ["John Doe"], + "title": ["Test Title"], + "subtitle": [""], + "paper_abstract": ["Test abstract"], + "published_in": ["Test Journal"], + "year": [2020], + "citation_count": [10], + # after enirchment + "citation_count": [10], + "cited_by_wikipedia_count": [2], + "cited_by_msm_count": [0], + "cited_by_policies_count": [0], + "cited_by_patents_count": [1], + "cited_by_accounts_count": [3], + }) + + # @patch('pyorcid.orcid_authentication.OrcidAuthentication.get_public_access_token') + # @patch('pyorcid.orcid_authentication.OrcidAuthentication') + # def test_authenticate( + # self, + # mock_orcid_authentication, + # mock_get_public_access_token + # ): + # # Check initial access_token is None + # self.assertIsNone(self.client.access_token) + + # mock_get_public_access_token.return_value = "mock_access_token" + + # # Create an instance of OrcidClient and call authenticate + # client = OrcidClient() + # client.authenticate() + + # # Assert that get_public_access_token was called once + # mock_get_public_access_token.assert_called_once() + + # # Check that the access token was updated correctly + # self.assertEqual(client.access_token, 'mock_access_token') + + # # Ensure OrcidAuthentication was instantiated with the correct parameters + # mock_orcid_authentication.assert_called_once_with( + # client_id=client.ORCID_CLIENT_ID, + # client_secret=client.ORCID_CLIENT_SECRET + # ) + + def test_next_item(self): + # Set up the mock Redis to return a mock message + mock_message = json.dumps({"id": "123", "params": {"orcid": "0000-0002-1825-0097"}, "endpoint": "search"}).encode('utf-8') + self.mock_redis.blpop.return_value = (None, mock_message) + + item_id, params, endpoint = self.client.next_item() + + self.assertEqual(item_id, "123") + self.assertEqual(params["orcid"], "0000-0002-1825-0097") + self.assertEqual(endpoint, "search") + + @patch('orcid.src.orcid.OrcidClient.execute_search') + def test_handle_search_success(self, mock_execute_search): + # Set up the mocks + mock_execute_search.return_value = {"status": "success", "id": "123"} + mock_params = {"orcid": "0000-0002-1825-0097"} + + self.client.handle_search("123", mock_params) + + # Check that the data was pushed back to the Redis queue + self.mock_redis.rpush.assert_called_once_with( + "input_data", + json.dumps(mock_execute_search.return_value).encode("utf8") + ) + + @patch('orcid.src.orcid.OrcidClient.execute_search') + def test_handle_search_error(self, mock_execute_search): + # Simulate an error during search execution + mock_execute_search.side_effect = Exception("An error occurred") + + mock_params = {"orcid": "0000-0002-1825-0097"} + with self.assertLogs(self.client.logger, level="ERROR") as log: + self.client.handle_search("123", mock_params) + + # Check that the logger caught the error + self.assertIn("Exception during data retrieval.", log.output[0]) + + @patch('orcid.src.orcid.OrcidClient._initialize_orcid') # Adjust the path if necessary + def test_initialize_orcid_mock(self, mock_initialize_orcid): + mock_initialize_orcid.return_value = None + client = OrcidWorker() + orcid = client._initialize_orcid("some_orcid_id") + self.assertIsNone(orcid) + + + @patch('orcid.src.orcid.OrcidClient._initialize_orcid') + @patch('orcid.src.orcid.OrcidClient._retrieve_author_info_and_metadata') + @patch('orcid.src.orcid.OrcidClient._process_metadata') + def test_execute_search_success( + self, + mock_process_metadata_mock, + retrieve_author_info_and_metadata_mock, + mock_initialize_orcid, + ): + mock_initialize_orcid.return_value = None + retrieve_author_info_and_metadata_mock.return_value = (self.mock_author_info, self.mock_metadata) + mock_process_metadata_mock.return_value = self.mock_enriched_metadata + + params = {"orcid": "0000-0002-1825-0097", "limit": 1} + client = OrcidWorker() + result = client.execute_search(params) + + # Assert the returned result structure + self.assertIn("input_data", result) + self.assertIn("author", result) + self.assertEqual(result["author"]["author_name"], "John Doe") + self.assertEqual(result["author"]["total_citations"], 15) + + @patch('orcid.src.orcid.OrcidClient._handle_error') # Patch _handle_error directly + @patch('orcid.src.orcid.OrcidClient._initialize_orcid') # Patch _initialize_orcid directly + def test_execute_search_invalid_orcid(self, mock_initialize_orcid: MagicMock, mock_handle_error: MagicMock): + # Simulate NotFound error when _initialize_orcid is called + response = requests.Response() + response.status_code = 404 + mock_initialize_orcid.side_effect = pyorcid_errors.NotFound(response) + + # Prepare the client and the parameters + client = OrcidWorker() + params = {"orcid": "invalid-orcid"} + + # Execute the method under test + client.execute_search(params) + + # Ensure _handle_error is called with the correct parameters + mock_handle_error.assert_called_once_with(params, "invalid orcid id", mock_initialize_orcid.side_effect) + def test_enrich_author_info(self): + mock_author_info = {"author_name": "John Doe"} + mock_metadata = pd.DataFrame({ + "citation_count": [10, 5], + "cited_by_wikipedia_count": [2, 1], + "cited_by_msm_count": [0, 0], + "cited_by_policies_count": [0, 0], + "cited_by_patents_count": [1, 1], + "cited_by_accounts_count": [3, 2], + "year": [2018, 2020] + }) + + enriched_info = self.client.enrich_author_info(mock_author_info, mock_metadata) + + self.assertEqual(enriched_info["total_citations"], 15) + self.assertEqual(enriched_info["total_neppr"], 5) + self.assertEqual(enriched_info["h_index"], 2) + self.assertEqual(enriched_info["academic_age"], "6") + self.assertAlmostEqual(enriched_info["normalized_h_index"], 0.3333, places=4) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/vis/js/HeadstartRunner.js b/vis/js/HeadstartRunner.js index b7270a875..fb78c1149 100644 --- a/vis/js/HeadstartRunner.js +++ b/vis/js/HeadstartRunner.js @@ -119,7 +119,8 @@ class HeadstartRunner { width, height, list.height, - this.dataManager.scalingFactors + this.dataManager.scalingFactors, + this.dataManager.author ) ); diff --git a/vis/js/actions/index.js b/vis/js/actions/index.js index 054aa1bec..d13e0ae73 100644 --- a/vis/js/actions/index.js +++ b/vis/js/actions/index.js @@ -60,7 +60,8 @@ export const initializeStore = ( streamWidth, streamHeight, listHeight, - scalingFactors + scalingFactors, + author ) => ({ type: "INITIALIZE", configObject, @@ -73,6 +74,7 @@ export const initializeStore = ( streamHeight, listHeight, scalingFactors, + author }); export const toggleList = () => ({ type: "TOGGLE_LIST" }); @@ -150,6 +152,9 @@ export const closeViperEditModal = () => ({ type: "CLOSE_VIPER_EDIT_MODAL" }); export const openInfoModal = () => ({ type: "OPEN_INFO_MODAL" }); export const closeInfoModal = () => ({ type: "CLOSE_INFO_MODAL" }); +export const openResearcherModal = () => ({ type: "OPEN_RESEARCHER_MODAL" }); +export const closeResearcherModal = () => ({ type: "CLOSE_RESEARCHER_MODAL" }); + export const scaleMap = (value, baseUnit, contentBased, sort) => ({ type: "SCALE", value, diff --git a/vis/js/components/ContextLine.js b/vis/js/components/ContextLine.js index 49297668b..871d703b9 100644 --- a/vis/js/components/ContextLine.js +++ b/vis/js/components/ContextLine.js @@ -19,6 +19,7 @@ import MoreInfoLink from "../templates/contextfeatures/MoreInfoLink"; import MetadataQuality from "../templates/contextfeatures/MetadataQuality"; import ContextTimeFrame from "../templates/contextfeatures/ContextTimeFrame"; import DocumentLang from "../templates/contextfeatures/DocumentLang"; +import ResearcherInfo from "../templates/contextfeatures/ResearcherInfo"; const defined = (param) => param !== undefined && param !== null; @@ -38,75 +39,80 @@ class ContextLine extends React.Component { return ( - {params.showAuthor && ( - - )} - - - - {defined(params.dataSource) && ( - - )} - {defined(params.timespan) && - - - } - - {/* was an issue to left "All Languages" as default value in the context if no lang_id in parameters */} - + + + {defined(params.dataSource) && ( + - {defined(params.paperCount) && ( - - )} - {defined(params.datasetCount) && ( - - )} - {defined(params.funder) && {params.funder}} - {defined(params.projectRuntime) && ( - {params.projectRuntime} - )} - {defined(params.legacySearchLanguage) && ( - {params.legacySearchLanguage} - )} - {defined(params.timestamp) && ( - - )} - + + + )} + + {/* was an issue to left "All Languages" as default value in the context if no lang_id in parameters */} + + {defined(params.paperCount) && ( + + )} + {defined(params.datasetCount) && ( + + )} + {defined(params.funder) && {params.funder}} + {defined(params.projectRuntime) && ( + {params.projectRuntime} + )} + {defined(params.legacySearchLanguage) && ( + {params.legacySearchLanguage} + )} + {defined(params.timestamp) && ( + - {defined(params.searchLanguage) && ( - {params.searchLanguage} - )} - + )} + + {defined(params.searchLanguage) && ( + {params.searchLanguage} + )} + + ); } diff --git a/vis/js/components/KnowledgeMap.js b/vis/js/components/KnowledgeMap.js index 41d4eaf72..5f1c63c49 100644 --- a/vis/js/components/KnowledgeMap.js +++ b/vis/js/components/KnowledgeMap.js @@ -183,7 +183,6 @@ const KnowledgeMap = (props) => { hovered={hoveredBubble === bubble.area_uri} zoom={zoom} zoomed={zoomedBubbleUri === bubble.area_uri} - baseUnit={baseUnit} animation={animation} highlighted={highlightedBubbleUri === bubble.area_uri} /> diff --git a/vis/js/components/Modals.js b/vis/js/components/Modals.js index 29b5dec7f..18359af67 100644 --- a/vis/js/components/Modals.js +++ b/vis/js/components/Modals.js @@ -8,6 +8,7 @@ import CitePaperModal from "../templates/modals/CitePaperModal"; import EmbedModal from "../templates/modals/EmbedModal"; import ExportPaperModal from "../templates/modals/ExportPaperModal"; import InfoModal from "../templates/modals/InfoModal"; +import ResearcherInfoModal from "../templates/modals/ResearcherInfoModal"; import PdfModal from "../templates/modals/PdfModal"; import ViperEditModal from "../templates/modals/ViperEditModal"; import LocalizationProvider from "./LocalizationProvider"; @@ -46,6 +47,7 @@ const Modals = ({ /> )} + {showPDFPreview && ( { if (showScaleToolbar) { const handleScaleChange = (newScaleBy) => { @@ -33,6 +34,7 @@ const Toolbar = ({ value={scaleValue} showCredit={showCredit} onInfoClick={onInfoClick} + onResearcherClick={onResearcherClick} onChange={handleScaleChange} /> @@ -54,6 +56,7 @@ const mapStateToProps = (state) => ({ const mapDispatchToProps = (dispatch) => ({ onInfoClick: () => dispatch(openInfoModal()), + onResearcherClick: () => dispatch(openResearcherModal()), onScaleChange: (value, baseUnit, contentBased, sort) => dispatch(scaleMap(value, baseUnit, contentBased, sort)), }); diff --git a/vis/js/dataprocessing/managers/DataManager.js b/vis/js/dataprocessing/managers/DataManager.js index 8d1ee9804..3b2c1f114 100644 --- a/vis/js/dataprocessing/managers/DataManager.js +++ b/vis/js/dataprocessing/managers/DataManager.js @@ -34,6 +34,15 @@ class DataManager { scalingFactors = {}; streams = []; areas = []; + author = {}; + scale_base_unit = { + citations: "citations", + references: "references", + cited_by_accounts_count: "social", + citation_count: 'citations', + cited_by_tweeters_count: 'tweets', + 'readers.mendeley': 'readers' + }; constructor(config, scheme = DEFAULT_SCHEME) { this.config = config; @@ -45,6 +54,8 @@ class DataManager { this.__parseContext(backendData); // initialize this.papers this.__parsePapers(backendData); + // initialize this.authors + this.__parseAuthor(backendData); // initialize this.scalingFactors this.__computeScalingFactors(this.papers.length); @@ -71,6 +82,25 @@ class DataManager { } } + __parseAuthor(backendData) { + this.author = this.__getAuthor(backendData); + } + + __getAuthor(backendData) { + if (this.config.show_context) { + if (typeof backendData.data === "string") { + const data = JSON.parse(backendData.data); + return data.author ?? {}; + } + return backendData.data?.author ?? {}; + } + if (typeof backendData.data === "object") { + return backendData.data?.author ?? {}; + } + + return backendData; + } + __parsePapers(backendData) { this.papers = this.__getPapersArray(backendData); @@ -83,12 +113,13 @@ class DataManager { __getPapersArray(backendData) { if (this.config.show_context) { if (typeof backendData.data === "string") { - return JSON.parse(backendData.data); + const data = JSON.parse(backendData.data); + return data.documents ? JSON.parse(data?.documents) : []; } - return backendData.data; + return backendData.data?.documents ?? []; } if (typeof backendData.data === "object") { - return backendData.data; + return backendData.data?.documents ?? []; } return backendData; @@ -109,15 +140,15 @@ class DataManager { this.__escapeStrings(paper); this.__sanitizeTitle(paper); this.__parseAuthors(paper); + this.__parseAccess(paper); + this.__parseLink(paper); + this.__parseComments(paper); + this.__countMetrics(paper); this.__parseCoordinates(paper); while (blockedCoords[`${paper.x}:${paper.y}`]) { this.__adjustCoordinates(paper); } blockedCoords[`${paper.x}:${paper.y}`] = true; - this.__parseAccess(paper); - this.__parseLink(paper); - this.__parseComments(paper); - this.__countMetrics(paper); this.__parseTags(paper); this.__parseKeywords(paper); }); @@ -143,8 +174,8 @@ class DataManager { __parseAuthors(paper) { paper.authors_objects = extractAuthors(paper.authors); paper.authors_list = getAuthorsList( - paper.authors, - this.config.convert_author_names + paper.authors, + this.config.convert_author_names ); // old variable with all authors_string @@ -211,20 +242,40 @@ class DataManager { paper.num_readers = 0; paper.internal_readers = 1; + paper.readers = paper.num_readers; + paper.tweets = getVisibleMetric(paper, "cited_by_tweeters_count"); + paper.citations = getVisibleMetric(paper, "citation_count"); + paper.readers = getVisibleMetric(paper, "readers.mendeley"); + + paper.social = getVisibleMetric(paper, "cited_by_accounts_count"); + paper.references = [ + paper.cited_by_wikipedia_count, + paper.cited_by_msm_count, + paper.cited_by_policies_count, + paper.cited_by_patents_count, + ].reduce((acc, val) => { + if (typeof val === "string" || typeof val === "number") { + return (acc ?? 0) + +val; + } else if (val === undefined || val === null) { + return acc; + } + }, null); + if (!config.content_based && !config.scale_by) { paper.num_readers = getVisibleMetric(paper, "readers"); paper.internal_readers = getInternalMetric(paper, "readers") + 1; } - if (config.scale_by) { - paper.num_readers = getVisibleMetric(paper, config.scale_by); - paper.internal_readers = getInternalMetric(paper, config.scale_by) + 1; - } - paper.readers = paper.num_readers; - if (config.metric_list) { - paper.tweets = getVisibleMetric(paper, "cited_by_tweeters_count"); - paper.citations = getVisibleMetric(paper, "citation_count"); - paper.readers = getVisibleMetric(paper, "readers.mendeley"); + if (config.scale_by) { + paper.num_readers = getVisibleMetric( + paper, + this.scale_base_unit?.[config.scale_by] ?? config.scale_by + ); + paper.internal_readers = + getInternalMetric( + paper, + this.scale_base_unit?.[config.scale_by] ?? config.scale_by + ) + 1; } } diff --git a/vis/js/default-config.js b/vis/js/default-config.js index 9dce4fea4..85cb7003d 100644 --- a/vis/js/default-config.js +++ b/vis/js/default-config.js @@ -600,6 +600,25 @@ var config = { please_note: "Please note", citation_warning: "we were not able to verify whether this citation is formatted correctly based on the metadata received. Please check before reuse.", time_frame_context_sg: "Please note that we remove time intervals with only a few associated papers during the computation of your streamgraph to increase its readability. As a result the time on the x-axis may not align with the time range you selected.", + citations_count_label: "citations", + social_media_count_label: "social media mentions", + references_count_label: "references outside academia", + citations: "citations", + social: "social media mentions", + references: "references outside academia", + scale_by_infolink_label: 'Find out more', + metrics_label: "Metrics", + scale_by_label: "Scale map by:", + scale_by_explanation: "The size of the bubbles is relative to the number of documents related to them.", + scale_label: { + content_based: 'Documents', + citations: 'Citations', + cited_by_accounts_count: 'Social media mentions', + references: 'References outside academia', + citation_count: 'Citations', + cited_by_tweeters_count: 'Tweets', + 'readers.mendeley': 'Readers' + }, }, eng_openaire: { loading: "Loading...", @@ -698,217 +717,11 @@ var config = { please_note: "Please note", citation_warning: "we were not able to verify whether this citation is formatted correctly based on the metadata received. Please check before reuse.", faqs_url: "https://openknowledgemaps.org/faq", - }, - ger_cris: { - loading: "Wird geladen...", - search_placeholder: "Suchwort eingeben", - show_list: "Liste ausklappen", - hide_list: "Liste einklappen", - intro_label: "mehr Informationen", - intro_label_areas: "Verteilung der Respondenten", - intro_areas_title: "Verteilung der Respondenten für ", - readers: "Nennungen", - year: "Jahr", - authors: "Autor", - title: "alphabetisch", - default_title: 'Überblick über Artikel', - overview_label: 'Überblick über', - most_recent_label: 'neueste', - most_relevant_label: 'relevanteste', - articles_label: 'Artikel', - source_label: 'Quelle', - documenttypes_label: 'Publikationsarten', - documenttypes_tooltip: 'Die folgenden Publikationsarten wurden bei der Erstellung dieser Visualisierung in Betracht gezogen (nicht alle davon scheinen notwendigerweise in dieser Visualisierung auch auf):', - area: "Themenfeld", - backlink: "← Zurück zur Übersicht", - backlink_list: "← Zeige alle Themen im Themenfeld", - keywords: "Keywords", - doctypes: "Document type(s)", - unknown: "Unknown", - no_keywords: "nicht vorhanden", - not_available: "nicht vorhanden", - no_title: "Kein Titel", - default_area: "Kein Bereich", - default_author: "", - default_id: "defaultid", - default_hash: "hashHash", - default_abstract: "", - default_published_in: "", - default_readers: 0, - default_url: "", - default_x: 1., - default_y: 1., - default_year: "", - showmore_questions_label: "Alle", - showmore_questions_verb: "Fragen anzeigen", - distributions_label: "Verteilungen ", - show_verb_label: "ausklappen", - hide_verb_label: "einklappen", - sort_by_label: 'sortieren: ', - items: "Themen", - comment_by_label: "von", - scale_by_infolink_label: '', - scale_by_label: 'Verteilung für:', - credit_alt: "Created by Open Knowledge Maps", - area_streamgraph: "Schlagwort", - stream_year: "Jahr", - stream_doc_num: "Anzahl Dokumente", - stream_docs: "Dokumente", - stream_total: "Gesamt", - empty_area_warning: "Keine Dokumente gefunden. Setzen Sie bitte Ihre Filtereinstellungen zurück.", - cite: "Cite", - cite_title_km: "Cite this knowledge map", - cite_title_sg: "Cite this streamgraph", - citation_template: "Open Knowledge Maps (${year}). ${type} for research on ${query}. Retrieved from ${source} [${date}].", - cite_vis_km: "Please cite this knowledge map as follows", - cite_vis_sg: "Please cite this streamgraph as follows", - cite_paper: "Cite this document as", - export_paper: "Export this document", - download: "Download", - please_note: "Please note", - citation_warning: "we were not able to verify whether this citation is formatted correctly based on the metadata received. Please check before reuse.", - }, - ger_cris_2: { - loading: "Wird geladen...", - search_placeholder: "Suchwort eingeben", - show_list: "Liste ausklappen", - hide_list: "Liste einklappen", - intro_label: "mehr Informationen", - intro_label_areas: "Verteilung der Respondenten", - intro_areas_title: "Verteilung der Respondenten für ", - readers: "Anzahl Fragen", - year: "Jahr", - authors: "Autor", - title: "alphabetisch", - default_title: 'Überblick über Artikel', - overview_label: 'Überblick über', - most_recent_label: 'neueste', - most_relevant_label: 'relevanteste', - articles_label: 'Artikel', - source_label: 'Quelle', - documenttypes_label: 'Publikationsarten', - documenttypes_tooltip: 'Die folgenden Publikationsarten wurden bei der Erstellung dieser Visualisierung in Betracht gezogen (nicht alle davon scheinen notwendigerweise in dieser Visualisierung auch auf):', - area: "Themenfeld", - backlink: "← Zurück zur Übersicht", - backlink_list: "← Zeige alle Themen im Themenfeld", - keywords: "Keywords", - doctypes: "Document type(s)", - unknown: "Unknown", - no_keywords: "nicht vorhanden", - not_available: "nicht vorhanden", - no_title: "Kein Titel", - default_area: "Kein Bereich", - default_author: "", - default_id: "defaultid", - default_hash: "hashHash", - default_abstract: "", - default_published_in: "", - default_readers: 0, - default_url: "", - default_x: 1., - default_y: 1., - default_year: "", - showmore_questions_label: "Alle", - showmore_questions_verb: "Fragen anzeigen", - distributions_label: "Verteilungen ", - show_verb_label: "ausklappen", - hide_verb_label: "einklappen", - sort_by_label: 'sortieren: ', - items: "Themen", - comment_by_label: "von", - scale_by_infolink_label: '', - scale_by_label: 'Verteilung für:', - credit_alt: "Created by Open Knowledge Maps", - area_streamgraph: "Schlagwort", - stream_year: "Jahr", - stream_doc_num: "Anzahl Dokumente", - stream_docs: "Dokumente", - stream_total: "Gesamt", - empty_area_warning: "Keine Dokumente gefunden. Setzen Sie bitte Ihre Filtereinstellungen zurück.", - cite: "Cite", - cite_title_km: "Cite this knowledge map", - cite_title_sg: "Cite this streamgraph", - citation_template: "Open Knowledge Maps (${year}). ${type} for research on ${query}. Retrieved from ${source} [${date}].", - cite_vis_km: "Please cite this knowledge map as follows", - cite_vis_sg: "Please cite this streamgraph as follows", - cite_paper: "Cite this document as", - export_paper: "Export this document", - download: "Download", - please_note: "Please note", - citation_warning: "we were not able to verify whether this citation is formatted correctly based on the metadata received. Please check before reuse.", - }, - eng_cris_2: { - loading: "Loading...", - search_placeholder: "Search within visualization...", - show_list: "Show list", - hide_list: "Hide list", - intro_label: "more information", - intro_label_areas: "Distribution of respondents", - intro_areas_title: "Distribution of respondents for ", - readers: "no. questions", - year: "date", - authors: "authors", - title: "alphabetically", - default_title: 'Overview of documents', - overview_label: 'Knowledge Map of', - most_recent_label: 'most recent', - most_relevant_label: 'most relevant', - articles_label: 'documents', - source_label: 'Data source', - documenttypes_label: 'Document types', - documenttypes_tooltip: 'The following document types were taken into consideration in the creation of this visualization (not all of them may appear in the visualization):', - area: "Area", - backlink: "← Back to overview", - backlink_list: "← Show all topics in area", - keywords: "Keywords", - doctypes: "Document type(s)", - unknown: "Unknown", - no_keywords: "not available", - not_available: "not available", - no_title: "No title", - default_area: "No area", - default_author: "", - default_id: "defaultid", - default_hash: "hashHash", - default_abstract: "No abstract available", - default_paper_title: "No title available", - default_authors: "No authors available", - default_published_in: "", - default_readers: 0, - default_url: "", - default_x: 1., - default_y: 1., - default_year: "", - sort_by_label: 'sort by:', - comment_by_label: "by", - embed_body_text: 'You can use this code to embed the visualization on your own website or in a dashboard.', - showmore_questions_label: "Show all", - showmore_questions_verb: "questions", - distributions_label: "distributions ", - show_verb_label: "expand", - hide_verb_label: "collapse", - items: "topics", - scale_by_infolink_label: '', - scale_by_label: 'Distribution for:', - credit_alt: "Created by Open Knowledge Maps", - area_streamgraph: "Stream", - stream_year: "Year", - stream_doc_num: "Number of documents", - stream_docs: "Documents", - stream_total: "Total documents in stream", - empty_area_warning: "No matches found. Please reset your filter options above.", - lang_all: "All languages", - cite: "Cite", - cite_title_km: "Cite this knowledge map", - cite_title_sg: "Cite this streamgraph", - citation_template: "Open Knowledge Maps (${year}). ${type} for research on ${query}. Retrieved from ${source} [${date}].", - cite_vis_km: "Please cite this knowledge map as follows", - cite_vis_sg: "Please cite this streamgraph as follows", - cite_paper: "Cite this document as", - export_paper: "Export this document", - download: "Download", - please_note: "Please note", - citation_warning: "we were not able to verify whether this citation is formatted correctly based on the metadata received. Please check before reuse.", + // citations_count_label: "citations", + researcher_info: "Researcher information", + social_media_count_label: " social media mentions", + references_count_label: "references outside academia", + // scale_by_infolink_label: 'Find out more' }, }, diff --git a/vis/js/reducers/author.js b/vis/js/reducers/author.js new file mode 100644 index 000000000..7247b26de --- /dev/null +++ b/vis/js/reducers/author.js @@ -0,0 +1,29 @@ +const author = (state = null, action) => { + if (action.canceled || action.isStreamgraph) { + return state; + } + switch (action.type) { + case "INITIALIZE": + return { + ...state, + author_keywords: action.author?.author_keywords, + author_name: action.author?.author_name, + biography: action.author?.biography, + country: action.author?.country, + external_identifiers: action.author?.external_identifiers, + orcid_id: action.author?.orcid_id, + total_citations: action.author?.total_citations, + total_neppr: action.author?.total_neppr, + total_unique_social_media_mentions: + action.author?.total_unique_social_media_mentions, + websites: action.author?.websites, + h_index: action.author?.h_index, + academic_age: action.author?.academic_age, + normalized_h_index: action.author?.normalized_h_index + }; + default: + return state; + } +}; + +export default author; diff --git a/vis/js/reducers/index.js b/vis/js/reducers/index.js index aed113b1e..7db0dbd1c 100644 --- a/vis/js/reducers/index.js +++ b/vis/js/reducers/index.js @@ -5,6 +5,7 @@ import { combineReducers } from "redux"; import animation from "./animation"; import areas from "./areas"; +import author from "./author"; import bubbleOrder from "./bubbleOrder"; import chart from "./chart"; import chartType from "./chartType"; @@ -34,6 +35,7 @@ import modalInfoType from "./modalInfoType"; export default combineReducers({ animation, areas, + author, bubbleOrder, chart, chartType, diff --git a/vis/js/reducers/modals.js b/vis/js/reducers/modals.js index 9c1a03caa..403a45907 100644 --- a/vis/js/reducers/modals.js +++ b/vis/js/reducers/modals.js @@ -31,6 +31,7 @@ const modals = ( }, openInfoModal: state.openInfoModal !== undefined && !!action.configObject.show_intro, + openResearcherModal: false, infoParams: action.contextObject ? { ...action.contextObject, @@ -78,6 +79,16 @@ const modals = ( ...state, openInfoModal: false, }; + case "OPEN_RESEARCHER_MODAL": + return { + ...state, + openResearcherModal: true, + }; + case "CLOSE_RESEARCHER_MODAL": + return { + ...state, + openResearcherModal: false, + }; case "SHOW_PREVIEW": return { ...state, @@ -123,6 +134,7 @@ const modals = ( return { ...state, openInfoModal: false, + openResearcherModal: false, openEmbedModal: false, openViperEditModal: false, openCitationModal: false, diff --git a/vis/js/templates/contextfeatures/ResearcherInfo.jsx b/vis/js/templates/contextfeatures/ResearcherInfo.jsx new file mode 100644 index 000000000..46dbd5fd1 --- /dev/null +++ b/vis/js/templates/contextfeatures/ResearcherInfo.jsx @@ -0,0 +1,42 @@ +import React from "react"; +import { connect } from "react-redux"; + +import useMatomo from "../../utils/useMatomo"; +import { useLocalizationContext } from "../../components/LocalizationProvider"; +import { openResearcherModal } from "../../actions"; + +const ResearcherInfo = ({ onClick }) => { + const loc = useLocalizationContext(); + const { trackEvent } = useMatomo(); + + const handleClick = () => { + onClick(); + + trackEvent( + "Title & Context line", + "Open researcher modal", + "More researcher info button" + ); + }; + + return ( + // html template starts here + + + {loc.metrics_label} + + + + // html template ends here + ); +}; + +const mapDispatchToProps = (dispatch) => ({ + onClick: () => dispatch(openResearcherModal()), +}); + +export default connect(null, mapDispatchToProps)(ResearcherInfo); diff --git a/vis/js/templates/listentry/OrcidMetrics.jsx b/vis/js/templates/listentry/OrcidMetrics.jsx new file mode 100644 index 000000000..9f84661e1 --- /dev/null +++ b/vis/js/templates/listentry/OrcidMetrics.jsx @@ -0,0 +1,62 @@ +import React from "react"; + +import { useLocalizationContext } from "../../components/LocalizationProvider"; + +// it is either some value or zero +const isDefined = (param) => !!param || parseInt(param) === 0; + +const OrcidMetrics = ({ + citations, + social_media, + references_outside_academia, + baseUnit, +}) => { + const localization = useLocalizationContext(); + + return ( + // html template starts here +
+ + + + {isDefined(citations) ? citations : "n/a"} + {" "} + {localization.citations_count_label} + + + + {isDefined(social_media) ? social_media : "n/a"} + {" "} + {localization.social_media_count_label} + + + + + {isDefined(references_outside_academia) + ? references_outside_academia + : "n/a"} + {" "} + {localization.references_count_label} + + +
+ // html template ends here + ); +}; + +export default OrcidMetrics; diff --git a/vis/js/templates/listentry/StandardListEntry.jsx b/vis/js/templates/listentry/StandardListEntry.jsx index fddba23db..5b6b82df3 100644 --- a/vis/js/templates/listentry/StandardListEntry.jsx +++ b/vis/js/templates/listentry/StandardListEntry.jsx @@ -17,6 +17,7 @@ import EntryBacklink from "./EntryBacklink"; import Keywords from "./Keywords"; import Link from "./Link"; import Metrics from "./Metrics"; +import OrcidMetrics from './OrcidMetrics'; import Title from "./Title"; /** @@ -36,6 +37,7 @@ const StandardListEntry = ({ isInStreamBacklink, showDocTags, showAllDocTypes, + service, // event handlers handleBacklinkClick, }) => { @@ -52,6 +54,8 @@ const StandardListEntry = ({ !showMetrics && (!!citations || parseInt(citations) === 0); + console.log(paper); + return ( // html template starts here
@@ -73,7 +77,8 @@ const StandardListEntry = ({ {paper.comments.length > 0 && } {showKeywords && {paper.keywords}} {showAllDocTypes && } - {showMetrics && ( + + {service !== "orcid" && showMetrics && ( )} + + {service === "orcid" && showMetrics && ( + + )} {!isStreamgraph && } {showCitations && } @@ -109,6 +123,7 @@ const mapStateToProps = (state) => ({ isInStreamBacklink: !!state.selectedBubble, showDocTags: state.service === "base" || state.service === "orcid", showAllDocTypes: (state.service === "base" || state.service === "orcid") && !!state.selectedPaper, + service: state.service, }); export default connect( diff --git a/vis/js/templates/modals/InfoModal.jsx b/vis/js/templates/modals/InfoModal.jsx index 5cdec65e8..15e70955e 100644 --- a/vis/js/templates/modals/InfoModal.jsx +++ b/vis/js/templates/modals/InfoModal.jsx @@ -9,7 +9,6 @@ import BaseInfo from "./infomodal/BaseInfo"; import CovisInfo from "./infomodal/CovisInfo"; import DefaultKMInfo from "./infomodal/DefaultKMInfo"; import DefaultSGInfo from "./infomodal/DefaultSGInfo"; -import GsheetsInfo from "./infomodal/GsheetsInfo"; import PubMedInfo from "./infomodal/PubMedInfo"; import TripleKMInfo from "./infomodal/TripleKMInfo"; import TripleSGInfo from "./infomodal/TripleSGInfo"; diff --git a/vis/js/templates/modals/ResearcherInfoModal.jsx b/vis/js/templates/modals/ResearcherInfoModal.jsx new file mode 100644 index 000000000..1b4675685 --- /dev/null +++ b/vis/js/templates/modals/ResearcherInfoModal.jsx @@ -0,0 +1,65 @@ +import React from "react"; +import { connect } from "react-redux"; +import { Modal } from "react-bootstrap"; + +import { closeResearcherModal } from "../../actions"; +import { STREAMGRAPH_MODE } from "../../reducers/chartType"; + +import ResearcherInfo from "./researcher-modal/OrcidResearcherInfo"; + +const getResearcherInfoTemplate = (service, isStreamgraph, modalType) => { + switch (service) { + case "orcid": + return ResearcherInfo; + default: + return null; + } +}; + +const ResearcherInfoModal = ({open, onClose, params, service, isStreamgraph, modalInfoType}) => { + const ResearcherInfoTemplate = getResearcherInfoTemplate(service, isStreamgraph, modalInfoType); + + return ( + // html template starts here + + + + // html template ends here + ); +}; + + +const mapStateToProps = (state) => ({ + open: state.modals.openResearcherModal, + params: { + ...state.modals.infoParams, + query: state.query.text, + customTitle: state.heading.customTitle, + q_advanced: state.q_advanced.text, + + author_name: state.author.author_name, + author_keywords: state.author.author_keywords, + biography: state.author.biography, + country: state.author.country, + external_identifiers: state.author.external_identifiers, + orcid_id: state.author.orcid_id, + total_citations: state.author.total_citations, + total_neppr: state.author.total_neppr, + total_unique_social_media_mentions: + state.author.total_unique_social_media_mentions, + websites: state.author.websites, + }, + service: state.isCovis ? "covis" : state.service, + isStreamgraph: state.chartType === STREAMGRAPH_MODE, + // new parameter from config to render correct type of info modal window + modalInfoType: state.modalInfoType, +}); + +const mapDispatchToProps = (dispatch) => ({ + onClose: () => dispatch(closeResearcherModal()), +}); + +export default connect( + mapStateToProps, + mapDispatchToProps +)(ResearcherInfoModal); diff --git a/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx b/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx new file mode 100644 index 000000000..bc789f29c --- /dev/null +++ b/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx @@ -0,0 +1,77 @@ +import React from "react"; +import { Modal } from "react-bootstrap"; +import { connect } from "react-redux"; + +const ResearcherInfo = ({ + params +}) => { + return ( + // html template starts here + <> + + Metrics + + +

METRICS

+

+ Normalised h-index: + {params.normalized_h_index?.toFixed(1)} + +

+

+ Academic age: + {params.academic_age} + +

+

+ h-index: + {params.h_index} + +

+

+ Number of total citations: {params.total_citations} +

+

ALTMETRICS

+

+ Number of total unique social media mentions:{" "} + {params.total_unique_social_media_mentions} +

+

+ Number of total news encyclopaedia, patent and policy references:{" "} + {params.total_neppr} +

+

NOTES ON METRICS

+

{params.biography}

+

OTHER IDs

+

+ {params.external_identifiers.map((external_id) => ( +

+ + {external_id["type"]}: {external_id["value"]} + +

+ ))} +

+
+ + // html template ends here + ); +}; + +const mapStateToProps = (state) => { + return { + params: { + total_citations: state.author.total_citations, + orcid_id: state.author.orcid_id, + total_unique_social_media_mentions: + state.author.total_unique_social_media_mentions, + total_neppr: state.author.total_neppr, + external_identifiers: state.author.external_identifiers, + h_index: state.author.h_index, + academic_age: state.author.academic_age, + normalized_h_index: state.author.normalized_h_index, + }, + }; +}; + +export default connect(mapStateToProps)(ResearcherInfo); diff --git a/vis/js/utils/data.js b/vis/js/utils/data.js index 27b198eec..22793c40a 100644 --- a/vis/js/utils/data.js +++ b/vis/js/utils/data.js @@ -158,7 +158,9 @@ const getParamSortFunction = (field) => { field === "relevance" || field === "citations" || field === "readers" || - field === "tweets" + field === "tweets" || + field === "social" || + field === "references" ) { return (a, b) => stringCompare(a[field], b[field], "desc"); } diff --git a/vis/stylesheets/modules/_map.scss b/vis/stylesheets/modules/_map.scss index 57e285e47..1edd51f9d 100644 --- a/vis/stylesheets/modules/_map.scss +++ b/vis/stylesheets/modules/_map.scss @@ -244,7 +244,7 @@ text { margin-right: 5px; margin-bottom: 10px; font-size: 9px; - width: 110px; + width: 140px; } } diff --git a/vis/stylesheets/modules/list/_entry.scss b/vis/stylesheets/modules/list/_entry.scss index c7e238750..d0dac0e1b 100644 --- a/vis/stylesheets/modules/list/_entry.scss +++ b/vis/stylesheets/modules/list/_entry.scss @@ -429,7 +429,7 @@ s.resulttype { color: $black; } -.list_metrics_citations, .list_metrics_tweets { +.list_metrics_citations, .list_metrics_tweets, .list_metrics_item { margin-right:10px; } diff --git a/vis/stylesheets/modules/list/_header.scss b/vis/stylesheets/modules/list/_header.scss index cdbcf7000..e5327aa21 100755 --- a/vis/stylesheets/modules/list/_header.scss +++ b/vis/stylesheets/modules/list/_header.scss @@ -35,7 +35,7 @@ #filter_parameter_container .dropdown-menu, #sort_container .dropdown-menu { width: 140px !important; - min-width:140px; + min-width: 140px; } .btn-default.active, .btn-default:active, .btn-default:focus, .btn-default.active:focus, .btn-default.active.focus, .btn-default.focus, .open > .dropdown-toggle.btn-default { diff --git a/vis/stylesheets/modules/map/_header.scss b/vis/stylesheets/modules/map/_header.scss index bf5e5570d..f888cafbb 100644 --- a/vis/stylesheets/modules/map/_header.scss +++ b/vis/stylesheets/modules/map/_header.scss @@ -102,6 +102,11 @@ #infolink { margin-left: 5px; + border-bottom: 1px solid #333; +} + +.headstart #infolink { + border-bottom: 1px solid #333; } #context-scale-toolbar #infolink { From 7ddf2e219070e5552409c4df47cc07090412e506 Mon Sep 17 00:00:00 2001 From: chreman Date: Mon, 2 Sep 2024 23:46:26 +0200 Subject: [PATCH 46/75] academic_age_offset param --- server/services/searchORCID.php | 2 +- server/workers/api/src/apis/request_validators.py | 1 + server/workers/orcid/src/orcid_service.py | 6 ++++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/server/services/searchORCID.php b/server/services/searchORCID.php index aca7f0f5a..6020e3d61 100644 --- a/server/services/searchORCID.php +++ b/server/services/searchORCID.php @@ -12,7 +12,7 @@ $precomputed_id = $_POST["unique_id"] ?? null; $params_array = array("orcid", "today"); -$optional_get_params = array("limit"); +$optional_get_params = array("limit", "academic_age_offset"); function filterEmptyString($value) { diff --git a/server/workers/api/src/apis/request_validators.py b/server/workers/api/src/apis/request_validators.py index 2ca6bd67f..4a253bcb8 100644 --- a/server/workers/api/src/apis/request_validators.py +++ b/server/workers/api/src/apis/request_validators.py @@ -34,6 +34,7 @@ class SearchParamSchema(Schema): list_size = fields.Int() custom_title = fields.Str() custom_clustering = fields.Str() + academic_age_offset = fields.Int() @pre_load diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index f49d4bec1..0ce250134 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -106,7 +106,7 @@ def enrich_metadata(self, params: Dict[str, str], metadata: pd.DataFrame) -> pd. metadata[c] = np.NaN return metadata - def enrich_author_info(self, author_info: AuthorInfo, metadata: pd.DataFrame) -> Dict[str, str]: + def enrich_author_info(self, author_info: AuthorInfo, metadata: pd.DataFrame, params: Dict[str, str]) -> Dict[str, str]: """ This function enriches the author information with additional information. Specifically, we extract and aggregate metrics data from the author's works, @@ -166,6 +166,8 @@ def extract_year(value): metadata["publication_year"] = metadata["year"].apply(extract_year) academic_age = author_info.academic_age + if academic_age & "academic_age_offset" in params: + academic_age += int(params.get("academic_age_offset")) # Calculate normalized h-index author_info.normalized_h_index = ( @@ -192,7 +194,7 @@ def _retrieve_author_info_and_metadata(self, orcid: Orcid) -> Tuple[Dict[str, st def _process_metadata(self, metadata: pd.DataFrame, author_info: AuthorInfo, params: Dict[str, str]) -> pd.DataFrame: metadata["authors"] = metadata["authors"].replace("", author_info.author_name) metadata = self.enrich_metadata(params, metadata) - author_info = self.enrich_author_info(author_info, metadata) + author_info = self.enrich_author_info(author_info, metadata, params) metadata = metadata.head(int(params.get("limit"))) return metadata From dadf5a2176c2f287374a294f6be81fd160a67c58 Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 3 Sep 2024 11:15:01 +0200 Subject: [PATCH 47/75] academic_age_offset param --- server/workers/orcid/src/orcid_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index 0ce250134..c1ebd0d9b 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -166,12 +166,12 @@ def extract_year(value): metadata["publication_year"] = metadata["year"].apply(extract_year) academic_age = author_info.academic_age - if academic_age & "academic_age_offset" in params: + if (academic_age and "academic_age_offset" in params): academic_age += int(params.get("academic_age_offset")) # Calculate normalized h-index author_info.normalized_h_index = ( - h_index / academic_age if academic_age & academic_age > 0 else 0 + h_index / academic_age if academic_age and academic_age > 0 else 0 ) return author_info From d877f0ba323999fc68a610b10d090108a923805f Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 3 Sep 2024 11:21:45 +0200 Subject: [PATCH 48/75] academic_age_offset param --- server/workers/api/src/apis/orcid.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/workers/api/src/apis/orcid.py b/server/workers/api/src/apis/orcid.py index 1c30aaf64..f62cea0af 100644 --- a/server/workers/api/src/apis/orcid.py +++ b/server/workers/api/src/apis/orcid.py @@ -19,6 +19,7 @@ # Constants DEFAULT_LIMIT = 200 REDIS_TIMEOUT = 300 +DEFAULT_ACADEMIC_AGE_OFFSET = 0 # Model definition orcid_querymodel = orcid_ns.model( @@ -74,6 +75,8 @@ def clean_params(self, params): del params["optradio"] if "limit" not in params: params["limit"] = DEFAULT_LIMIT + if "academic_age_offset" not in params: + params["academic_age_offset"] = DEFAULT_ACADEMIC_AGE_OFFSET def get_response_headers(self): headers = {} From fcebe8e2e6b21f7b4d9737e238de995d792e91c5 Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 3 Sep 2024 12:23:31 +0200 Subject: [PATCH 49/75] academic_age_offset param --- server/workers/build_docker_images.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/workers/build_docker_images.sh b/server/workers/build_docker_images.sh index 52feed689..71a2855d0 100755 --- a/server/workers/build_docker_images.sh +++ b/server/workers/build_docker_images.sh @@ -1,6 +1,6 @@ #!/bin/bash SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -services=("api" "persistence" "dataprocessing" "base" "pubmed" "openaire" "orcid") +services=("api" "persistence" "dataprocessing" "base" "pubmed" "openaire" "orcid" "metrics") service_version="`git rev-parse HEAD`" echo "" echo "Building services with version $service_version" From e4242680195fc8cedb5fa3297769bb9d495c3e60 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Wed, 4 Sep 2024 09:22:42 +0200 Subject: [PATCH 50/75] fix: minor python --- server/workers/orcid/requirements.txt | 2 +- server/workers/orcid/src/repositories/works.py | 2 +- vis/js/templates/listentry/Details.jsx | 2 +- vis/js/templates/listentry/StandardListEntry.jsx | 2 -- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt index dd278cc32..584804b32 100644 --- a/server/workers/orcid/requirements.txt +++ b/server/workers/orcid/requirements.txt @@ -24,4 +24,4 @@ redis==4.3.6 six==1.16.0 typing-extensions==4.2.0 zipp==3.6.0 -pyorcid @ git+https://github.com/OpenKnowledgeMaps/PyOrcid.git@main \ No newline at end of file +pyorcid @ git+https://github.com/OpenKnowledgeMaps/PyOrcid.git@3d8b30cbc95c2c7bb34369145866d2b252c677b8 \ No newline at end of file diff --git a/server/workers/orcid/src/repositories/works.py b/server/workers/orcid/src/repositories/works.py index 90df09e9c..8fa80f64f 100644 --- a/server/workers/orcid/src/repositories/works.py +++ b/server/workers/orcid/src/repositories/works.py @@ -19,7 +19,7 @@ def get_full_works_metadata(self, limit: int = 10000) -> pd.DataFrame: - pd.DataFrame: The full metadata for all works associated with the ORCID. """ - works_data = self.orcid.works_full_metadata(limit=limit) + works_data = self.orcid.works_full_metadata() return self.transform_works_metadata(pd.DataFrame(works_data)) def transform_works_metadata(self, works_data: pd.DataFrame) -> pd.DataFrame: diff --git a/vis/js/templates/listentry/Details.jsx b/vis/js/templates/listentry/Details.jsx index a3fe2a230..fda8db68f 100644 --- a/vis/js/templates/listentry/Details.jsx +++ b/vis/js/templates/listentry/Details.jsx @@ -15,7 +15,7 @@ const Details = ({authors, source, isSelected}) => { ); // console.log("Details.jsx: authorsString: ", authorsString); - console.log("Details.jsx: loc.default_authors: ", loc.default_authors); + // console.log("Details.jsx: loc.default_authors: ", loc.default_authors); return ( // html template starts here diff --git a/vis/js/templates/listentry/StandardListEntry.jsx b/vis/js/templates/listentry/StandardListEntry.jsx index 5b6b82df3..7e04b53ab 100644 --- a/vis/js/templates/listentry/StandardListEntry.jsx +++ b/vis/js/templates/listentry/StandardListEntry.jsx @@ -54,8 +54,6 @@ const StandardListEntry = ({ !showMetrics && (!!citations || parseInt(citations) === 0); - console.log(paper); - return ( // html template starts here
From de22c7c48944f58f040d702b594c742fb3464175 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Wed, 4 Sep 2024 13:52:25 +0200 Subject: [PATCH 51/75] fix: dropdown stylings --- server/workers/orcid/src/orcid_service.py | 5 +++-- vis/js/templates/ScaleToolbar.jsx | 11 ++++++----- vis/js/templates/filtersort/FilterDropdown.jsx | 13 +++++++++---- vis/js/templates/filtersort/SortDropdown.jsx | 16 ++++++++++------ .../researcher-modal/OrcidResearcherInfo.jsx | 12 ++++++------ vis/stylesheets/modules/list/_entry.scss | 16 ++++++++++++++++ 6 files changed, 50 insertions(+), 23 deletions(-) diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index c1ebd0d9b..4285decd3 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -166,12 +166,13 @@ def extract_year(value): metadata["publication_year"] = metadata["year"].apply(extract_year) academic_age = author_info.academic_age - if (academic_age and "academic_age_offset" in params): + if (academic_age is not None and "academic_age_offset" in params): academic_age += int(params.get("academic_age_offset")) + author_info.academic_age = academic_age # Calculate normalized h-index author_info.normalized_h_index = ( - h_index / academic_age if academic_age and academic_age > 0 else 0 + h_index / academic_age if academic_age and academic_age > 0 else None ) return author_info diff --git a/vis/js/templates/ScaleToolbar.jsx b/vis/js/templates/ScaleToolbar.jsx index b7be695e1..b07122047 100644 --- a/vis/js/templates/ScaleToolbar.jsx +++ b/vis/js/templates/ScaleToolbar.jsx @@ -33,11 +33,12 @@ const ScaleToolbar = ({ id="scale-menu" noCaret title={ - <> - {localization.scale_by_label}{" "} - {labels[value]}{" "} - - +
+ + {localization.scale_by_label} {labels[value]} + + +
} > {options.map((key) => ( diff --git a/vis/js/templates/filtersort/FilterDropdown.jsx b/vis/js/templates/filtersort/FilterDropdown.jsx index 782f7ffcc..192703456 100644 --- a/vis/js/templates/filtersort/FilterDropdown.jsx +++ b/vis/js/templates/filtersort/FilterDropdown.jsx @@ -27,10 +27,15 @@ const FilterDropdown = ({ id="filter_params" noCaret title={ - <> - {label} {valueLabel}{" "} - - +
+ + {label} {valueLabel} + + +
} > {options.map((o) => ( diff --git a/vis/js/templates/filtersort/SortDropdown.jsx b/vis/js/templates/filtersort/SortDropdown.jsx index 92cec8499..471544a0d 100644 --- a/vis/js/templates/filtersort/SortDropdown.jsx +++ b/vis/js/templates/filtersort/SortDropdown.jsx @@ -24,10 +24,12 @@ const SortDropdown = ({ label, value, valueLabel, options, handleChange }) => { id="sort" noCaret title={ - <> - {label} {valueLabel}{" "} - - +
+ + {label} {valueLabel} + + +
} > {options.map((o) => ( @@ -38,7 +40,9 @@ const SortDropdown = ({ label, value, valueLabel, options, handleChange }) => { onSelect={handleSortChange} active={o.id === value} > - {o.label} +
+ {o.label} +
))} @@ -46,4 +50,4 @@ const SortDropdown = ({ label, value, valueLabel, options, handleChange }) => { ); }; -export default SortDropdown; +export default SortDropdown; \ No newline at end of file diff --git a/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx b/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx index bc789f29c..acc3be26f 100644 --- a/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx +++ b/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx @@ -15,30 +15,30 @@ const ResearcherInfo = ({

METRICS

Normalised h-index: - {params.normalized_h_index?.toFixed(1)} + {params.normalized_h_index ? params.normalized_h_index?.toFixed(1) : "N/A"}

Academic age: - {params.academic_age} + {params.academic_age ? params.academic_age : "N/A"}

h-index: - {params.h_index} + {params.h_index ? params.h_index : "N/A"}

- Number of total citations: {params.total_citations} + Number of total citations: {params.total_citations ? params.total_citations : 'N/A'}

ALTMETRICS

Number of total unique social media mentions:{" "} - {params.total_unique_social_media_mentions} + {params.total_unique_social_media_mentions ? params.total_unique_social_media_mentions : 'N/A'}

Number of total news encyclopaedia, patent and policy references:{" "} - {params.total_neppr} + {params.total_neppr ? params.total_neppr : 'N/A'}

NOTES ON METRICS

{params.biography}

diff --git a/vis/stylesheets/modules/list/_entry.scss b/vis/stylesheets/modules/list/_entry.scss index d0dac0e1b..ca4fcdbcb 100644 --- a/vis/stylesheets/modules/list/_entry.scss +++ b/vis/stylesheets/modules/list/_entry.scss @@ -909,3 +909,19 @@ img#preview_page { color: #cc3b6b; } } + +.dropdown-toggle { + .flex-container { + display: flex; + align-items: center; + justify-content: center; + } +} + +.truncate-text { + width: 100%; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + display: inline-block; +} \ No newline at end of file From 917bdff7faa86b1dcd72c3eb731dfe40d2747fd0 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Fri, 6 Sep 2024 16:53:43 +0200 Subject: [PATCH 52/75] fix: metadata parsing and sanitizing, tooltips --- server/workers/orcid/src/orcid_service.py | 17 +++- .../orcid/src/repositories/author_info.py | 68 +++++++++------- .../workers/orcid/src/repositories/works.py | 77 +++++++++++-------- vis/js/components/Footer.js | 2 +- .../dataprocessing/schemes/defaultScheme.js | 3 +- vis/js/templates/ScaleToolbar.jsx | 59 ++++++++------ .../templates/filtersort/FilterDropdown.jsx | 60 +++++++++------ vis/js/templates/filtersort/SortDropdown.jsx | 59 ++++++++------ 8 files changed, 211 insertions(+), 134 deletions(-) diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index 4285decd3..9c4c374f2 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -56,8 +56,9 @@ def execute_search(self, params: Dict[str, str]) -> Dict[str, str]: if metadata.empty: return self._handle_insufficient_results(params, orcid_id) - metadata = self._process_metadata(metadata, author_info, params) + + self.logger.debug('metadata processed inside of _process_metadata') return self._format_response(data=metadata, author_info=author_info, params=params) except ( @@ -200,10 +201,20 @@ def _process_metadata(self, metadata: pd.DataFrame, author_info: AuthorInfo, par return metadata def _format_response(self, data: pd.DataFrame, author_info: AuthorInfo, params: Dict[str, str]) -> Dict[str, str]: - text = pd.concat([data.id, data[["title", "paper_abstract", "subtitle", "published_in", "authors"]] - .apply(lambda x: " ".join(x), axis=1)], axis=1) + self.logger.debug(f"Formatting response for ORCID {params.get('orcid')}") + text = pd.concat( + [ + data.id, + data[["title", "paper_abstract", "subtitle", "published_in", "authors"]] + .fillna('') # Replace NaN values with empty string + .apply(lambda x: " ".join(x.astype(str)), axis=1) # Ensure all elements are strings before joining + ], + axis=1 + ) text.columns = ["id", "content"] + self.logger.debug(f"Returning response for ORCID {params.get('orcid')} len {len(data)}") + response = { "input_data": { "metadata": data.to_json(orient='records'), diff --git a/server/workers/orcid/src/repositories/author_info.py b/server/workers/orcid/src/repositories/author_info.py index a7dbc2d67..f72fef88b 100644 --- a/server/workers/orcid/src/repositories/author_info.py +++ b/server/workers/orcid/src/repositories/author_info.py @@ -1,3 +1,4 @@ +import logging from datetime import datetime from pyorcid import Orcid import pandas as pd @@ -7,6 +8,8 @@ class AuthorInfoRepository: + logger = logging.getLogger(__name__) + def __init__(self, orcid: Orcid) -> None: self.orcid = orcid @@ -72,9 +75,10 @@ def calculate_academic_age(self, data: List[Dict[str, str]]) -> int: # Find the PhD-equivalent end date phd_end_date = None - for entry in data: - # Check if the Role matches any PhD-equivalent term - if any(term in entry["Role"].lower() for term in doctoral_terms): + for entry in reversed(data): + # Check if Role exists and is not None, and if it matches any PhD-equivalent term + role = entry.get("Role", "").lower() if entry.get("Role") else "" + if any(term in role for term in doctoral_terms): phd_end_date = entry["end-date"] break @@ -83,7 +87,12 @@ def calculate_academic_age(self, data: List[Dict[str, str]]) -> int: return None # Convert PhD end date to a datetime object - phd_end_date = datetime.strptime(phd_end_date, "%m/%Y") + try: + # Try to parse using "month/year" format + phd_end_date = datetime.strptime(phd_end_date, "%m/%Y") + except ValueError: + # Fallback to parse using "year" format if the above fails + phd_end_date = datetime.strptime(phd_end_date, "%Y") # Calculate the number of years since the PhD current_date = datetime.now() @@ -92,31 +101,36 @@ def calculate_academic_age(self, data: List[Dict[str, str]]) -> int: return academic_age def extract_external_identifiers( - self, - data: List[Dict[str, str]] - ) -> List[ExternalIdentifier]: - external_identifiers = pd.DataFrame( - data - ) + self, + data: List[Dict[str, str]] + ) -> List[ExternalIdentifier]: + external_identifiers = pd.DataFrame(data) - if external_identifiers.empty: - return [] - - external_identifiers = external_identifiers[ - external_identifiers["visibility"] == "public" - ] - external_identifiers["external-id-url"] = external_identifiers[ - "external-id-url" - ].apply(lambda x: x.get("value") if isinstance(x, dict) else "") - - return external_identifiers[ - [ - "external-id-type", - "external-id-url", - "external-id-value", - "external-id-relationship", + if external_identifiers.empty: + return [] + + # Filter the rows where visibility is 'public' + external_identifiers = external_identifiers[ + external_identifiers["visibility"] == "public" ] - ].to_dict(orient="records") + + # Handle the 'external-id-url' column + external_identifiers["external-id-url"] = external_identifiers[ + "external-id-url" + ].apply(lambda x: x.get("value") if isinstance(x, dict) else "") + + # Rename columns by removing the 'external-id-' prefix + external_identifiers.rename(columns=lambda x: x.replace("external-id-", ""), inplace=True) + + # Return the required columns as a list of dictionaries + return external_identifiers[ + [ + "type", + "url", + "value", + "relationship", + ] + ].to_dict(orient="records") def extract_websites(self, researcher_urls: List[Dict[str, str]]) -> List[Website]: urls = pd.DataFrame(researcher_urls) diff --git a/server/workers/orcid/src/repositories/works.py b/server/workers/orcid/src/repositories/works.py index 8fa80f64f..1b1588f7f 100644 --- a/server/workers/orcid/src/repositories/works.py +++ b/server/workers/orcid/src/repositories/works.py @@ -1,10 +1,14 @@ +import logging from dateutil.parser import parse from pyorcid import Orcid import pandas as pd import numpy as np from common.utils import get_nested_value +from typing import Optional class WorksRepository: + logger = logging.getLogger(__name__) + def __init__(self, orcid: Orcid) -> None: self.orcid = orcid @@ -30,8 +34,8 @@ def transform_works_metadata(self, works_data: pd.DataFrame) -> pd.DataFrame: # Perform transformations and store in new DataFrame new_works_data["id"] = works_data.apply(self.get_put_code, axis=1).astype(str) - new_works_data["title"] = works_data.apply(self.get_title, axis=1).astype(str) - new_works_data["subtitle"] = works_data.apply(self.get_subtitle, axis=1).astype(str) + new_works_data["title"] = works_data.apply(self.get_title, axis=1) + new_works_data["subtitle"] = works_data.apply(self.get_subtitle, axis=1) new_works_data["authors"] = works_data.apply(self.get_authors, axis=1) new_works_data["paper_abstract"] = works_data.apply( self.get_paper_abstract, axis=1 @@ -41,8 +45,8 @@ def transform_works_metadata(self, works_data: pd.DataFrame) -> pd.DataFrame: new_works_data["resulttype"] = works_data.apply(self.get_resulttype, axis=1).map( lambda x: doc_type_mapping.get(x, "") ) - new_works_data["doi"] = works_data.apply(self.extract_dois, axis=1) new_works_data["subject"] = "" # this needs to come from BASE enrichment + new_works_data["doi"] = works_data.apply(self.extract_doi, axis=1) new_works_data["url"] = works_data.apply(self.get_url, axis=1) new_works_data["link"] = works_data.apply(self.get_link, axis=1) new_works_data["oa_state"] = new_works_data.link.map(lambda x: 1 if x else 2) @@ -63,58 +67,69 @@ def get_authors(self, work) -> str: return "; ".join(authors) def get_title(self, work) -> str: - return get_nested_value(work, ["title", "title", "value"], "") + return get_nested_value(work, ["title", "title", "value"], None) def get_subtitle(self, work) -> str: - return get_nested_value(work, ["title", "subtitle", "value"], "") + return get_nested_value(work, ["title", "subtitle", "value"], None) def get_paper_abstract(self, work) -> str: - return get_nested_value(work, ["short-description"], "") + return get_nested_value(work, ["short-description"], None) def get_resulttype(self, work) -> str: - return get_nested_value(work, ["type"], "") + return get_nested_value(work, ["type"], None) def published_in(self, work) -> str: - return get_nested_value(work, ["journal-title", "value"], "") + return get_nested_value(work, ["journal-title", "value"], None) def get_put_code(self, work) -> str: - return get_nested_value(work, ["put-code"], "") - - def get_url(self, work) -> str: - # Try to get the primary URL - url = get_nested_value(work, ["url", "value"], "") + put_code = get_nested_value(work, ["put-code"], None) + return str(put_code) if put_code else None + + def get_url(self, work) -> Optional[str]: + url = get_nested_value(work, ["url", "value"], None) if url: return url - - # Fallback to checking external IDs if no URL was found + ids = get_nested_value(work, ["external-ids", "external-id"], []) if isinstance(ids, list): for id in ids: - external_url = id.get("external-id-value", "") - if external_url.startswith("http"): - return external_url + external_id_value = id.get("external-id-value", None) + external_id_url = get_nested_value(id, ["external-id-url", "value"], None) + external_id_type = get_nested_value(id, ["external-id-type"], "").lower() + + if external_id_url: + return external_id_url + + if external_id_value.startswith("http"): + return external_id_value - return "" + if external_id_type == "doi": + return f"https://doi.org/{external_id_value}" + if external_id_type == "isbn": + return f"https://books.google.pl/books?vid=ISBN{external_id_value}&redir_esc=y&hl=en" + if external_id_type == "arxiv": + return f"https://arxiv.org/abs/{external_id_value}" + self.logger.warning(f"Unknown external id type: {external_id_type}") + + return None def get_link(self, work) -> str: url = get_nested_value(work, ["url", "value"], "") if url.lower().endswith(".pdf"): return url - return "" + return None - def extract_dois(self, work: pd.DataFrame) -> str: + def extract_doi(self, work: pd.DataFrame) -> Optional[str]: external_ids = get_nested_value(work, ["external-ids", "external-id"], []) - + if not isinstance(external_ids, list) or not external_ids: - return "" - - dois = [ - eid.get("external-id-value", "") - for eid in external_ids - if eid.get("external-id-type") == "doi" - ] - - return dois[0] if dois else "" + return None + + for eid in external_ids: + if eid.get("external-id-type") == "doi": + return eid.get("external-id-value", None) + + return None def get_publication_date(self, work) -> str: year = get_nested_value(work, ["publication-date", "year", "value"], np.nan) diff --git a/vis/js/components/Footer.js b/vis/js/components/Footer.js index 76f08d0f6..15b39657b 100644 --- a/vis/js/components/Footer.js +++ b/vis/js/components/Footer.js @@ -9,7 +9,7 @@ const Footer = ({service, timestamp, faqsUrl, faqsUrlStr, isStreamgraph}) => { return null; } - if (service.startsWith("triple") || ["base", "pubmed", "openaire"].includes(service)) { + if (service.startsWith("triple") || ["base", "pubmed", "openaire", "orcid"].includes(service)) { return ; } diff --git a/vis/js/dataprocessing/schemes/defaultScheme.js b/vis/js/dataprocessing/schemes/defaultScheme.js index 48ea60406..672cd07c8 100644 --- a/vis/js/dataprocessing/schemes/defaultScheme.js +++ b/vis/js/dataprocessing/schemes/defaultScheme.js @@ -45,7 +45,7 @@ const DEFAULT_SCHEME = [ name: "title", required: true, type: ["string"], - fallback: () => "", + fallback: (loc) => loc?.default_paper_title, }, { name: "paper_abstract", @@ -145,6 +145,7 @@ const DEFAULT_SCHEME = [ }, { name: "cluster_labels", required: true }, { name: "file_hash", type: ["string"], fallback: (loc) => loc.default_hash }, + { name: "citation_count", type: ["number"], fallback: () => 'n/a' }, ]; export default DEFAULT_SCHEME; diff --git a/vis/js/templates/ScaleToolbar.jsx b/vis/js/templates/ScaleToolbar.jsx index b07122047..e8ca974ae 100644 --- a/vis/js/templates/ScaleToolbar.jsx +++ b/vis/js/templates/ScaleToolbar.jsx @@ -3,6 +3,7 @@ import { DropdownButton, MenuItem } from "react-bootstrap"; import { useLocalizationContext } from "../components/LocalizationProvider"; import useMatomo from "../utils/useMatomo"; +import HoverPopover from "./HoverPopover"; const ScaleToolbar = ({ value, @@ -29,30 +30,44 @@ const ScaleToolbar = ({ return (
- - - {localization.scale_by_label} {labels[value]} - - -
+ + {localization.scale_by_label}{" "} + {labels[value]} + } > - {options.map((key) => ( - - {labels[key]} - - ))} - + + + {localization.scale_by_label}{" "} + {labels[value]} + + +
+ } + > + {options.map((key) => ( + + {labels[key]} + + ))} + +
{explanations[value]} diff --git a/vis/js/templates/filtersort/FilterDropdown.jsx b/vis/js/templates/filtersort/FilterDropdown.jsx index 192703456..692c4334e 100644 --- a/vis/js/templates/filtersort/FilterDropdown.jsx +++ b/vis/js/templates/filtersort/FilterDropdown.jsx @@ -1,6 +1,7 @@ import React from "react"; import { DropdownButton, MenuItem } from "react-bootstrap"; import useMatomo from "../../utils/useMatomo"; +import HoverPopover from "../HoverPopover"; const FilterDropdown = ({ label, @@ -23,33 +24,42 @@ const FilterDropdown = ({ return (
- - - {label} {valueLabel} - - -
+ + {label} {valueLabel} + } > - {options.map((o) => ( - - {o.label} - - ))} - + + + {label} {valueLabel} + + +
+ } + > + {options.map((o) => ( + + {o.label} + + ))} + +
); }; diff --git a/vis/js/templates/filtersort/SortDropdown.jsx b/vis/js/templates/filtersort/SortDropdown.jsx index 471544a0d..812a9747a 100644 --- a/vis/js/templates/filtersort/SortDropdown.jsx +++ b/vis/js/templates/filtersort/SortDropdown.jsx @@ -1,6 +1,7 @@ import React from "react"; import { DropdownButton, MenuItem } from "react-bootstrap"; import useMatomo from "../../utils/useMatomo"; +import HoverPopover from "../HoverPopover"; const SortDropdown = ({ label, value, valueLabel, options, handleChange }) => { const { trackEvent } = useMatomo(); @@ -20,34 +21,44 @@ const SortDropdown = ({ label, value, valueLabel, options, handleChange }) => { id="sort_container" style={{ display: "inline-block" }} > - - - {label} {valueLabel} - - - + + {label} {valueLabel} + } > - {options.map((o) => ( - -
- {o.label} + + + {label} {valueLabel} + +
-
- ))} -
+ } + > + {options.map((o) => ( + +
{o.label}
+
+ ))} + + ); }; -export default SortDropdown; \ No newline at end of file +export default SortDropdown; From d0e61cda87d0db1802270a67faea28cc6da38383 Mon Sep 17 00:00:00 2001 From: chreman Date: Tue, 3 Sep 2024 13:46:53 +0200 Subject: [PATCH 53/75] deployment changes --- docker-compose.yml | 6 ++++-- server/workers/api/Dockerfile | 6 ++---- server/workers/persistence/Dockerfile | 4 +--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index cdd88e267..b7b695a58 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -45,16 +45,18 @@ services: BEHIND_PROXY: "${BEHIND_PROXY}" DEFAULT_DATABASE: "${DEFAULT_DATABASE}" FLASK_ENV: "${FLASK_ENV}" + command: ["python", "app.py"] volumes: - ./api_cache:/var/api_cache - - ./server/workers/api/src:/app/api/src - - ./server/workers/common:/app/common + - ./server/workers/api/src:/api + - ./server/workers/common:/api/common depends_on: - redis - base - pubmed - openaire - orcid + - metrics networks: - headstart diff --git a/server/workers/api/Dockerfile b/server/workers/api/Dockerfile index 54c2befc8..a46870ca7 100644 --- a/server/workers/api/Dockerfile +++ b/server/workers/api/Dockerfile @@ -5,7 +5,7 @@ LABEL maintainer="Chris Kittel " RUN apt-get update RUN apt-get install -y gcc git libpq-dev -WORKDIR /app +WORKDIR /api COPY ./workers/api/requirements.txt ./api/requirements.txt @@ -17,7 +17,5 @@ COPY ./workers/api/requirements-e.txt ./api/requirements-e.txt RUN cd api && pip install --no-cache-dir -r requirements-e.txt && cd .. -COPY ./workers/api/src ./api/src +COPY ./workers/api/src ./ COPY ./workers/api/tests ./api/tests - -CMD ["python", "api/src/app.py"] diff --git a/server/workers/persistence/Dockerfile b/server/workers/persistence/Dockerfile index 6feefeec9..b8861e7a3 100644 --- a/server/workers/persistence/Dockerfile +++ b/server/workers/persistence/Dockerfile @@ -13,6 +13,4 @@ RUN pip install git+https://github.com/python-restx/flask-restx COPY workers/common ../common COPY workers/persistence/requirements-e.txt ./ RUN pip install --no-cache-dir -r requirements-e.txt -COPY workers/persistence/src/ ./ - -CMD ["python3", "app.py"] \ No newline at end of file +COPY workers/persistence/src/ ./ \ No newline at end of file From f96f1610fdd93ac8fc2c24cc3355d0a9a0971b45 Mon Sep 17 00:00:00 2001 From: chreman Date: Fri, 6 Sep 2024 12:50:51 +0200 Subject: [PATCH 54/75] temporary PDF deactivation --- server/workers/orcid/src/repositories/works.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/workers/orcid/src/repositories/works.py b/server/workers/orcid/src/repositories/works.py index 1b1588f7f..0ea54e377 100644 --- a/server/workers/orcid/src/repositories/works.py +++ b/server/workers/orcid/src/repositories/works.py @@ -49,7 +49,8 @@ def transform_works_metadata(self, works_data: pd.DataFrame) -> pd.DataFrame: new_works_data["doi"] = works_data.apply(self.extract_doi, axis=1) new_works_data["url"] = works_data.apply(self.get_url, axis=1) new_works_data["link"] = works_data.apply(self.get_link, axis=1) - new_works_data["oa_state"] = new_works_data.link.map(lambda x: 1 if x else 2) + #new_works_data["oa_state"] = new_works_data.link.map(lambda x: 1 if x else 2) + new_works_data["oa_state"] = 2 return new_works_data From e54c709a922fed650d350049dbf85afd6beeaeda Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 10 Sep 2024 13:37:51 +0200 Subject: [PATCH 55/75] fix: bugs including logs --- server/preprocessing/other-scripts/metrics.R | 90 ++++++++++--------- .../api/src/apis/request_validators.py | 7 ++ server/workers/metrics/src/metrics.py | 4 + server/workers/orcid/src/orcid_service.py | 18 +++- .../orcid/src/repositories/author_info.py | 3 +- .../workers/orcid/src/repositories/works.py | 38 ++++---- vis/js/components/KnowledgeMap.js | 9 +- vis/js/default-config.js | 1 + vis/js/templates/Paper.jsx | 47 +++++++++- vis/js/utils/dimensions.js | 2 + 10 files changed, 155 insertions(+), 64 deletions(-) diff --git a/server/preprocessing/other-scripts/metrics.R b/server/preprocessing/other-scripts/metrics.R index 119550db6..fec1e6a32 100644 --- a/server/preprocessing/other-scripts/metrics.R +++ b/server/preprocessing/other-scripts/metrics.R @@ -1,24 +1,26 @@ -library('rAltmetric') -library('rcrossref') +library("rAltmetric") +library("rcrossref") library("plyr") -alog <- getLogger('metrics') +alog <- getLogger("metrics") -enrich_metadata_metrics <- function(metadata){ +enrich_metadata_metrics <- function(metadata) { start.time <- Sys.time() results <- get_altmetrics(metadata$doi) - requested_metrics <- c("cited_by_wikipedia_count", - "cited_by_msm_count", - "cited_by_policies_count", - "cited_by_patents_count", - "cited_by_accounts_count") + requested_metrics <- c( + "cited_by_wikipedia_count", + "cited_by_msm_count", + "cited_by_policies_count", + "cited_by_patents_count", + "cited_by_accounts_count" + ) - if (nrow(results) > 0){ - for (metric in requested_metrics){ - if (!(metric %in% names(results))){ - results[[metric]] = NA + if (nrow(results) > 0) { + for (metric in requested_metrics) { + if (!(metric %in% names(results))) { + results[[metric]] <- NA } } requested_metrics <- c("doi", requested_metrics) @@ -27,53 +29,59 @@ enrich_metadata_metrics <- function(metadata){ # merge the metadata with the results of the altmetrics # don't remove any rows from the metadata, just add the altmetrics to the # output - output <- merge(x=metadata, y=results, by='doi', all.x = TRUE, all.y = FALSE) + output <- merge(x = metadata, y = results, by = "doi", all.x = TRUE, all.y = FALSE) } else { - for (metric in requested_metrics){ - metadata[[metric]] = NA + for (metric in requested_metrics) { + metadata[[metric]] <- NA } alog$info("No altmetrics found for any paper in this dataset.") output <- metadata } output <- add_citations(output) - #Remove duplicate lines - TODO: check for root of this problem - output = unique(output) + # Remove duplicate lines - TODO: check for root of this problem + output <- unique(output) output_json <- toJSON(output) end.time <- Sys.time() time.taken <- end.time - start.time - alog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Time taken:", time.taken, sep=" ")) + alog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Time taken:", time.taken, sep = " ")) - return (output_json) + return(output_json) } -get_altmetrics <- function(dois){ - valid_dois <- unique(dois[which(dois!="")]) +get_altmetrics <- function(dois) { + valid_dois <- unique(dois[which(dois != "")]) results <- data.frame() - for (doi in valid_dois){ - tryCatch({ - metrics <- altmetric_data(altmetrics(doi=doi, apikey="")) - results <- rbind.fill(results, metrics) - }, error = function(err){ - alog$debug(gsub("[\r\n]", "", paste(err, doi, sep=" "))) - }) + for (doi in valid_dois) { + tryCatch( + { + metrics <- altmetric_data(altmetrics(doi = doi, apikey = "")) + results <- rbind.fill(results, metrics) + }, + error = function(err) { + alog$debug(gsub("[\r\n]", "", paste(err, doi, sep = " "))) + } + ) } - return (results) + return(results) } -add_citations <- function(output){ +add_citations <- function(output) { dois <- output$doi - valid_dois <- unique(dois[which(dois!="")]) + valid_dois <- unique(dois[which(dois != "")]) - cc <- tryCatch({ - cr_citation_count(doi=valid_dois, async=TRUE) - }, error = function(err){ - alog$debug(gsub("[\r\n]", "", paste(err, doi, sep=" "))) - return(list(doi=dois, count=NA)) - }) - names(cc)[names(cc)=="count"] <- "citation_count" - output = merge(x=output, y=cc, by='doi', all.x = TRUE) - return (output) + cc <- tryCatch( + { + cr_citation_count(doi = valid_dois, async = TRUE) + }, + error = function(err) { + alog$debug(gsub("[\r\n]", "", paste(err, doi, sep = " "))) + return(list(doi = dois, count = NA)) + } + ) + names(cc)[names(cc) == "count"] <- "citation_count" + output <- merge(x = output, y = cc, by = "doi", all.x = TRUE) + return(output) } diff --git a/server/workers/api/src/apis/request_validators.py b/server/workers/api/src/apis/request_validators.py index 4a253bcb8..eb11d84b1 100644 --- a/server/workers/api/src/apis/request_validators.py +++ b/server/workers/api/src/apis/request_validators.py @@ -39,6 +39,9 @@ class SearchParamSchema(Schema): @pre_load def fix_years(self, in_data, **kwargs): + if not in_data: + in_data = {} + from_date = in_data.get('from') to_date = in_data.get('to') if from_date and len(from_date) == 4: @@ -64,11 +67,15 @@ def filter_nonpublic(self, in_data, **kwargs): @pre_load def lang_id_empty_fallback(self, in_data, **kwargs): + if in_data is None: + in_data = {} # Define in_data as an empty dictionary + lang_id = in_data.get("lang_id") if lang_id: lang_id = list(filter(lambda x: x != "", lang_id)) if len(lang_id) == 0: in_data["lang_id"] = ["all-lang"] + return in_data @validates('from_') diff --git a/server/workers/metrics/src/metrics.py b/server/workers/metrics/src/metrics.py index 67155157a..2bf5ccb68 100644 --- a/server/workers/metrics/src/metrics.py +++ b/server/workers/metrics/src/metrics.py @@ -59,6 +59,8 @@ def execute_search(self, params: dict, metadata: str) -> dict: ) stdout, stderr = proc.communicate(json.dumps(data)) + self.logger.debug(f"Stdout: {stdout}") + output = [line for line in stdout.split('\n') if line] errors = [line for line in stderr.split('\n') if line] @@ -67,6 +69,8 @@ def execute_search(self, params: dict, metadata: str) -> dict: raw_metadata = json.loads(output[-2])[0] + self.logger.debug(f"Raw metadata: {raw_metadata}") + if isinstance(raw_metadata, dict) and raw_metadata.get('status') == "error": return raw_metadata diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index 9c4c374f2..50fab7cb3 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -92,8 +92,10 @@ def enrich_metadata(self, params: Dict[str, str], metadata: pd.DataFrame) -> pd. "params": params, "metadata": metadata.to_json(orient="records"), } + self.logger.debug(f"enrich metadata task data: {task_data}") self.redis_store.rpush("metrics", json.dumps(task_data)) result = get_key(self.redis_store, request_id, 300) + self.logger.debug(f"result: {result}") metadata = pd.DataFrame(json.loads(result["input_data"])) for c in [ "citation_count", @@ -195,20 +197,28 @@ def _retrieve_author_info_and_metadata(self, orcid: Orcid) -> Tuple[Dict[str, st def _process_metadata(self, metadata: pd.DataFrame, author_info: AuthorInfo, params: Dict[str, str]) -> pd.DataFrame: metadata["authors"] = metadata["authors"].replace("", author_info.author_name) + self.logger.debug(f"Enriching metadata for ORCID {params.get('orcid')}") metadata = self.enrich_metadata(params, metadata) + self.logger.debug(f"Enriching author info for ORCID {params.get('orcid')}") author_info = self.enrich_author_info(author_info, metadata, params) metadata = metadata.head(int(params.get("limit"))) return metadata def _format_response(self, data: pd.DataFrame, author_info: AuthorInfo, params: Dict[str, str]) -> Dict[str, str]: self.logger.debug(f"Formatting response for ORCID {params.get('orcid')}") + desired_columns = ["title", "paper_abstract", "subtitle", "published_in", "authors"] + + # Filter the columns to only those that exist in the DataFrame + existing_columns = [col for col in desired_columns if col in data.columns] + + # Proceed with the concatenation using only the existing columns text = pd.concat( [ - data.id, - data[["title", "paper_abstract", "subtitle", "published_in", "authors"]] - .fillna('') # Replace NaN values with empty string + data.id, + data[existing_columns] # Use only the existing columns + .fillna('') # Replace NaN values with an empty string .apply(lambda x: " ".join(x.astype(str)), axis=1) # Ensure all elements are strings before joining - ], + ], axis=1 ) text.columns = ["id", "content"] diff --git a/server/workers/orcid/src/repositories/author_info.py b/server/workers/orcid/src/repositories/author_info.py index f72fef88b..bce8d06a1 100644 --- a/server/workers/orcid/src/repositories/author_info.py +++ b/server/workers/orcid/src/repositories/author_info.py @@ -70,7 +70,8 @@ def calculate_academic_age(self, data: List[Dict[str, str]]) -> int: doctoral_terms = [ "phd", "dphil", "doctorate", "doctoral", "edd", "dsc", "md-phd", "jd-phd", "dr.phil.", "dr.rer.nat.", - "doctor of science", "doctor of education", "doctor's degree" + "doctor of science", "doctor of education", "doctor's degree", + "ph.d" ] # Find the PhD-equivalent end date diff --git a/server/workers/orcid/src/repositories/works.py b/server/workers/orcid/src/repositories/works.py index 1b1588f7f..2fa4590a1 100644 --- a/server/workers/orcid/src/repositories/works.py +++ b/server/workers/orcid/src/repositories/works.py @@ -5,6 +5,7 @@ import numpy as np from common.utils import get_nested_value from typing import Optional +import calendar class WorksRepository: logger = logging.getLogger(__name__) @@ -109,7 +110,7 @@ def get_url(self, work) -> Optional[str]: return f"https://books.google.pl/books?vid=ISBN{external_id_value}&redir_esc=y&hl=en" if external_id_type == "arxiv": return f"https://arxiv.org/abs/{external_id_value}" - self.logger.warning(f"Unknown external id type: {external_id_type}") + self.logger.warning(f"Unknown external id type: {external_id_type}. {id}") return None @@ -136,21 +137,28 @@ def get_publication_date(self, work) -> str: month = get_nested_value(work, ["publication-date", "month", "value"], np.nan) day = get_nested_value(work, ["publication-date", "day", "value"], np.nan) - publication_date = "" - parsed_publication_date = publication_date - if year is not np.nan: - publication_date += str(int(year)) - parsed_publication_date = publication_date - if month is not np.nan and month != "00": - publication_date += "-" + str(int(month)) - date_obj = parse(publication_date) - parsed_publication_date = date_obj.strftime("%Y-%m") - if day is not np.nan: - publication_date += "-" + str(int(day)) - date_obj = parse(publication_date) - parsed_publication_date = date_obj.strftime("%Y-%m-%d") - return parsed_publication_date + if year is np.nan or not (1 <= int(year) <= 9999): + return None + year = int(year) + result_date = str(year) + + if month is not np.nan: + month = int(month) + if 1 <= month <= 12: + result_date += f"-{month:02d}" + + if day is not np.nan: + day = int(day) + max_day = calendar.monthrange(year, month)[1] + if 1 <= day <= max_day: + result_date += f"-{day:02d}" + return result_date + return result_date + else: + return str(year) + else: + return str(year) doc_type_mapping = { "book": "Book", diff --git a/vis/js/components/KnowledgeMap.js b/vis/js/components/KnowledgeMap.js index 5f1c63c49..6883c4adf 100644 --- a/vis/js/components/KnowledgeMap.js +++ b/vis/js/components/KnowledgeMap.js @@ -8,6 +8,7 @@ import Paper from "../templates/Paper"; import { mapDispatchToMapEntriesProps } from "../utils/eventhandlers"; import { trackMatomoEvent } from "../utils/useMatomo"; +import { useLocalizationContext } from "./LocalizationProvider"; const KnowledgeMap = (props) => { const { data, areas, zoom, animation } = props; @@ -19,6 +20,7 @@ const KnowledgeMap = (props) => { const { hoveredBubble, bubbleOrder, changeBubbleOrder } = props; const { hoveredPaper, paperOrder, changePaperOrder } = props; const { trackMouseOver } = props; + const localization = useLocalizationContext(); // bubble section const handleAreaMouseOver = (area) => { @@ -131,7 +133,6 @@ const KnowledgeMap = (props) => { { animation={animation} maxSize={height / 2.0} enlargeFactor={enlargeFactor} + baseUnit={baseUnit} + socialMediaLabel={localization.social_media_count_label} + referencesLabel={localization.references_count_label} + citationsLabel={localization.citations_count_label} + readersLabel={localization.readers_count_label} + tweetsLabel={localization.tweets_count_label} /> ); }; diff --git a/vis/js/default-config.js b/vis/js/default-config.js index 85cb7003d..cb711696d 100644 --- a/vis/js/default-config.js +++ b/vis/js/default-config.js @@ -600,6 +600,7 @@ var config = { please_note: "Please note", citation_warning: "we were not able to verify whether this citation is formatted correctly based on the metadata received. Please check before reuse.", time_frame_context_sg: "Please note that we remove time intervals with only a few associated papers during the computation of your streamgraph to increase its readability. As a result the time on the x-axis may not align with the time range you selected.", + // metrics citations_count_label: "citations", social_media_count_label: "social media mentions", references_count_label: "references outside academia", diff --git a/vis/js/templates/Paper.jsx b/vis/js/templates/Paper.jsx index a212b16fc..5e348b10e 100644 --- a/vis/js/templates/Paper.jsx +++ b/vis/js/templates/Paper.jsx @@ -96,14 +96,17 @@ class Paper extends React.Component { } render() { - const { data, readersLabel, zoom, selected, hovered } = this.props; + const { data, zoom, selected, hovered } = this.props; const { maxSize, enlargeFactor } = this.props; const { onClick, onMouseOver, onMouseOut } = this.props; const {title, authors_string: authors, authors_list: authors_list, year, area} = data; - const { num_readers: readers, published_in: publisher } = data; + const { published_in: publisher } = data; const { x, y, width: baseWidth, height: baseHeight } = this.state; const { path: basePath, dogEar: baseDogEar } = this.state; + + const { socialMediaLabel, referencesLabel, citationsLabel, readersLabel, tweetsLabel } = this.props; + const { num_readers: readers, tweets, citations, social, references } = data; let { width: realWidth, height: realHeight } = this.getCoordinatesAndDimensions(); @@ -263,6 +266,46 @@ class Paper extends React.Component {

)} + {!!socialMediaLabel && + typeof readers !== "undefined" && + readers !== null && ( +
+

+ {social} + {socialMediaLabel} +

+
+ )} + {!!citationsLabel && + typeof readers !== "undefined" && + readers !== null && ( +
+

+ {citations} + {citationsLabel} +

+
+ )} + {!!referencesLabel && + typeof readers !== "undefined" && + readers !== null && ( +
+

+ {references} + {referencesLabel} +

+
+ )} + {!!tweetsLabel && + typeof readers !== "undefined" && + readers !== null && ( +
+

+ {tweets} + {tweetsLabel} +

+
+ )} diff --git a/vis/js/utils/dimensions.js b/vis/js/utils/dimensions.js index a928c1b5e..23d543a94 100644 --- a/vis/js/utils/dimensions.js +++ b/vis/js/utils/dimensions.js @@ -1,5 +1,7 @@ import $ from "jquery"; +// ?: fix scaling here? + // these constants are hardcoded dimensions of various headstart parts const TITLE_HEIGHT = 60; const TOOLBAR_HEIGHT = 66; From e533bf6f308eae793407ef065fabc4d797db23c0 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 10 Sep 2024 13:46:48 +0200 Subject: [PATCH 56/75] fix: works --- server/workers/orcid/src/repositories/works.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/workers/orcid/src/repositories/works.py b/server/workers/orcid/src/repositories/works.py index 2fa4590a1..70da23ada 100644 --- a/server/workers/orcid/src/repositories/works.py +++ b/server/workers/orcid/src/repositories/works.py @@ -40,9 +40,9 @@ def transform_works_metadata(self, works_data: pd.DataFrame) -> pd.DataFrame: new_works_data["authors"] = works_data.apply(self.get_authors, axis=1) new_works_data["paper_abstract"] = works_data.apply( self.get_paper_abstract, axis=1 - ).astype(str) + ) new_works_data["year"] = works_data.apply(self.get_publication_date, axis=1) - new_works_data["published_in"] = works_data.apply(self.published_in, axis=1).astype(str) + new_works_data["published_in"] = works_data.apply(self.published_in, axis=1) new_works_data["resulttype"] = works_data.apply(self.get_resulttype, axis=1).map( lambda x: doc_type_mapping.get(x, "") ) From fd51a0482f8de99bbef2dfc92e432692625be550 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 10 Sep 2024 15:06:26 +0200 Subject: [PATCH 57/75] fix: paper --- .../dataprocessing/schemes/defaultScheme.js | 4 ++++ vis/js/templates/Paper.jsx | 24 ++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/vis/js/dataprocessing/schemes/defaultScheme.js b/vis/js/dataprocessing/schemes/defaultScheme.js index 672cd07c8..f1d69c8a4 100644 --- a/vis/js/dataprocessing/schemes/defaultScheme.js +++ b/vis/js/dataprocessing/schemes/defaultScheme.js @@ -146,6 +146,10 @@ const DEFAULT_SCHEME = [ { name: "cluster_labels", required: true }, { name: "file_hash", type: ["string"], fallback: (loc) => loc.default_hash }, { name: "citation_count", type: ["number"], fallback: () => 'n/a' }, + { name: "tweets", fallback: (loc) => loc.default_readers }, + { name: "social", fallback: (loc) => loc.default_readers }, + { name: "references", fallback: (loc) => loc.default_readers }, + { name: "citations", fallback: (loc) => loc.default_readers } ]; export default DEFAULT_SCHEME; diff --git a/vis/js/templates/Paper.jsx b/vis/js/templates/Paper.jsx index 5e348b10e..0342ebfb4 100644 --- a/vis/js/templates/Paper.jsx +++ b/vis/js/templates/Paper.jsx @@ -217,7 +217,7 @@ class Paper extends React.Component {

{readers} - {readersLabel} + {readersLabel || 'n/a'}

)} @@ -272,7 +272,7 @@ class Paper extends React.Component {

{social} - {socialMediaLabel} + {socialMediaLabel || 'n/a'}

)} @@ -282,7 +282,7 @@ class Paper extends React.Component {

{citations} - {citationsLabel} + {citationsLabel || 'n/a'}

)} @@ -292,7 +292,7 @@ class Paper extends React.Component {

{references} - {referencesLabel} + {referencesLabel || 'n/a'}

)} @@ -302,7 +302,7 @@ class Paper extends React.Component {

{tweets} - {tweetsLabel} + {tweetsLabel || 'n/a'}

)} @@ -489,14 +489,10 @@ const getEnlargeFactor = (offsetWidth, scrollHeight) => { return (newWidth / offsetWidth) * (1.0 / (1 - DOGEAR_WIDTH)); }; -const getMetadataHeight = (realHeight, hasReaders, isZoomed) => { - let readersHeight = 0; - if (hasReaders) { - if (isZoomed) { - readersHeight = 22; - } else { - readersHeight = 12; - } +const getMetadataHeight = (realHeight, numOfLabels, isZoomed) => { + let readersHeight = 12; + if (numOfLabels && isZoomed) { + readersHeight =+ numOfLabels * 15; } const height = realHeight - readersHeight; From b949801ec0475e521a2e2624e76a8d5c3ec75732 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Wed, 11 Sep 2024 09:47:41 +0200 Subject: [PATCH 58/75] fix: paper n/a --- vis/js/templates/Paper.jsx | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vis/js/templates/Paper.jsx b/vis/js/templates/Paper.jsx index 0342ebfb4..d8fba7870 100644 --- a/vis/js/templates/Paper.jsx +++ b/vis/js/templates/Paper.jsx @@ -261,8 +261,8 @@ class Paper extends React.Component { readers !== null && (

- {readers} - {readersLabel || 'n/a'} + {readers || 'n/a'} + {readersLabel}

)} @@ -271,8 +271,8 @@ class Paper extends React.Component { readers !== null && (

- {social} - {socialMediaLabel || 'n/a'} + {social || 'n/a'} + {socialMediaLabel}

)} @@ -281,8 +281,8 @@ class Paper extends React.Component { readers !== null && (

- {citations} - {citationsLabel || 'n/a'} + {citations || 'n/a'} + {citationsLabel}

)} @@ -291,8 +291,8 @@ class Paper extends React.Component { readers !== null && (

- {references} - {referencesLabel || 'n/a'} + {references || 'n/a'} + {referencesLabel}

)} @@ -301,8 +301,8 @@ class Paper extends React.Component { readers !== null && (

- {tweets} - {tweetsLabel || 'n/a'} + {tweets || 'n/a'} + {tweetsLabel}

)} From 9d36728d3a5b3e0528b23b9d9e70580fab7c73f3 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 12 Sep 2024 10:18:49 +0200 Subject: [PATCH 59/75] R metrics metadata handling bugfix --- server/preprocessing/other-scripts/metrics.R | 14 +++++------ .../preprocessing/other-scripts/run_metrics.R | 25 +++++++++++-------- .../workers/dataprocessing/src/headstart.py | 4 +++ server/workers/metrics/src/metrics.py | 2 +- server/workers/orcid/src/orcid_service.py | 2 +- .../workers/orcid/src/repositories/works.py | 4 +-- 6 files changed, 28 insertions(+), 23 deletions(-) diff --git a/server/preprocessing/other-scripts/metrics.R b/server/preprocessing/other-scripts/metrics.R index fec1e6a32..a16ee3795 100644 --- a/server/preprocessing/other-scripts/metrics.R +++ b/server/preprocessing/other-scripts/metrics.R @@ -2,7 +2,7 @@ library("rAltmetric") library("rcrossref") library("plyr") -alog <- getLogger("metrics") +mlog <- getLogger("metrics") enrich_metadata_metrics <- function(metadata) { @@ -34,7 +34,7 @@ enrich_metadata_metrics <- function(metadata) { for (metric in requested_metrics) { metadata[[metric]] <- NA } - alog$info("No altmetrics found for any paper in this dataset.") + mlog$info("No altmetrics found for any paper in this dataset.") output <- metadata } output <- add_citations(output) @@ -42,13 +42,11 @@ enrich_metadata_metrics <- function(metadata) { # Remove duplicate lines - TODO: check for root of this problem output <- unique(output) - output_json <- toJSON(output) - end.time <- Sys.time() time.taken <- end.time - start.time - alog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Time taken:", time.taken, sep = " ")) + mlog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Time taken:", time.taken, sep = " ")) - return(output_json) + return(output) } get_altmetrics <- function(dois) { @@ -61,7 +59,7 @@ get_altmetrics <- function(dois) { results <- rbind.fill(results, metrics) }, error = function(err) { - alog$debug(gsub("[\r\n]", "", paste(err, doi, sep = " "))) + mlog$debug(gsub("[\r\n]", "", paste(err, doi, sep = " "))) } ) } @@ -77,7 +75,7 @@ add_citations <- function(output) { cr_citation_count(doi = valid_dois, async = TRUE) }, error = function(err) { - alog$debug(gsub("[\r\n]", "", paste(err, doi, sep = " "))) + mlog$debug(gsub("[\r\n]", "", paste(err, doi, sep = " "))) return(list(doi = dois, count = NA)) } ) diff --git a/server/preprocessing/other-scripts/run_metrics.R b/server/preprocessing/other-scripts/run_metrics.R index f33e00a7f..c8a85eeea 100644 --- a/server/preprocessing/other-scripts/run_metrics.R +++ b/server/preprocessing/other-scripts/run_metrics.R @@ -29,7 +29,7 @@ if (DEBUG==TRUE){ } -tslog <- getLogger('ts') +mlog <- getLogger('metrics_runner') f <- file("stdin") open(f) @@ -47,7 +47,6 @@ if (!is.null(params$lang_id)) { lang_id <- 'all' } -source("utils.R") source('metrics.R') registerDoParallel(detectCores(all.tests = FALSE, logical = TRUE)-1) @@ -55,18 +54,22 @@ registerDoParallel(detectCores(all.tests = FALSE, logical = TRUE)-1) failed <- list(params=params) tryCatch({ - metadata <- enrich_metadata_metrics(metadata) + if ("doi" %in% names(metadata)) { + # only enrich metadata with metrics if at least one DOI is present + if (!all(is.na(metadata$doi))) { + output <- enrich_metadata_metrics(metadata) + } + } else { + mlog$warn("No DOIs found in metadata") + } }, error=function(err){ - tslog$error(gsub("\n", " ", paste("Metric enrichment failed", service, paste(params, collapse=" "), err, sep="||"))) - failed$query <<- params$q - failed$query_reason <<- err$message + mlog$error(gsub("\n", " ", paste("Metric enrichment failed", service, paste(params, collapse=" "), err, sep="||"))) }) -if (exists('metadata')) { +if (exists('output')) { + print(toJSON(output)) + print(toJSON(output)) +} else { print(toJSON(metadata)) print(toJSON(metadata)) -} else { - output_json <- detect_error(failed, service, params) - print(output_json) - print(output_json) } diff --git a/server/workers/dataprocessing/src/headstart.py b/server/workers/dataprocessing/src/headstart.py index aefc674e7..ffd3bad76 100644 --- a/server/workers/dataprocessing/src/headstart.py +++ b/server/workers/dataprocessing/src/headstart.py @@ -61,11 +61,15 @@ def execute_search(self, params, input_data): data["params"] = params cmd = [self.command, self.hs, self.wd, q, service] + self.logger.debug(f"Executing command: {cmd}") + self.logger.debug(f"Input data: {data}") proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8") stdout, stderr = proc.communicate(json.dumps(data)) + self.logger.debug(f"Stdout: {stdout}") output = [o for o in stdout.split('\n') if len(o) > 0] error = [o.encode('ascii', errors='replace').decode() for o in stderr.split('\n') if len(o) > 0] + self.logger.debug(f"Raw output: {output}") self.logger.debug(error) try: res = pd.DataFrame(json.loads(output[-1])).to_json(orient="records") diff --git a/server/workers/metrics/src/metrics.py b/server/workers/metrics/src/metrics.py index 2bf5ccb68..6c55cc710 100644 --- a/server/workers/metrics/src/metrics.py +++ b/server/workers/metrics/src/metrics.py @@ -67,7 +67,7 @@ def execute_search(self, params: dict, metadata: str) -> dict: if not output: raise ValueError("No output received from the subprocess") - raw_metadata = json.loads(output[-2])[0] + raw_metadata = json.loads(output[-2]) self.logger.debug(f"Raw metadata: {raw_metadata}") diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index 50fab7cb3..b25220a6e 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -96,7 +96,7 @@ def enrich_metadata(self, params: Dict[str, str], metadata: pd.DataFrame) -> pd. self.redis_store.rpush("metrics", json.dumps(task_data)) result = get_key(self.redis_store, request_id, 300) self.logger.debug(f"result: {result}") - metadata = pd.DataFrame(json.loads(result["input_data"])) + metadata = pd.DataFrame(result["input_data"]) for c in [ "citation_count", "cited_by_wikipedia_count", diff --git a/server/workers/orcid/src/repositories/works.py b/server/workers/orcid/src/repositories/works.py index daaeea936..b48372a45 100644 --- a/server/workers/orcid/src/repositories/works.py +++ b/server/workers/orcid/src/repositories/works.py @@ -69,13 +69,13 @@ def get_authors(self, work) -> str: return "; ".join(authors) def get_title(self, work) -> str: - return get_nested_value(work, ["title", "title", "value"], None) + return get_nested_value(work, ["title", "title", "value"], "") def get_subtitle(self, work) -> str: return get_nested_value(work, ["title", "subtitle", "value"], None) def get_paper_abstract(self, work) -> str: - return get_nested_value(work, ["short-description"], None) + return get_nested_value(work, ["short-description"], "") def get_resulttype(self, work) -> str: return get_nested_value(work, ["type"], None) From ec52852403864d80850715f4a131fcc327731a59 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Fri, 13 Sep 2024 11:22:57 +0200 Subject: [PATCH 60/75] feat: implement researcher info modal --- server/workers/common/common/r_wrapper.py | 8 +- server/workers/orcid/src/model.py | 68 +++++++- server/workers/orcid/src/orcid_service.py | 10 +- .../orcid/src/repositories/author_info.py | 153 ++++++++++++++++-- .../workers/orcid/src/repositories/works.py | 1 - vis/js/actions/index.js | 3 + vis/js/components/ContextLine.js | 17 +- vis/js/components/Employment.jsx | 36 +++++ vis/js/components/Heading.js | 91 ++++++----- vis/js/components/Modals.js | 2 + vis/js/components/TitleContext.js | 2 +- vis/js/components/Toolbar.js | 7 +- vis/js/default-config.js | 14 +- vis/js/reducers/author.js | 8 +- vis/js/reducers/contextLine.js | 59 +++---- vis/js/reducers/modals.js | 12 ++ vis/js/templates/AuthorImage.jsx | 2 +- vis/js/templates/SubdisciplineTitle.jsx | 36 ++++- vis/js/templates/contextfeatures/Author.jsx | 23 +-- vis/js/templates/contextfeatures/Modifier.jsx | 2 +- .../templates/contextfeatures/NumArticles.jsx | 36 +++-- .../contextfeatures/ResearcherInfo.jsx | 10 +- .../contextfeatures/ResearcherMetricsInfo.jsx | 42 +++++ .../templates/modals/ResearcherInfoModal.jsx | 2 +- .../modals/ResearcherMetricsInfoModal.jsx | 65 ++++++++ .../researcher-modal/OrcidResearcherInfo.jsx | 78 +++++---- .../OrcidResearcherMetricsInfo.jsx | 77 +++++++++ vis/stylesheets/modules/map/_header.scss | 17 +- 28 files changed, 693 insertions(+), 188 deletions(-) create mode 100644 vis/js/components/Employment.jsx create mode 100644 vis/js/templates/contextfeatures/ResearcherMetricsInfo.jsx create mode 100644 vis/js/templates/modals/ResearcherMetricsInfoModal.jsx create mode 100644 vis/js/templates/modals/researcher-modal/OrcidResearcherMetricsInfo.jsx diff --git a/server/workers/common/common/r_wrapper.py b/server/workers/common/common/r_wrapper.py index 8332700a5..73f720db2 100644 --- a/server/workers/common/common/r_wrapper.py +++ b/server/workers/common/common/r_wrapper.py @@ -3,6 +3,8 @@ import copy import json import logging +from typing import Optional +from redis import StrictRedis formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s', @@ -11,10 +13,14 @@ class RWrapper(object): - def __init__(self, wd="./", script="", redis_store=None, + def __init__(self, wd="./", script="", redis_store: Optional[StrictRedis] = None, language=None, loglevel="INFO"): + # path should be to where in the docker container the Rscript are + if not redis_store: + raise ValueError("Redis store is required") + self.wd = wd self.command = 'Rscript' self.runner = os.path.abspath(os.path.join(self.wd, script)) diff --git a/server/workers/orcid/src/model.py b/server/workers/orcid/src/model.py index 9741084b8..efa5872ec 100644 --- a/server/workers/orcid/src/model.py +++ b/server/workers/orcid/src/model.py @@ -17,6 +17,63 @@ class ExternalIdentifier(TypedDict): relationship: str +@dataclass +class Employment: + organization: Optional[str] = None + organization_address: Optional[str] = None + department: Optional[str] = None + role: Optional[str] = None + start_date: Optional[str] = None + end_date: Optional[str] = None + +@dataclass +class Membership: + organization: Optional[str] = None + organization_address: Optional[str] = None + department: Optional[str] = None + role: Optional[str] = None + start_date: Optional[str] = None + end_date: Optional[str] = None + +@dataclass +class Amount: + currency: str + value: float + +@dataclass +class Funding: + title: str + type: str + start_date: str + end_date: str + organization: str + organization_address: str + url: str + amount: Optional[Amount] = None + +@dataclass +class Education: + department: Optional[str] = None + role: Optional[str] = None + start_date: Optional[str] = None + end_date: Optional[str] = None + organization: Optional[str] = None + organization_address: Optional[str] = None + url: Optional[str] = None + +# @data + +@dataclass +class PeerReview: + type: Optional[str] = None + role: Optional[str] = None + url: Optional[str] = None + completion_date: Optional[str] = None + organization: Optional[str] = None + organization_address: Optional[str] = None + # organization_address + + @dataclass class AuthorInfo: # TODO: consider renaming it to something more generic, @@ -28,16 +85,21 @@ class AuthorInfo: author_name: Optional[str] = None biography: Optional[str] = None author_keywords: Optional[str] = None - academic_age: Optional[str] = None + academic_age: Optional[int] = None websites: List['Website'] = field(default_factory=list) - external_identifiers: List['ExternalIdentifier'] = field(default_factory=list) + external_identifiers: List[ExternalIdentifier] = field(default_factory=list) countries: List[str] = field(default_factory=list) total_citations: Optional[int] = None total_unique_social_media_mentions: Optional[int] = None total_neppr: Optional[int] = None h_index: Optional[int] = None normalized_h_index: Optional[int] = None - + employment: Optional[Employment] = None + employments: List[Employment] = field(default_factory=list) + funds: List[Funding] = field(default_factory=list) + educations: List[Education] = field(default_factory=list) + memberships: List[Membership] = field(default_factory=list) + peer_reviews: List[PeerReview] = field(default_factory=list) @dataclass class Work: diff --git a/server/workers/orcid/src/orcid_service.py b/server/workers/orcid/src/orcid_service.py index b25220a6e..f028f12f4 100644 --- a/server/workers/orcid/src/orcid_service.py +++ b/server/workers/orcid/src/orcid_service.py @@ -13,6 +13,7 @@ from redis import StrictRedis from typing import Dict from model import AuthorInfo +from dataclasses import asdict class OrcidService: logger = logging.getLogger(__name__) @@ -49,7 +50,8 @@ def create( def execute_search(self, params: Dict[str, str]) -> Dict[str, str]: try: orcid_id = params.get("orcid") - # limit = params.get("limit") + if not orcid_id: + raise ValueError("ORCID ID is required.") orcid = self._initialize_orcid(orcid_id) author_info, metadata = self._retrieve_author_info_and_metadata(orcid) @@ -109,7 +111,7 @@ def enrich_metadata(self, params: Dict[str, str], metadata: pd.DataFrame) -> pd. metadata[c] = np.NaN return metadata - def enrich_author_info(self, author_info: AuthorInfo, metadata: pd.DataFrame, params: Dict[str, str]) -> Dict[str, str]: + def enrich_author_info(self, author_info: AuthorInfo, metadata: pd.DataFrame, params: Dict[str, str]) -> AuthorInfo: """ This function enriches the author information with additional information. Specifically, we extract and aggregate metrics data from the author's works, @@ -189,7 +191,7 @@ def _initialize_orcid(self, orcid_id: str) -> Orcid: sandbox=self.sandbox, ) - def _retrieve_author_info_and_metadata(self, orcid: Orcid) -> Tuple[Dict[str, str], pd.DataFrame]: + def _retrieve_author_info_and_metadata(self, orcid: Orcid) -> Tuple[AuthorInfo, pd.DataFrame]: author_info = AuthorInfoRepository(orcid).extract_author_info() metadata = WorksRepository(orcid).get_full_works_metadata() @@ -231,7 +233,7 @@ def _format_response(self, data: pd.DataFrame, author_info: AuthorInfo, params: "text": text.to_json(orient='records') }, # TODO: consider to return model? - "author": author_info.__dict__, + "author": asdict(author_info), "params": params } return response diff --git a/server/workers/orcid/src/repositories/author_info.py b/server/workers/orcid/src/repositories/author_info.py index bce8d06a1..f5452d09f 100644 --- a/server/workers/orcid/src/repositories/author_info.py +++ b/server/workers/orcid/src/repositories/author_info.py @@ -2,10 +2,12 @@ from datetime import datetime from pyorcid import Orcid import pandas as pd +import numpy as np from common.utils import get_nested_value from typing import List, Dict -from model import AuthorInfo, ExternalIdentifier, Website - +from model import AuthorInfo, ExternalIdentifier, Website, Employment, Funding, Education, Membership, PeerReview +from typing import Optional, Any +import calendar class AuthorInfoRepository: logger = logging.getLogger(__name__) @@ -26,6 +28,7 @@ def extract_author_info(self) -> AuthorInfo: author_info.author_keywords = ", ".join(keywords) education, _ = self.orcid.educations() + author_info.educations = self.extract_educations(education) author_info.academic_age = self.calculate_academic_age(education) external_identifiers = self.orcid.external_identifiers()["external-identifier"] @@ -37,21 +40,125 @@ def extract_author_info(self) -> AuthorInfo: researcher_urls = self.orcid.researcher_urls()["researcher-url"] author_info.websites = self.extract_websites(researcher_urls) + employments, _ = self.orcid.employments() + if employments: + author_info.employment = self.extract_employment(employments) + author_info.employments = self.extract_employments(employments) + + memberships, _ = self.orcid.memberships() + if memberships: + author_info.memberships = self.extract_memberships(memberships) + + grants, _ = self.orcid.fundings() + if grants: + author_info.funds = self.extract_funds(grants) + + peer_reviews = self.orcid.peer_reviews() + if peer_reviews: + author_info.peer_reviews = self.extract_peer_reviews(peer_reviews) + return author_info + + def extract_peer_reviews(self, peer_reviews: Any) -> List[PeerReview]: + peer_review_list = [] + + peer_review_groups = peer_reviews.get('group', []) + + for peer_review_group in peer_review_groups: + peer_reviews = peer_review_group.get('peer-review-group', []) + for peer_review in peer_reviews: + summary = peer_review.get('peer-review-summary', {}) + organization_name = get_nested_value(summary, ["convening-organization", "name"], None) + organization_address = get_nested_value(summary, ["convening-organization", "address"], {}) + organization_address_str = f"{organization_address.get('city', '')}, {organization_address.get('region', '')}, {organization_address.get('country', '')}" + + + peer_review_list.append(PeerReview( + role=summary.get('reviewer-role', None), + type=summary.get('review-type', None), + url=summary.get('review-url', None), + completion_date=self.get_completion_date(summary), + organization=organization_name, + organization_address=organization_address_str + )) + + return peer_review_list + + def extract_memberships(self, memberships: List[Dict[str, str]]) -> List[Membership]: + return [ + Membership( + organization=membership.get("organization", ""), + organization_address=membership.get("organization-address", ""), + department=membership.get("Department", ""), + role=membership.get("Role", ""), + start_date=membership.get("start-date", ""), + end_date=membership.get("end-date", ""), + ) + for membership in memberships + ] + + def extract_educations(self, educations: List[Dict[str, str]]) -> List[Education]: + return [ + Education( + department=education.get("Department", None), + role=education.get("Role", None), + start_date=education.get("start-date", ""), + end_date=education.get("end-date", ""), + organization=education.get("organization", ""), + organization_address=education.get("organization-address", ""), + url=education.get("url", "") + ) + for education in educations] + + def extract_funds(self, funds: List[Dict[str, str]]) -> List[Funding]: + return [ + Funding( + title=funding.get("title", ""), + type=funding.get("type", ""), + start_date=funding.get("start-date", ""), + end_date=funding.get("end-date", ""), + organization=funding.get("organization", ""), + organization_address=funding.get("organization-address", ""), + url=funding.get("url", "") + ) + for funding in funds + ] + + def extract_employment(self, employments: List[Dict[str, str]]) -> Optional[Employment]: + employment = employments[0] if employments else None + return Employment( + organization=employment.get("organization", None), + organization_address=employment.get("organization-address", None), + department=employment.get("department", None), + role=employment.get("Role", None), + start_date=employment.get("start-date", ""), + end_date=employment.get("end-date", "") + ) if employment else None + + def extract_employments(self, employments: List[Dict[str, str]]) -> List[Employment]: + return [ + Employment( + organization=employment.get("organization", None), + department=employment.get("department", None), + role=employment.get("Role", None), + start_date=employment.get("start-date", ""), + end_date=employment.get("end-date", "") + ) for employment in employments + ] def extract_author_name(self, personal_details: Dict[str, str]) -> str: return " ".join( [ - get_nested_value(personal_details, ["name", "given-names", "value"], ""), - get_nested_value(personal_details, ["name", "family-name", "value"], ""), + str(get_nested_value(personal_details, ["name", "given-names", "value"], "")), + str(get_nested_value(personal_details, ["name", "family-name", "value"], "")) ] ) def extract_biography(self, personal_details: Dict[str, str]) -> str: return ( - get_nested_value(personal_details, ["biography", "content"], "") + str(get_nested_value(personal_details, ["biography", "content"], "")) if ( - get_nested_value(personal_details, ["biography", "visibility"], "") == "public" + str(get_nested_value(personal_details, ["biography", "visibility"], "")) == "public" ) else "" ) @@ -65,7 +172,7 @@ def extract_countries(self, addresses: List[Dict[str, str]]) -> List[str]: countries = countries["country"] return countries.tolist() - def calculate_academic_age(self, data: List[Dict[str, str]]) -> int: + def calculate_academic_age(self, data: List[Dict[str, str]]) -> Optional[int]: # Possible terms for a PhD-equivalent role doctoral_terms = [ "phd", "dphil", "doctorate", "doctoral", @@ -131,7 +238,7 @@ def extract_external_identifiers( "value", "relationship", ] - ].to_dict(orient="records") + ].to_dict(orient="records") # type: ignore def extract_websites(self, researcher_urls: List[Dict[str, str]]) -> List[Website]: urls = pd.DataFrame(researcher_urls) @@ -142,4 +249,32 @@ def extract_websites(self, researcher_urls: List[Dict[str, str]]) -> List[Websit urls = urls[urls["visibility"] == "public"] urls["url"] = urls["url"].apply(lambda x: x.get("value")) urls = urls[["url-name", "url"]] - return urls.to_dict(orient="records") + return urls.to_dict(orient="records") # type: ignore + + def get_completion_date(self, summary) -> Optional[str]: + year = get_nested_value(summary, ["completion-date", "year", "value"], np.nan) + month = get_nested_value(summary, ["completion-date", "month", "value"], np.nan) + day = get_nested_value(summary, ["completion-date", "day", "value"], np.nan) + + if year is np.nan or not (1 <= int(year) <= 9999): + return None + + year = int(year) + result_date = str(year) + + if month is not np.nan: + month = int(month) + if 1 <= month <= 12: + result_date += f"-{month:02d}" + + if day is not np.nan: + day = int(day) + max_day = calendar.monthrange(year, month)[1] + if 1 <= day <= max_day: + result_date += f"-{day:02d}" + return result_date + return result_date + else: + return str(year) + else: + return str(year) \ No newline at end of file diff --git a/server/workers/orcid/src/repositories/works.py b/server/workers/orcid/src/repositories/works.py index b48372a45..ad65fd248 100644 --- a/server/workers/orcid/src/repositories/works.py +++ b/server/workers/orcid/src/repositories/works.py @@ -1,5 +1,4 @@ import logging -from dateutil.parser import parse from pyorcid import Orcid import pandas as pd import numpy as np diff --git a/vis/js/actions/index.js b/vis/js/actions/index.js index d13e0ae73..57fb1e771 100644 --- a/vis/js/actions/index.js +++ b/vis/js/actions/index.js @@ -155,6 +155,9 @@ export const closeInfoModal = () => ({ type: "CLOSE_INFO_MODAL" }); export const openResearcherModal = () => ({ type: "OPEN_RESEARCHER_MODAL" }); export const closeResearcherModal = () => ({ type: "CLOSE_RESEARCHER_MODAL" }); +export const openResearcherMetricsModal = () => ({ type: "OPEN_RESEARCHER_METRICS_MODAL" }); +export const closeResearcherMetricsModal = () => ({ type: "CLOSE_RESEARCHER_METRICS_MODAL" }); + export const scaleMap = (value, baseUnit, contentBased, sort) => ({ type: "SCALE", value, diff --git a/vis/js/components/ContextLine.js b/vis/js/components/ContextLine.js index 871d703b9..fa9fa06e8 100644 --- a/vis/js/components/ContextLine.js +++ b/vis/js/components/ContextLine.js @@ -19,6 +19,8 @@ import MoreInfoLink from "../templates/contextfeatures/MoreInfoLink"; import MetadataQuality from "../templates/contextfeatures/MetadataQuality"; import ContextTimeFrame from "../templates/contextfeatures/ContextTimeFrame"; import DocumentLang from "../templates/contextfeatures/DocumentLang"; +import ResearcherMetricsInfo from "../templates/contextfeatures/ResearcherMetricsInfo"; +import { Employment } from "./Employment"; import ResearcherInfo from "../templates/contextfeatures/ResearcherInfo"; const defined = (param) => param !== undefined && param !== null; @@ -28,10 +30,9 @@ const defined = (param) => param !== undefined && param !== null; * * It has to be a class component because of the popovers (they use 'this'). */ -class ContextLine extends React.Component { - render() { - const { params, localization, hidden, service } = this.props; - const { popoverContainer } = this.props; +export const ContextLine = (props) => { + const { author, params, localization, hidden, service } = props; + const { popoverContainer } = props; if (hidden) { return null; @@ -44,21 +45,24 @@ class ContextLine extends React.Component { bioLabel={localization.bio_link} livingDates={params.author.livingDates} link={"https://d-nb.info/gnd/" + params.author.id} + author={author} /> )} + {defined(params.dataSource) && ( )} {defined(params.timespan) && ( @@ -112,10 +116,10 @@ class ContextLine extends React.Component { {params.searchLanguage} )} + ); - } } const mapStateToProps = (state) => ({ @@ -126,6 +130,7 @@ const mapStateToProps = (state) => ({ }, service: state.service, localization: state.localization, + author: state.author, }); export default connect(mapStateToProps)(ContextLine); diff --git a/vis/js/components/Employment.jsx b/vis/js/components/Employment.jsx new file mode 100644 index 000000000..df6be8b2a --- /dev/null +++ b/vis/js/components/Employment.jsx @@ -0,0 +1,36 @@ +import React from "react"; +import HoverPopover from '../templates/HoverPopover'; + +export function Employment({ author }) { + return ( + <> + {author?.employment?.role ? ( + + + {author.employment.role} + + + ) : null} + + {author?.employment?.organization ? ( + + + {author.employment.organization} + + + ) : null} + + ); +} \ No newline at end of file diff --git a/vis/js/components/Heading.js b/vis/js/components/Heading.js index eb3651d86..0f7d1fdf1 100644 --- a/vis/js/components/Heading.js +++ b/vis/js/components/Heading.js @@ -1,7 +1,6 @@ import React from "react"; import { connect } from "react-redux"; - import { BasicTitle, ProjectTitle, @@ -13,43 +12,44 @@ import { STREAMGRAPH_MODE } from "../reducers/chartType"; import { queryConcatenator } from "../utils/data"; const Heading = ({ - localization, - zoomed, - query, - bubbleTitle, - headingParams, - streamgraph, - q_advanced, - service, - }) => { + localization, + zoomed, + query, + bubbleTitle, + headingParams, + streamgraph, + q_advanced, + service, + author +}) => { if (zoomed) { const label = streamgraph - ? localization.area_streamgraph - : localization.area; + ? localization.area_streamgraph + : localization.area; return ( - // html template starts here -

- {label}:{" "} - -

- // html template ends here + // html template starts here +

+ {label}:{" "} + +

+ // html template ends here ); } - let queryString = queryConcatenator([query, q_advanced]) + let queryString = queryConcatenator([query, q_advanced]); return ( - // html template starts here -
-

- {renderTitle(localization, queryString, headingParams, service)} -

-
- // html template ends here + // html template starts here +
+

+ {renderTitle(localization, queryString, headingParams, author, service)} +

+
+ // html template ends here ); }; @@ -59,13 +59,13 @@ const mapStateToProps = (state) => ({ query: state.query.text, bubbleTitle: state.selectedBubble ? state.selectedBubble.title : null, headingParams: state.heading, + author: state.author, streamgraph: state.chartType === STREAMGRAPH_MODE, q_advanced: state.q_advanced.text, - // get source BASE or PubMed - service: state.contextLine.dataSource, + // get source BASE or PubMed + service: state.contextLine.dataSource, }); - export default connect(mapStateToProps)(Heading); // This should probably make its way to a more global config @@ -75,8 +75,7 @@ const MAX_LENGTH_CUSTOM = 100; /** * Renders the title for the correct setup. */ -const renderTitle = (localization, query, headingParams, service) => { - +const renderTitle = (localization, query, headingParams, author, service) => { if (headingParams.presetTitle) { return ; } @@ -106,17 +105,29 @@ const renderTitle = (localization, query, headingParams, service) => { } // this condition for BASE service and custom title if its value exists in config params - if (service === "BASE") { - if (headingParams.customTitle) { - return ; - } + if (service === "BASE") { + if (headingParams.customTitle) { + return ( + + ); + } + } + + if (service === 'ORCID') { + if (author?.author_name && author?.orcid_id) { + return ( + + ); } + } return ; } - - return ; + return ; }; const renderViperTitle = (title, acronym, projectId) => { diff --git a/vis/js/components/Modals.js b/vis/js/components/Modals.js index 18359af67..8f6e4bfd8 100644 --- a/vis/js/components/Modals.js +++ b/vis/js/components/Modals.js @@ -9,6 +9,7 @@ import EmbedModal from "../templates/modals/EmbedModal"; import ExportPaperModal from "../templates/modals/ExportPaperModal"; import InfoModal from "../templates/modals/InfoModal"; import ResearcherInfoModal from "../templates/modals/ResearcherInfoModal"; +import ResearcherMetricsInfoModal from "../templates/modals/ResearcherMetricsInfoModal"; import PdfModal from "../templates/modals/PdfModal"; import ViperEditModal from "../templates/modals/ViperEditModal"; import LocalizationProvider from "./LocalizationProvider"; @@ -47,6 +48,7 @@ const Modals = ({ /> )} + {showPDFPreview && ( { return ( -
+
{showAuthor && }
diff --git a/vis/js/components/Toolbar.js b/vis/js/components/Toolbar.js index 789b3488d..30f51f719 100644 --- a/vis/js/components/Toolbar.js +++ b/vis/js/components/Toolbar.js @@ -2,7 +2,7 @@ import React from "react"; import { connect } from "react-redux"; import ScaleToolbar from "../templates/ScaleToolbar"; -import { openInfoModal, openResearcherModal, scaleMap } from "../actions"; +import { openInfoModal, scaleMap } from "../actions"; const Toolbar = ({ showScaleToolbar, @@ -13,8 +13,7 @@ const Toolbar = ({ scaleValue, showCredit, onInfoClick, - onScaleChange, - onResearcherClick, + onScaleChange }) => { if (showScaleToolbar) { const handleScaleChange = (newScaleBy) => { @@ -34,7 +33,6 @@ const Toolbar = ({ value={scaleValue} showCredit={showCredit} onInfoClick={onInfoClick} - onResearcherClick={onResearcherClick} onChange={handleScaleChange} />
@@ -56,7 +54,6 @@ const mapStateToProps = (state) => ({ const mapDispatchToProps = (dispatch) => ({ onInfoClick: () => dispatch(openInfoModal()), - onResearcherClick: () => dispatch(openResearcherModal()), onScaleChange: (value, baseUnit, contentBased, sort) => dispatch(scaleMap(value, baseUnit, contentBased, sort)), }); diff --git a/vis/js/default-config.js b/vis/js/default-config.js index cb711696d..582283a72 100644 --- a/vis/js/default-config.js +++ b/vis/js/default-config.js @@ -203,6 +203,7 @@ var config = { , openaire: "OpenAIRE" , triple_km: "GoTriple" , triple_sg: "GoTriple" + , orcid: 'ORCID' }, localization: { @@ -211,7 +212,7 @@ var config = { search_placeholder: "Search within visualization...", show_list: "Show list", hide_list: "Hide list", - intro_label: "More information", + intro_label: "About the map", readers: "readers", year: "date", authors: "authors", @@ -289,7 +290,7 @@ var config = { search_placeholder: "Suche in der Liste...", show_list: "Liste ausklappen", hide_list: "Liste einklappen", - intro_label: "Mehr Informationen", + intro_label: "Over de kaart", readers: "Leser", year: "Jahr", authors: "Autor", @@ -365,7 +366,7 @@ var config = { search_placeholder: "Suche in der Liste...", show_list: "Liste ausklappen", hide_list: "Liste einklappen", - intro_label: "Mehr Informationen", + intro_label: "Over de kaart", readers: "Leser", year: "Jahr", authors: "Autor", @@ -444,7 +445,7 @@ var config = { search_placeholder: "Search within visualization...", show_list: "Show list", hide_list: "Hide list", - intro_label: "More information", + intro_label: "About the map", readers: "views", year: "date", authors: "authors", @@ -514,7 +515,7 @@ var config = { search_placeholder: "Search within visualization...", show_list: "Show list", hide_list: "Hide list", - intro_label: "More information", + intro_label: "About the map", relevance: "relevance", readers: "citations", year: "year", @@ -609,6 +610,7 @@ var config = { references: "references outside academia", scale_by_infolink_label: 'Find out more', metrics_label: "Metrics", + researcher_details_label: "Researcher details", scale_by_label: "Scale map by:", scale_by_explanation: "The size of the bubbles is relative to the number of documents related to them.", scale_label: { @@ -626,7 +628,7 @@ var config = { search_placeholder: "Search within visualization...", show_list: "Show list", hide_list: "Hide list", - intro_label: "More information", + intro_label: "About the map", relevance: "relevance", readers: "readers", tweets: "tweets", diff --git a/vis/js/reducers/author.js b/vis/js/reducers/author.js index 7247b26de..81d9efc97 100644 --- a/vis/js/reducers/author.js +++ b/vis/js/reducers/author.js @@ -19,7 +19,13 @@ const author = (state = null, action) => { websites: action.author?.websites, h_index: action.author?.h_index, academic_age: action.author?.academic_age, - normalized_h_index: action.author?.normalized_h_index + normalized_h_index: action.author?.normalized_h_index, + // TODO: consider to remove employment? + employment: action.author?.employment, + employments: action.author?.employments, + funds: action.author?.funds, + educations: action.author?.educations, + memberships: action.author?.memberships, }; default: return state; diff --git a/vis/js/reducers/contextLine.js b/vis/js/reducers/contextLine.js index 024bc1d3f..b60ff82ae 100644 --- a/vis/js/reducers/contextLine.js +++ b/vis/js/reducers/contextLine.js @@ -19,53 +19,47 @@ const contextLine = (state = {}, action) => { openAccessCount: config.show_context_oa_number ? papers.filter((p) => p.oa).length : null, - showAuthor: - !!config.is_authorview && - !!context.params && - exists(context.params.author_id) && - exists(context.params.living_dates) && - exists(context.params.image_link), + showAuthor: !!config.is_authorview, author: { - id: - context.params && context.params.author_id - ? String(context.params.author_id).replace(/\([^)]*\)/, "") - : null, - livingDates: context.params ? context.params.living_dates : null, - imageLink: context.params ? context.params.image_link : null, + id: context?.params?.author_id + ? String(context.params.author_id).replace(/\([^)]*\)/, "") + : null, + livingDates: context?.params?.living_dates ?? null, + imageLink: context?.params?.image_link ?? null, }, documentTypes: getDocumentTypes(config, context), dataSource: - typeof config.service_name !== "undefined" - ? config.service_name - : config.service_names[context.service], + typeof config.service_name !== "undefined" + ? config.service_name + : config.service_names[context.service], contentProvider: context.params ? context.params.repo_name : null, paperCount: - config.create_title_from_context_style === "viper" - ? papers.filter((p) => p.resulttype.includes("publication")).length - : null, + config.create_title_from_context_style === "viper" + ? papers.filter((p) => p.resulttype.includes("publication")).length + : null, datasetCount: - config.create_title_from_context_style === "viper" - ? papers.filter((p) => p.resulttype.includes("dataset")).length - : null, + config.create_title_from_context_style === "viper" + ? papers.filter((p) => p.resulttype.includes("dataset")).length + : null, funder: - config.create_title_from_context_style === "viper" && context.params - ? context.params.funder - : null, + config.create_title_from_context_style === "viper" && context.params + ? context.params.funder + : null, projectRuntime: getProjectRuntime(config, context), // probably deprecated, used in base in the past legacySearchLanguage: getLegacySearchLanguage(config, context), // new language version, used in triple searchLanguage: - context.params && context.params.language - ? context.params.language - : null, + context.params && context.params.language + ? context.params.language + : null, timestamp: getTimestamp(config, context), metadataQuality: getMetadataQuality(config, context), // documents language used in new search box documentLang: - context.params && context.params.lang_id - ? getDocumentLanguage(config, context) - : null, + context.params && context.params.lang_id + ? getDocumentLanguage(config, context) + : null, }; default: return state; @@ -205,10 +199,7 @@ const getMetadataQuality = (config, context) => { // get documents language from context parameters (from response) const getDocumentLanguage = (config, context) => { - if ( - !context.params || - !context.params.lang_id - ) { + if (!context.params || !context.params.lang_id) { return null; } diff --git a/vis/js/reducers/modals.js b/vis/js/reducers/modals.js index 403a45907..1951622c0 100644 --- a/vis/js/reducers/modals.js +++ b/vis/js/reducers/modals.js @@ -32,6 +32,7 @@ const modals = ( openInfoModal: state.openInfoModal !== undefined && !!action.configObject.show_intro, openResearcherModal: false, + openResearcherMetricsModal: false, infoParams: action.contextObject ? { ...action.contextObject, @@ -89,6 +90,16 @@ const modals = ( ...state, openResearcherModal: false, }; + case "OPEN_RESEARCHER_METRICS_MODAL": + return { + ...state, + openResearcherMetricsModal: true, + }; + case "CLOSE_RESEARCHER_METRICS_MODAL": + return { + ...state, + openResearcherMetricsModal: false, + }; case "SHOW_PREVIEW": return { ...state, @@ -135,6 +146,7 @@ const modals = ( ...state, openInfoModal: false, openResearcherModal: false, + openResearcherMetricsModal: false, openEmbedModal: false, openViperEditModal: false, openCitationModal: false, diff --git a/vis/js/templates/AuthorImage.jsx b/vis/js/templates/AuthorImage.jsx index 7a8d5a953..ac3d88cc3 100644 --- a/vis/js/templates/AuthorImage.jsx +++ b/vis/js/templates/AuthorImage.jsx @@ -4,7 +4,7 @@ import defaultImage from "../../images/author_default.png"; const AuthorImage = ({ url = "" }) => { let link = defaultImage; - if (url !== "") { + if (url) { link = url; } diff --git a/vis/js/templates/SubdisciplineTitle.jsx b/vis/js/templates/SubdisciplineTitle.jsx index a4d43a6b1..4de18ab12 100644 --- a/vis/js/templates/SubdisciplineTitle.jsx +++ b/vis/js/templates/SubdisciplineTitle.jsx @@ -7,13 +7,37 @@ import ContextLine from "../components/ContextLine"; class SubdisciplineTitle extends React.Component { render() { return ( -
- - - -
+ <> +
+ {/*
+
+ +
+
+
*/} + + + + {/*
*/} +
+ ); } } -export default SubdisciplineTitle; +export default SubdisciplineTitle; \ No newline at end of file diff --git a/vis/js/templates/contextfeatures/Author.jsx b/vis/js/templates/contextfeatures/Author.jsx index 4062c2252..33df0bca5 100644 --- a/vis/js/templates/contextfeatures/Author.jsx +++ b/vis/js/templates/contextfeatures/Author.jsx @@ -1,17 +1,22 @@ import React from "react"; -const Author = ({ bioLabel, livingDates, link }) => { +const Author = ({ bioLabel, livingDates, link, author }) => { return ( // html template starts here <> - - {livingDates} - - - - {bioLabel} - - + {livingDates ? ( + + {livingDates} + + ) : null} + + {bioLabel ? ( + + + {bioLabel} + + + ) : null} // html template ends here ); diff --git a/vis/js/templates/contextfeatures/Modifier.jsx b/vis/js/templates/contextfeatures/Modifier.jsx index 7cd0207bb..02c948770 100644 --- a/vis/js/templates/contextfeatures/Modifier.jsx +++ b/vis/js/templates/contextfeatures/Modifier.jsx @@ -16,7 +16,7 @@ const Modifier = ({ popoverContainer, modifier, isStreamgraph }) => { <> {localization.most_recent_label} - {" "} + ); } diff --git a/vis/js/templates/contextfeatures/NumArticles.jsx b/vis/js/templates/contextfeatures/NumArticles.jsx index 7d1c35caa..5f94e2431 100644 --- a/vis/js/templates/contextfeatures/NumArticles.jsx +++ b/vis/js/templates/contextfeatures/NumArticles.jsx @@ -4,17 +4,31 @@ const NumArticles = ({ articlesCount, articlesCountLabel, openAccessArticlesCount = null, + service, children, -}) => ( - // html template starts here - - {articlesCount} {children} - {articlesCountLabel}{" "} - {openAccessArticlesCount !== null && ( - <>({openAccessArticlesCount} open access) - )} - - // html template ends here -); +}) => { + let displayText = `${articlesCount} ${articlesCountLabel}`; + + if (service === 'orcid') { + if (articlesCount > 200) { + displayText = `200 ${children} works`; + } + } + + if (service === 'pubmed' || service === 'base') { + if ((service === 'base' || service === 'pubmed') && articlesCount > 100) { + displayText = `100 ${children} ${articlesCountLabel}`; + } + } + + return ( + + {displayText}{" "} + {openAccessArticlesCount !== null && ( + <>({openAccessArticlesCount} open access) + )} + + ); +}; export default NumArticles; diff --git a/vis/js/templates/contextfeatures/ResearcherInfo.jsx b/vis/js/templates/contextfeatures/ResearcherInfo.jsx index 46dbd5fd1..f90731503 100644 --- a/vis/js/templates/contextfeatures/ResearcherInfo.jsx +++ b/vis/js/templates/contextfeatures/ResearcherInfo.jsx @@ -3,7 +3,7 @@ import { connect } from "react-redux"; import useMatomo from "../../utils/useMatomo"; import { useLocalizationContext } from "../../components/LocalizationProvider"; -import { openResearcherModal } from "../../actions"; +import { openResearcherMetricsModal } from "../../actions"; const ResearcherInfo = ({ onClick }) => { const loc = useLocalizationContext(); @@ -22,12 +22,12 @@ const ResearcherInfo = ({ onClick }) => { return ( // html template starts here - {loc.metrics_label} + {loc.researcher_details_label} @@ -36,7 +36,7 @@ const ResearcherInfo = ({ onClick }) => { }; const mapDispatchToProps = (dispatch) => ({ - onClick: () => dispatch(openResearcherModal()), + onClick: () => dispatch(openResearcherMetricsModal()), }); export default connect(null, mapDispatchToProps)(ResearcherInfo); diff --git a/vis/js/templates/contextfeatures/ResearcherMetricsInfo.jsx b/vis/js/templates/contextfeatures/ResearcherMetricsInfo.jsx new file mode 100644 index 000000000..2147d7b68 --- /dev/null +++ b/vis/js/templates/contextfeatures/ResearcherMetricsInfo.jsx @@ -0,0 +1,42 @@ +import React from "react"; +import { connect } from "react-redux"; + +import useMatomo from "../../utils/useMatomo"; +import { useLocalizationContext } from "../../components/LocalizationProvider"; +import { openResearcherMetricsModal } from "../../actions"; + +const ResearcherMetricsInfo = ({ onClick }) => { + const loc = useLocalizationContext(); + const { trackEvent } = useMatomo(); + + const handleClick = () => { + onClick(); + + trackEvent( + "Title & Context line", + "Open researcher metrics modal", + "More researcher metrics info button" + ); + }; + + return ( + // html template starts here + + + {loc.metrics_label} + + + + // html template ends here + ); +}; + +const mapDispatchToProps = (dispatch) => ({ + onClick: () => dispatch(openResearcherMetricsModal()), +}); + +export default connect(null, mapDispatchToProps)(ResearcherMetricsInfo); diff --git a/vis/js/templates/modals/ResearcherInfoModal.jsx b/vis/js/templates/modals/ResearcherInfoModal.jsx index 1b4675685..bcd280e2b 100644 --- a/vis/js/templates/modals/ResearcherInfoModal.jsx +++ b/vis/js/templates/modals/ResearcherInfoModal.jsx @@ -30,7 +30,7 @@ const ResearcherInfoModal = ({open, onClose, params, service, isStreamgraph, mod const mapStateToProps = (state) => ({ - open: state.modals.openResearcherModal, + open: state.modals.openResearcherMetricsModal, params: { ...state.modals.infoParams, query: state.query.text, diff --git a/vis/js/templates/modals/ResearcherMetricsInfoModal.jsx b/vis/js/templates/modals/ResearcherMetricsInfoModal.jsx new file mode 100644 index 000000000..7708da4e0 --- /dev/null +++ b/vis/js/templates/modals/ResearcherMetricsInfoModal.jsx @@ -0,0 +1,65 @@ +import React from "react"; +import { connect } from "react-redux"; +import { Modal } from "react-bootstrap"; + +import { closeResearcherModal } from "../../actions"; +import { STREAMGRAPH_MODE } from "../../reducers/chartType"; + +import ResearcherMetricsInfo from "./researcher-modal/OrcidResearcherMetricsInfo"; + +const getResearcherInfoTemplate = (service, isStreamgraph, modalType) => { + switch (service) { + case "orcid": + return ResearcherMetricsInfo; + default: + return null; + } +}; + +const ResearcherMetricsInfoModal = ({open, onClose, params, service, isStreamgraph, modalInfoType}) => { + const ResearcherInfoTemplate = getResearcherInfoTemplate(service, isStreamgraph, modalInfoType); + + return ( + // html template starts here + + + + // html template ends here + ); +}; + + +const mapStateToProps = (state) => ({ + open: state.modals.openResearcherModal, + params: { + ...state.modals.infoParams, + query: state.query.text, + customTitle: state.heading.customTitle, + q_advanced: state.q_advanced.text, + + author_name: state.author.author_name, + author_keywords: state.author.author_keywords, + biography: state.author.biography, + country: state.author.country, + external_identifiers: state.author.external_identifiers, + orcid_id: state.author.orcid_id, + total_citations: state.author.total_citations, + total_neppr: state.author.total_neppr, + total_unique_social_media_mentions: + state.author.total_unique_social_media_mentions, + websites: state.author.websites, + }, + service: state.isCovis ? "covis" : state.service, + isStreamgraph: state.chartType === STREAMGRAPH_MODE, + // new parameter from config to render correct type of info modal window + modalInfoType: state.modalInfoType, +}); + +const mapDispatchToProps = (dispatch) => ({ + onClose: () => dispatch(closeResearcherModal()), +}); + +export default connect( + mapStateToProps, + mapDispatchToProps +)(ResearcherMetricsInfoModal); diff --git a/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx b/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx index acc3be26f..03ba10846 100644 --- a/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx +++ b/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx @@ -6,43 +6,30 @@ const ResearcherInfo = ({ params }) => { return ( - // html template starts here <> - Metrics + Researcher details -

METRICS

-

- Normalised h-index: - {params.normalized_h_index ? params.normalized_h_index?.toFixed(1) : "N/A"} - -

-

- Academic age: - {params.academic_age ? params.academic_age : "N/A"} - -

-

- h-index: - {params.h_index ? params.h_index : "N/A"} - -

-

- Number of total citations: {params.total_citations ? params.total_citations : 'N/A'} -

-

ALTMETRICS

-

- Number of total unique social media mentions:{" "} - {params.total_unique_social_media_mentions ? params.total_unique_social_media_mentions : 'N/A'} -

-

- Number of total news encyclopaedia, patent and policy references:{" "} - {params.total_neppr ? params.total_neppr : 'N/A'} -

-

NOTES ON METRICS

-

{params.biography}

-

OTHER IDs

+

EMPLOYMENT

+ {params.employments?.map((employment) => ( +

+ {employment.role} / {employment.start_date} - {employment.end_date} / {employment.organization} / {employment.organization_address} +

+ ))} +

EDUCATION & QUALIFICATION

+ {params.educations?.map((education) => ( +

+ {education.role} / {education.start_date} - {education.end_date} / {education.organization} / {education.organization_address} +

+ ))} +

GRANTS ({params.funds?.length})

+ {params.funds?.map((fund) => ( +

+ {fund.title} / {fund.start_date} - {fund.end_date} / Funder: {fund.organization} / Amount: {fund.amount?.value} {fund.amount?.currency} +

+ ))} +

LINKS

{params.external_identifiers.map((external_id) => (

@@ -52,24 +39,33 @@ const ResearcherInfo = ({

))}

+

DISTINCTIONS / AWARDS

+

MEMBERSHIPS

+ {params.memberships?.map((membership) => ( +

+ {membership.role} / {membership.start_date} - {membership.end_date} / {membership.organization} / {membership.organization_address} +

+ ))} +

PEER REVIEWS

+ {params.peer_reviews?.map((peer_review) => ( +

+ {peer_review.role} / {peer_review.completion_date} / {peer_review.organization} / {peer_review.organization_address} +

+ ))}
- // html template ends here ); }; const mapStateToProps = (state) => { return { params: { - total_citations: state.author.total_citations, orcid_id: state.author.orcid_id, - total_unique_social_media_mentions: - state.author.total_unique_social_media_mentions, - total_neppr: state.author.total_neppr, external_identifiers: state.author.external_identifiers, - h_index: state.author.h_index, - academic_age: state.author.academic_age, - normalized_h_index: state.author.normalized_h_index, + funds: state.author.funds, + educations: state.author.educations, + employments: state.author.employments, + memberships: state.author.memberships, }, }; }; diff --git a/vis/js/templates/modals/researcher-modal/OrcidResearcherMetricsInfo.jsx b/vis/js/templates/modals/researcher-modal/OrcidResearcherMetricsInfo.jsx new file mode 100644 index 000000000..3dc70a0cf --- /dev/null +++ b/vis/js/templates/modals/researcher-modal/OrcidResearcherMetricsInfo.jsx @@ -0,0 +1,77 @@ +import React from "react"; +import { Modal } from "react-bootstrap"; +import { connect } from "react-redux"; + +const ResearcherMetricsInfo = ({ + params +}) => { + return ( + // html template starts here + <> + + Metrics + + +

METRICS

+

+ Normalised h-index: + {params.normalized_h_index ? params.normalized_h_index?.toFixed(1) : "N/A"} + +

+

+ Academic age: + {params.academic_age ? params.academic_age : "N/A"} + +

+

+ h-index: + {params.h_index ? params.h_index : "N/A"} + +

+

+ Number of total citations: {params.total_citations ? params.total_citations : 'N/A'} +

+

ALTMETRICS

+

+ Number of total unique social media mentions:{" "} + {params.total_unique_social_media_mentions ? params.total_unique_social_media_mentions : 'N/A'} +

+

+ Number of total news encyclopaedia, patent and policy references:{" "} + {params.total_neppr ? params.total_neppr : 'N/A'} +

+

NOTES ON METRICS

+

{params.biography}

+

OTHER IDs

+

+ {params.external_identifiers.map((external_id) => ( +

+ + {external_id["type"]}: {external_id["value"]} + +

+ ))} +

+
+ + // html template ends here + ); +}; + +const mapStateToProps = (state) => { + return { + params: { + total_citations: state.author.total_citations, + orcid_id: state.author.orcid_id, + total_unique_social_media_mentions: + state.author.total_unique_social_media_mentions, + total_neppr: state.author.total_neppr, + external_identifiers: state.author.external_identifiers, + h_index: state.author.h_index, + academic_age: state.author.academic_age, + normalized_h_index: state.author.normalized_h_index, + }, + }; +}; + +export default connect(mapStateToProps)(ResearcherMetricsInfo); diff --git a/vis/stylesheets/modules/map/_header.scss b/vis/stylesheets/modules/map/_header.scss index f888cafbb..2bbb1895a 100644 --- a/vis/stylesheets/modules/map/_header.scss +++ b/vis/stylesheets/modules/map/_header.scss @@ -1,7 +1,12 @@ -#subdiscipline_title { - margin: 0px; +#title_context { + display: flex; + align-items: center; padding-top: 15px; padding-left: 50px; +} + +#subdiscipline_title { + margin: 0px; font-family: $base-font-family; min-height: 54px; @@ -313,6 +318,14 @@ margin-right: 1%; height: 15px; margin-bottom: 6px; + + &.overflow-ellipsis { + text-overflow: ellipsis; + max-width: 65px; + overflow: hidden; + white-space: nowrap; + vertical-align: bottom; + } } //.context_metadata_high { From 74e7f99e58b595351d5eeb01f6b718723a63fa19 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Fri, 13 Sep 2024 12:35:29 +0200 Subject: [PATCH 61/75] fix: enrich researcher retrics and add missed data --- server/workers/orcid/src/model.py | 19 ++++++++-- .../orcid/src/repositories/author_info.py | 35 ++++++++++++++++++- vis/js/reducers/author.js | 1 + .../contextfeatures/ResearcherInfo.jsx | 4 +-- .../templates/modals/ResearcherInfoModal.jsx | 2 +- .../modals/ResearcherMetricsInfoModal.jsx | 6 ++-- .../researcher-modal/OrcidResearcherInfo.jsx | 15 +++++--- 7 files changed, 69 insertions(+), 13 deletions(-) diff --git a/server/workers/orcid/src/model.py b/server/workers/orcid/src/model.py index efa5872ec..f6bb1a678 100644 --- a/server/workers/orcid/src/model.py +++ b/server/workers/orcid/src/model.py @@ -19,15 +19,28 @@ class ExternalIdentifier(TypedDict): @dataclass class Employment: + id: str + organization: Optional[str] = None + organization_address: Optional[str] = None + department: Optional[str] = None + role: Optional[str] = None + start_date: Optional[str] = None + end_date: Optional[str] = None + +@dataclass +class Distinction: + id: str organization: Optional[str] = None organization_address: Optional[str] = None department: Optional[str] = None role: Optional[str] = None start_date: Optional[str] = None end_date: Optional[str] = None + url: Optional[str] = None @dataclass class Membership: + id: str organization: Optional[str] = None organization_address: Optional[str] = None department: Optional[str] = None @@ -42,6 +55,7 @@ class Amount: @dataclass class Funding: + id: str title: str type: str start_date: str @@ -53,6 +67,7 @@ class Funding: @dataclass class Education: + id: str department: Optional[str] = None role: Optional[str] = None start_date: Optional[str] = None @@ -61,10 +76,9 @@ class Education: organization_address: Optional[str] = None url: Optional[str] = None -# @data - @dataclass class PeerReview: + id: str type: Optional[str] = None role: Optional[str] = None url: Optional[str] = None @@ -100,6 +114,7 @@ class AuthorInfo: educations: List[Education] = field(default_factory=list) memberships: List[Membership] = field(default_factory=list) peer_reviews: List[PeerReview] = field(default_factory=list) + distinctions: List[Distinction] = field(default_factory=list) @dataclass class Work: diff --git a/server/workers/orcid/src/repositories/author_info.py b/server/workers/orcid/src/repositories/author_info.py index f5452d09f..40fe3d7fc 100644 --- a/server/workers/orcid/src/repositories/author_info.py +++ b/server/workers/orcid/src/repositories/author_info.py @@ -5,10 +5,19 @@ import numpy as np from common.utils import get_nested_value from typing import List, Dict -from model import AuthorInfo, ExternalIdentifier, Website, Employment, Funding, Education, Membership, PeerReview +from model import AuthorInfo, ExternalIdentifier, Website, Employment, Funding, Education, Membership, PeerReview, Distinction from typing import Optional, Any import calendar +import hashlib +import time + + +def unique_id(): + unique_string = str(time.time()).encode() # Encode a unique string (e.g., timestamp) + short_unique_id = hashlib.md5(unique_string).hexdigest()[:8] # Get first 8 characters of the hash + return short_unique_id + class AuthorInfoRepository: logger = logging.getLogger(__name__) @@ -57,6 +66,10 @@ def extract_author_info(self) -> AuthorInfo: if peer_reviews: author_info.peer_reviews = self.extract_peer_reviews(peer_reviews) + distinctions, _ = self.orcid.distinctions() + if distinctions: + author_info.distinctions = self.extract_distinctions(distinctions) + return author_info def extract_peer_reviews(self, peer_reviews: Any) -> List[PeerReview]: @@ -74,6 +87,7 @@ def extract_peer_reviews(self, peer_reviews: Any) -> List[PeerReview]: peer_review_list.append(PeerReview( + id=unique_id(), role=summary.get('reviewer-role', None), type=summary.get('review-type', None), url=summary.get('review-url', None), @@ -87,6 +101,7 @@ def extract_peer_reviews(self, peer_reviews: Any) -> List[PeerReview]: def extract_memberships(self, memberships: List[Dict[str, str]]) -> List[Membership]: return [ Membership( + id=unique_id(), organization=membership.get("organization", ""), organization_address=membership.get("organization-address", ""), department=membership.get("Department", ""), @@ -100,6 +115,7 @@ def extract_memberships(self, memberships: List[Dict[str, str]]) -> List[Members def extract_educations(self, educations: List[Dict[str, str]]) -> List[Education]: return [ Education( + id=unique_id(), department=education.get("Department", None), role=education.get("Role", None), start_date=education.get("start-date", ""), @@ -110,9 +126,24 @@ def extract_educations(self, educations: List[Dict[str, str]]) -> List[Education ) for education in educations] + def extract_distinctions(self, distinctions: List[Dict[str, str]]) -> List[Distinction]: + return [ + Distinction( + id=unique_id(), + department=education.get("Department", None), + role=education.get("Role", None), + start_date=education.get("start-date", ""), + end_date=education.get("end-date", ""), + organization=education.get("organization", ""), + organization_address=education.get("organization-address", ""), + url=education.get("url", "") + ) + for education in distinctions] + def extract_funds(self, funds: List[Dict[str, str]]) -> List[Funding]: return [ Funding( + id=unique_id(), title=funding.get("title", ""), type=funding.get("type", ""), start_date=funding.get("start-date", ""), @@ -127,6 +158,7 @@ def extract_funds(self, funds: List[Dict[str, str]]) -> List[Funding]: def extract_employment(self, employments: List[Dict[str, str]]) -> Optional[Employment]: employment = employments[0] if employments else None return Employment( + id=unique_id(), organization=employment.get("organization", None), organization_address=employment.get("organization-address", None), department=employment.get("department", None), @@ -138,6 +170,7 @@ def extract_employment(self, employments: List[Dict[str, str]]) -> Optional[Empl def extract_employments(self, employments: List[Dict[str, str]]) -> List[Employment]: return [ Employment( + id=unique_id(), organization=employment.get("organization", None), department=employment.get("department", None), role=employment.get("Role", None), diff --git a/vis/js/reducers/author.js b/vis/js/reducers/author.js index 81d9efc97..f8f1ddd47 100644 --- a/vis/js/reducers/author.js +++ b/vis/js/reducers/author.js @@ -26,6 +26,7 @@ const author = (state = null, action) => { funds: action.author?.funds, educations: action.author?.educations, memberships: action.author?.memberships, + distinctions: action.author?.distinctions, }; default: return state; diff --git a/vis/js/templates/contextfeatures/ResearcherInfo.jsx b/vis/js/templates/contextfeatures/ResearcherInfo.jsx index f90731503..7beb02a7e 100644 --- a/vis/js/templates/contextfeatures/ResearcherInfo.jsx +++ b/vis/js/templates/contextfeatures/ResearcherInfo.jsx @@ -3,7 +3,7 @@ import { connect } from "react-redux"; import useMatomo from "../../utils/useMatomo"; import { useLocalizationContext } from "../../components/LocalizationProvider"; -import { openResearcherMetricsModal } from "../../actions"; +import { openResearcherModal } from "../../actions"; const ResearcherInfo = ({ onClick }) => { const loc = useLocalizationContext(); @@ -36,7 +36,7 @@ const ResearcherInfo = ({ onClick }) => { }; const mapDispatchToProps = (dispatch) => ({ - onClick: () => dispatch(openResearcherMetricsModal()), + onClick: () => dispatch(openResearcherModal()), }); export default connect(null, mapDispatchToProps)(ResearcherInfo); diff --git a/vis/js/templates/modals/ResearcherInfoModal.jsx b/vis/js/templates/modals/ResearcherInfoModal.jsx index bcd280e2b..1b4675685 100644 --- a/vis/js/templates/modals/ResearcherInfoModal.jsx +++ b/vis/js/templates/modals/ResearcherInfoModal.jsx @@ -30,7 +30,7 @@ const ResearcherInfoModal = ({open, onClose, params, service, isStreamgraph, mod const mapStateToProps = (state) => ({ - open: state.modals.openResearcherMetricsModal, + open: state.modals.openResearcherModal, params: { ...state.modals.infoParams, query: state.query.text, diff --git a/vis/js/templates/modals/ResearcherMetricsInfoModal.jsx b/vis/js/templates/modals/ResearcherMetricsInfoModal.jsx index 7708da4e0..2c237d735 100644 --- a/vis/js/templates/modals/ResearcherMetricsInfoModal.jsx +++ b/vis/js/templates/modals/ResearcherMetricsInfoModal.jsx @@ -2,7 +2,7 @@ import React from "react"; import { connect } from "react-redux"; import { Modal } from "react-bootstrap"; -import { closeResearcherModal } from "../../actions"; +import { closeResearcherMetricsModal } from "../../actions"; import { STREAMGRAPH_MODE } from "../../reducers/chartType"; import ResearcherMetricsInfo from "./researcher-modal/OrcidResearcherMetricsInfo"; @@ -30,7 +30,7 @@ const ResearcherMetricsInfoModal = ({open, onClose, params, service, isStreamgra const mapStateToProps = (state) => ({ - open: state.modals.openResearcherModal, + open: state.modals.openResearcherMetricsModal, params: { ...state.modals.infoParams, query: state.query.text, @@ -56,7 +56,7 @@ const mapStateToProps = (state) => ({ }); const mapDispatchToProps = (dispatch) => ({ - onClose: () => dispatch(closeResearcherModal()), + onClose: () => dispatch(closeResearcherMetricsModal()), }); export default connect( diff --git a/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx b/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx index 03ba10846..1373d8cce 100644 --- a/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx +++ b/vis/js/templates/modals/researcher-modal/OrcidResearcherInfo.jsx @@ -23,7 +23,7 @@ const ResearcherInfo = ({ {education.role} / {education.start_date} - {education.end_date} / {education.organization} / {education.organization_address}

))} -

GRANTS ({params.funds?.length})

+

GRANTS ({params.funds?.length || 0})

{params.funds?.map((fund) => (

{fund.title} / {fund.start_date} - {fund.end_date} / Funder: {fund.organization} / Amount: {fund.amount?.value} {fund.amount?.currency} @@ -33,20 +33,26 @@ const ResearcherInfo = ({

{params.external_identifiers.map((external_id) => (

+ {external_id["type"]}: {external_id["value"]}

))}

-

DISTINCTIONS / AWARDS

-

MEMBERSHIPS

+

DISTINCTIONS / AWARDS ({params.distinctions?.length || 0})

+ {params.distinctions?.map((distinction) => ( +

+ {distinction.title} / {distinction.start_date} - {distinction.end_date} / {distinction.organization} / {distinction.organization_address} +

+ ))} +

MEMBERSHIPS ({params.memberships?.length || 0})

{params.memberships?.map((membership) => (

{membership.role} / {membership.start_date} - {membership.end_date} / {membership.organization} / {membership.organization_address}

))} -

PEER REVIEWS

+

PEER REVIEWS ({params.peer_reviews?.length || 0})

{params.peer_reviews?.map((peer_review) => (

{peer_review.role} / {peer_review.completion_date} / {peer_review.organization} / {peer_review.organization_address} @@ -66,6 +72,7 @@ const mapStateToProps = (state) => { educations: state.author.educations, employments: state.author.employments, memberships: state.author.memberships, + distinctions: state.author.distinctions, }, }; }; From 57987cfede73681fac8f0f1ce9501e8d6eb3fccb Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 17 Sep 2024 11:40:28 +0200 Subject: [PATCH 62/75] fix: add omitted pyorcid update --- server/workers/orcid/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/workers/orcid/requirements.txt b/server/workers/orcid/requirements.txt index 584804b32..8e6caa442 100644 --- a/server/workers/orcid/requirements.txt +++ b/server/workers/orcid/requirements.txt @@ -24,4 +24,4 @@ redis==4.3.6 six==1.16.0 typing-extensions==4.2.0 zipp==3.6.0 -pyorcid @ git+https://github.com/OpenKnowledgeMaps/PyOrcid.git@3d8b30cbc95c2c7bb34369145866d2b252c677b8 \ No newline at end of file +pyorcid @ git+https://github.com/OpenKnowledgeMaps/PyOrcid.git@2de742394155266dc975823ea39f8baf37892cd1 \ No newline at end of file From c3fc8973f81cdad8ac324b0fe6887dd10e816f34 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 17 Sep 2024 11:43:25 +0200 Subject: [PATCH 63/75] feat: add some tests --- test.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 000000000..9cb8f5a80 --- /dev/null +++ b/test.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass +import json +from typing import Optional + +@dataclass +class AuthorInfo: + # TODO: consider to rename it to something more generic, + # TODO: so that it will be possible to reuse between different client integerations + orcid_id: str + # TODO: consider to store First Name and Last Name separately, usually it's better + # TODO: for data computation. The only issue we may face with this approach is that some + # TODO: data integrations may return author name as single field + author_name: Optional[str] = None + biography: Optional[str] = None + author_keywords: Optional[str] = None + academic_age: Optional[str] = None + total_citations: Optional[int] = None + total_unique_social_media_mentions: Optional[int] = None + total_neppr: Optional[int] = None + h_index: Optional[int] = None + normalized_h_index: Optional[int] = None + + def to_dict(self): + return { + 'orcid_id': self.orcid_id, + 'author_name': self.author_name, + 'biography': self.biography, + 'author_keywords': self.author_keywords, + 'academic_age': self.academic_age, + 'total_citations': self.total_citations, + 'total_unique_social_media_mentions': self.total_unique_social_media_mentions, + 'total_neppr': self.total_neppr, + 'h_index': self.h_index, + 'normalized_h_index': self.normalized_h_index + } + + +author = AuthorInfo( + orcid_id='0000-0002-1825-0097', +) + +author.total_unique_social_media_mentions = 20 + +print(json.dumps(vars(author))) \ No newline at end of file From c8cc5fcd1753323b6c3df1ce65d02b48d34ae376 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 17 Sep 2024 11:48:14 +0200 Subject: [PATCH 64/75] feat: add mac gitignore --- .gitignore | 2 ++ server/preprocessing/resources/additional_stopwords.txt | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e5b535348..35adbffb4 100644 --- a/.gitignore +++ b/.gitignore @@ -75,3 +75,5 @@ server/preprocessing/other-scripts/renv local_dev/config_local_headstart.ini local_dev/config_local_searchflow.ini +# mac os +.DS_Store diff --git a/server/preprocessing/resources/additional_stopwords.txt b/server/preprocessing/resources/additional_stopwords.txt index 9ddc3f98a..2af358387 100644 --- a/server/preprocessing/resources/additional_stopwords.txt +++ b/server/preprocessing/resources/additional_stopwords.txt @@ -599,4 +599,4 @@ scipo psy relig anthro-se -socio \ No newline at end of file +socio From 69d554853f4870a9e6e949cd94cb27177ba2c194 Mon Sep 17 00:00:00 2001 From: Krutilin Sergey Date: Tue, 17 Sep 2024 12:48:39 +0200 Subject: [PATCH 65/75] fix: escaping --- .../orcid/src/repositories/author_info.py | 30 +++++++++---------- server/workers/orcid/src/worker.py | 3 +- vis/js/dataprocessing/managers/DataManager.js | 8 ++--- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/server/workers/orcid/src/repositories/author_info.py b/server/workers/orcid/src/repositories/author_info.py index 40fe3d7fc..10fbc9667 100644 --- a/server/workers/orcid/src/repositories/author_info.py +++ b/server/workers/orcid/src/repositories/author_info.py @@ -80,21 +80,21 @@ def extract_peer_reviews(self, peer_reviews: Any) -> List[PeerReview]: for peer_review_group in peer_review_groups: peer_reviews = peer_review_group.get('peer-review-group', []) for peer_review in peer_reviews: - summary = peer_review.get('peer-review-summary', {}) - organization_name = get_nested_value(summary, ["convening-organization", "name"], None) - organization_address = get_nested_value(summary, ["convening-organization", "address"], {}) - organization_address_str = f"{organization_address.get('city', '')}, {organization_address.get('region', '')}, {organization_address.get('country', '')}" - - - peer_review_list.append(PeerReview( - id=unique_id(), - role=summary.get('reviewer-role', None), - type=summary.get('review-type', None), - url=summary.get('review-url', None), - completion_date=self.get_completion_date(summary), - organization=organization_name, - organization_address=organization_address_str - )) + summaries = peer_review.get('peer-review-summary', []) + for summary in summaries: + organization_name = get_nested_value(summary, ["convening-organization", "name"], None) + organization_address = get_nested_value(summary, ["convening-organization", "address"], {}) + organization_address_str = f"{organization_address.get('city', '')}, {organization_address.get('region', '')}, {organization_address.get('country', '')}" + + peer_review_list.append(PeerReview( + id=unique_id(), + role=summary.get('reviewer-role', None), + type=summary.get('review-type', None), + url=summary.get('review-url', None), + completion_date=self.get_completion_date(summary), + organization=organization_name, + organization_address=organization_address_str + )) return peer_review_list diff --git a/server/workers/orcid/src/worker.py b/server/workers/orcid/src/worker.py index 190dee157..2f401e965 100644 --- a/server/workers/orcid/src/worker.py +++ b/server/workers/orcid/src/worker.py @@ -6,6 +6,7 @@ from common.rate_limiter import RateLimiter from redis import Redis from orcid_service import OrcidService +from typing import Optional class OrcidWorker: service = "orcid" @@ -22,7 +23,7 @@ def __init__( self.data_retriever = data_retriever self.rate_limiter = rate_limiter - def next_item(self) -> Tuple[str, Dict[str, str], str]: + def next_item(self) -> Tuple[Optional[str], Optional[Dict[str, str]], Optional[str]]: _, message = self.redis_store.blpop(self.service) try: message_data: Dict[str, str] = json.loads(message.decode("utf-8")) diff --git a/vis/js/dataprocessing/managers/DataManager.js b/vis/js/dataprocessing/managers/DataManager.js index 3b2c1f114..6d0b131ee 100644 --- a/vis/js/dataprocessing/managers/DataManager.js +++ b/vis/js/dataprocessing/managers/DataManager.js @@ -202,10 +202,10 @@ class DataManager { for (const field in paper) { if (typeof paper[field] === "string") { paper[field] = $("