From 67fc591ce32a6896f49cabf447ca930e6b541cda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 5 Dec 2024 15:28:08 +0000 Subject: [PATCH 001/310] Create .gitignore --- .gitignore | 378 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..2e75f7f963 --- /dev/null +++ b/.gitignore @@ -0,0 +1,378 @@ +# Created by https://www.toptal.com/developers/gitignore/api/osx,vim,java,linux,python,intellij +# Edit at https://www.toptal.com/developers/gitignore?templates=osx,vim,java,linux,python,intellij + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Java ### +# Compiled class file +*.class + +# Log file +*.log + +# BlueJ files +*.ctxt + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.nar +*.ear +*.zip +*.tar.gz +*.rar + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* +replay_pid* + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### OSX ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +# End of https://www.toptal.com/developers/gitignore/api/osx,vim,java,linux,python,intellij From cc9a269a05b0e524a161618a3ea2060542b1ae0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 5 Dec 2024 15:33:56 +0000 Subject: [PATCH 002/310] Add cost comparison between Neo4j and Neptune --- docs/neo4j_neptune_cost_comparison.md | 97 +++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 docs/neo4j_neptune_cost_comparison.md diff --git a/docs/neo4j_neptune_cost_comparison.md b/docs/neo4j_neptune_cost_comparison.md new file mode 100644 index 0000000000..00db6ca0c5 --- /dev/null +++ b/docs/neo4j_neptune_cost_comparison.md @@ -0,0 +1,97 @@ +# Capacity estimate + +## Storage estimate + +The graph will have a node for each work and for each concept. Our intention is to store _all_ LoC and MeSH concepts in the graph +(even though most of them are not referenced by any works) as this will make updating the graph with new edges much easier. + +**Number of nodes**: ~20 million (~14 million concepts + ~3 million works + headroom for other nodes)\ +**Storage per node**: ~2 KB (assuming we store 10 to 20 fields per node)\ +**Number of edges**: ~40 million (the graph will be quite sparse — some nodes will have many edges, but most will have 0 edges)\ +**Storage per edge**: ~100 B (there will only be a few fields associated with each edge)\ +=====\ +**Total storage**: 20 M * 2 K + 40 M + 0.1 K = 44 GB ≈ **50 GB of storage** + +## Memory & compute estimate + +We estimate that the database will receive up to **10 million requests** per month (~300,000 per day). On most days it will not receive this many requests, but during a reindex it will receive many more than this. + +Memory consumption depends on several factors (such as the complexity of the queries and how optimised they are) and is difficult to estimate. As an educated guess, we estimate that under heavy load, the database might require up to **20 GB of memory**. We also anticipate that most of the time (perhaps 80% of the time) the database will be more or less idle and only require a small amount of memory. + +# Cost estimate + +## Neptune serverless + +Neptune serverless automatically scales with usage. Unlike other serverless services, Neptune serverless does not scale to zero — the minimum provisioned capacity is 1 Neptune Capacity Unit (NCU). This means that a completely idle database still incurs costs of **~115 USD per month** plus storage. + +According to AWS, 1 NCU corresponds to approximately 2 GB of memory, so we would need 10 NCUs to provide 20 GB of memory. + +Using the capacity estimates from above, including the assumption that the database will be under heavy workloads 20% of the time, the monthly cost breakdown would be as follows: + +**Idle cost**: 1 NCU * 19.2 hours * 0.16 USD * 30 days ≈ 93 USD\ +**Scaled-up cost**: 10 NCUs * 4.8 hours * 0.16 USD * 30 days ≈ 231 USD\ +**Request cost**: 10 million * 0.0000002 = 2 USD\ +**Storage cost**: 50 GB * 0.1 USD = 5 USD\ +=====\ +**Total cost per month**: 93 + 231 + 2 + 5 = **331 USD** + +This calculation assumes we will only be running a single serverless instance. For high availability, we would need to provision read-only replicas across several availability zones, which would incur additional costs. + +## Neptune cluster + +AWS offers many instance types to choose from. Some relevant general-purpose instances and their associated costs are included in the table below. + +| Instance type | Cost per month | +|--------------------------------------|----------------| +| db.r6g.large (16 GB memory, 2 CPU) | ~240 USD | +| db.r6g.xlarge (32 GB memory, 4 CPU) | ~480 USD | +| db.r6g.2xlarge (64 GB memory, 8 CPU) | ~960 USD | + +Based on the capacity calculations above, the instance type which is most likely to fit our use case costs ~480 USD per month. Request and storage costs are the same as in the serverless calculation above. + +**Instance cost**: ~480 USD\ +**Request cost**: 10 million * 0.0000002 = 2 USD\ +**Storage cost**: 50 GB * 0.1 USD = 5 USD\ +=====\ +**Total cost per month**: 480 + 2 + 5 = **487 USD** + +This calculation assumes we will only be running a single instance. For high availability, we would need to provision read-only replicas across several availability zones, which would incur additional costs. + +## Neo4j fully managed (AuraDB) + +Neo4j offers a fully managed database (called AuraDB) with fixed monthly costs based on plan and configuration. There are +two paid plans — _Professional_ and _Business Critical_. The Business Critical plan offers high availability (with a 99.95% SLA) and premium 24x7 support. The Professional plan makes no availability guarantees. + +See table below for relevant configurations and their associated costs. + +| Configuration | Cost per month (Professional plan) | Cost per month (Business critical plan) | +|------------------------------------|------------------------------------|-----------------------------------------| +| 16 GB memory, 3 CPU, 32 GB storage | ~1,051 USD | 2,336 USD | +| 24 GB memory, 5 CPU, 48 GB storage | ~1,577 USD | - | +| 32 GB memory, 6 CPU, 64 GB storage | ~2,102 USD | 4,672 USD | + +Based on the capacity calculations above, the configuration which is most likely to fit our use case costs **1,577 USD per month**. Additionally, it is unclear whether a fully managed AuraDB hosted in AWS can be easily integrated with our VPC to prevent data egress charges. + +Unlike Neptune, storage and compute scale together — if we needed more storage, we would need to pay for more memory and CPUs too. + +## Neo4j self-managed + +Neo4j offers two self-managed options — **Community Edition** and **Enterprise Edition**. They do not publicly disclose Enterprise Edition pricing, so +this calculation assumes we will use the Community Edition, which is free (but comes with limited features). + +We can host the database on an EC2 instance. One suitable instance type is *r6g.xlarge* (which is the EC2 equivalent of the *db.r6g.xlarge* type chosen in the Neptune cluster section above). + +With this instance, the cost breakdown would be as follows:\ +**Instance cost (on-demand)**: ~147 USD\ +**Storage costs (gp3 SSD)**: ~50 USD (highly dependent on throughput, IOPS, and snapshot frequency)\ +=====\ +**Total cost per month**: 147 + 50 = **197 USD** + +# Conclusion + +Based on the calculations above, hosting a self-managed Neo4j database in an EC2 instance comes with the lowest infrastructure costs (at approximately 200 USD per month). +However, this option comes with significant operational overhead in the form of software updates and security patches of the underlying operating system. Additionally, +scaling the database (either vertically by migrating to a larger instance, or horizontally by creating read replicas) would be cumbersome and time-consuming. + +Taking operational overhead into account, Neptune serverless is most likely a better fit. It is by far the most flexible option, incurring minimal costs when not in use and automatically scaling up to meet spiky workloads. +Additionally, its estimated costs are only about 50% more (~100 USD per month) than the self-managed option. \ No newline at end of file From b5e031b126e79ae76c1a2b9d74fbcef1ea7e7427 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 5 Dec 2024 15:55:10 +0000 Subject: [PATCH 003/310] Add a Jupyter notebook for running experimental queries --- scripts/neptune_experimental_queries.ipynb | 131 +++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 scripts/neptune_experimental_queries.ipynb diff --git a/scripts/neptune_experimental_queries.ipynb b/scripts/neptune_experimental_queries.ipynb new file mode 100644 index 0000000000..94a9574362 --- /dev/null +++ b/scripts/neptune_experimental_queries.ipynb @@ -0,0 +1,131 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "import requests\n", + "import boto3\n", + "from botocore.auth import SigV4Auth\n", + "from botocore.awsrequest import AWSRequest\n", + "\n", + "session = boto3.Session(profile_name=\"platform-developer\")\n", + "\n", + "\n", + "def _get_secret(secret_name: str):\n", + " secrets_manager_client = session.client('secretsmanager', region_name='eu-west-1')\n", + " response = secrets_manager_client.get_secret_value(SecretId=secret_name)\n", + "\n", + " return response['SecretString']\n", + "\n", + "# The experimental database is accessible from outside of the VPC via a Network Load Balancer (NLB) for testing purposes\n", + "LOAD_BALANCER_URL = _get_secret(\"NeptuneTest/LoadBalancerUrl\")\n", + "NEPTUNE_INSTANCE_ENDPOINT = _get_secret(\"NeptuneTest/InstanceEndpoint\")\n", + "\n", + "def run_open_cypher_query(query: str):\n", + " \"\"\"Run a Cypher query against an experimental serverless Neptune cluster\"\"\"\n", + " open_cypher_endpoint_url = f'{LOAD_BALANCER_URL}/openCypher'\n", + "\n", + " headers = {\n", + " \"Host\": NEPTUNE_INSTANCE_ENDPOINT,\n", + " \"Content-Type\": \"application/json\"\n", + " }\n", + " payload = {\"query\": query}\n", + " \n", + " # We use IAM database authentication, which means we need to authenticate the request using AWS Signature\n", + " request = AWSRequest(method=\"POST\", url=open_cypher_endpoint_url, data=json.dumps(payload), headers=headers)\n", + " SigV4Auth(session.get_credentials(), \"neptune-db\", \"eu-west-1\" ).add_auth(request)\n", + "\n", + " # We need to send a manual request rather than using boto3 since we are accessing the instance via a NLB\n", + " response = requests.post(\n", + " open_cypher_endpoint_url,\n", + " data=json.dumps(payload),\n", + " headers=dict(request.headers),\n", + " # We are using the default NLB DNS name, which does not support custom SSL certificates, so we need to disable SSL certificate verification.\n", + " # This increases the risks of a man-in-the-middle attack, which is acceptable for a testing database.\n", + " # In production, we will be connecting to the database directly from within the VPC.\n", + " verify=False\n", + " )\n", + "\n", + " if response.status_code != 200:\n", + " raise Exception(response.content)\n", + "\n", + " return response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "CREATE (n:Person {name: 'Alice', age: 30, city: 'New York'})\n", + "RETURN n\n", + "\"\"\"\n", + "\n", + "run_open_cypher_query(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/urllib3/connectionpool.py:1061: InsecureRequestWarning: Unverified HTTPS request is being made to host 'neptune-test-1de05f4795593a07.elb.eu-west-1.amazonaws.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "{'results': [{'n': {'~id': '22dcce2d-4577-416a-a07a-11c93b0cf734',\n", + " '~entityType': 'node',\n", + " '~labels': ['Person'],\n", + " '~properties': {'age': 30, 'city': 'New York', 'name': 'Alice'}}}]}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + "MATCH (n) RETURN n LIMIT 10\n", + "\"\"\"\n", + "\n", + "run_open_cypher_query(query)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From bfd067819d1bc31e6be8d3e7ca93e74c904c0d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 6 Dec 2024 11:51:22 +0000 Subject: [PATCH 004/310] Define infrastructure in Terraform --- .gitignore | 2 + scripts/neptune_experimental_queries.ipynb | 59 ++++++++++++------ terraform/.terraform.lock.hcl | 24 ++++++++ terraform/load_balancer.tf | 71 ++++++++++++++++++++++ terraform/locals.tf | 9 +++ terraform/neptune.tf | 37 +++++++++++ terraform/provider.tf | 7 +++ terraform/terraform.tf | 23 +++++++ 8 files changed, 213 insertions(+), 19 deletions(-) create mode 100644 terraform/.terraform.lock.hcl create mode 100644 terraform/load_balancer.tf create mode 100644 terraform/locals.tf create mode 100644 terraform/neptune.tf create mode 100644 terraform/provider.tf create mode 100644 terraform/terraform.tf diff --git a/.gitignore b/.gitignore index 2e75f7f963..90a6243bee 100644 --- a/.gitignore +++ b/.gitignore @@ -376,3 +376,5 @@ tags [._]*.un~ # End of https://www.toptal.com/developers/gitignore/api/osx,vim,java,linux,python,intellij + +.terraform/ diff --git a/scripts/neptune_experimental_queries.ipynb b/scripts/neptune_experimental_queries.ipynb index 94a9574362..a210e5ff44 100644 --- a/scripts/neptune_experimental_queries.ipynb +++ b/scripts/neptune_experimental_queries.ipynb @@ -2,8 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-06T12:15:06.916144Z", + "start_time": "2024-12-06T12:15:05.140532Z" + } + }, "outputs": [], "source": [ "import json\n", @@ -13,6 +18,9 @@ "from botocore.auth import SigV4Auth\n", "from botocore.awsrequest import AWSRequest\n", "\n", + "import urllib3\n", + "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", + "\n", "session = boto3.Session(profile_name=\"platform-developer\")\n", "\n", "\n", @@ -73,27 +81,19 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-06T12:15:08.969Z", + "start_time": "2024-12-06T12:15:07.398518Z" + } + }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/urllib3/connectionpool.py:1061: InsecureRequestWarning: Unverified HTTPS request is being made to host 'neptune-test-1de05f4795593a07.elb.eu-west-1.amazonaws.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, { "data": { - "text/plain": [ - "{'results': [{'n': {'~id': '22dcce2d-4577-416a-a07a-11c93b0cf734',\n", - " '~entityType': 'node',\n", - " '~labels': ['Person'],\n", - " '~properties': {'age': 30, 'city': 'New York', 'name': 'Alice'}}}]}" - ] + "text/plain": "{'results': [{'n': {'~id': '22dcce2d-4577-416a-a07a-11c93b0cf734',\n '~entityType': 'node',\n '~labels': ['Person'],\n '~properties': {'age': 30, 'city': 'New York', 'name': 'Alice'}}}]}" }, - "execution_count": 14, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -105,6 +105,27 @@ "\n", "run_open_cypher_query(query)" ] + }, + { + "cell_type": "code", + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-12-06T11:37:08.297024Z", + "start_time": "2024-12-06T11:37:08.294583Z" + } + }, + "execution_count": 10 + }, + { + "cell_type": "code", + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } } ], "metadata": { diff --git a/terraform/.terraform.lock.hcl b/terraform/.terraform.lock.hcl new file mode 100644 index 0000000000..ffb8432e70 --- /dev/null +++ b/terraform/.terraform.lock.hcl @@ -0,0 +1,24 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.80.0" + hashes = [ + "h1:q2pqSs7uPWvxunrBYjyirXARlxFIoxn2Lju42uJbxk4=", + "zh:0b1655e39639d60f2de2860a5df8642f9556ba0ca04529c1b861fde4935cb0df", + "zh:13dc0155e0a11edceee29ce687fc04c5a5a85f3324c67556472713cfd52e5807", + "zh:180f6cb2be44be14cfe329e0649121b774319f083b6e4e8fb749f85090d73121", + "zh:3158d44b74c67465f7f19f22c42b643840c8d18ce833e2ec86e8d93085b06926", + "zh:6351b5bf7cde5dc83e926944891570636069e05ca43341f4d1feda67773469bf", + "zh:6fa9db1532096ba50e842d369b6688979306d2295c7ead49b8a266b0d60962cc", + "zh:85d2fe75def7619ff2cc29102048875039cad088fafb62ecc14c3763e7b1e9d9", + "zh:9028d653f1d7341c6dfe2afe961b6541581e9043a474eac2faf90e6426a24f6d", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:9c4e248c442bc60f07f9f089e5361f19936833370dc3c04b27916672b765f0e1", + "zh:a710a3979596e3f3938c3ec6bb748e604724d3a4afa96ed2c14f0a245cc41a11", + "zh:c27936bdf447779d0c0833bf52a9ef618985f5ea8e3e243d6266513520ca31c4", + "zh:c7681134a123486e72eaedc3f8d2d75e267dbbfd45fa7de5aea8f757af57f89b", + "zh:ea717ebad3561fd02591f9eecf30f3df5635405556fba2bdbf29fd42691bebac", + "zh:f4e1e8f23c58c3e8f4371f9c3379a723ab4155246e6b6daad8eb99e16666b2cb", + ] +} diff --git a/terraform/load_balancer.tf b/terraform/load_balancer.tf new file mode 100644 index 0000000000..0d1301fd07 --- /dev/null +++ b/terraform/load_balancer.tf @@ -0,0 +1,71 @@ +# A Network Load Balancer for accessing the Neptune cluster from outside of the VPC. +# See https://aws-samples.github.io/aws-dbs-refarch-graph/src/connecting-using-a-load-balancer/. +# TODO: This only exists for testing purposes and should be destroyed before we switch to production. + +resource "aws_lb" "neptune_experimental_network_lb" { + name = "neptune-test" + internal = false + load_balancer_type = "network" + security_groups = [aws_security_group.neptune_lb_security_group.id] + subnets = local.public_subnets +} + +# Create a new target group and attach the IP of the Neptune cluster +resource "aws_lb_target_group" "neptune_instance" { + name = "neptune-test-cluster" + port = 8182 + protocol = "TCP" + vpc_id = data.aws_vpc.vpc.id + target_type = "ip" +} + +resource "aws_lb_target_group_attachment" "neptune_instance_attachment" { + target_group_arn = aws_lb_target_group.neptune_instance.arn + # Hardcode the private IP of the Neptune cluster. AWS does not guarantee that the IP will stay static, so we might + # have to manually change this from time to time. I think this is okay for an experimental database, and overall + # this setup is still more convenient than only being able to connect from within the VPC. + # If it starts bothering us, we can create a Lambda function for dynamically updating the target group IP, as outlined + # here: https://aws-samples.github.io/aws-dbs-refarch-graph/src/connecting-using-a-load-balancer/ + target_id = "172.42.180.173" +} + + +# Forward traffic to the Neptune target group +resource "aws_lb_listener" "listener" { + load_balancer_arn = aws_lb.neptune_experimental_network_lb.arn + port = "8182" + protocol = "TCP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.neptune_instance.arn + } +} + +# Create a security group allowing all ingress traffic. Limit egress traffic to the developer VPC. +resource "aws_security_group" "neptune_lb_security_group" { + name = "neptune-load-balancer" + vpc_id = data.aws_vpc.vpc.id +} + +resource "aws_vpc_security_group_ingress_rule" "neptune_lb_ingress" { + security_group_id = aws_security_group.neptune_lb_security_group.id + cidr_ipv4 = "0.0.0.0/0" + ip_protocol = "-1" +} + +resource "aws_vpc_security_group_egress_rule" "neptune_lb_egress" { + security_group_id = aws_security_group.neptune_lb_security_group.id + cidr_ipv4 = data.aws_vpc.vpc.cidr_block + ip_protocol = "-1" +} + +resource "aws_secretsmanager_secret" "neptune_nlb_url" { + name = "NeptuneTest/LoadBalancerUrl" +} + +resource "aws_secretsmanager_secret_version" "neptune_nlb_endpoint_url" { + secret_id = aws_secretsmanager_secret.neptune_nlb_url.id + secret_string = "https://${aws_lb.neptune_experimental_network_lb.dns_name}:8182" +} + diff --git a/terraform/locals.tf b/terraform/locals.tf new file mode 100644 index 0000000000..6909632e7c --- /dev/null +++ b/terraform/locals.tf @@ -0,0 +1,9 @@ +locals { + vpc_id = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_id + private_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_private_subnets + public_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_public_subnets +} + +data "aws_vpc" "vpc" { + id = local.vpc_id +} diff --git a/terraform/neptune.tf b/terraform/neptune.tf new file mode 100644 index 0000000000..70445cbe2f --- /dev/null +++ b/terraform/neptune.tf @@ -0,0 +1,37 @@ +resource "aws_neptune_cluster" "experimental_graph_cluster" { + cluster_identifier = "neptune-test" + engine = "neptune" + engine_version = "1.3.2.1" + neptune_cluster_parameter_group_name = "default.neptune1.3" + iam_database_authentication_enabled = true + apply_immediately = true + storage_encrypted = true + vpc_security_group_ids = [aws_security_group.neptune_security_group.id] + + # Set minimum capacity to 1 NCU, and maximum capacity to 16 NCUs. These are the minimum possible values. + serverless_v2_scaling_configuration { + min_capacity = 1 + max_capacity = 16 + } +} + +resource "aws_security_group" "neptune_security_group" { + name = "neptune-test" + vpc_id = data.aws_vpc.vpc.id +} + +# Only allow ingress traffic from the developer VPC +resource "aws_vpc_security_group_ingress_rule" "neptune_ingress" { + security_group_id = aws_security_group.neptune_security_group.id + cidr_ipv4 = data.aws_vpc.vpc.cidr_block + ip_protocol = "-1" +} + +resource "aws_secretsmanager_secret" "neptune_cluster_endpoint" { + name = "NeptuneTest/InstanceEndpoint" +} + +resource "aws_secretsmanager_secret_version" "neptune_cluster_endpoint_value" { + secret_id = aws_secretsmanager_secret.neptune_cluster_endpoint.id + secret_string = aws_neptune_cluster.experimental_graph_cluster.endpoint +} diff --git a/terraform/provider.tf b/terraform/provider.tf new file mode 100644 index 0000000000..588bca7bc4 --- /dev/null +++ b/terraform/provider.tf @@ -0,0 +1,7 @@ +provider "aws" { + region = "eu-west-1" + + assume_role { + role_arn = "arn:aws:iam::760097843905:role/platform-developer" + } +} diff --git a/terraform/terraform.tf b/terraform/terraform.tf new file mode 100644 index 0000000000..e6835e41c0 --- /dev/null +++ b/terraform/terraform.tf @@ -0,0 +1,23 @@ +terraform { + required_version = ">= 0.11" + + backend "s3" { + role_arn = "arn:aws:iam::760097843905:role/platform-developer" + + bucket = "wellcomecollection-platform-infra" + key = "terraform/catalogue/graph.tfstate" + dynamodb_table = "terraform-locktable" + region = "eu-west-1" + } +} + +data "terraform_remote_state" "aws_account_infrastructure" { + backend = "s3" + + config = { + role_arn = "arn:aws:iam::760097843905:role/platform-read_only" + bucket = "wellcomecollection-platform-infra" + key = "terraform/aws-account-infrastructure/platform.tfstate" + region = "eu-west-1" + } +} From a1a7abdbb8a3d5da3a743bbce71a3d3c920bcc9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 9 Dec 2024 20:31:41 +0000 Subject: [PATCH 005/310] Initial pipeline work --- .gitignore | 1 + __init__.py | 0 clients/__init__.py | 0 clients/cypher_client.py | 75 +++++++++++++++ clients/neptune_client.py | 50 ++++++++++ extractors/__init__.py | 0 extractors/loc_concepts_extractor.py | 137 +++++++++++++++++++++++++++ models/__init__.py | 0 models/graph_edge.py | 19 ++++ models/graph_node.py | 18 ++++ test.py | 34 +++++++ 11 files changed, 334 insertions(+) create mode 100644 __init__.py create mode 100644 clients/__init__.py create mode 100644 clients/cypher_client.py create mode 100644 clients/neptune_client.py create mode 100644 extractors/__init__.py create mode 100644 extractors/loc_concepts_extractor.py create mode 100644 models/__init__.py create mode 100644 models/graph_edge.py create mode 100644 models/graph_node.py create mode 100644 test.py diff --git a/.gitignore b/.gitignore index 2e75f7f963..9220e7e944 100644 --- a/.gitignore +++ b/.gitignore @@ -376,3 +376,4 @@ tags [._]*.un~ # End of https://www.toptal.com/developers/gitignore/api/osx,vim,java,linux,python,intellij +/terraform/.terraform diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/clients/__init__.py b/clients/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/clients/cypher_client.py b/clients/cypher_client.py new file mode 100644 index 0000000000..f179f95afe --- /dev/null +++ b/clients/cypher_client.py @@ -0,0 +1,75 @@ +from pydantic import BaseModel + +from .neptune_client import NeptuneClient +from models.graph_node import SourceConcept +from models.graph_edge import BaseEdge + + +class CypherClient: + def __init__(self, neptune_client: NeptuneClient): + self.neptune_client = neptune_client + + def _value_to_cypher_value(self, raw_value: any): + if isinstance(raw_value, str): + escaped = raw_value.replace("'", "\\'") + value = f"'{escaped}'" + elif isinstance(raw_value, bool): + value = str(raw_value).lower() + elif isinstance(raw_value, list): + # if len(raw_value) == 0: + # value = "null" + # else: + # Neptune does not support lists, so we convert them to a single string with a `||` separator + value = self._value_to_cypher_value("||".join(raw_value)) + elif raw_value is None: + value = "null" + else: + raise TypeError(f"""Cannot convert type {type(raw_value)} (with value {repr(raw_value)}) into a Cypher data + type. Use a different type or add support for type {type(raw_value)} to CypherClient.""") + + return value + + def _pydantic_object_to_cypher_map(self, pydantic_object: BaseModel): + properties = [] + + for key, raw_value in pydantic_object.items(): + value = self._value_to_cypher_value(raw_value) + properties.append(f"{key}: {value}") + + return "{" + ", ".join(properties) + "}" + + def create_source_concept_nodes(self, source_concepts: list[SourceConcept]): + all_fields = SourceConcept.__fields__.keys() + field_set = [f"n.{f} = data.{f}" for f in all_fields] + field_set_statement = ", ".join(field_set) + + cypher_maps = [self._pydantic_object_to_cypher_map(concept) for concept in source_concepts] + joined_cypher_maps = ",\n".join(cypher_maps) + + query = f""" + UNWIND [ + {joined_cypher_maps} + ] AS data + MERGE (n:SourceConcept {{source_id: data.source_id}}) + ON CREATE SET {field_set_statement} + ON MATCH SET {field_set_statement} + """ + + return self.neptune_client.run_open_cypher_query(query) + + def upsert_edges(self, edges: list[BaseEdge]): + from_type = edges[0].from_type + to_type = edges[0].to_type + relationship = edges[0].relationship + + joined_cypher_maps = ",\n".join([self._pydantic_object_to_cypher_map(edge) for edge in edges]) + query = f""" + UNWIND [ + {joined_cypher_maps} + ] AS data + MATCH (a:{from_type} {{source_id: data.from_id}}) + MATCH (b:{to_type} {{source_id: data.to_id}}) + MERGE (a)-[r:{relationship}]->(b) + """ + + return self.neptune_client.run_open_cypher_query(query) diff --git a/clients/neptune_client.py b/clients/neptune_client.py new file mode 100644 index 0000000000..46e8409940 --- /dev/null +++ b/clients/neptune_client.py @@ -0,0 +1,50 @@ +import json + +import requests +import boto3 +from botocore.auth import SigV4Auth +from botocore.awsrequest import AWSRequest + +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +class NeptuneClient: + def __init__(self, load_balancer_url: str, neptune_endpoint: str): + self.load_balancer_url = load_balancer_url + self.neptune_endpoint = neptune_endpoint + self.session = boto3.Session(profile_name="platform-developer") + + def _make_request(self, method: str, relative_url: str, payload: dict = None): + url = f"{self.load_balancer_url}{relative_url}" + headers = { + "Host": self.neptune_endpoint, + "Content-Type": "application/json" + } + data = json.dumps(payload) + + # We use IAM database authentication, which means we need to authenticate the request using AWS Signature + request = AWSRequest(method=method, url=url, data=data, headers=headers) + SigV4Auth(self.session.get_credentials(), "neptune-db", "eu-west-1").add_auth(request) + + # We need to send a manual request rather than using boto3 since we are accessing the instance via a NLB + # We are using the default NLB DNS name, which does not support custom SSL certificates, so we need to + # disable SSL certificate verification. This increases the risks of a man-in-the-middle attack, + # which is acceptable for a testing database. In production, we will be connecting to the database + # directly from within the VPC. + response = requests.request(method, url, data=data, headers=request.headers, verify=False) + + if response.status_code != 200: + raise Exception(response.content) + + return response.json() + + def run_open_cypher_query(self, query: str): + """Run a Cypher query against an experimental serverless Neptune cluster""" + payload = {"query": query} + response = self._make_request("POST", "/openCypher", payload) + return response['results'] + + def get_graph_summary(self): + return self._make_request("GET", "/propertygraph/statistics/summary", {})["payload"]['graphSummary'] diff --git a/extractors/__init__.py b/extractors/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/extractors/loc_concepts_extractor.py b/extractors/loc_concepts_extractor.py new file mode 100644 index 0000000000..ce104f28fe --- /dev/null +++ b/extractors/loc_concepts_extractor.py @@ -0,0 +1,137 @@ +from typing import Optional + +import requests +import json +import gzip + +from collections.abc import Generator +from models.graph_node import SourceConcept +from models.graph_edge import SourceConceptNarrowerThan, BaseEdge + + +class RawLibraryOfCongressConcept: + def __init__(self, raw_concept: dict): + self.raw_concept = raw_concept + self.source_id = self._extract_source_id() + self.raw_concept_node = self._extract_concept_node() + + @staticmethod + def remove_id_prefix(raw_id: str): + return raw_id.removeprefix("/authorities/subjects/").removeprefix("http://id.loc.gov/authorities/subjects/") + + def _extract_source_id(self): + return self.remove_id_prefix(self.raw_concept["@id"]) + + def _extract_concept_node(self): + graph = self.raw_concept["@graph"] + concept_nodes = [node for node in graph if + self.source_id in node.get("@id") and node["@type"] == "skos:Concept"] + + # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. + # When this happens, return `None` because there is no concept for us to extract. + if len(concept_nodes) == 0: + return None + + return concept_nodes[0] + + @staticmethod + def _extract_label(raw_label: str | dict): + # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. + if isinstance(raw_label, str): + return raw_label + + return raw_label["@value"] + + def _extract_preferred_label(self): + raw_preferred_label = self.raw_concept_node["skos:prefLabel"] + return self._extract_label(raw_preferred_label) + + def _extract_alternative_labels(self): + raw_alternative_labels = self.raw_concept_node.get("skos:altLabel", []) + + # Raw alternative labels are either returned in a list of labels, or as a single label + # in the same format as `skos:prefLabel` + if isinstance(raw_alternative_labels, list): + return [self._extract_label(item) for item in raw_alternative_labels] + + return [self._extract_label(raw_alternative_labels)] + + def _extract_broader_concepts(self): + broader_concepts = self.raw_concept_node.get('skos:broader', []) + + # Sometimes broader concepts are returned as a list of concepts, and sometimes as just a single JSON + if isinstance(broader_concepts, dict): + broader_concepts = [broader_concepts] + + return broader_concepts + + def extract_edges(self) -> list[BaseEdge]: + if self.raw_concept_node is None: + return [] + + broader_concepts = self._extract_broader_concepts() + broader_ids = [self.remove_id_prefix(concept["@id"]) for concept in broader_concepts] + + edges = [] + for broader_id in broader_ids: + edges.append(SourceConceptNarrowerThan(from_id=self.source_id, to_id=broader_id)) + + return edges + + def transform_to_source_concept(self): + """Transforms the raw LoC concept into a SourceConcept""" + if self.raw_concept_node is None: + return None + + label = self._extract_preferred_label() + alternative_labels = self._extract_alternative_labels() + + return SourceConcept(source_id=self.source_id, + label=label, + source="lc-subjects", + alternative_ids=[], + alternative_labels=alternative_labels, + description=None) + + +class LibraryOfCongressConceptsExtractor: + def __init__(self, concepts_gzip_url: str): + self.url = concepts_gzip_url + + def _stream_raw_concepts(self) -> Generator[dict]: + response = requests.get(self.url, stream=True) + + with gzip.GzipFile(fileobj=response.raw) as file: + for line_bytes in file: + yield json.loads(line_bytes.decode('utf8')) + + def extract_sample_nodes(self, number: Optional[int] = 10) -> Generator[SourceConcept]: + counter = 0 + + for raw_concept in self._stream_raw_concepts(): + source_concept = RawLibraryOfCongressConcept(raw_concept).transform_to_source_concept() + + if source_concept: + yield source_concept + + counter += 1 + if counter == number: + return + + def extract_all_nodes(self) -> Generator[SourceConcept]: + return self.extract_sample_nodes(None) + + def extract_sample_edges(self, number: Optional[int] = 10) -> Generator[BaseEdge]: + counter = 0 + + for raw_concept in self._stream_raw_concepts(): + edges = RawLibraryOfCongressConcept(raw_concept).extract_edges() + for edge in edges: + counter += 1 + yield edge + + if counter == number: + return + + def extract_all_edges(self) -> Generator[BaseEdge]: + return self.extract_sample_edges(None) diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/models/graph_edge.py b/models/graph_edge.py new file mode 100644 index 0000000000..da0d5fddee --- /dev/null +++ b/models/graph_edge.py @@ -0,0 +1,19 @@ +from pydantic import BaseModel + + +class BaseEdge(BaseModel): + from_type: str + to_type: str + from_id: str + to_id: str + relationship: str + directed: bool + + +class SourceConceptNarrowerThan(BaseEdge): + from_type: str = "SourceConcept" + to_type: str = "SourceConcept" + from_id: str + to_id: str + relationship: str = "NARROWER_THAN" + directed: bool = True diff --git a/models/graph_node.py b/models/graph_node.py new file mode 100644 index 0000000000..2bad222758 --- /dev/null +++ b/models/graph_node.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel +from typing import Literal, Optional + + +class SourceConcept(BaseModel): + source_id: str # unique identifier provided by the source vocabulary + label: str # label/preferred term from source vocabulary + source: Literal["nlm-mesh", "lc-subjects", "wikidata"] + alternative_ids: list[str] # for example MeSH tree numbers or other identifiers from Wikidata + alternative_labels: list[str] # LoC variants, MeSH concepts other than preferred term + description: Optional[str] # Concept description, such as MeSH scope note or Wikidata description + + +class Concept(BaseModel): + wellcome_id: str + label: str + type: Literal["Person", "Concept": "Organisation", "Place", "Agent", "Meeting", "Genre", "Period"] + source: Literal["label-derived", "nlm-mesh", "lc-subjects", "lc-names", "viaf", "fihrist"] diff --git a/test.py b/test.py new file mode 100644 index 0000000000..c21cd02a71 --- /dev/null +++ b/test.py @@ -0,0 +1,34 @@ +import boto3 + +from extractors.loc_concepts_extractor import LibraryOfCongressConceptsExtractor +from clients.neptune_client import NeptuneClient +from clients.cypher_client import CypherClient + +import itertools + + +def _get_secret(secret_name: str): + session = boto3.Session(profile_name="platform-developer") + secrets_manager_client = session.client('secretsmanager', region_name='eu-west-1') + response = secrets_manager_client.get_secret_value(SecretId=secret_name) + + return response['SecretString'] + + +url = "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" +loc_extractor = LibraryOfCongressConceptsExtractor(url) + +neptune_client = NeptuneClient(_get_secret("NeptuneTest/LoadBalancerUrl"), _get_secret("NeptuneTest/InstanceEndpoint")) +cypher_client = CypherClient(neptune_client) + +sample_nodes = loc_extractor.extract_sample_nodes(3000) +sample_edges = loc_extractor.extract_sample_edges(300) + +while True: + chunk = list(itertools.islice(sample_edges, 1)) + if chunk: + print(cypher_client.upsert_edges(chunk)) + #print(cypher_client.create_source_concept_nodes(chunk)) + else: + break + From eb4b00d09f38776113eae3f2ceda2a04f8d581b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 10 Dec 2024 08:37:38 +0000 Subject: [PATCH 006/310] Add README --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000000..7fc9479d43 --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +# Catalogue Graph + +Experimental repository for building a knowledge graph from catalogue concepts and works. The main goals are to: +* Experiment with AWS Neptune and explore its features and potential limitations +* Provision infrastructure for a knowledge graph pipeline +* Write an initial pipeline in Python for populating the knowledge graph with concepts (LoC, MeSH, Wikidata) + +Eventually the contents of this repository might be merged into another repository, such as catalogue-pipeline or concepts-pipeline. + +See the following RFCs for more context: +* [RFC 062: Wellcome Collection Graph overview and next steps](https://github.com/wellcomecollection/docs/tree/main/rfcs/062-knowledge-graph) +* [RFC 064: Graph data model](https://github.com/wellcomecollection/docs/blob/rfc-064-graph-model/rfcs/064-graph-data-model/README.md) From 156d2cc9ea6561ce89bbbdd5d84bbc7079e28db8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 10 Dec 2024 09:23:34 +0000 Subject: [PATCH 007/310] Refactoring & formatting --- clients/cypher_client.py | 75 ---------- create_zip.sh | 12 ++ models/graph_node.py | 18 --- scripts/neptune_experimental_queries.ipynb | 45 ++++-- __init__.py => src/clients/__init__.py | 0 .../clients/local_neptune_client.py | 5 +- src/clients/neptune_client.py | 15 ++ src/extractor.py | 62 ++++++++ {clients => src/extractors/loc}/__init__.py | 0 .../extractors/loc/concepts_extractor.py | 117 +++++++-------- src/indexer.py | 33 +++++ {extractors => src/models}/__init__.py | 0 {models => src/models}/graph_edge.py | 0 src/models/graph_node.py | 39 +++++ {models => src/query_builders}/__init__.py | 0 src/query_builders/cypher.py | 78 ++++++++++ src/requirements.in | 3 + src/requirements.txt | 44 ++++++ src/sources/__init__.py | 0 src/sources/gzip.py | 16 +++ terraform/extractor_lambda.tf | 33 +++++ terraform/indexer_lambda.tf | 133 ++++++++++++++++++ terraform/load_balancer.tf | 2 +- terraform/neptune.tf | 20 ++- test.py | 34 ----- 25 files changed, 579 insertions(+), 205 deletions(-) delete mode 100644 clients/cypher_client.py create mode 100644 create_zip.sh delete mode 100644 models/graph_node.py rename __init__.py => src/clients/__init__.py (100%) rename clients/neptune_client.py => src/clients/local_neptune_client.py (93%) create mode 100644 src/clients/neptune_client.py create mode 100644 src/extractor.py rename {clients => src/extractors/loc}/__init__.py (100%) rename extractors/loc_concepts_extractor.py => src/extractors/loc/concepts_extractor.py (60%) create mode 100644 src/indexer.py rename {extractors => src/models}/__init__.py (100%) rename {models => src/models}/graph_edge.py (100%) create mode 100644 src/models/graph_node.py rename {models => src/query_builders}/__init__.py (100%) create mode 100644 src/query_builders/cypher.py create mode 100644 src/requirements.in create mode 100644 src/requirements.txt create mode 100644 src/sources/__init__.py create mode 100644 src/sources/gzip.py create mode 100644 terraform/extractor_lambda.tf create mode 100644 terraform/indexer_lambda.tf delete mode 100644 test.py diff --git a/clients/cypher_client.py b/clients/cypher_client.py deleted file mode 100644 index f179f95afe..0000000000 --- a/clients/cypher_client.py +++ /dev/null @@ -1,75 +0,0 @@ -from pydantic import BaseModel - -from .neptune_client import NeptuneClient -from models.graph_node import SourceConcept -from models.graph_edge import BaseEdge - - -class CypherClient: - def __init__(self, neptune_client: NeptuneClient): - self.neptune_client = neptune_client - - def _value_to_cypher_value(self, raw_value: any): - if isinstance(raw_value, str): - escaped = raw_value.replace("'", "\\'") - value = f"'{escaped}'" - elif isinstance(raw_value, bool): - value = str(raw_value).lower() - elif isinstance(raw_value, list): - # if len(raw_value) == 0: - # value = "null" - # else: - # Neptune does not support lists, so we convert them to a single string with a `||` separator - value = self._value_to_cypher_value("||".join(raw_value)) - elif raw_value is None: - value = "null" - else: - raise TypeError(f"""Cannot convert type {type(raw_value)} (with value {repr(raw_value)}) into a Cypher data - type. Use a different type or add support for type {type(raw_value)} to CypherClient.""") - - return value - - def _pydantic_object_to_cypher_map(self, pydantic_object: BaseModel): - properties = [] - - for key, raw_value in pydantic_object.items(): - value = self._value_to_cypher_value(raw_value) - properties.append(f"{key}: {value}") - - return "{" + ", ".join(properties) + "}" - - def create_source_concept_nodes(self, source_concepts: list[SourceConcept]): - all_fields = SourceConcept.__fields__.keys() - field_set = [f"n.{f} = data.{f}" for f in all_fields] - field_set_statement = ", ".join(field_set) - - cypher_maps = [self._pydantic_object_to_cypher_map(concept) for concept in source_concepts] - joined_cypher_maps = ",\n".join(cypher_maps) - - query = f""" - UNWIND [ - {joined_cypher_maps} - ] AS data - MERGE (n:SourceConcept {{source_id: data.source_id}}) - ON CREATE SET {field_set_statement} - ON MATCH SET {field_set_statement} - """ - - return self.neptune_client.run_open_cypher_query(query) - - def upsert_edges(self, edges: list[BaseEdge]): - from_type = edges[0].from_type - to_type = edges[0].to_type - relationship = edges[0].relationship - - joined_cypher_maps = ",\n".join([self._pydantic_object_to_cypher_map(edge) for edge in edges]) - query = f""" - UNWIND [ - {joined_cypher_maps} - ] AS data - MATCH (a:{from_type} {{source_id: data.from_id}}) - MATCH (b:{to_type} {{source_id: data.to_id}}) - MERGE (a)-[r:{relationship}]->(b) - """ - - return self.neptune_client.run_open_cypher_query(query) diff --git a/create_zip.sh b/create_zip.sh new file mode 100644 index 0000000000..49f578edbd --- /dev/null +++ b/create_zip.sh @@ -0,0 +1,12 @@ +# Create a temporary directory to hold source code and packages +mkdir -p temp + +cp -r src/* temp/ +pip3 install -r src/requirements.txt --platform manylinux2014_x86_64 --target temp/ --only-binary=:all: --python-version 3.10 + +cd temp +zip -r ../build.zip . +cd .. + +# Clean up the temporary build directory +rm -rf temp diff --git a/models/graph_node.py b/models/graph_node.py deleted file mode 100644 index 2bad222758..0000000000 --- a/models/graph_node.py +++ /dev/null @@ -1,18 +0,0 @@ -from pydantic import BaseModel -from typing import Literal, Optional - - -class SourceConcept(BaseModel): - source_id: str # unique identifier provided by the source vocabulary - label: str # label/preferred term from source vocabulary - source: Literal["nlm-mesh", "lc-subjects", "wikidata"] - alternative_ids: list[str] # for example MeSH tree numbers or other identifiers from Wikidata - alternative_labels: list[str] # LoC variants, MeSH concepts other than preferred term - description: Optional[str] # Concept description, such as MeSH scope note or Wikidata description - - -class Concept(BaseModel): - wellcome_id: str - label: str - type: Literal["Person", "Concept": "Organisation", "Place", "Agent", "Meeting", "Genre", "Period"] - source: Literal["label-derived", "nlm-mesh", "lc-subjects", "lc-names", "viaf", "fihrist"] diff --git a/scripts/neptune_experimental_queries.ipynb b/scripts/neptune_experimental_queries.ipynb index a210e5ff44..ae2af04ada 100644 --- a/scripts/neptune_experimental_queries.ipynb +++ b/scripts/neptune_experimental_queries.ipynb @@ -2,11 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 24, + "execution_count": 59, "metadata": { "ExecuteTime": { - "end_time": "2024-12-06T12:15:06.916144Z", - "start_time": "2024-12-06T12:15:05.140532Z" + "end_time": "2024-12-10T09:46:24.193547Z", + "start_time": "2024-12-10T09:46:22.627832Z" } }, "outputs": [], @@ -81,26 +81,27 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 67, "metadata": { "ExecuteTime": { - "end_time": "2024-12-06T12:15:08.969Z", - "start_time": "2024-12-06T12:15:07.398518Z" + "end_time": "2024-12-10T09:51:35.288073Z", + "start_time": "2024-12-10T09:51:34.570581Z" } }, "outputs": [ { "data": { - "text/plain": "{'results': [{'n': {'~id': '22dcce2d-4577-416a-a07a-11c93b0cf734',\n '~entityType': 'node',\n '~labels': ['Person'],\n '~properties': {'age': 30, 'city': 'New York', 'name': 'Alice'}}}]}" + "text/plain": "{'results': []}" }, - "execution_count": 25, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", - "MATCH (n) RETURN n LIMIT 10\n", + "MATCH (person:SourceConcept {id: \"sh00000011\"})-[:NARROWER_THAN]->(friend)\n", + "RETURN friend;\n", "\"\"\"\n", "\n", "run_open_cypher_query(query)" @@ -108,16 +109,32 @@ }, { "cell_type": "code", - "outputs": [], - "source": [], + "outputs": [ + { + "data": { + "text/plain": "{'results': []}" + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + "MATCH (person:SourceConcept {id: \"sh00000011\"})\n", + "RETURN person\n", + "\"\"\"\n", + "\n", + "run_open_cypher_query(query)" + ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-12-06T11:37:08.297024Z", - "start_time": "2024-12-06T11:37:08.294583Z" + "end_time": "2024-12-10T09:57:24.678422Z", + "start_time": "2024-12-10T09:57:24.401967Z" } }, - "execution_count": 10 + "execution_count": 72 }, { "cell_type": "code", diff --git a/__init__.py b/src/clients/__init__.py similarity index 100% rename from __init__.py rename to src/clients/__init__.py diff --git a/clients/neptune_client.py b/src/clients/local_neptune_client.py similarity index 93% rename from clients/neptune_client.py rename to src/clients/local_neptune_client.py index 46e8409940..c0057a4f99 100644 --- a/clients/neptune_client.py +++ b/src/clients/local_neptune_client.py @@ -16,7 +16,7 @@ def __init__(self, load_balancer_url: str, neptune_endpoint: str): self.neptune_endpoint = neptune_endpoint self.session = boto3.Session(profile_name="platform-developer") - def _make_request(self, method: str, relative_url: str, payload: dict = None): + def _make_request(self, method: str, relative_url: str, payload: dict): url = f"{self.load_balancer_url}{relative_url}" headers = { "Host": self.neptune_endpoint, @@ -47,4 +47,5 @@ def run_open_cypher_query(self, query: str): return response['results'] def get_graph_summary(self): - return self._make_request("GET", "/propertygraph/statistics/summary", {})["payload"]['graphSummary'] + response = self._make_request("GET", "/propertygraph/statistics/summary", {}) + return response["payload"]['graphSummary'] diff --git a/src/clients/neptune_client.py b/src/clients/neptune_client.py new file mode 100644 index 0000000000..6a22c39e6b --- /dev/null +++ b/src/clients/neptune_client.py @@ -0,0 +1,15 @@ +import boto3 + + +class NeptuneClient: + def __init__(self, neptune_endpoint: str): + endpoint_url = f"https://{neptune_endpoint}:8182" + self.client = boto3.client("neptunedata", endpoint_url=endpoint_url) + + def run_open_cypher_query(self, query: str): + """Run a Cypher query against the Neptune cluster""" + response = self.client.execute_open_cypher_query(openCypherQuery=query) + return response["results"] + + def get_graph_summary(self): + return self.client.get_propertygraph_summary(mode="detailed")["payload"] diff --git a/src/extractor.py b/src/extractor.py new file mode 100644 index 0000000000..5e1dac635e --- /dev/null +++ b/src/extractor.py @@ -0,0 +1,62 @@ +import itertools +import json +from collections.abc import Generator +import enum +from typing import Literal + +import boto3 + +from extractors.loc.concepts_extractor import LibraryOfCongressConceptsExtractor +import query_builders.cypher as cypher + + +CHUNK_SIZE = 100 +LOC_SH_URL = "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" + + +def publish_to_sns(query: str): + client = boto3.client("sns") + client.publish( + TopicArn="arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries", + Message=json.dumps({"default": query}), + MessageStructure="json", + ) + + +def _generator_to_chunks(items: Generator): + while True: + chunk = list(itertools.islice(items, CHUNK_SIZE)) + if chunk: + yield chunk + else: + return + + +class GraphExtractorType(enum.Enum): + LOC_SH = LibraryOfCongressConceptsExtractor(LOC_SH_URL) + LOC_LOCATION = LibraryOfCongressConceptsExtractor(LOC_SH_URL) + + +def extract_all( + extractor_type: GraphExtractorType, entity_type: Literal["nodes", "edges"] +): + extractor = GraphExtractorType[extractor_type.name].value + + if entity_type == "nodes": + entities = extractor.extract_nodes() + elif entity_type == "edges": + entities = extractor.extract_edges() + else: + raise ValueError("Unsupported entity type.") + + for chunk in _generator_to_chunks(entities): + query = cypher.construct_upsert_nodes_query(chunk) + publish_to_sns(query) + + +def lambda_handler(event: dict, context): + extract_all(GraphExtractorType.LOC_SH, "edges") + + +if __name__ == "__main__": + lambda_handler({}, None) diff --git a/clients/__init__.py b/src/extractors/loc/__init__.py similarity index 100% rename from clients/__init__.py rename to src/extractors/loc/__init__.py diff --git a/extractors/loc_concepts_extractor.py b/src/extractors/loc/concepts_extractor.py similarity index 60% rename from extractors/loc_concepts_extractor.py rename to src/extractors/loc/concepts_extractor.py index ce104f28fe..c421b1d8c9 100644 --- a/extractors/loc_concepts_extractor.py +++ b/src/extractors/loc/concepts_extractor.py @@ -1,12 +1,7 @@ -from typing import Optional - -import requests -import json -import gzip - from collections.abc import Generator from models.graph_node import SourceConcept from models.graph_edge import SourceConceptNarrowerThan, BaseEdge +from sources.gzip import GZipSource class RawLibraryOfCongressConcept: @@ -15,67 +10,76 @@ def __init__(self, raw_concept: dict): self.source_id = self._extract_source_id() self.raw_concept_node = self._extract_concept_node() - @staticmethod + @staticmethod def remove_id_prefix(raw_id: str): - return raw_id.removeprefix("/authorities/subjects/").removeprefix("http://id.loc.gov/authorities/subjects/") - + return raw_id.removeprefix("/authorities/subjects/").removeprefix( + "http://id.loc.gov/authorities/subjects/" + ) + def _extract_source_id(self): return self.remove_id_prefix(self.raw_concept["@id"]) def _extract_concept_node(self): graph = self.raw_concept["@graph"] - concept_nodes = [node for node in graph if - self.source_id in node.get("@id") and node["@type"] == "skos:Concept"] - + concept_nodes = [ + node + for node in graph + if self.source_id in node.get("@id") and node["@type"] == "skos:Concept" + ] + # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. # When this happens, return `None` because there is no concept for us to extract. if len(concept_nodes) == 0: return None - + return concept_nodes[0] - + @staticmethod def _extract_label(raw_label: str | dict): - # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. + # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. if isinstance(raw_label, str): return raw_label return raw_label["@value"] - + def _extract_preferred_label(self): raw_preferred_label = self.raw_concept_node["skos:prefLabel"] return self._extract_label(raw_preferred_label) def _extract_alternative_labels(self): raw_alternative_labels = self.raw_concept_node.get("skos:altLabel", []) - - # Raw alternative labels are either returned in a list of labels, or as a single label + + # Raw alternative labels are either returned in a list of labels, or as a single label # in the same format as `skos:prefLabel` if isinstance(raw_alternative_labels, list): return [self._extract_label(item) for item in raw_alternative_labels] return [self._extract_label(raw_alternative_labels)] - + def _extract_broader_concepts(self): - broader_concepts = self.raw_concept_node.get('skos:broader', []) - + broader_concepts = self.raw_concept_node.get("skos:broader", []) + # Sometimes broader concepts are returned as a list of concepts, and sometimes as just a single JSON if isinstance(broader_concepts, dict): broader_concepts = [broader_concepts] - + return broader_concepts - + def extract_edges(self) -> list[BaseEdge]: if self.raw_concept_node is None: return [] - + broader_concepts = self._extract_broader_concepts() - broader_ids = [self.remove_id_prefix(concept["@id"]) for concept in broader_concepts] - + broader_ids = [ + self.remove_id_prefix(concept["@id"]) for concept in broader_concepts + ] + edges = [] for broader_id in broader_ids: - edges.append(SourceConceptNarrowerThan(from_id=self.source_id, to_id=broader_id)) - + edges.append( + SourceConceptNarrowerThan(from_id=self.source_id, to_id=broader_id) + ) + return edges def transform_to_source_concept(self): @@ -86,30 +90,31 @@ def transform_to_source_concept(self): label = self._extract_preferred_label() alternative_labels = self._extract_alternative_labels() - return SourceConcept(source_id=self.source_id, - label=label, - source="lc-subjects", - alternative_ids=[], - alternative_labels=alternative_labels, - description=None) + return SourceConcept( + id=self.source_id, + label=label, + source="lc-subjects", + alternative_ids=[], + alternative_labels=alternative_labels, + description=None, + ) class LibraryOfCongressConceptsExtractor: - def __init__(self, concepts_gzip_url: str): - self.url = concepts_gzip_url - - def _stream_raw_concepts(self) -> Generator[dict]: - response = requests.get(self.url, stream=True) - - with gzip.GzipFile(fileobj=response.raw) as file: - for line_bytes in file: - yield json.loads(line_bytes.decode('utf8')) - - def extract_sample_nodes(self, number: Optional[int] = 10) -> Generator[SourceConcept]: + def __init__(self, url: str): + self.source = GZipSource(url) + + def extract_nodes(self, number: int = None) -> Generator[SourceConcept]: + """ + Extracts and returns SourceConcept nodes from LoC Subject Headings. + Takes an optional parameter to only extract the first `number` nodes. + """ counter = 0 - for raw_concept in self._stream_raw_concepts(): - source_concept = RawLibraryOfCongressConcept(raw_concept).transform_to_source_concept() + for raw_concept in self.source.stream_raw(): + source_concept = RawLibraryOfCongressConcept( + raw_concept + ).transform_to_source_concept() if source_concept: yield source_concept @@ -118,20 +123,18 @@ def extract_sample_nodes(self, number: Optional[int] = 10) -> Generator[SourceCo if counter == number: return - def extract_all_nodes(self) -> Generator[SourceConcept]: - return self.extract_sample_nodes(None) - - def extract_sample_edges(self, number: Optional[int] = 10) -> Generator[BaseEdge]: + def extract_edges(self, number: int = None) -> Generator[BaseEdge]: + """ + Extracts and returns SourceConceptNarrowerThan edges from LoC Subject Headings. + Takes an optional parameter to only extract the first `number` edges. + """ counter = 0 - - for raw_concept in self._stream_raw_concepts(): + + for raw_concept in self.source.stream_raw(): edges = RawLibraryOfCongressConcept(raw_concept).extract_edges() for edge in edges: counter += 1 yield edge - + if counter == number: return - - def extract_all_edges(self) -> Generator[BaseEdge]: - return self.extract_sample_edges(None) diff --git a/src/indexer.py b/src/indexer.py new file mode 100644 index 0000000000..fae07046aa --- /dev/null +++ b/src/indexer.py @@ -0,0 +1,33 @@ +import boto3 +import json + +from clients.neptune_client import NeptuneClient + + +def _get_secret(secret_name: str): + secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") + response = secrets_manager_client.get_secret_value(SecretId=secret_name) + + return response["SecretString"] + + +def extract_sns_messages_from_sqs_event(event): + queries = [] + + for record in event["Records"]: + query = json.loads(record["body"])["Message"] + queries.append(query) + + return queries + + +def lambda_handler(event: dict, context): + queries = extract_sns_messages_from_sqs_event(event) + neptune_client = NeptuneClient(_get_secret("NeptuneTest/InstanceEndpoint")) + + for query in queries: + neptune_client.run_open_cypher_query(query) + + +if __name__ == "__main__": + lambda_handler({}, None) diff --git a/extractors/__init__.py b/src/models/__init__.py similarity index 100% rename from extractors/__init__.py rename to src/models/__init__.py diff --git a/models/graph_edge.py b/src/models/graph_edge.py similarity index 100% rename from models/graph_edge.py rename to src/models/graph_edge.py diff --git a/src/models/graph_node.py b/src/models/graph_node.py new file mode 100644 index 0000000000..8069fd4421 --- /dev/null +++ b/src/models/graph_node.py @@ -0,0 +1,39 @@ +from pydantic import BaseModel +from typing import Literal, Optional + + +class BaseNode(BaseModel): + id: str + + +class SourceConcept(BaseNode): + # Unique identifier provided by the source vocabulary + id: str + # Label/preferred term from source vocabulary + label: str + source: Literal["nlm-mesh", "lc-subjects", "wikidata"] + # For example MeSH tree numbers or other identifiers from Wikidata + alternative_ids: list[str] + # LoC variants, MeSH concepts other than preferred term + alternative_labels: list[str] + # Concept description, such as MeSH scope note or Wikidata description + description: Optional[str] + + +class Concept(BaseNode): + # Unique Wellcome identifier + id: str + label: str + type: Literal[ + "Person", + "Concept", + "Organisation", + "Place", + "Agent", + "Meeting", + "Genre", + "Period", + ] + source: Literal[ + "label-derived", "nlm-mesh", "lc-subjects", "lc-names", "viaf", "fihrist" + ] diff --git a/models/__init__.py b/src/query_builders/__init__.py similarity index 100% rename from models/__init__.py rename to src/query_builders/__init__.py diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py new file mode 100644 index 0000000000..498d0bb4eb --- /dev/null +++ b/src/query_builders/cypher.py @@ -0,0 +1,78 @@ +from pydantic import BaseModel + +from models.graph_edge import BaseEdge +from models.graph_node import BaseNode + + +def _value_to_cypher_value(raw_value: any): + if isinstance(raw_value, str): + escaped = raw_value.replace("'", "\\'") + value = f"'{escaped}'" + elif isinstance(raw_value, bool): + value = str(raw_value).lower() + elif isinstance(raw_value, list): + # Neptune does not support lists, so we convert them to a single string with a `||` separator + value = _value_to_cypher_value("||".join(raw_value)) + elif raw_value is None: + value = "null" + else: + raise TypeError( + f""" + Cannot convert type {type(raw_value)} (with value {repr(raw_value)}) into a Cypher data type. + Use a different type or add support for type {type(raw_value)} to CypherClient. + """ + ) + + return value + + +def _pydantic_object_to_cypher_map(pydantic_object: BaseModel): + properties = [] + + for key, raw_value in dict(pydantic_object).items(): + value = _value_to_cypher_value(raw_value) + properties.append(f"{key}: {value}") + + return "{" + ", ".join(properties) + "}" + + +def construct_upsert_nodes_query(source_concepts: list[BaseNode]): + model_name = type(source_concepts[0]).__name__ + all_fields = type(source_concepts[0]).model_fields.keys() + + field_set = [f"n.{f} = data.{f}" for f in all_fields] + field_set_statement = ", ".join(field_set) + + cypher_maps = [ + _pydantic_object_to_cypher_map(concept) for concept in source_concepts + ] + joined_cypher_maps = ",\n".join(cypher_maps) + + query = f""" + UNWIND [ + {joined_cypher_maps} + ] AS data + MERGE (n:{model_name} {{id: data.id}}) + ON CREATE SET {field_set_statement} + ON MATCH SET {field_set_statement} + """ + return query + + +def construct_upsert_edges_query(edges: list[BaseEdge]): + from_type = edges[0].from_type + to_type = edges[0].to_type + relationship = edges[0].relationship + + joined_cypher_maps = ",\n".join( + [_pydantic_object_to_cypher_map(edge) for edge in edges] + ) + query = f""" + UNWIND [ + {joined_cypher_maps} + ] AS data + MATCH (a:{from_type} {{id: data.from_id}}) + MATCH (b:{to_type} {{id: data.to_id}}) + MERGE (a)-[r:{relationship}]->(b) + """ + return query diff --git a/src/requirements.in b/src/requirements.in new file mode 100644 index 0000000000..4d604a314b --- /dev/null +++ b/src/requirements.in @@ -0,0 +1,3 @@ +boto3 +requests +pydantic diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000000..157235162d --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,44 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements.in +# +annotated-types==0.7.0 + # via pydantic +boto3==1.35.77 + # via -r requirements.in +botocore==1.35.77 + # via + # boto3 + # s3transfer +certifi==2024.8.30 + # via requests +charset-normalizer==3.4.0 + # via requests +idna==3.10 + # via requests +jmespath==1.0.1 + # via + # boto3 + # botocore +pydantic==2.10.3 + # via -r requirements.in +pydantic-core==2.27.1 + # via pydantic +python-dateutil==2.9.0.post0 + # via botocore +requests==2.32.3 + # via -r requirements.in +s3transfer==0.10.4 + # via boto3 +six==1.17.0 + # via python-dateutil +typing-extensions==4.12.2 + # via + # pydantic + # pydantic-core +urllib3==2.2.3 + # via + # botocore + # requests diff --git a/src/sources/__init__.py b/src/sources/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/sources/gzip.py b/src/sources/gzip.py new file mode 100644 index 0000000000..48b0fdedcd --- /dev/null +++ b/src/sources/gzip.py @@ -0,0 +1,16 @@ +import requests +from collections.abc import Generator +import gzip +import json + + +class GZipSource: + def __init__(self, url: str): + self.url = url + + def stream_raw(self) -> Generator[dict]: + response = requests.get(self.url, stream=True) + + with gzip.GzipFile(fileobj=response.raw) as file: + for line_bytes in file: + yield json.loads(line_bytes.decode("utf8")) diff --git a/terraform/extractor_lambda.tf b/terraform/extractor_lambda.tf new file mode 100644 index 0000000000..2c58f4693e --- /dev/null +++ b/terraform/extractor_lambda.tf @@ -0,0 +1,33 @@ +module "extractor_lambda" { + source = "git@github.com:wellcomecollection/terraform-aws-lambda?ref=v1.2.0" + + name = "catalogue-graph-extractor" + description = "Extracts source concepts and turns them into Cypher queries." + runtime = "python3.10" + + filename = "../build.zip" + source_code_hash = filesha256("../build.zip") + + handler = "extractor.lambda_handler" + memory_size = 128 + timeout = 60 // 1 minute + + # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] +} + +data "aws_iam_policy_document" "publish_to_queries_topic" { + statement { + actions = [ + "sns:Publish", + ] + + resources = [ + module.catalogue_graph_queries_topic.arn + ] + } +} + +resource "aws_iam_role_policy" "reindex_jobs_policy" { + role = module.extractor_lambda.lambda_role.name + policy = data.aws_iam_policy_document.publish_to_queries_topic.json +} diff --git a/terraform/indexer_lambda.tf b/terraform/indexer_lambda.tf new file mode 100644 index 0000000000..27f8858478 --- /dev/null +++ b/terraform/indexer_lambda.tf @@ -0,0 +1,133 @@ +module "indexer_lambda" { + source = "git@github.com:wellcomecollection/terraform-aws-lambda?ref=v1.2.0" + + name = "catalogue-graph-indexer" + description = "Indexes nodes and edges into the Neptune catalogue graph cluster." + runtime = "python3.10" + + filename = "../build.zip" + source_code_hash = filesha256("../build.zip") + + handler = "indexer.lambda_handler" + memory_size = 128 + timeout = 60 // 1 minute + + vpc_config = { + subnet_ids = local.private_subnets + security_group_ids = [aws_security_group.graph_indexer_lambda_security_group.id] + } + + # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] +} + +data "aws_iam_policy_document" "allow_secret_read" { + statement { + actions = ["secretsmanager:GetSecretValue"] + resources = [ + "arn:aws:secretsmanager:eu-west-1:760097843905:secret:NeptuneTest/*" + ] + } +} + +resource "aws_iam_role_policy" "read_secrets_policy" { + role = module.indexer_lambda.lambda_role.name + policy = data.aws_iam_policy_document.allow_secret_read.json +} + +# Create a security group allowing all ingress traffic. Limit egress traffic to the developer VPC. +resource "aws_security_group" "graph_indexer_lambda_security_group" { + name = "graph-indexer-lambda" + vpc_id = data.aws_vpc.vpc.id +} + +resource "aws_vpc_security_group_ingress_rule" "neptune_lambda_ingress" { + security_group_id = aws_security_group.graph_indexer_lambda_security_group.id + cidr_ipv4 = "0.0.0.0/0" + ip_protocol = "-1" +} + +resource "aws_vpc_security_group_egress_rule" "neptune_lambda_egress" { + security_group_id = aws_security_group.graph_indexer_lambda_security_group.id + cidr_ipv4 = "0.0.0.0/0" + ip_protocol = "-1" +} + +module "catalogue_graph_queries_topic" { + source = "github.com/wellcomecollection/terraform-aws-sns-topic.git?ref=v1.0.0" + name = "catalogue_graph_queries" +} + +# Add an SQS queue which will collect messages from SNS +module "indexer_message_queue" { + source = "github.com/wellcomecollection/terraform-aws-sqs//queue?ref=v1.2.1" + + queue_name = "catalogue-graph-indexer-message-queue" + + topic_arns = [module.catalogue_graph_queries_topic.arn] + visibility_timeout_seconds = 90 + max_receive_count = 3 + alarm_topic_arn = "arn:aws:sns:eu-west-1:760097843905:platform_dlq_non_empty_alarm" +} + +data "aws_iam_policy_document" "neptune_write" { + statement { + actions = [ + "neptune-db:*" + ] + + resources = [ + "*" + ] + } +} + +data "aws_iam_policy_document" "allow_sqs_receive_message" { + statement { + actions = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes", + "sqs:GetQueueUrl", + "sqs:ChangeMessageVisibilityBatch", + "sqs:ChangeMessageVisibility" + ] + resources = [ + module.indexer_message_queue.arn + ] + } +} + +# Allow the Lambda function to read from the queue +resource "aws_iam_role_policy" "indexer_lambda_sqs_policy" { + role = module.indexer_lambda.lambda_role.name + policy = data.aws_iam_policy_document.allow_sqs_receive_message.json +} + +resource "aws_iam_role_policy" "indexer_lambda_neptune_policy" { + role = module.indexer_lambda.lambda_role.name + policy = data.aws_iam_policy_document.neptune_write.json +} + +# This configures an EventSourceMapping which automatically polls the SQS queue for new messages and triggers +# the indexer Lambda function. All messages received in a 60 second window (defined by `maximum_batching_window_in_seconds`) +# are collected and sent to the Lambda for processing in batches of at most 10 messages (defined by `batch_size`). +# Additionally, the `maximum_concurrency` parameter ensures that there are at most 10 active indexer Lambda functions +# running at a time. +resource "aws_lambda_event_source_mapping" "sqs_to_indexer_lambda" { + event_source_arn = module.indexer_message_queue.arn + function_name = module.indexer_lambda.lambda.function_name + batch_size = 1 + enabled = true + maximum_batching_window_in_seconds = 60 + scaling_config { + maximum_concurrency = 20 + } +} + +# Give the SQS queue permission to invoke the indexer lambda +resource "aws_lambda_permission" "allow_indexer_lambda_sqs_trigger" { + action = "lambda:InvokeFunction" + function_name = module.indexer_lambda.lambda.function_name + principal = "sqs.amazonaws.com" + source_arn = module.indexer_message_queue.arn +} diff --git a/terraform/load_balancer.tf b/terraform/load_balancer.tf index 0d1301fd07..ba3c1b8c8e 100644 --- a/terraform/load_balancer.tf +++ b/terraform/load_balancer.tf @@ -26,7 +26,7 @@ resource "aws_lb_target_group_attachment" "neptune_instance_attachment" { # this setup is still more convenient than only being able to connect from within the VPC. # If it starts bothering us, we can create a Lambda function for dynamically updating the target group IP, as outlined # here: https://aws-samples.github.io/aws-dbs-refarch-graph/src/connecting-using-a-load-balancer/ - target_id = "172.42.180.173" + target_id = "172.42.174.101" } diff --git a/terraform/neptune.tf b/terraform/neptune.tf index 70445cbe2f..52ba8cad7c 100644 --- a/terraform/neptune.tf +++ b/terraform/neptune.tf @@ -1,5 +1,5 @@ -resource "aws_neptune_cluster" "experimental_graph_cluster" { - cluster_identifier = "neptune-test" +resource "aws_neptune_cluster" "catalogue_graph_cluster" { + cluster_identifier = "catalogue-graph" engine = "neptune" engine_version = "1.3.2.1" neptune_cluster_parameter_group_name = "default.neptune1.3" @@ -7,6 +7,7 @@ resource "aws_neptune_cluster" "experimental_graph_cluster" { apply_immediately = true storage_encrypted = true vpc_security_group_ids = [aws_security_group.neptune_security_group.id] + neptune_subnet_group_name = aws_db_subnet_group.neptune_subnet_group.name # Set minimum capacity to 1 NCU, and maximum capacity to 16 NCUs. These are the minimum possible values. serverless_v2_scaling_configuration { @@ -15,8 +16,19 @@ resource "aws_neptune_cluster" "experimental_graph_cluster" { } } +resource "aws_neptune_cluster_instance" "catalogue_graph_instance" { + cluster_identifier = aws_neptune_cluster.catalogue_graph_cluster.cluster_identifier + instance_class = "db.serverless" + neptune_parameter_group_name = "default.neptune1.3" +} + +resource "aws_db_subnet_group" "neptune_subnet_group" { + name = "catalogue-graph" + subnet_ids = local.private_subnets +} + resource "aws_security_group" "neptune_security_group" { - name = "neptune-test" + name = "catalogue-graph-neptune" vpc_id = data.aws_vpc.vpc.id } @@ -33,5 +45,5 @@ resource "aws_secretsmanager_secret" "neptune_cluster_endpoint" { resource "aws_secretsmanager_secret_version" "neptune_cluster_endpoint_value" { secret_id = aws_secretsmanager_secret.neptune_cluster_endpoint.id - secret_string = aws_neptune_cluster.experimental_graph_cluster.endpoint + secret_string = aws_neptune_cluster.catalogue_graph_cluster.endpoint } diff --git a/test.py b/test.py deleted file mode 100644 index c21cd02a71..0000000000 --- a/test.py +++ /dev/null @@ -1,34 +0,0 @@ -import boto3 - -from extractors.loc_concepts_extractor import LibraryOfCongressConceptsExtractor -from clients.neptune_client import NeptuneClient -from clients.cypher_client import CypherClient - -import itertools - - -def _get_secret(secret_name: str): - session = boto3.Session(profile_name="platform-developer") - secrets_manager_client = session.client('secretsmanager', region_name='eu-west-1') - response = secrets_manager_client.get_secret_value(SecretId=secret_name) - - return response['SecretString'] - - -url = "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" -loc_extractor = LibraryOfCongressConceptsExtractor(url) - -neptune_client = NeptuneClient(_get_secret("NeptuneTest/LoadBalancerUrl"), _get_secret("NeptuneTest/InstanceEndpoint")) -cypher_client = CypherClient(neptune_client) - -sample_nodes = loc_extractor.extract_sample_nodes(3000) -sample_edges = loc_extractor.extract_sample_edges(300) - -while True: - chunk = list(itertools.islice(sample_edges, 1)) - if chunk: - print(cypher_client.upsert_edges(chunk)) - #print(cypher_client.create_source_concept_nodes(chunk)) - else: - break - From d8810e8d6f6def937cae82418e9731e4bd36422b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 17 Dec 2024 11:54:38 +0000 Subject: [PATCH 008/310] Add support for LoC names and locations & refactoring --- src/clients/local_neptune_client.py | 2 +- src/extractor.py | 152 ++++++++++++++---- src/extractors/loc/concepts_extractor.py | 140 ---------------- src/models/graph_edge.py | 3 +- src/models/graph_node.py | 35 ++-- src/query_builders/cypher.py | 6 + src/sources/base_source.py | 7 + src/sources/{gzip.py => gzip_source.py} | 13 +- src/transformers/base_transformer.py | 85 ++++++++++ .../loc/__init__.py | 0 src/transformers/loc/concepts_transformer.py | 33 ++++ src/transformers/loc/locations_transformer.py | 33 ++++ src/transformers/loc/names_transformer.py | 29 ++++ src/transformers/loc/raw_concept.py | 113 +++++++++++++ 14 files changed, 463 insertions(+), 188 deletions(-) delete mode 100644 src/extractors/loc/concepts_extractor.py create mode 100644 src/sources/base_source.py rename src/sources/{gzip.py => gzip_source.py} (54%) create mode 100644 src/transformers/base_transformer.py rename src/{extractors => transformers}/loc/__init__.py (100%) create mode 100644 src/transformers/loc/concepts_transformer.py create mode 100644 src/transformers/loc/locations_transformer.py create mode 100644 src/transformers/loc/names_transformer.py create mode 100644 src/transformers/loc/raw_concept.py diff --git a/src/clients/local_neptune_client.py b/src/clients/local_neptune_client.py index c0057a4f99..ea192dcfb2 100644 --- a/src/clients/local_neptune_client.py +++ b/src/clients/local_neptune_client.py @@ -10,7 +10,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -class NeptuneClient: +class LocalNeptuneClient: def __init__(self, load_balancer_url: str, neptune_endpoint: str): self.load_balancer_url = load_balancer_url self.neptune_endpoint = neptune_endpoint diff --git a/src/extractor.py b/src/extractor.py index 5e1dac635e..6a9014dbd6 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -1,62 +1,148 @@ -import itertools +import argparse import json -from collections.abc import Generator import enum from typing import Literal import boto3 -from extractors.loc.concepts_extractor import LibraryOfCongressConceptsExtractor +from transformers.loc.concepts_transformer import LibraryOfCongressConceptsTransformer +from transformers.loc.names_transformer import LibraryOfCongressNamesTransformer +from transformers.loc.locations_transformer import LibraryOfCongressLocationsTransformer +from transformers.base_transformer import BaseTransformer import query_builders.cypher as cypher +from clients.local_neptune_client import LocalNeptuneClient +from clients.neptune_client import NeptuneClient -CHUNK_SIZE = 100 -LOC_SH_URL = "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" +QUERY_CHUNK_SIZE = 200 +LOC_SUBJECT_HEADINGS_URL = ( + "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" +) +LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" -def publish_to_sns(query: str): - client = boto3.client("sns") - client.publish( - TopicArn="arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries", - Message=json.dumps({"default": query}), +GRAPH_QUERIES_SNS_TOPIC_ARN = ( + "arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries" +) + + +def _get_secret(secret_name: str): + secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") + response = secrets_manager_client.get_secret_value(SecretId=secret_name) + + return response["SecretString"] + + +def publish_to_sns(queries: list[str]): + messages = [json.dumps({"default": query}) for query in queries] + + boto3.client("sns").publish_batch( + TopicArn=GRAPH_QUERIES_SNS_TOPIC_ARN, + Message=messages, MessageStructure="json", ) -def _generator_to_chunks(items: Generator): - while True: - chunk = list(itertools.islice(items, CHUNK_SIZE)) - if chunk: - yield chunk - else: - return +class GraphTransformer(enum.Enum): + LOC_SUBJECT_HEADINGS = LibraryOfCongressConceptsTransformer( + LOC_SUBJECT_HEADINGS_URL + ) + LOC_NAMES = LibraryOfCongressNamesTransformer(LOC_NAMES_URL) + LOC_LOCATIONS = LibraryOfCongressLocationsTransformer( + LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL + ) + def __str__(self): + return self.name.lower() -class GraphExtractorType(enum.Enum): - LOC_SH = LibraryOfCongressConceptsExtractor(LOC_SH_URL) - LOC_LOCATION = LibraryOfCongressConceptsExtractor(LOC_SH_URL) + @staticmethod + def argparse(s): + return GraphTransformer[s.upper()] -def extract_all( - extractor_type: GraphExtractorType, entity_type: Literal["nodes", "edges"] +def stream_to_sns( + transformer_type: GraphTransformer, + entity_type: Literal["nodes", "edges"], + sample_size: int = None, ): - extractor = GraphExtractorType[extractor_type.name].value + """Streams selected entities (nodes or edges) into SNS via the selected Transformer.""" + queries = [] - if entity_type == "nodes": - entities = extractor.extract_nodes() - elif entity_type == "edges": - entities = extractor.extract_edges() - else: - raise ValueError("Unsupported entity type.") + transformer: BaseTransformer = GraphTransformer[transformer_type.name].value + + for chunk in transformer.stream_chunks(entity_type, QUERY_CHUNK_SIZE, sample_size): + queries.append(cypher.construct_upsert_nodes_query(chunk)) - for chunk in _generator_to_chunks(entities): + # SNS supports a maximum batch size of 10 + if len(queries) >= 10: + publish_to_sns(queries) + queries = [] + + if len(queries) > 0: + publish_to_sns(queries) + + +def stream_to_graph( + neptune_client: NeptuneClient | LocalNeptuneClient, + transformer_type: GraphTransformer, + entity_type: Literal["nodes", "edges"], + sample_size: int = None, +): + """Streams selected entities (nodes or edges) directly into Neptune via the selected Transformer. + Only used when testing the `extractor` locally. + """ + transformer: BaseTransformer = GraphTransformer[transformer_type.name].value + + queries = 0 + for chunk in transformer.stream_chunks(entity_type, QUERY_CHUNK_SIZE, sample_size): query = cypher.construct_upsert_nodes_query(chunk) - publish_to_sns(query) + queries += 1 + + if queries % 10 == 0: + print(queries) + # neptune_client.run_open_cypher_query(query) def lambda_handler(event: dict, context): - extract_all(GraphExtractorType.LOC_SH, "edges") + stream_to_sns(GraphTransformer.LOC_NAMES, "nodes") if __name__ == "__main__": - lambda_handler({}, None) + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--transformer-type", + type=GraphTransformer.argparse, + choices=list(GraphTransformer), + help="", + required=True, + ) + parser.add_argument( + "--entity-type", + type=str, + choices=["nodes", "edges"], + help="", + required=True, + ) + parser.add_argument( + "--stream-to", + type=str, + choices=["sns", "graph"], + help="", + required=True, + ) + parser.add_argument("--sample-size", type=int, help="") + args = parser.parse_args() + + if args.stream_to == "graph": + client = LocalNeptuneClient( + _get_secret("NeptuneTest/LoadBalancerUrl"), + _get_secret("NeptuneTest/InstanceEndpoint"), + ) + stream_to_graph( + client, + args.transformer_type, + args.entity_type, + args.sample_size, + ) + else: + stream_to_sns(args.transformer_type, args.entity_type, args.sample_size) diff --git a/src/extractors/loc/concepts_extractor.py b/src/extractors/loc/concepts_extractor.py deleted file mode 100644 index c421b1d8c9..0000000000 --- a/src/extractors/loc/concepts_extractor.py +++ /dev/null @@ -1,140 +0,0 @@ -from collections.abc import Generator -from models.graph_node import SourceConcept -from models.graph_edge import SourceConceptNarrowerThan, BaseEdge -from sources.gzip import GZipSource - - -class RawLibraryOfCongressConcept: - def __init__(self, raw_concept: dict): - self.raw_concept = raw_concept - self.source_id = self._extract_source_id() - self.raw_concept_node = self._extract_concept_node() - - @staticmethod - def remove_id_prefix(raw_id: str): - return raw_id.removeprefix("/authorities/subjects/").removeprefix( - "http://id.loc.gov/authorities/subjects/" - ) - - def _extract_source_id(self): - return self.remove_id_prefix(self.raw_concept["@id"]) - - def _extract_concept_node(self): - graph = self.raw_concept["@graph"] - concept_nodes = [ - node - for node in graph - if self.source_id in node.get("@id") and node["@type"] == "skos:Concept" - ] - - # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. - # When this happens, return `None` because there is no concept for us to extract. - if len(concept_nodes) == 0: - return None - - return concept_nodes[0] - - @staticmethod - def _extract_label(raw_label: str | dict): - # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. - if isinstance(raw_label, str): - return raw_label - - return raw_label["@value"] - - def _extract_preferred_label(self): - raw_preferred_label = self.raw_concept_node["skos:prefLabel"] - return self._extract_label(raw_preferred_label) - - def _extract_alternative_labels(self): - raw_alternative_labels = self.raw_concept_node.get("skos:altLabel", []) - - # Raw alternative labels are either returned in a list of labels, or as a single label - # in the same format as `skos:prefLabel` - if isinstance(raw_alternative_labels, list): - return [self._extract_label(item) for item in raw_alternative_labels] - - return [self._extract_label(raw_alternative_labels)] - - def _extract_broader_concepts(self): - broader_concepts = self.raw_concept_node.get("skos:broader", []) - - # Sometimes broader concepts are returned as a list of concepts, and sometimes as just a single JSON - if isinstance(broader_concepts, dict): - broader_concepts = [broader_concepts] - - return broader_concepts - - def extract_edges(self) -> list[BaseEdge]: - if self.raw_concept_node is None: - return [] - - broader_concepts = self._extract_broader_concepts() - broader_ids = [ - self.remove_id_prefix(concept["@id"]) for concept in broader_concepts - ] - - edges = [] - for broader_id in broader_ids: - edges.append( - SourceConceptNarrowerThan(from_id=self.source_id, to_id=broader_id) - ) - - return edges - - def transform_to_source_concept(self): - """Transforms the raw LoC concept into a SourceConcept""" - if self.raw_concept_node is None: - return None - - label = self._extract_preferred_label() - alternative_labels = self._extract_alternative_labels() - - return SourceConcept( - id=self.source_id, - label=label, - source="lc-subjects", - alternative_ids=[], - alternative_labels=alternative_labels, - description=None, - ) - - -class LibraryOfCongressConceptsExtractor: - def __init__(self, url: str): - self.source = GZipSource(url) - - def extract_nodes(self, number: int = None) -> Generator[SourceConcept]: - """ - Extracts and returns SourceConcept nodes from LoC Subject Headings. - Takes an optional parameter to only extract the first `number` nodes. - """ - counter = 0 - - for raw_concept in self.source.stream_raw(): - source_concept = RawLibraryOfCongressConcept( - raw_concept - ).transform_to_source_concept() - - if source_concept: - yield source_concept - - counter += 1 - if counter == number: - return - - def extract_edges(self, number: int = None) -> Generator[BaseEdge]: - """ - Extracts and returns SourceConceptNarrowerThan edges from LoC Subject Headings. - Takes an optional parameter to only extract the first `number` edges. - """ - counter = 0 - - for raw_concept in self.source.stream_raw(): - edges = RawLibraryOfCongressConcept(raw_concept).extract_edges() - for edge in edges: - counter += 1 - yield edge - - if counter == number: - return diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index da0d5fddee..1c352fa48a 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -8,12 +8,11 @@ class BaseEdge(BaseModel): to_id: str relationship: str directed: bool + attributes: dict = {} class SourceConceptNarrowerThan(BaseEdge): from_type: str = "SourceConcept" to_type: str = "SourceConcept" - from_id: str - to_id: str relationship: str = "NARROWER_THAN" directed: bool = True diff --git a/src/models/graph_node.py b/src/models/graph_node.py index 8069fd4421..c02e53b76d 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -1,29 +1,42 @@ +import datetime + from pydantic import BaseModel from typing import Literal, Optional +# Each node must have a label and an id class BaseNode(BaseModel): id: str + label: str +# Represents a LoC, MeSH, or Wikidata concept. +# The `id` field stores a unique identifier provided by the source vocabulary/ontology class SourceConcept(BaseNode): - # Unique identifier provided by the source vocabulary - id: str - # Label/preferred term from source vocabulary - label: str - source: Literal["nlm-mesh", "lc-subjects", "wikidata"] + source: Literal["nlm-mesh", "lc-subjects", "lc-names", "wikidata"] # For example MeSH tree numbers or other identifiers from Wikidata - alternative_ids: list[str] + alternative_ids: list[str] = [] # LoC variants, MeSH concepts other than preferred term - alternative_labels: list[str] + alternative_labels: list[str] = [] # Concept description, such as MeSH scope note or Wikidata description - description: Optional[str] + description: Optional[str] = None + +# Represents a LoC or Wikidata location. Inherits all fields from SourceConcept, plus optional coordinates. +class SourceLocation(SourceConcept): + latitude: Optional[float] = None # Coordinates from Wikidata + longitude: Optional[float] = None # Coordinates from Wikidata + +# Represents a LoC or Wikidata name. Inherits all fields from SourceConcept, plus other optional fields. +class SourceName(SourceConcept): + date_of_birth: Optional[datetime.date] = None + date_of_death: Optional[datetime.date] = None + place_of_birth: Optional[str] = None + + +# The `id` field stores a canonical Wellcome identifier class Concept(BaseNode): - # Unique Wellcome identifier - id: str - label: str type: Literal[ "Person", "Concept", diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index 498d0bb4eb..fe5bb8379a 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -63,6 +63,10 @@ def construct_upsert_edges_query(edges: list[BaseEdge]): from_type = edges[0].from_type to_type = edges[0].to_type relationship = edges[0].relationship + attributes = edges[0].attributes or dict() + + field_set = [f"n.{f} = data.{f}" for f in attributes.keys()] + field_set_statement = ", ".join(field_set) joined_cypher_maps = ",\n".join( [_pydantic_object_to_cypher_map(edge) for edge in edges] @@ -74,5 +78,7 @@ def construct_upsert_edges_query(edges: list[BaseEdge]): MATCH (a:{from_type} {{id: data.from_id}}) MATCH (b:{to_type} {{id: data.to_id}}) MERGE (a)-[r:{relationship}]->(b) + ON CREATE SET r={field_set_statement} + ON MATCH SET r={field_set_statement} """ return query diff --git a/src/sources/base_source.py b/src/sources/base_source.py new file mode 100644 index 0000000000..84e59ce540 --- /dev/null +++ b/src/sources/base_source.py @@ -0,0 +1,7 @@ +from collections.abc import Generator + + +class BaseSource: + def stream_raw(self) -> Generator[dict]: + raise NotImplementedError("Each source must implement a `stream_raw` method.") + diff --git a/src/sources/gzip.py b/src/sources/gzip_source.py similarity index 54% rename from src/sources/gzip.py rename to src/sources/gzip_source.py index 48b0fdedcd..57ea91dfe0 100644 --- a/src/sources/gzip.py +++ b/src/sources/gzip_source.py @@ -2,9 +2,10 @@ from collections.abc import Generator import gzip import json +from .base_source import BaseSource -class GZipSource: +class GZipSource(BaseSource): def __init__(self, url: str): self.url = url @@ -14,3 +15,13 @@ def stream_raw(self) -> Generator[dict]: with gzip.GzipFile(fileobj=response.raw) as file: for line_bytes in file: yield json.loads(line_bytes.decode("utf8")) + + +class MultiGZipSource(BaseSource): + def __init__(self, urls: list[str]): + self.urls = urls + + def stream_raw(self) -> Generator[dict]: + for url in self.urls: + source = GZipSource(url) + yield from source.stream_raw() diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py new file mode 100644 index 0000000000..d62fd0232e --- /dev/null +++ b/src/transformers/base_transformer.py @@ -0,0 +1,85 @@ +from collections.abc import Generator +from models.graph_edge import BaseEdge +from models.graph_node import BaseNode +from sources.base_source import BaseSource + +from typing import Literal +from itertools import islice + + +def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: + while True: + chunk = list(islice(items, chunk_size)) + if chunk: + yield chunk + else: + return + + +class BaseTransformer: + def __init__(self): + self.source: BaseSource = BaseSource() + + def transform_node(self, raw_node: dict) -> BaseNode | None: + raise NotImplementedError( + "Each transformer must implement a `transform_node` method." + ) + + def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: + raise NotImplementedError( + "Each transformer must implement an `extract_edges` method." + ) + + def stream_nodes(self, number: int = None) -> Generator[BaseNode]: + """ + Extracts nodes from the specified source and transforms them. The `source` must define a `stream_raw` method. + Takes an optional parameter to only extract the first `number` nodes. + """ + counter = 0 + + for raw_node in self.source.stream_raw(): + node = self.transform_node(raw_node) + + if node: + yield node + + counter += 1 + if counter == number: + return + + def stream_edges(self, number: int = None) -> Generator[BaseEdge]: + """ + Extracts edges from the specified source and transforms them. The `source` must define a `stream_raw` method. + Takes an optional parameter to only extract the first `number` edges. + """ + counter = 0 + + for raw_node in self.source.stream_raw(): + edges = self.extract_edges(raw_node) + + for edge in edges: + yield edge + + counter += 1 + if counter == number: + return + + def stream_chunks( + self, + entity_type: Literal["nodes", "edges"], + chunk_size: int, + sample_size: int = None, + ) -> Generator[list[BaseNode | BaseEdge]]: + """ + Extracts the specified entity type (nodes or edges) from its source, transforms them, + and returns the results stream in fixed-size chunks. + """ + if entity_type == "nodes": + entities = self.stream_nodes(sample_size) + elif entity_type == "edges": + entities = self.stream_edges(sample_size) + else: + raise ValueError("Unsupported entity type.") + + for chunk in _generator_to_chunks(entities, chunk_size): + yield chunk diff --git a/src/extractors/loc/__init__.py b/src/transformers/loc/__init__.py similarity index 100% rename from src/extractors/loc/__init__.py rename to src/transformers/loc/__init__.py diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py new file mode 100644 index 0000000000..d0e16b6bdc --- /dev/null +++ b/src/transformers/loc/concepts_transformer.py @@ -0,0 +1,33 @@ +from sources.gzip_source import GZipSource +from transformers.base_transformer import BaseTransformer +from collections.abc import Generator +from models.graph_node import SourceConcept +from models.graph_edge import SourceConceptNarrowerThan +from .raw_concept import RawLibraryOfCongressConcept + + +class LibraryOfCongressConceptsTransformer(BaseTransformer): + def __init__(self, url: str): + self.source = GZipSource(url) + + def transform_node(self, raw_node: dict) -> SourceConcept | None: + raw_concept = RawLibraryOfCongressConcept(raw_node) + + if raw_concept.exclude() or raw_concept.is_geographic: + return None + + return SourceConcept( + id=raw_concept.source_id, + label=raw_concept.label, + source=raw_concept.source, + alternative_labels=raw_concept.alternative_labels, + ) + + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: + raw_concept = RawLibraryOfCongressConcept(raw_node) + + if raw_concept.exclude() or raw_concept.is_geographic: + return [] + + for broader_id in raw_concept.broader_concept_ids: + yield SourceConceptNarrowerThan(from_id=self.source_id, to_id=broader_id) diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py new file mode 100644 index 0000000000..393ef8e2b0 --- /dev/null +++ b/src/transformers/loc/locations_transformer.py @@ -0,0 +1,33 @@ +from sources.gzip_source import MultiGZipSource +from transformers.base_transformer import BaseTransformer +from collections.abc import Generator +from models.graph_node import SourceLocation +from models.graph_edge import SourceConceptNarrowerThan +from .raw_concept import RawLibraryOfCongressConcept + + +class LibraryOfCongressLocationsTransformer(BaseTransformer): + def __init__(self, subject_headings_url: str, names_url: str): + self.source = MultiGZipSource([subject_headings_url, names_url]) + + def transform_node(self, raw_node: dict) -> SourceLocation | None: + raw_concept = RawLibraryOfCongressConcept(raw_node) + + if raw_concept.exclude() or not raw_concept.is_geographic: + return None + + return SourceLocation( + id=raw_concept.source_id, + label=raw_concept.label, + source=raw_concept.source, + alternative_labels=raw_concept.alternative_labels, + ) + + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: + raw_concept = RawLibraryOfCongressConcept(raw_node) + + if raw_concept.exclude() or not raw_concept.is_geographic: + return [] + + for broader_id in raw_concept.broader_concept_ids: + yield SourceConceptNarrowerThan(from_id=self.source_id, to_id=broader_id) diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py new file mode 100644 index 0000000000..a6985d30c3 --- /dev/null +++ b/src/transformers/loc/names_transformer.py @@ -0,0 +1,29 @@ +from sources.gzip_source import GZipSource +from transformers.base_transformer import BaseTransformer + +from collections.abc import Generator +from models.graph_node import SourceName +from models.graph_edge import BaseEdge +from .raw_concept import RawLibraryOfCongressConcept + + +class LibraryOfCongressNamesTransformer(BaseTransformer): + def __init__(self, url: str): + self.source = GZipSource(url) + + def transform_node(self, raw_node: dict) -> SourceName | None: + raw_concept = RawLibraryOfCongressConcept(raw_node) + + if raw_concept.exclude() or raw_concept.is_geographic: + return None + + return SourceName( + id=raw_concept.source_id, + label=raw_concept.label, + source=raw_concept.source, + alternative_labels=raw_concept.alternative_labels, + ) + + def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: + # At the moment there are no edges to extract. Return an empty generator. + yield from [] diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py new file mode 100644 index 0000000000..08c6fac039 --- /dev/null +++ b/src/transformers/loc/raw_concept.py @@ -0,0 +1,113 @@ +class RawLibraryOfCongressConcept: + def __init__(self, raw_concept: dict): + self.raw_concept = raw_concept + self._raw_concept_node = self._extract_concept_node() + + @staticmethod + def _remove_id_prefix(raw_id: str): + prefixes_to_remove = [ + "/authorities/subjects/", + "http://id.loc.gov/authorities/subjects/", + "/authorities/names/", + ] + + for prefix in prefixes_to_remove: + raw_id = raw_id.removeprefix(prefix) + + return raw_id + + def _extract_concept_node(self): + graph = self.raw_concept["@graph"] + concept_nodes = [ + node + for node in graph + if self.source_id in node.get("@id") and node["@type"] == "skos:Concept" + ] + + # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. + # When this happens, return `None` because there is no concept for us to extract. + if len(concept_nodes) == 0: + return None + + return concept_nodes[0] + + @staticmethod + def _extract_label(raw_label: str | dict): + # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. + if isinstance(raw_label, str): + return raw_label + # Very rarely, labels are returned as a list of strings. When this happens, + # we only return the first item in the list. + if isinstance(raw_label, list): + return raw_label[0] + + return raw_label["@value"] + + def exclude(self): + if self._raw_concept_node is None: + return True + + # Remove concepts whose IDs have the "-781" suffix. They are duplicates of concepts with non-suffixed IDs. + # The suffix represents the fact that the concept in question is part of the LCSH - Geographic collection. + if self.source_id.endswith("-781"): + return True + + return False + + @property + def source_id(self): + return self._remove_id_prefix(self.raw_concept["@id"]) + + @property + def label(self): + raw_preferred_label = self._raw_concept_node["skos:prefLabel"] + return self._extract_label(raw_preferred_label) + + @property + def alternative_labels(self): + raw_alternative_labels = self._raw_concept_node.get("skos:altLabel", []) + + # Raw alternative labels are either returned in a list of labels, or as a single label + # in the same format as `skos:prefLabel` + if isinstance(raw_alternative_labels, list): + return [self._extract_label(item) for item in raw_alternative_labels] + + return [self._extract_label(raw_alternative_labels)] + + @property + def broader_concept_ids(self): + broader_concepts = self._raw_concept_node.get("skos:broader", []) + + # Sometimes broader concepts are returned as a list of concepts, and sometimes as just a single JSON + if isinstance(broader_concepts, dict): + broader_concepts = [broader_concepts] + + broader_ids = [ + self._remove_id_prefix(concept["@id"]) for concept in broader_concepts + ] + + return broader_ids + + @property + def is_geographic(self): + if self._raw_concept_node is None: + return False + + # Notations are sometimes returned as a single notation (with a `@type` property, and a `@value` property), + # and sometimes as a list of notations. + notation = self._raw_concept_node.get("skos:notation", []) + if isinstance(notation, dict): + notation = [notation] + + notation_types = {item.get("@type") for item in notation} + return "http://id.loc.gov/datatypes/codes/gac" in notation_types + + @property + def source(self): + if "subjects" in self.raw_concept["@id"]: + return "lc-subjects" + + if "names" in self.raw_concept["@id"]: + return "lc-names" + + raise ValueError("Unknown concept type.") From 03c7dbed0328048b549b17b06ae1618cb7606480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 19 Dec 2024 09:56:18 +0000 Subject: [PATCH 009/310] Add support for bulk loading into Neptune --- src/bulk_loader.py | 59 ++++++ src/clients/base_neptune_client.py | 117 ++++++++++++ src/clients/lambda_neptune_client.py | 18 ++ src/clients/local_neptune_client.py | 54 ++---- src/clients/neptune_client.py | 15 -- src/converters/cypher/__init__.py | 0 src/converters/cypher/base_converter.py | 39 ++++ src/converters/cypher/bulk_load_converter.py | 46 +++++ src/converters/cypher/query_converter.py | 55 ++++++ src/extractor.py | 172 ++++++------------ src/indexer.py | 38 ++-- src/query_builders/cypher.py | 86 ++++----- src/requirements.in | 2 + src/requirements.txt | 8 +- src/transformers/__init__.py | 0 src/transformers/base_transformer.py | 132 ++++++++++++-- src/transformers/loc/concepts_transformer.py | 4 +- src/transformers/loc/locations_transformer.py | 4 +- src/transformers/loc/raw_concept.py | 25 ++- src/transformers/transformer_type.py | 30 +++ src/utils/__init__.py | 0 src/utils/aws.py | 39 ++++ terraform/bulk_loader.tf | 52 ++++++ .../{extractor_lambda.tf => extractor.tf} | 0 terraform/{indexer_lambda.tf => indexer.tf} | 11 +- terraform/neptune.tf | 26 +++ 26 files changed, 762 insertions(+), 270 deletions(-) create mode 100644 src/bulk_loader.py create mode 100644 src/clients/base_neptune_client.py create mode 100644 src/clients/lambda_neptune_client.py delete mode 100644 src/clients/neptune_client.py create mode 100644 src/converters/cypher/__init__.py create mode 100644 src/converters/cypher/base_converter.py create mode 100644 src/converters/cypher/bulk_load_converter.py create mode 100644 src/converters/cypher/query_converter.py create mode 100644 src/transformers/__init__.py create mode 100644 src/transformers/transformer_type.py create mode 100644 src/utils/__init__.py create mode 100644 src/utils/aws.py create mode 100644 terraform/bulk_loader.tf rename terraform/{extractor_lambda.tf => extractor.tf} (100%) rename terraform/{indexer_lambda.tf => indexer.tf} (88%) diff --git a/src/bulk_loader.py b/src/bulk_loader.py new file mode 100644 index 0000000000..673f13951a --- /dev/null +++ b/src/bulk_loader.py @@ -0,0 +1,59 @@ +from transformers.base_transformer import EntityType +from transformers.transformer_type import TransformerType +from utils.aws import get_neptune_client + +import typing +import argparse +import time + +S3_BULK_LOAD_BUCKET_NAME = "wellcomecollection-neptune-graph-loader" + + +def handler(transformer_type: str, entity_type: EntityType, is_local=False): + file_name = f"{transformer_type}__{entity_type}.csv" + s3_file_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" + + neptune_client = get_neptune_client(is_local) + load_id = neptune_client.initiate_bulk_load(s3_file_uri=s3_file_uri) + + while True: + final_status = neptune_client.get_bulk_load_status(load_id) + + if final_status is not None: + break + + time.sleep(20) + + if final_status != "LOAD_COMPLETED": + raise Exception("Load failed. See error log above.") + + +def lambda_handler(event: dict, context): + transformer_type = TransformerType.argparse(event["transformer_type"]) + entity_type = event["entity_type"] + handler(transformer_type, entity_type) + + +def local_handler(): + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--transformer-type", + type=TransformerType.argparse, + choices=list(TransformerType), + help="Which transformer's output to bulk load.", + required=True, + ) + parser.add_argument( + "--entity-type", + type=str, + choices=typing.get_args(EntityType), + help="Which entity type to bulk load (nodes or edges).", + required=True, + ) + args = parser.parse_args() + + handler(**args.__dict__, is_local=True) + + +if __name__ == "__main__": + local_handler() diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py new file mode 100644 index 0000000000..3784267a7a --- /dev/null +++ b/src/clients/base_neptune_client.py @@ -0,0 +1,117 @@ +import backoff +import json +import requests +import datetime +from botocore.auth import SigV4Auth +from botocore.awsrequest import AWSRequest + + +class BaseNeptuneClient: + def __init__(self): + self.session = None + self.neptune_endpoint = None + self.verify_requests = True + + def _get_client_url(self): + raise NotImplementedError() + + def _make_request(self, method: str, relative_url: str, payload: dict = None): + client_url = self._get_client_url() + + url = f"{client_url}{relative_url}" + headers = {"Host": self.neptune_endpoint, "Content-Type": "application/json"} + data = json.dumps(payload or {}) + + # We use IAM database authentication, which means we need to authenticate the request using AWS Signature + request = AWSRequest(method=method, url=url, data=data, headers=headers) + SigV4Auth(self.session.get_credentials(), "neptune-db", "eu-west-1").add_auth( + request + ) + + response = requests.request( + method, url, data=data, headers=request.headers, verify=self.verify_requests + ) + + if response.status_code != 200: + raise Exception(response.content) + + return response.json() + + @backoff.on_exception(backoff.constant, Exception, max_tries=5, interval=1) + def run_open_cypher_query(self, query: str): + payload = {"query": query} + response = self._make_request("POST", "/openCypher", payload) + return response["results"] + + def get_graph_summary(self): + response = self._make_request("GET", "/propertygraph/statistics/summary") + return response["payload"]["graphSummary"] + + def reset_database(self): + # TODO: Only keep this function for testing purposes. Remove before releasing. + data = {"action": "initiateDatabaseReset"} + response = self._make_request("POST", "/system", data) + reset_token = response["payload"]["token"] + + data = {"action": "performDatabaseReset", "token": reset_token} + response = self._make_request("POST", "/system", data) + + return response + + def initiate_bulk_load(self, s3_file_uri: str) -> str: + response = self._make_request( + "POST", + "/loader", + { + "source": s3_file_uri, + "format": "opencypher", + "iamRoleArn": "arn:aws:iam::760097843905:role/catalogue-graph-cluster", + "region": "eu-west-1", + "failOnError": "FALSE", + "parallelism": "MEDIUM", + "queueRequest": "TRUE", + "userProvidedEdgeIds": "TRUE", + }, + ) + return response["payload"]["loadId"] + + def get_bulk_load_status(self, load_id: str): + # Response format: https://docs.aws.amazon.com/neptune/latest/userguide/load-api-reference-status-response.html + response = self._make_request( + "GET", f"/loader?loadId={load_id}&errors=TRUE&details=TRUE" + ) + payload = response["payload"] + + overall_status = payload["overallStatus"] + status = overall_status["status"] + full_uri = overall_status["fullUri"] + processed_count = overall_status["totalRecords"] + + print(f"Bulk load status: {status}") + print(f" Source file URI: {full_uri}") + print(f" Processed records: {processed_count:,}") + + # Statuses: https://docs.aws.amazon.com/neptune/latest/userguide/loader-message.html + if status in ("LOAD_NOT_STARTED", "LOAD_IN_QUEUE", "LOAD_IN_PROGRESS"): + return + + insert_error_count = overall_status["insertErrors"] + parsing_error_count = overall_status["parsingErrors"] + data_type_error_count = overall_status["datatypeMismatchErrors"] + formatted_time = datetime.timedelta(seconds=overall_status["totalTimeSpent"]) + + print(f" Insert errors: {insert_error_count:,}") + print(f" Parsing errors: {parsing_error_count:,}") + print(f" Data type mismatch errors: {data_type_error_count:,}") + print(f" Total time spent: {formatted_time}") + + error_logs = payload["errors"]["errorLogs"] + if error_logs: + print(" First 10 errors:") + + for error_log in error_logs: + code = error_log["errorCode"] + message = error_log["errorMessage"] + print(f" {code}: {message}") + + return status diff --git a/src/clients/lambda_neptune_client.py b/src/clients/lambda_neptune_client.py new file mode 100644 index 0000000000..e348f3567f --- /dev/null +++ b/src/clients/lambda_neptune_client.py @@ -0,0 +1,18 @@ +import boto3 +import os + +from .base_neptune_client import BaseNeptuneClient + + +class LambdaNeptuneClient(BaseNeptuneClient): + def __init__(self, neptune_endpoint: str): + super().__init__() + self.neptune_endpoint = neptune_endpoint + self.session = boto3.Session( + aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + aws_session_token=os.getenv("AWS_SESSION_TOKEN"), + ) + + def _get_client_url(self): + return f"https://{self.neptune_endpoint}:8182" diff --git a/src/clients/local_neptune_client.py b/src/clients/local_neptune_client.py index ea192dcfb2..decc84fa6d 100644 --- a/src/clients/local_neptune_client.py +++ b/src/clients/local_neptune_client.py @@ -1,51 +1,21 @@ -import json - -import requests import boto3 -from botocore.auth import SigV4Auth -from botocore.awsrequest import AWSRequest - import urllib3 -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +from .base_neptune_client import BaseNeptuneClient -class LocalNeptuneClient: +class LocalNeptuneClient(BaseNeptuneClient): def __init__(self, load_balancer_url: str, neptune_endpoint: str): - self.load_balancer_url = load_balancer_url - self.neptune_endpoint = neptune_endpoint - self.session = boto3.Session(profile_name="platform-developer") - - def _make_request(self, method: str, relative_url: str, payload: dict): - url = f"{self.load_balancer_url}{relative_url}" - headers = { - "Host": self.neptune_endpoint, - "Content-Type": "application/json" - } - data = json.dumps(payload) - - # We use IAM database authentication, which means we need to authenticate the request using AWS Signature - request = AWSRequest(method=method, url=url, data=data, headers=headers) - SigV4Auth(self.session.get_credentials(), "neptune-db", "eu-west-1").add_auth(request) - - # We need to send a manual request rather than using boto3 since we are accessing the instance via a NLB - # We are using the default NLB DNS name, which does not support custom SSL certificates, so we need to - # disable SSL certificate verification. This increases the risks of a man-in-the-middle attack, - # which is acceptable for a testing database. In production, we will be connecting to the database + # We are using the default NLB DNS name, which does not support custom SSL certificates, so we need to + # disable SSL certificate verification. This increases the risks of a man-in-the-middle attack, + # which is acceptable for a testing database. In production, we will be connecting to the database # directly from within the VPC. - response = requests.request(method, url, data=data, headers=request.headers, verify=False) - - if response.status_code != 200: - raise Exception(response.content) - - return response.json() + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + self.verify_requests = False - def run_open_cypher_query(self, query: str): - """Run a Cypher query against an experimental serverless Neptune cluster""" - payload = {"query": query} - response = self._make_request("POST", "/openCypher", payload) - return response['results'] + self.load_balancer_url = load_balancer_url + self.neptune_endpoint = neptune_endpoint + self.session = boto3.Session() - def get_graph_summary(self): - response = self._make_request("GET", "/propertygraph/statistics/summary", {}) - return response["payload"]['graphSummary'] + def _get_client_url(self): + return self.load_balancer_url diff --git a/src/clients/neptune_client.py b/src/clients/neptune_client.py deleted file mode 100644 index 6a22c39e6b..0000000000 --- a/src/clients/neptune_client.py +++ /dev/null @@ -1,15 +0,0 @@ -import boto3 - - -class NeptuneClient: - def __init__(self, neptune_endpoint: str): - endpoint_url = f"https://{neptune_endpoint}:8182" - self.client = boto3.client("neptunedata", endpoint_url=endpoint_url) - - def run_open_cypher_query(self, query: str): - """Run a Cypher query against the Neptune cluster""" - response = self.client.execute_open_cypher_query(openCypherQuery=query) - return response["results"] - - def get_graph_summary(self): - return self.client.get_propertygraph_summary(mode="detailed")["payload"] diff --git a/src/converters/cypher/__init__.py b/src/converters/cypher/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/converters/cypher/base_converter.py b/src/converters/cypher/base_converter.py new file mode 100644 index 0000000000..be00e4dd15 --- /dev/null +++ b/src/converters/cypher/base_converter.py @@ -0,0 +1,39 @@ +from models.graph_node import BaseNode +from models.graph_edge import BaseEdge + + +class CypherBaseConverter: + def __init__(self, model_to_convert: BaseEdge | BaseNode): + self.model = model_to_convert + + def _convert_str(self, raw_value: str) -> str: + return raw_value + + def _convert_bool(self, raw_value: bool) -> str: + return str(raw_value).lower() + + def _convert_none(self) -> str: + return "null" + + def _convert_list(self, raw_value: list[any]) -> str: + # Neptune does not support lists, so we convert them to a single string with a `||` separator + return self._raw_value_to_cypher_value("||".join(raw_value)) + + def _raw_value_to_cypher_value(self, raw_value: any) -> str: + if isinstance(raw_value, str): + value = self._convert_str(raw_value) + elif isinstance(raw_value, bool): + value = self._convert_bool(raw_value) + elif isinstance(raw_value, list): + value = self._convert_list(raw_value) + elif raw_value is None: + value = self._convert_none() + else: + raise TypeError( + f""" + Cannot convert type {type(raw_value)} (with value {repr(raw_value)}) into an openCypher-compatible + data type. Use a different type or add support for type {type(raw_value)}. + """ + ) + + return value diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py new file mode 100644 index 0000000000..9d6df57a58 --- /dev/null +++ b/src/converters/cypher/bulk_load_converter.py @@ -0,0 +1,46 @@ +from models.graph_node import BaseNode +from models.graph_edge import BaseEdge +from .base_converter import CypherBaseConverter + + +class CypherBulkLoadConverter(CypherBaseConverter): + def __init__(self, model_to_convert: BaseEdge | BaseNode): + self.model = model_to_convert + + def _node_to_bulk_cypher(self): + bulk_node = {":ID": self.model.id, ":LABEL": type(self.model).__name__} + + for key, raw_value in dict(self.model).items(): + value = self._raw_value_to_cypher_value(raw_value) + bulk_node[key] = value + + return bulk_node + + def _edge_to_bulk_cypher(self): + bulk_edge = { + ":ID": f"{self.model.from_id}-->{self.model.to_id}", + ":START_ID": self.model.from_id, + ":END_ID": self.model.to_id, + ":TYPE": self.model.relationship, + } + + for key, raw_value in self.model.attributes.items(): + value = self._raw_value_to_cypher_value(raw_value) + bulk_edge[key] = value + + return bulk_edge + + def convert_to_bulk_cypher(self): + """ + Returns a dictionary representing the entity (node or edge), converting all values into a format compatible + with openCypher, and adding all required values for bulk upload, such as `:ID` or `:LABEL`. + See https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load-tutorial-format-opencypher.html. + """ + if isinstance(self.model, BaseNode): + return self._node_to_bulk_cypher() + elif isinstance(self.model, BaseEdge): + return self._edge_to_bulk_cypher() + else: + raise ValueError( + "Unsupported Pydantic model. Each model must subclass BaseEdge or BaseNode." + ) diff --git a/src/converters/cypher/query_converter.py b/src/converters/cypher/query_converter.py new file mode 100644 index 0000000000..7e490413fd --- /dev/null +++ b/src/converters/cypher/query_converter.py @@ -0,0 +1,55 @@ +from models.graph_node import BaseNode +from models.graph_edge import BaseEdge +from .base_converter import CypherBaseConverter + + +class CypherQueryConverter(CypherBaseConverter): + def __init__(self, model_to_convert: BaseEdge | BaseNode): + self.model = model_to_convert + + def _convert_str(self, raw_value: str) -> str: + # All strings need to be surrounded in single quotation marks, and all single quotation marks + # which were already present in the string need to be escaped. + escaped = raw_value.replace("'", "\\'") + return f"'{escaped}'" + + def _node_to_cypher_map(self) -> str: + properties = [] + + for key, raw_value in dict(self.model).items(): + value = self._raw_value_to_cypher_value(raw_value) + properties.append(f"{key}: {value}") + + return "{" + ", ".join(properties) + "}" + + def _edge_to_cypher_map(self) -> str: + properties = [] + + for key, raw_value in self.model.attributes.items(): + value = self._raw_value_to_cypher_value(raw_value) + properties.append(f"{key}: {value}") + + for key, raw_value in self.model: + if key in ("from_id", "to_id"): + value = self._raw_value_to_cypher_value(raw_value) + properties.append(f"{key}: {value}") + + return "{" + ", ".join(properties) + "}" + + def convert_to_cypher_map(self): + """ + Returns a string representing an openCypher Map of the entity (node or edge) for use with an `UNWIND` query. + + For example, the Pydantic model `BaseNode(id="someId123", label="Some Label")` would be converted into + the string `{id: 'someId123', label: 'Some Label'}`. + + See https://neo4j.com/docs/cypher-manual/current/values-and-types/maps/. + """ + if isinstance(self.model, BaseNode): + return self._node_to_cypher_map() + elif isinstance(self.model, BaseEdge): + return self._edge_to_cypher_map() + else: + raise ValueError( + "Unsupported Pydantic model. Each model must subclass BaseEdge or BaseNode." + ) diff --git a/src/extractor.py b/src/extractor.py index 6a9014dbd6..32df78e676 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -1,148 +1,88 @@ import argparse -import json -import enum -from typing import Literal +import typing -import boto3 - -from transformers.loc.concepts_transformer import LibraryOfCongressConceptsTransformer -from transformers.loc.names_transformer import LibraryOfCongressNamesTransformer -from transformers.loc.locations_transformer import LibraryOfCongressLocationsTransformer -from transformers.base_transformer import BaseTransformer -import query_builders.cypher as cypher - -from clients.local_neptune_client import LocalNeptuneClient -from clients.neptune_client import NeptuneClient +from utils.aws import get_neptune_client +from transformers.base_transformer import BaseTransformer, EntityType, StreamDestination +from transformers.transformer_type import TransformerType QUERY_CHUNK_SIZE = 200 -LOC_SUBJECT_HEADINGS_URL = ( - "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" -) -LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" - -GRAPH_QUERIES_SNS_TOPIC_ARN = ( - "arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries" -) - - -def _get_secret(secret_name: str): - secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") - response = secrets_manager_client.get_secret_value(SecretId=secret_name) - - return response["SecretString"] - - -def publish_to_sns(queries: list[str]): - messages = [json.dumps({"default": query}) for query in queries] - - boto3.client("sns").publish_batch( - TopicArn=GRAPH_QUERIES_SNS_TOPIC_ARN, - Message=messages, - MessageStructure="json", - ) - - -class GraphTransformer(enum.Enum): - LOC_SUBJECT_HEADINGS = LibraryOfCongressConceptsTransformer( - LOC_SUBJECT_HEADINGS_URL - ) - LOC_NAMES = LibraryOfCongressNamesTransformer(LOC_NAMES_URL) - LOC_LOCATIONS = LibraryOfCongressLocationsTransformer( - LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL - ) - - def __str__(self): - return self.name.lower() - - @staticmethod - def argparse(s): - return GraphTransformer[s.upper()] - - -def stream_to_sns( - transformer_type: GraphTransformer, - entity_type: Literal["nodes", "edges"], - sample_size: int = None, -): - """Streams selected entities (nodes or edges) into SNS via the selected Transformer.""" - queries = [] - - transformer: BaseTransformer = GraphTransformer[transformer_type.name].value - - for chunk in transformer.stream_chunks(entity_type, QUERY_CHUNK_SIZE, sample_size): - queries.append(cypher.construct_upsert_nodes_query(chunk)) - - # SNS supports a maximum batch size of 10 - if len(queries) >= 10: - publish_to_sns(queries) - queries = [] - - if len(queries) > 0: - publish_to_sns(queries) +S3_BULK_LOAD_BUCKET_NAME = "wellcomecollection-neptune-graph-loader" +SNS_QUERY_TOPIC_ARN = "arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries" -def stream_to_graph( - neptune_client: NeptuneClient | LocalNeptuneClient, - transformer_type: GraphTransformer, - entity_type: Literal["nodes", "edges"], +def handler( + stream_destination: StreamDestination, + transformer_type: TransformerType, + entity_type: EntityType, sample_size: int = None, + is_local=False, ): - """Streams selected entities (nodes or edges) directly into Neptune via the selected Transformer. - Only used when testing the `extractor` locally. - """ - transformer: BaseTransformer = GraphTransformer[transformer_type.name].value + print( + f"Transforming {sample_size or 'all'} {entity_type} using the {transformer_type} " + f"transformer and streaming them into {stream_destination}." + ) - queries = 0 - for chunk in transformer.stream_chunks(entity_type, QUERY_CHUNK_SIZE, sample_size): - query = cypher.construct_upsert_nodes_query(chunk) - queries += 1 + transformer: BaseTransformer = TransformerType[transformer_type.name].value - if queries % 10 == 0: - print(queries) - # neptune_client.run_open_cypher_query(query) + if stream_destination == "graph": + neptune_client = get_neptune_client(is_local) + transformer.stream_to_graph( + neptune_client, entity_type, QUERY_CHUNK_SIZE, sample_size + ) + elif stream_destination == "s3": + file_name = f"{transformer_type}__{entity_type}.csv" + s3_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" + transformer.stream_to_s3(s3_uri, entity_type, sample_size) + elif stream_destination == "sns": + transformer.stream_to_sns( + SNS_QUERY_TOPIC_ARN, entity_type, QUERY_CHUNK_SIZE, sample_size + ) + else: + raise ValueError("Unsupported stream destination.") def lambda_handler(event: dict, context): - stream_to_sns(GraphTransformer.LOC_NAMES, "nodes") + stream_destination = event["stream_destination"] + transformer_type = TransformerType.argparse(event["transformer_type"]) + entity_type = event["entity_type"] + sample_size = event.get("sample_size") + handler(stream_destination, transformer_type, entity_type, sample_size) -if __name__ == "__main__": + +def local_handler(): parser = argparse.ArgumentParser(description="") parser.add_argument( "--transformer-type", - type=GraphTransformer.argparse, - choices=list(GraphTransformer), - help="", + type=TransformerType.argparse, + choices=list(TransformerType), + help="Which transformer to use for streaming.", required=True, ) parser.add_argument( "--entity-type", type=str, - choices=["nodes", "edges"], - help="", + choices=typing.get_args(EntityType), + help="Which entity type to transform using the specified transformer (nodes or edges).", required=True, ) parser.add_argument( - "--stream-to", + "--stream-destination", type=str, - choices=["sns", "graph"], - help="", + choices=typing.get_args(StreamDestination), + help="Where to stream the transformed entities.", required=True, ) - parser.add_argument("--sample-size", type=int, help="") + parser.add_argument( + "--sample-size", + type=int, + help="How many entities to stream. If not specified, streaming will continue until the source is exhausted.", + ) args = parser.parse_args() - if args.stream_to == "graph": - client = LocalNeptuneClient( - _get_secret("NeptuneTest/LoadBalancerUrl"), - _get_secret("NeptuneTest/InstanceEndpoint"), - ) - stream_to_graph( - client, - args.transformer_type, - args.entity_type, - args.sample_size, - ) - else: - stream_to_sns(args.transformer_type, args.entity_type, args.sample_size) + handler(**args.__dict__, is_local=True) + + +if __name__ == "__main__": + local_handler() diff --git a/src/indexer.py b/src/indexer.py index fae07046aa..5648ae8161 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,14 +1,7 @@ -import boto3 import json +import argparse -from clients.neptune_client import NeptuneClient - - -def _get_secret(secret_name: str): - secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") - response = secrets_manager_client.get_secret_value(SecretId=secret_name) - - return response["SecretString"] +from utils.aws import get_neptune_client def extract_sns_messages_from_sqs_event(event): @@ -21,13 +14,32 @@ def extract_sns_messages_from_sqs_event(event): return queries -def lambda_handler(event: dict, context): - queries = extract_sns_messages_from_sqs_event(event) - neptune_client = NeptuneClient(_get_secret("NeptuneTest/InstanceEndpoint")) +def handler(queries: list[str], is_local=False): + neptune_client = get_neptune_client(is_local) + + print(f"Received number of queries: {len(queries)}") for query in queries: neptune_client.run_open_cypher_query(query) +def lambda_handler(event: dict, context): + queries = extract_sns_messages_from_sqs_event(event) + handler(queries) + + +def local_handler(): + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--cypher-query", + type=str, + help="An openCypher query to run against the Neptune cluster.", + required=True, + ) + args = parser.parse_args() + + handler([args.cypher_query], is_local=True) + + if __name__ == "__main__": - lambda_handler({}, None) + local_handler() diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index fe5bb8379a..71532008b5 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -1,56 +1,21 @@ -from pydantic import BaseModel - from models.graph_edge import BaseEdge from models.graph_node import BaseNode +from converters.cypher.query_converter import CypherQueryConverter -def _value_to_cypher_value(raw_value: any): - if isinstance(raw_value, str): - escaped = raw_value.replace("'", "\\'") - value = f"'{escaped}'" - elif isinstance(raw_value, bool): - value = str(raw_value).lower() - elif isinstance(raw_value, list): - # Neptune does not support lists, so we convert them to a single string with a `||` separator - value = _value_to_cypher_value("||".join(raw_value)) - elif raw_value is None: - value = "null" - else: - raise TypeError( - f""" - Cannot convert type {type(raw_value)} (with value {repr(raw_value)}) into a Cypher data type. - Use a different type or add support for type {type(raw_value)} to CypherClient. - """ - ) - - return value - - -def _pydantic_object_to_cypher_map(pydantic_object: BaseModel): - properties = [] - - for key, raw_value in dict(pydantic_object).items(): - value = _value_to_cypher_value(raw_value) - properties.append(f"{key}: {value}") - - return "{" + ", ".join(properties) + "}" - - -def construct_upsert_nodes_query(source_concepts: list[BaseNode]): - model_name = type(source_concepts[0]).__name__ - all_fields = type(source_concepts[0]).model_fields.keys() +def construct_upsert_nodes_query(nodes: list[BaseNode]) -> str: + model_name = type(nodes[0]).__name__ + all_fields = type(nodes[0]).model_fields.keys() field_set = [f"n.{f} = data.{f}" for f in all_fields] field_set_statement = ", ".join(field_set) - cypher_maps = [ - _pydantic_object_to_cypher_map(concept) for concept in source_concepts - ] - joined_cypher_maps = ",\n".join(cypher_maps) + unwind_maps = [CypherQueryConverter(node).convert_to_cypher_map() for node in nodes] + joined_unwind_maps = ",\n".join(unwind_maps) query = f""" UNWIND [ - {joined_cypher_maps} + {joined_unwind_maps} ] AS data MERGE (n:{model_name} {{id: data.id}}) ON CREATE SET {field_set_statement} @@ -59,26 +24,47 @@ def construct_upsert_nodes_query(source_concepts: list[BaseNode]): return query -def construct_upsert_edges_query(edges: list[BaseEdge]): +def construct_upsert_edges_query(edges: list[BaseEdge]) -> str: from_type = edges[0].from_type to_type = edges[0].to_type relationship = edges[0].relationship attributes = edges[0].attributes or dict() field_set = [f"n.{f} = data.{f}" for f in attributes.keys()] - field_set_statement = ", ".join(field_set) + field_set_statement = ", ".join(field_set) + + if len(field_set_statement) == 0: + field_set_statement = "r={}" + + unwind_maps = [CypherQueryConverter(edge).convert_to_cypher_map() for edge in edges] + joined_unwind_maps = ",\n".join(unwind_maps) - joined_cypher_maps = ",\n".join( - [_pydantic_object_to_cypher_map(edge) for edge in edges] - ) query = f""" UNWIND [ - {joined_cypher_maps} + {joined_unwind_maps} ] AS data MATCH (a:{from_type} {{id: data.from_id}}) MATCH (b:{to_type} {{id: data.to_id}}) MERGE (a)-[r:{relationship}]->(b) - ON CREATE SET r={field_set_statement} - ON MATCH SET r={field_set_statement} + ON CREATE SET {field_set_statement} + ON MATCH SET {field_set_statement} """ return query + + +def construct_upsert_cypher_query(entities: list[BaseNode | BaseEdge]): + """ + Returns an openCypher `UNWIND` query which creates a graph node or edge for each item specified in `entities`, + or updates an existing matching node or edge. + + All passed `entities` must be instances of the same Pydantic model because labels cannot be set dynamically + in openCypher. + """ + if isinstance(entities[0], BaseNode): + return construct_upsert_nodes_query(entities) + elif isinstance(entities[0], BaseEdge): + return construct_upsert_edges_query(entities) + else: + raise ValueError( + "Unsupported Pydantic model. Each model must subclass BaseEdge or BaseNode." + ) diff --git a/src/requirements.in b/src/requirements.in index 4d604a314b..f56c57658c 100644 --- a/src/requirements.in +++ b/src/requirements.in @@ -1,3 +1,5 @@ boto3 requests pydantic +backoff +smart-open diff --git a/src/requirements.txt b/src/requirements.txt index 157235162d..82d084d929 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -2,10 +2,12 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile requirements.in +# pip-compile # annotated-types==0.7.0 # via pydantic +backoff==2.2.1 + # via -r requirements.in boto3==1.35.77 # via -r requirements.in botocore==1.35.77 @@ -34,6 +36,8 @@ s3transfer==0.10.4 # via boto3 six==1.17.0 # via python-dateutil +smart-open==7.1.0 + # via -r requirements.in typing-extensions==4.12.2 # via # pydantic @@ -42,3 +46,5 @@ urllib3==2.2.3 # via # botocore # requests +wrapt==1.17.0 + # via smart-open diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index d62fd0232e..0bc9caf0e6 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -2,10 +2,21 @@ from models.graph_edge import BaseEdge from models.graph_node import BaseNode from sources.base_source import BaseSource - +from clients.base_neptune_client import BaseNeptuneClient +from query_builders.cypher import construct_upsert_cypher_query +from utils.aws import publish_to_sns +from converters.cypher.bulk_load_converter import CypherBulkLoadConverter + +import smart_open +import concurrent.futures +import boto3 +import csv from typing import Literal from itertools import islice +EntityType = Literal["nodes", "edges"] +StreamDestination = Literal["graph", "s3", "sns"] + def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: while True: @@ -30,7 +41,7 @@ def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: "Each transformer must implement an `extract_edges` method." ) - def stream_nodes(self, number: int = None) -> Generator[BaseNode]: + def _stream_nodes(self, number: int = None) -> Generator[BaseNode]: """ Extracts nodes from the specified source and transforms them. The `source` must define a `stream_raw` method. Takes an optional parameter to only extract the first `number` nodes. @@ -42,12 +53,14 @@ def stream_nodes(self, number: int = None) -> Generator[BaseNode]: if node: yield node + counter += 1 - counter += 1 + if counter % 5000 == 0: + print(f"Streamed {counter} nodes...") if counter == number: return - def stream_edges(self, number: int = None) -> Generator[BaseEdge]: + def _stream_edges(self, number: int = None) -> Generator[BaseEdge]: """ Extracts edges from the specified source and transforms them. The `source` must define a `stream_raw` method. Takes an optional parameter to only extract the first `number` edges. @@ -61,25 +74,116 @@ def stream_edges(self, number: int = None) -> Generator[BaseEdge]: yield edge counter += 1 + if counter % 5000 == 0: + print(f"Streamed {counter} edges...") if counter == number: return - def stream_chunks( + def _stream_entities( + self, entity_type: EntityType, sample_size: int = None + ) -> Generator[BaseNode | BaseEdge]: + if entity_type == "nodes": + entities = self._stream_nodes(sample_size) + elif entity_type == "edges": + entities = self._stream_edges(sample_size) + else: + raise ValueError("Unsupported entity type.") + + yield from entities + + def _stream_chunks( self, - entity_type: Literal["nodes", "edges"], + entity_type: EntityType, chunk_size: int, sample_size: int = None, ) -> Generator[list[BaseNode | BaseEdge]]: """ - Extracts the specified entity type (nodes or edges) from its source, transforms them, + Extracts the specified entity type (nodes or edges) from its source, transforms each entity, and returns the results stream in fixed-size chunks. """ - if entity_type == "nodes": - entities = self.stream_nodes(sample_size) - elif entity_type == "edges": - entities = self.stream_edges(sample_size) - else: - raise ValueError("Unsupported entity type.") - + entities = self._stream_entities(entity_type, sample_size) for chunk in _generator_to_chunks(entities, chunk_size): yield chunk + + def stream_to_s3( + self, s3_uri: str, entity_type: EntityType, sample_size: int = None + ): + """ + Streams transformed entities (nodes or edges) into an S3 bucket for bulk loading into the Neptune cluster. + Suitable for indexing large numbers of entities in production. Provides limited observability. + """ + transport_params = {"client": boto3.client("s3")} + with smart_open.open(s3_uri, "w", transport_params=transport_params) as f: + csv_writer = None + + for entity in self._stream_entities(entity_type, sample_size): + bulk_dict = CypherBulkLoadConverter(entity).convert_to_bulk_cypher() + if csv_writer is None: + csv_writer = csv.DictWriter(f, fieldnames=bulk_dict.keys()) + csv_writer.writeheader() + + csv_writer.writerow(bulk_dict) + + def stream_to_graph( + self, + neptune_client: BaseNeptuneClient, + entity_type: EntityType, + query_chunk_size: int, + sample_size: int = None, + ): + """ + Streams transformed entities (nodes or edges) directly into Neptune using multiple threads for parallel + processing. Suitable for local testing. Not recommended for indexing large numbers of entities. + """ + chunks = self._stream_chunks(entity_type, query_chunk_size, sample_size) + + def run_query(chunk): + query = construct_upsert_cypher_query(chunk) + neptune_client.run_open_cypher_query(query) + + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = { + executor.submit(run_query, chunk) + for i, chunk in enumerate(islice(chunks, 10)) + } + + while futures: + done, futures = concurrent.futures.wait( + futures, return_when=concurrent.futures.FIRST_COMPLETED + ) + + for fut in done: + fut.result() + + for chunk in islice(chunks, len(done)): + futures.add(executor.submit(run_query, chunk)) + + def stream_to_sns( + self, + topic_arn: str, + entity_type: EntityType, + query_chunk_size: int, + sample_size: int = None, + ): + """ + Streams transformed entities (nodes or edges) into an SNS topic as openCypher queries, where they will be + consumed by the `indexer` Lambda function. + """ + queries = [] + counter = 0 + + for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): + queries.append(construct_upsert_cypher_query(chunk)) + + # SNS supports a maximum batch size of 10 + if len(queries) >= 10: + publish_to_sns(topic_arn, queries) + queries = [] + + counter += 1 + if counter % 100 == 0: + print(f"Published {counter} messages to SNS.") + + # Publish remaining messages (if any) + if len(queries) > 0: + publish_to_sns(topic_arn, queries) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index d0e16b6bdc..b10e0a5ddd 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -30,4 +30,6 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: return [] for broader_id in raw_concept.broader_concept_ids: - yield SourceConceptNarrowerThan(from_id=self.source_id, to_id=broader_id) + yield SourceConceptNarrowerThan( + from_id=raw_concept.source_id, to_id=broader_id + ) diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 393ef8e2b0..31ab0fe9a6 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -30,4 +30,6 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: return [] for broader_id in raw_concept.broader_concept_ids: - yield SourceConceptNarrowerThan(from_id=self.source_id, to_id=broader_id) + yield SourceConceptNarrowerThan( + from_id=raw_concept.source_id, to_id=broader_id + ) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 08c6fac039..8958911f2f 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -42,7 +42,7 @@ def _extract_label(raw_label: str | dict): return raw_label[0] return raw_label["@value"] - + def exclude(self): if self._raw_concept_node is None: return True @@ -73,7 +73,7 @@ def alternative_labels(self): return [self._extract_label(item) for item in raw_alternative_labels] return [self._extract_label(raw_alternative_labels)] - + @property def broader_concept_ids(self): broader_concepts = self._raw_concept_node.get("skos:broader", []) @@ -82,9 +82,14 @@ def broader_concept_ids(self): if isinstance(broader_concepts, dict): broader_concepts = [broader_concepts] - broader_ids = [ - self._remove_id_prefix(concept["@id"]) for concept in broader_concepts - ] + broader_ids = [] + for concept in broader_concepts: + # Some broader concepts have IDs in the format `_:n`. + # These IDs do not exist in the LoC source files or the LoC website, so we filter them out. + if concept["@id"].startswith("_:n"): + continue + + broader_ids.append(self._remove_id_prefix(concept["@id"])) return broader_ids @@ -92,22 +97,22 @@ def broader_concept_ids(self): def is_geographic(self): if self._raw_concept_node is None: return False - + # Notations are sometimes returned as a single notation (with a `@type` property, and a `@value` property), - # and sometimes as a list of notations. + # and sometimes as a list of notations. notation = self._raw_concept_node.get("skos:notation", []) if isinstance(notation, dict): notation = [notation] notation_types = {item.get("@type") for item in notation} return "http://id.loc.gov/datatypes/codes/gac" in notation_types - + @property def source(self): if "subjects" in self.raw_concept["@id"]: return "lc-subjects" - + if "names" in self.raw_concept["@id"]: return "lc-names" - + raise ValueError("Unknown concept type.") diff --git a/src/transformers/transformer_type.py b/src/transformers/transformer_type.py new file mode 100644 index 0000000000..843c4d66a2 --- /dev/null +++ b/src/transformers/transformer_type.py @@ -0,0 +1,30 @@ +import enum + +from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer +from .loc.names_transformer import LibraryOfCongressNamesTransformer +from .loc.locations_transformer import LibraryOfCongressLocationsTransformer + +LOC_SUBJECT_HEADINGS_URL = ( + "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" +) +LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" + +GRAPH_QUERIES_SNS_TOPIC_ARN = ( + "arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries" +) + + +class TransformerType(enum.Enum): + LOC_CONCEPTS = LibraryOfCongressConceptsTransformer(LOC_SUBJECT_HEADINGS_URL) + LOC_NAMES = LibraryOfCongressNamesTransformer(LOC_NAMES_URL) + LOC_LOCATIONS = LibraryOfCongressLocationsTransformer( + LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL + ) + + def __str__(self): + return self.name.lower() + + # For parsing lowercase Lambda/command line arguments + @staticmethod + def argparse(s): + return TransformerType[s.upper()] diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/utils/aws.py b/src/utils/aws.py new file mode 100644 index 0000000000..5f5347378e --- /dev/null +++ b/src/utils/aws.py @@ -0,0 +1,39 @@ +import boto3 +import json + +from clients.lambda_neptune_client import LambdaNeptuneClient +from clients.local_neptune_client import LocalNeptuneClient + + +def get_secret(secret_name: str): + secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") + response = secrets_manager_client.get_secret_value(SecretId=secret_name) + + return response["SecretString"] + + +def publish_to_sns(topic_arn: str, queries: list[str]): + request_entries = [] + for i, query in enumerate(queries): + request_entries.append( + { + "Id": f"batch_message_{i}", + "Message": json.dumps({"default": query}), + "MessageStructure": "json", + } + ) + + boto3.client("sns").publish_batch( + TopicArn=topic_arn, + PublishBatchRequestEntries=request_entries, + ) + + +def get_neptune_client(is_local: bool): + if is_local: + return LocalNeptuneClient( + get_secret("NeptuneTest/LoadBalancerUrl"), + get_secret("NeptuneTest/InstanceEndpoint"), + ) + else: + return LambdaNeptuneClient(get_secret("NeptuneTest/InstanceEndpoint")) diff --git a/terraform/bulk_loader.tf b/terraform/bulk_loader.tf new file mode 100644 index 0000000000..c91adac65d --- /dev/null +++ b/terraform/bulk_loader.tf @@ -0,0 +1,52 @@ +module "bulk_loader_lambda" { + source = "git@github.com:wellcomecollection/terraform-aws-lambda?ref=v1.2.0" + + name = "catalogue-graph-bulk-loader" + description = "Bulk loads entities from an S3 bucket into the Neptune database." + runtime = "python3.10" + + filename = "../build.zip" + source_code_hash = filesha256("../build.zip") + + handler = "bulk_loader.lambda_handler" + memory_size = 128 + timeout = 20 // 20 seconds + + vpc_config = { + subnet_ids = local.private_subnets + security_group_ids = [aws_security_group.graph_indexer_lambda_security_group.id] + } + + # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] +} + +resource "aws_s3_bucket" "neptune_bulk_upload_bucket" { + bucket = "wellcomecollection-neptune-graph-loader" +} + +resource "aws_iam_role_policy_attachment" "s3_readonly_attachment" { + role = aws_iam_role.catalogue_graph_cluster.name + policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" +} + +resource "aws_iam_role_policy" "bulk_loader_lambda_read_secrets_policy" { + role = module.bulk_loader_lambda.lambda_role.name + policy = data.aws_iam_policy_document.allow_secret_read.json +} + +data "aws_iam_policy_document" "neptune_load" { + statement { + actions = [ + "neptune-db:StartLoaderJob" + ] + + resources = [ + "*" + ] + } +} + +resource "aws_iam_role_policy" "bulk_loader_lambda_neptune_policy" { + role = module.bulk_loader_lambda.lambda_role.name + policy = data.aws_iam_policy_document.neptune_load.json +} diff --git a/terraform/extractor_lambda.tf b/terraform/extractor.tf similarity index 100% rename from terraform/extractor_lambda.tf rename to terraform/extractor.tf diff --git a/terraform/indexer_lambda.tf b/terraform/indexer.tf similarity index 88% rename from terraform/indexer_lambda.tf rename to terraform/indexer.tf index 27f8858478..abd70c3231 100644 --- a/terraform/indexer_lambda.tf +++ b/terraform/indexer.tf @@ -29,7 +29,7 @@ data "aws_iam_policy_document" "allow_secret_read" { } } -resource "aws_iam_role_policy" "read_secrets_policy" { +resource "aws_iam_role_policy" "indexer_lambda_read_secrets_policy" { role = module.indexer_lambda.lambda_role.name policy = data.aws_iam_policy_document.allow_secret_read.json } @@ -109,18 +109,15 @@ resource "aws_iam_role_policy" "indexer_lambda_neptune_policy" { } # This configures an EventSourceMapping which automatically polls the SQS queue for new messages and triggers -# the indexer Lambda function. All messages received in a 60 second window (defined by `maximum_batching_window_in_seconds`) -# are collected and sent to the Lambda for processing in batches of at most 10 messages (defined by `batch_size`). -# Additionally, the `maximum_concurrency` parameter ensures that there are at most 10 active indexer Lambda functions -# running at a time. +# the indexer Lambda function. resource "aws_lambda_event_source_mapping" "sqs_to_indexer_lambda" { event_source_arn = module.indexer_message_queue.arn function_name = module.indexer_lambda.lambda.function_name - batch_size = 1 + batch_size = 5 # Maximum number of messages processed in a single Lambda run. enabled = true maximum_batching_window_in_seconds = 60 scaling_config { - maximum_concurrency = 20 + maximum_concurrency = 20 # Maximum number of active indexer Lambda functions running at a time } } diff --git a/terraform/neptune.tf b/terraform/neptune.tf index 52ba8cad7c..6f89aea559 100644 --- a/terraform/neptune.tf +++ b/terraform/neptune.tf @@ -8,6 +8,7 @@ resource "aws_neptune_cluster" "catalogue_graph_cluster" { storage_encrypted = true vpc_security_group_ids = [aws_security_group.neptune_security_group.id] neptune_subnet_group_name = aws_db_subnet_group.neptune_subnet_group.name + iam_roles = [aws_iam_role.catalogue_graph_cluster.arn] # Set minimum capacity to 1 NCU, and maximum capacity to 16 NCUs. These are the minimum possible values. serverless_v2_scaling_configuration { @@ -16,6 +17,23 @@ resource "aws_neptune_cluster" "catalogue_graph_cluster" { } } +resource "aws_iam_role" "catalogue_graph_cluster" { + name = "catalogue-graph-cluster" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "rds.amazonaws.com" # Neptune uses RDS for some operations + } + Action = "sts:AssumeRole" + } + ] + }) +} + resource "aws_neptune_cluster_instance" "catalogue_graph_instance" { cluster_identifier = aws_neptune_cluster.catalogue_graph_cluster.cluster_identifier instance_class = "db.serverless" @@ -39,6 +57,14 @@ resource "aws_vpc_security_group_ingress_rule" "neptune_ingress" { ip_protocol = "-1" } +# Allow any egress traffic. The Neptune cluster needs to be able to reach the `wellcomecollection-neptune-graph-loader` +# S3 bucket for bulk loading. +resource "aws_vpc_security_group_egress_rule" "neptune_egress" { + security_group_id = aws_security_group.neptune_security_group.id + cidr_ipv4 = "0.0.0.0/0" + ip_protocol = "-1" +} + resource "aws_secretsmanager_secret" "neptune_cluster_endpoint" { name = "NeptuneTest/InstanceEndpoint" } From ca6b1e1f3574b6d345325521fc76c0d56072c6cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 20 Dec 2024 10:09:50 +0000 Subject: [PATCH 010/310] Performance improvements --- create_zip.sh | 2 +- src/converters/cypher/base_converter.py | 7 ---- src/converters/cypher/bulk_load_converter.py | 33 ++++++++++--------- src/converters/cypher/query_converter.py | 25 +++++++------- src/extractor.py | 4 +-- src/query_builders/cypher.py | 6 ++-- src/requirements.txt | 2 +- src/transformers/base_transformer.py | 27 ++++++++++------ src/transformers/loc/raw_concept.py | 34 +++++++++++--------- terraform/bulk_loader.tf | 2 +- terraform/extractor.tf | 31 +++++++++++++++--- terraform/indexer.tf | 2 +- 12 files changed, 103 insertions(+), 72 deletions(-) diff --git a/create_zip.sh b/create_zip.sh index 49f578edbd..fe0019a3de 100644 --- a/create_zip.sh +++ b/create_zip.sh @@ -2,7 +2,7 @@ mkdir -p temp cp -r src/* temp/ -pip3 install -r src/requirements.txt --platform manylinux2014_x86_64 --target temp/ --only-binary=:all: --python-version 3.10 +pip3 install -r src/requirements.txt --platform manylinux2014_x86_64 --target temp/ --only-binary=:all: --python-version 3.13 cd temp zip -r ../build.zip . diff --git a/src/converters/cypher/base_converter.py b/src/converters/cypher/base_converter.py index be00e4dd15..c0d562a8ad 100644 --- a/src/converters/cypher/base_converter.py +++ b/src/converters/cypher/base_converter.py @@ -1,11 +1,4 @@ -from models.graph_node import BaseNode -from models.graph_edge import BaseEdge - - class CypherBaseConverter: - def __init__(self, model_to_convert: BaseEdge | BaseNode): - self.model = model_to_convert - def _convert_str(self, raw_value: str) -> str: return raw_value diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index 9d6df57a58..a7941f2d05 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -1,45 +1,46 @@ from models.graph_node import BaseNode from models.graph_edge import BaseEdge from .base_converter import CypherBaseConverter +from typing import Literal class CypherBulkLoadConverter(CypherBaseConverter): - def __init__(self, model_to_convert: BaseEdge | BaseNode): - self.model = model_to_convert + def __init__(self, entity_type: Literal["nodes", "edges"]): + self.entity_type = entity_type - def _node_to_bulk_cypher(self): - bulk_node = {":ID": self.model.id, ":LABEL": type(self.model).__name__} + def _node_to_bulk_cypher(self, model: BaseNode): + bulk_node = {":ID": model.id, ":LABEL": type(model).__name__} - for key, raw_value in dict(self.model).items(): + for key, raw_value in model.dict().items(): value = self._raw_value_to_cypher_value(raw_value) bulk_node[key] = value return bulk_node - def _edge_to_bulk_cypher(self): + def _edge_to_bulk_cypher(self, model: BaseEdge): bulk_edge = { - ":ID": f"{self.model.from_id}-->{self.model.to_id}", - ":START_ID": self.model.from_id, - ":END_ID": self.model.to_id, - ":TYPE": self.model.relationship, + ":ID": f"{model.from_id}-->{model.to_id}", + ":START_ID": model.from_id, + ":END_ID": model.to_id, + ":TYPE": model.relationship, } - for key, raw_value in self.model.attributes.items(): + for key, raw_value in model.attributes.items(): value = self._raw_value_to_cypher_value(raw_value) bulk_edge[key] = value return bulk_edge - def convert_to_bulk_cypher(self): + def convert_to_bulk_cypher(self, model: BaseNode | BaseEdge): """ Returns a dictionary representing the entity (node or edge), converting all values into a format compatible with openCypher, and adding all required values for bulk upload, such as `:ID` or `:LABEL`. See https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load-tutorial-format-opencypher.html. """ - if isinstance(self.model, BaseNode): - return self._node_to_bulk_cypher() - elif isinstance(self.model, BaseEdge): - return self._edge_to_bulk_cypher() + if self.entity_type == "nodes": + return self._node_to_bulk_cypher(model) + elif self.entity_type == "edges": + return self._edge_to_bulk_cypher(model) else: raise ValueError( "Unsupported Pydantic model. Each model must subclass BaseEdge or BaseNode." diff --git a/src/converters/cypher/query_converter.py b/src/converters/cypher/query_converter.py index 7e490413fd..b90a9b3752 100644 --- a/src/converters/cypher/query_converter.py +++ b/src/converters/cypher/query_converter.py @@ -1,11 +1,12 @@ from models.graph_node import BaseNode from models.graph_edge import BaseEdge from .base_converter import CypherBaseConverter +from typing import Literal class CypherQueryConverter(CypherBaseConverter): - def __init__(self, model_to_convert: BaseEdge | BaseNode): - self.model = model_to_convert + def __init__(self, entity_type: Literal["nodes", "edges"]): + self.entity_type = entity_type def _convert_str(self, raw_value: str) -> str: # All strings need to be surrounded in single quotation marks, and all single quotation marks @@ -13,30 +14,30 @@ def _convert_str(self, raw_value: str) -> str: escaped = raw_value.replace("'", "\\'") return f"'{escaped}'" - def _node_to_cypher_map(self) -> str: + def _node_to_cypher_map(self, model: BaseNode) -> str: properties = [] - for key, raw_value in dict(self.model).items(): + for key, raw_value in model.dict().items(): value = self._raw_value_to_cypher_value(raw_value) properties.append(f"{key}: {value}") return "{" + ", ".join(properties) + "}" - def _edge_to_cypher_map(self) -> str: + def _edge_to_cypher_map(self, model: BaseEdge) -> str: properties = [] - for key, raw_value in self.model.attributes.items(): + for key, raw_value in model.attributes.items(): value = self._raw_value_to_cypher_value(raw_value) properties.append(f"{key}: {value}") - for key, raw_value in self.model: + for key, raw_value in model: if key in ("from_id", "to_id"): value = self._raw_value_to_cypher_value(raw_value) properties.append(f"{key}: {value}") return "{" + ", ".join(properties) + "}" - def convert_to_cypher_map(self): + def convert_to_cypher_map(self, model: BaseNode | BaseEdge): """ Returns a string representing an openCypher Map of the entity (node or edge) for use with an `UNWIND` query. @@ -45,10 +46,10 @@ def convert_to_cypher_map(self): See https://neo4j.com/docs/cypher-manual/current/values-and-types/maps/. """ - if isinstance(self.model, BaseNode): - return self._node_to_cypher_map() - elif isinstance(self.model, BaseEdge): - return self._edge_to_cypher_map() + if self.entity_type == "nodes": + return self._node_to_cypher_map(model) + elif self.entity_type == "edges": + return self._edge_to_cypher_map(model) else: raise ValueError( "Unsupported Pydantic model. Each model must subclass BaseEdge or BaseNode." diff --git a/src/extractor.py b/src/extractor.py index 32df78e676..7a573c6185 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -6,7 +6,7 @@ from transformers.base_transformer import BaseTransformer, EntityType, StreamDestination from transformers.transformer_type import TransformerType -QUERY_CHUNK_SIZE = 200 +QUERY_CHUNK_SIZE = 256 S3_BULK_LOAD_BUCKET_NAME = "wellcomecollection-neptune-graph-loader" SNS_QUERY_TOPIC_ARN = "arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries" @@ -33,7 +33,7 @@ def handler( elif stream_destination == "s3": file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" - transformer.stream_to_s3(s3_uri, entity_type, sample_size) + transformer.stream_to_s3(s3_uri, entity_type, QUERY_CHUNK_SIZE, sample_size) elif stream_destination == "sns": transformer.stream_to_sns( SNS_QUERY_TOPIC_ARN, entity_type, QUERY_CHUNK_SIZE, sample_size diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index 71532008b5..e9f62f6a8b 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -10,7 +10,8 @@ def construct_upsert_nodes_query(nodes: list[BaseNode]) -> str: field_set = [f"n.{f} = data.{f}" for f in all_fields] field_set_statement = ", ".join(field_set) - unwind_maps = [CypherQueryConverter(node).convert_to_cypher_map() for node in nodes] + converter = CypherQueryConverter("nodes") + unwind_maps = [converter.convert_to_cypher_map(node) for node in nodes] joined_unwind_maps = ",\n".join(unwind_maps) query = f""" @@ -36,7 +37,8 @@ def construct_upsert_edges_query(edges: list[BaseEdge]) -> str: if len(field_set_statement) == 0: field_set_statement = "r={}" - unwind_maps = [CypherQueryConverter(edge).convert_to_cypher_map() for edge in edges] + converter = CypherQueryConverter("edges") + unwind_maps = [converter.convert_to_cypher_map(edge) for edge in edges] joined_unwind_maps = ",\n".join(unwind_maps) query = f""" diff --git a/src/requirements.txt b/src/requirements.txt index 82d084d929..78556d0fd0 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.13 # by the following command: # # pip-compile diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 0bc9caf0e6..e2b3d76caf 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -55,7 +55,7 @@ def _stream_nodes(self, number: int = None) -> Generator[BaseNode]: yield node counter += 1 - if counter % 5000 == 0: + if counter % 10000 == 0: print(f"Streamed {counter} nodes...") if counter == number: return @@ -74,7 +74,7 @@ def _stream_edges(self, number: int = None) -> Generator[BaseEdge]: yield edge counter += 1 - if counter % 5000 == 0: + if counter % 10000 == 0: print(f"Streamed {counter} edges...") if counter == number: return @@ -106,7 +106,11 @@ def _stream_chunks( yield chunk def stream_to_s3( - self, s3_uri: str, entity_type: EntityType, sample_size: int = None + self, + s3_uri: str, + entity_type: EntityType, + chunk_size: int, + sample_size: int = None, ): """ Streams transformed entities (nodes or edges) into an S3 bucket for bulk loading into the Neptune cluster. @@ -116,13 +120,18 @@ def stream_to_s3( with smart_open.open(s3_uri, "w", transport_params=transport_params) as f: csv_writer = None - for entity in self._stream_entities(entity_type, sample_size): - bulk_dict = CypherBulkLoadConverter(entity).convert_to_bulk_cypher() - if csv_writer is None: - csv_writer = csv.DictWriter(f, fieldnames=bulk_dict.keys()) - csv_writer.writeheader() + converter = CypherBulkLoadConverter(entity_type) + for chunk in self._stream_chunks(entity_type, chunk_size, sample_size): + bulk_dicts = [] + + for entity in chunk: + bulk_dict = converter.convert_to_bulk_cypher(entity) + bulk_dicts.append(bulk_dict) + if csv_writer is None: + csv_writer = csv.DictWriter(f, fieldnames=bulk_dict.keys()) + csv_writer.writeheader() - csv_writer.writerow(bulk_dict) + csv_writer.writerows(bulk_dicts) def stream_to_graph( self, diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 8958911f2f..76254a9ef1 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -1,3 +1,10 @@ +ID_PREFIXES_TO_REMOVE = ( + "/authorities/subjects/", + "http://id.loc.gov/authorities/subjects/", + "/authorities/names/", +) + + class RawLibraryOfCongressConcept: def __init__(self, raw_concept: dict): self.raw_concept = raw_concept @@ -5,31 +12,26 @@ def __init__(self, raw_concept: dict): @staticmethod def _remove_id_prefix(raw_id: str): - prefixes_to_remove = [ - "/authorities/subjects/", - "http://id.loc.gov/authorities/subjects/", - "/authorities/names/", - ] - - for prefix in prefixes_to_remove: + for prefix in ID_PREFIXES_TO_REMOVE: raw_id = raw_id.removeprefix(prefix) return raw_id def _extract_concept_node(self): graph = self.raw_concept["@graph"] - concept_nodes = [ - node - for node in graph - if self.source_id in node.get("@id") and node["@type"] == "skos:Concept" - ] # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. # When this happens, return `None` because there is no concept for us to extract. - if len(concept_nodes) == 0: - return None - - return concept_nodes[0] + concept_node = next( + ( + node + for node in graph + if self.source_id in node.get("@id") and node["@type"] == "skos:Concept" + ), + None, + ) + + return concept_node @staticmethod def _extract_label(raw_label: str | dict): diff --git a/terraform/bulk_loader.tf b/terraform/bulk_loader.tf index c91adac65d..dcdff36a54 100644 --- a/terraform/bulk_loader.tf +++ b/terraform/bulk_loader.tf @@ -3,7 +3,7 @@ module "bulk_loader_lambda" { name = "catalogue-graph-bulk-loader" description = "Bulk loads entities from an S3 bucket into the Neptune database." - runtime = "python3.10" + runtime = "python3.13" filename = "../build.zip" source_code_hash = filesha256("../build.zip") diff --git a/terraform/extractor.tf b/terraform/extractor.tf index 2c58f4693e..b86560ec7a 100644 --- a/terraform/extractor.tf +++ b/terraform/extractor.tf @@ -3,7 +3,7 @@ module "extractor_lambda" { name = "catalogue-graph-extractor" description = "Extracts source concepts and turns them into Cypher queries." - runtime = "python3.10" + runtime = "python3.13" filename = "../build.zip" source_code_hash = filesha256("../build.zip") @@ -12,10 +12,15 @@ module "extractor_lambda" { memory_size = 128 timeout = 60 // 1 minute + vpc_config = { + subnet_ids = local.private_subnets + security_group_ids = [aws_security_group.graph_indexer_lambda_security_group.id] + } + # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] } -data "aws_iam_policy_document" "publish_to_queries_topic" { +data "aws_iam_policy_document" "stream_to_sns" { statement { actions = [ "sns:Publish", @@ -27,7 +32,25 @@ data "aws_iam_policy_document" "publish_to_queries_topic" { } } -resource "aws_iam_role_policy" "reindex_jobs_policy" { +data "aws_iam_policy_document" "stream_to_s3" { + statement { + actions = [ + "s3:PutObject", + "s3:GetObject" + ] + + resources = [ + "${aws_s3_bucket.neptune_bulk_upload_bucket.arn}/*" + ] + } +} + +resource "aws_iam_role_policy" "stream_to_sns_policy" { + role = module.extractor_lambda.lambda_role.name + policy = data.aws_iam_policy_document.stream_to_sns.json +} + +resource "aws_iam_role_policy" "stream_to_s3_policy" { role = module.extractor_lambda.lambda_role.name - policy = data.aws_iam_policy_document.publish_to_queries_topic.json + policy = data.aws_iam_policy_document.stream_to_s3.json } diff --git a/terraform/indexer.tf b/terraform/indexer.tf index abd70c3231..fd57436e78 100644 --- a/terraform/indexer.tf +++ b/terraform/indexer.tf @@ -3,7 +3,7 @@ module "indexer_lambda" { name = "catalogue-graph-indexer" description = "Indexes nodes and edges into the Neptune catalogue graph cluster." - runtime = "python3.10" + runtime = "python3.13" filename = "../build.zip" source_code_hash = filesha256("../build.zip") From e74fc728164b27c0aff48a007763933dea341f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 20 Dec 2024 16:01:48 +0000 Subject: [PATCH 011/310] Add step function orchestration --- src/bulk_loader.py | 2 + src/clients/base_neptune_client.py | 12 ++-- terraform/bulk_loader.tf | 5 +- terraform/extractor.tf | 9 ++- terraform/state_machine.tf | 97 ++++++++++++++++++++++++++++++ terraform/variables.tf | 30 +++++++++ 6 files changed, 143 insertions(+), 12 deletions(-) create mode 100644 terraform/state_machine.tf create mode 100644 terraform/variables.tf diff --git a/src/bulk_loader.py b/src/bulk_loader.py index 673f13951a..26f71247b1 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -13,6 +13,8 @@ def handler(transformer_type: str, entity_type: EntityType, is_local=False): file_name = f"{transformer_type}__{entity_type}.csv" s3_file_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" + print(f"Initiating bulk load from {s3_file_uri}.") + neptune_client = get_neptune_client(is_local) load_id = neptune_client.initiate_bulk_load(s3_file_uri=s3_file_uri) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 3784267a7a..588e4c6b26 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -80,18 +80,17 @@ def get_bulk_load_status(self, load_id: str): response = self._make_request( "GET", f"/loader?loadId={load_id}&errors=TRUE&details=TRUE" ) - payload = response["payload"] + payload = response["payload"] overall_status = payload["overallStatus"] + error_logs = payload["errors"]["errorLogs"] + + # Statuses: https://docs.aws.amazon.com/neptune/latest/userguide/loader-message.html status = overall_status["status"] - full_uri = overall_status["fullUri"] processed_count = overall_status["totalRecords"] - print(f"Bulk load status: {status}") - print(f" Source file URI: {full_uri}") - print(f" Processed records: {processed_count:,}") + print(f"Bulk load status: {status}. (Processed {processed_count:,} records.)") - # Statuses: https://docs.aws.amazon.com/neptune/latest/userguide/loader-message.html if status in ("LOAD_NOT_STARTED", "LOAD_IN_QUEUE", "LOAD_IN_PROGRESS"): return @@ -105,7 +104,6 @@ def get_bulk_load_status(self, load_id: str): print(f" Data type mismatch errors: {data_type_error_count:,}") print(f" Total time spent: {formatted_time}") - error_logs = payload["errors"]["errorLogs"] if error_logs: print(" First 10 errors:") diff --git a/terraform/bulk_loader.tf b/terraform/bulk_loader.tf index dcdff36a54..5c550e0a0b 100644 --- a/terraform/bulk_loader.tf +++ b/terraform/bulk_loader.tf @@ -10,7 +10,7 @@ module "bulk_loader_lambda" { handler = "bulk_loader.lambda_handler" memory_size = 128 - timeout = 20 // 20 seconds + timeout = 15*60 // 15 minutes vpc_config = { subnet_ids = local.private_subnets @@ -37,7 +37,8 @@ resource "aws_iam_role_policy" "bulk_loader_lambda_read_secrets_policy" { data "aws_iam_policy_document" "neptune_load" { statement { actions = [ - "neptune-db:StartLoaderJob" + "neptune-db:StartLoaderJob", + "neptune-db:GetLoaderJobStatus" ] resources = [ diff --git a/terraform/extractor.tf b/terraform/extractor.tf index b86560ec7a..d1aa07dadd 100644 --- a/terraform/extractor.tf +++ b/terraform/extractor.tf @@ -5,12 +5,15 @@ module "extractor_lambda" { description = "Extracts source concepts and turns them into Cypher queries." runtime = "python3.13" - filename = "../build.zip" + filename = "../build.zip" source_code_hash = filesha256("../build.zip") handler = "extractor.lambda_handler" - memory_size = 128 - timeout = 60 // 1 minute + + // This Lambda does not need a lot of memory, but it downloads and processes large datasets (with up to 10 million + // items) and therefore needs the additional compute and networking capacity which comes with increased memory. + memory_size = 4096 + timeout = 15*60 // 15 minutes vpc_config = { subnet_ids = local.private_subnets diff --git a/terraform/state_machine.tf b/terraform/state_machine.tf new file mode 100644 index 0000000000..d82e9456f9 --- /dev/null +++ b/terraform/state_machine.tf @@ -0,0 +1,97 @@ +resource "aws_iam_role" "state_machine_execution_role" { + name = "catalogue-graph-state-machine-execution-role" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "states.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_policy" "state_machine_policy" { + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Action = ["logs:CreateLogStream", "logs:PutLogEvents"], + Resource = "*" + }, + { + Effect = "Allow", + Action = ["lambda:InvokeFunction"], + Resource = "*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "sfn_policy_attachment" { + role = aws_iam_role.state_machine_execution_role.name + policy_arn = aws_iam_policy.state_machine_policy.arn +} + +resource "aws_sfn_state_machine" "catalogue_graph_pipeline" { + name = "catalogue-graph-pipeline" + role_arn = aws_iam_role.state_machine_execution_role.arn + + definition = jsonencode({ + Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." + StartAt = "Extractors" + States = merge(local.extractor_states, local.bulk_loader_states, local.success_state) + }) +} + +locals { + extractor_states = { + # Run all extractor Lambda functions in parallel. + Extractors = { + Type = "Parallel" + Branches = flatten([ + for index, task_input in var.state_machine_inputs : { + StartAt = "Extract ${task_input.label}" + States = { + "Extract ${task_input.label}" = { + Type = "Task" + Resource = module.extractor_lambda.lambda.arn + Parameters = { + "transformer_type" = task_input.transformer_type, + "entity_type" = task_input.entity_type, + "stream_destination" = "s3", + "sample_size" = 1000 # Only stream a small sample while testing + } + End = true + } + } + } + ]) + Next = "Load ${var.state_machine_inputs[0].label}" + } + } + + bulk_loader_states = tomap({ + for index, task_input in var.state_machine_inputs : + "Load ${task_input.label}" => { + Type = "Task" + Resource = module.bulk_loader_lambda.lambda.arn, + Parameters = { + "transformer_type" = task_input.transformer_type, + "entity_type" = task_input.entity_type + } + Next = index == length(var.state_machine_inputs) - 1 ? "Success" : "Load ${var.state_machine_inputs[index + 1].label}" + } + }) + + success_state = tomap({ + Success = { + Type = "Succeed" + } + }) +} + diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000000..e1c6a9089b --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,30 @@ +variable "state_machine_inputs" { + type = list(object({label: string, transformer_type: string, entity_type: string})) + default = [ + { + "label" : "LoC Concept Nodes", + "transformer_type" : "loc_concepts", + "entity_type" : "nodes" + }, + { + "label" : "LoC Location Nodes", + "transformer_type" : "loc_locations", + "entity_type" : "nodes" + }, + { + "label" : "LoC Name Nodes", + "transformer_type" : "loc_names", + "entity_type" : "nodes" + }, + { + "label" : "LoC Concept Edges", + "transformer_type" : "loc_concepts", + "entity_type" : "edges" + }, + { + "label" : "LoC Location Edges", + "transformer_type" : "loc_locations", + "entity_type" : "edges" + }, + ] +} From d19e51ed25c812a59e220ce2d7c3f98a8c414c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 2 Jan 2025 13:37:24 +0000 Subject: [PATCH 012/310] State machine refactoring --- src/bulk_load_poller.py | 44 +++++++++ src/bulk_loader.py | 14 +-- src/clients/base_neptune_client.py | 7 +- terraform/lambda_bulk_load_poller.tf | 43 ++++++++ .../{bulk_loader.tf => lambda_bulk_loader.tf} | 2 +- .../{extractor.tf => lambda_extractor.tf} | 0 terraform/{indexer.tf => lambda_indexer.tf} | 0 terraform/state_machine.tf | 97 ------------------- terraform/state_machine_bulk_loader.tf | 53 ++++++++++ terraform/state_machine_bulk_loaders.tf | 30 ++++++ terraform/state_machine_extractors.tf | 37 +++++++ terraform/state_machine_iam.tf | 50 ++++++++++ terraform/state_machine_pipeline.tf | 30 ++++++ 13 files changed, 296 insertions(+), 111 deletions(-) create mode 100644 src/bulk_load_poller.py create mode 100644 terraform/lambda_bulk_load_poller.tf rename terraform/{bulk_loader.tf => lambda_bulk_loader.tf} (97%) rename terraform/{extractor.tf => lambda_extractor.tf} (100%) rename terraform/{indexer.tf => lambda_indexer.tf} (100%) delete mode 100644 terraform/state_machine.tf create mode 100644 terraform/state_machine_bulk_loader.tf create mode 100644 terraform/state_machine_bulk_loaders.tf create mode 100644 terraform/state_machine_extractors.tf create mode 100644 terraform/state_machine_iam.tf create mode 100644 terraform/state_machine_pipeline.tf diff --git a/src/bulk_load_poller.py b/src/bulk_load_poller.py new file mode 100644 index 0000000000..1a4535edd8 --- /dev/null +++ b/src/bulk_load_poller.py @@ -0,0 +1,44 @@ +from utils.aws import get_neptune_client + +import argparse + + +def handler(load_id: str, is_local=False): + neptune_client = get_neptune_client(is_local) + status = neptune_client.get_bulk_load_status(load_id) + + if status in ("LOAD_NOT_STARTED", "LOAD_IN_QUEUE", "LOAD_IN_PROGRESS"): + return { + "loadId": load_id, + "status": "IN_PROGRESS", + } + + if status == "LOAD_COMPLETED": + return { + "loadId": load_id, + "status": "SUCCEEDED", + } + + raise Exception("Load failed. See error log above.") + + +def lambda_handler(event: dict, context): + load_id = event["loadId"] + return handler(load_id) + + +def local_handler(): + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--load-id", + type=str, + help="The ID of the bulk load job whose status to check.", + required=True, + ) + args = parser.parse_args() + + handler(**args.__dict__, is_local=True) + + +if __name__ == "__main__": + local_handler() diff --git a/src/bulk_loader.py b/src/bulk_loader.py index 26f71247b1..b99fd96803 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -4,7 +4,6 @@ import typing import argparse -import time S3_BULK_LOAD_BUCKET_NAME = "wellcomecollection-neptune-graph-loader" @@ -18,22 +17,13 @@ def handler(transformer_type: str, entity_type: EntityType, is_local=False): neptune_client = get_neptune_client(is_local) load_id = neptune_client.initiate_bulk_load(s3_file_uri=s3_file_uri) - while True: - final_status = neptune_client.get_bulk_load_status(load_id) - - if final_status is not None: - break - - time.sleep(20) - - if final_status != "LOAD_COMPLETED": - raise Exception("Load failed. See error log above.") + return {"loadId": load_id} def lambda_handler(event: dict, context): transformer_type = TransformerType.argparse(event["transformer_type"]) entity_type = event["entity_type"] - handler(transformer_type, entity_type) + return handler(transformer_type, entity_type) def local_handler(): diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 588e4c6b26..003fd3c3b0 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -92,7 +92,7 @@ def get_bulk_load_status(self, load_id: str): print(f"Bulk load status: {status}. (Processed {processed_count:,} records.)") if status in ("LOAD_NOT_STARTED", "LOAD_IN_QUEUE", "LOAD_IN_PROGRESS"): - return + return status insert_error_count = overall_status["insertErrors"] parsing_error_count = overall_status["parsingErrors"] @@ -113,3 +113,8 @@ def get_bulk_load_status(self, load_id: str): print(f" {code}: {message}") return status + + def get_bulk_load_statuses(self): + response = self._make_request("GET", "/loader") + payload = response["payload"] + return payload diff --git a/terraform/lambda_bulk_load_poller.tf b/terraform/lambda_bulk_load_poller.tf new file mode 100644 index 0000000000..6b949b5514 --- /dev/null +++ b/terraform/lambda_bulk_load_poller.tf @@ -0,0 +1,43 @@ +module "bulk_load_poller_lambda" { + source = "git@github.com:wellcomecollection/terraform-aws-lambda?ref=v1.2.0" + + name = "catalogue-graph-bulk-load-poller" + description = "Polls the status of a Neptune bulk load job." + runtime = "python3.13" + + filename = "../build.zip" + source_code_hash = filesha256("../build.zip") + + handler = "bulk_load_poller.lambda_handler" + memory_size = 128 + timeout = 30 // 30 seconds + + vpc_config = { + subnet_ids = local.private_subnets + security_group_ids = [aws_security_group.graph_indexer_lambda_security_group.id] + } + + # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] +} + +resource "aws_iam_role_policy" "bulk_load_poller_lambda_read_secrets_policy" { + role = module.bulk_load_poller_lambda.lambda_role.name + policy = data.aws_iam_policy_document.allow_secret_read.json +} + +data "aws_iam_policy_document" "neptune_poll" { + statement { + actions = [ + "neptune-db:GetLoaderJobStatus" + ] + + resources = [ + "*" + ] + } +} + +resource "aws_iam_role_policy" "bulk_load_poller_lambda_neptune_policy" { + role = module.bulk_load_poller_lambda.lambda_role.name + policy = data.aws_iam_policy_document.neptune_poll.json +} diff --git a/terraform/bulk_loader.tf b/terraform/lambda_bulk_loader.tf similarity index 97% rename from terraform/bulk_loader.tf rename to terraform/lambda_bulk_loader.tf index 5c550e0a0b..101cc4d0e9 100644 --- a/terraform/bulk_loader.tf +++ b/terraform/lambda_bulk_loader.tf @@ -10,7 +10,7 @@ module "bulk_loader_lambda" { handler = "bulk_loader.lambda_handler" memory_size = 128 - timeout = 15*60 // 15 minutes + timeout = 30 // 30 seconds vpc_config = { subnet_ids = local.private_subnets diff --git a/terraform/extractor.tf b/terraform/lambda_extractor.tf similarity index 100% rename from terraform/extractor.tf rename to terraform/lambda_extractor.tf diff --git a/terraform/indexer.tf b/terraform/lambda_indexer.tf similarity index 100% rename from terraform/indexer.tf rename to terraform/lambda_indexer.tf diff --git a/terraform/state_machine.tf b/terraform/state_machine.tf deleted file mode 100644 index d82e9456f9..0000000000 --- a/terraform/state_machine.tf +++ /dev/null @@ -1,97 +0,0 @@ -resource "aws_iam_role" "state_machine_execution_role" { - name = "catalogue-graph-state-machine-execution-role" - assume_role_policy = jsonencode({ - Version = "2012-10-17", - Statement = [ - { - Effect = "Allow", - Principal = { - Service = "states.amazonaws.com" - }, - Action = "sts:AssumeRole" - } - ] - }) -} - -resource "aws_iam_policy" "state_machine_policy" { - policy = jsonencode({ - Version = "2012-10-17", - Statement = [ - { - Effect = "Allow", - Action = ["logs:CreateLogStream", "logs:PutLogEvents"], - Resource = "*" - }, - { - Effect = "Allow", - Action = ["lambda:InvokeFunction"], - Resource = "*" - } - ] - }) -} - -resource "aws_iam_role_policy_attachment" "sfn_policy_attachment" { - role = aws_iam_role.state_machine_execution_role.name - policy_arn = aws_iam_policy.state_machine_policy.arn -} - -resource "aws_sfn_state_machine" "catalogue_graph_pipeline" { - name = "catalogue-graph-pipeline" - role_arn = aws_iam_role.state_machine_execution_role.arn - - definition = jsonencode({ - Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." - StartAt = "Extractors" - States = merge(local.extractor_states, local.bulk_loader_states, local.success_state) - }) -} - -locals { - extractor_states = { - # Run all extractor Lambda functions in parallel. - Extractors = { - Type = "Parallel" - Branches = flatten([ - for index, task_input in var.state_machine_inputs : { - StartAt = "Extract ${task_input.label}" - States = { - "Extract ${task_input.label}" = { - Type = "Task" - Resource = module.extractor_lambda.lambda.arn - Parameters = { - "transformer_type" = task_input.transformer_type, - "entity_type" = task_input.entity_type, - "stream_destination" = "s3", - "sample_size" = 1000 # Only stream a small sample while testing - } - End = true - } - } - } - ]) - Next = "Load ${var.state_machine_inputs[0].label}" - } - } - - bulk_loader_states = tomap({ - for index, task_input in var.state_machine_inputs : - "Load ${task_input.label}" => { - Type = "Task" - Resource = module.bulk_loader_lambda.lambda.arn, - Parameters = { - "transformer_type" = task_input.transformer_type, - "entity_type" = task_input.entity_type - } - Next = index == length(var.state_machine_inputs) - 1 ? "Success" : "Load ${var.state_machine_inputs[index + 1].label}" - } - }) - - success_state = tomap({ - Success = { - Type = "Succeed" - } - }) -} - diff --git a/terraform/state_machine_bulk_loader.tf b/terraform/state_machine_bulk_loader.tf new file mode 100644 index 0000000000..f7e0b21be8 --- /dev/null +++ b/terraform/state_machine_bulk_loader.tf @@ -0,0 +1,53 @@ +resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { + name = "catalogue-graph-bulk-loader" + role_arn = aws_iam_role.state_machine_execution_role.arn + + definition = jsonencode({ + QueryLanguage = "JSONata" + Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." + StartAt = "Trigger bulk load" + States = { + "Trigger bulk load" : { + "Type" : "Task", + "Resource" : "arn:aws:states:::lambda:invoke", + "Output" : "{% $states.result.Payload %}", + "Arguments" : { + "FunctionName" : module.bulk_loader_lambda.lambda.arn, + "Payload" : "{% $states.input %}" + }, + "Next" : "Wait 30 seconds" + }, + "Wait 30 seconds" : { + "Type" : "Wait", + "Next" : "Check load status", + "Seconds" : 30 + }, + "Check load status" : { + "Type" : "Task", + "Resource" : "arn:aws:states:::lambda:invoke", + "Output" : "{% $states.result.Payload %}", + "Arguments" : { + "FunctionName" : module.bulk_load_poller_lambda.lambda.arn, + "Payload" : "{% $states.input %}" + }, + "Next" : "Load complete?" + }, + "Load complete?" : { + "Type" : "Choice", + "Output" : "{% $states.input %}", + "Choices" : [ + { + "Condition" : "{% $states.input.status = 'SUCCEEDED' %}", + "Next" : "Success" + } + ], + "Default" : "Wait 30 seconds" + }, + "Success" : { + "Type" : "Succeed" + } + } + + }) +} + diff --git a/terraform/state_machine_bulk_loaders.tf b/terraform/state_machine_bulk_loaders.tf new file mode 100644 index 0000000000..b28dc4b2b2 --- /dev/null +++ b/terraform/state_machine_bulk_loaders.tf @@ -0,0 +1,30 @@ +resource "aws_sfn_state_machine" "catalogue_graph_bulk_loaders" { + name = "catalogue-graph-bulk-loaders" + role_arn = aws_iam_role.state_machine_execution_role.arn + + definition = jsonencode({ + Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." + StartAt = "Load ${var.state_machine_inputs[0].label}" + States = merge(tomap({ + for index, task_input in var.state_machine_inputs : + "Load ${task_input.label}" => { + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", + Parameters = { + StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loader.arn + Input = { + "transformer_type" = task_input.transformer_type, + "entity_type" = task_input.entity_type + } + } + Next = index == length(var.state_machine_inputs) - 1 ? "Success" : "Load ${var.state_machine_inputs[index + 1].label}" + } + }), { + Success = { + Type = "Succeed" + } + }) + + }) +} + diff --git a/terraform/state_machine_extractors.tf b/terraform/state_machine_extractors.tf new file mode 100644 index 0000000000..c5d9dd81f4 --- /dev/null +++ b/terraform/state_machine_extractors.tf @@ -0,0 +1,37 @@ +resource "aws_sfn_state_machine" "catalogue_graph_extractors" { + name = "catalogue-graph-extractors" + role_arn = aws_iam_role.state_machine_execution_role.arn + + definition = jsonencode({ + Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." + StartAt = "Trigger extractors" + States = { + "Trigger extractors" = { + Type = "Parallel" + Branches = flatten([ + for index, task_input in var.state_machine_inputs : { + StartAt = "Extract ${task_input.label}" + States = { + "Extract ${task_input.label}" = { + Type = "Task" + Resource = module.extractor_lambda.lambda.arn + Parameters = { + "transformer_type" = task_input.transformer_type, + "entity_type" = task_input.entity_type, + "stream_destination" = "s3", + "sample_size" = 1000 # Only stream a small sample while testing + } + End = true + } + } + } + ]) + Next = "Success" + }, + "Success" : { + "Type" : "Succeed" + } + } + }) +} + diff --git a/terraform/state_machine_iam.tf b/terraform/state_machine_iam.tf new file mode 100644 index 0000000000..5a75038e0a --- /dev/null +++ b/terraform/state_machine_iam.tf @@ -0,0 +1,50 @@ +data "aws_caller_identity" "current" {} + +resource "aws_iam_role" "state_machine_execution_role" { + name = "catalogue-graph-state-machine-execution-role" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "states.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_policy" "state_machine_policy" { + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Action = ["logs:CreateLogStream", "logs:PutLogEvents"], + Resource = "*" + }, + { + Effect = "Allow", + Action = ["lambda:InvokeFunction"], + Resource = "*" + }, + { + Effect = "Allow", + Action = ["states:StartExecution", "states:DescribeExecution", "states:StopExecution"], + Resource = "*" + }, + { + Effect = "Allow", + Action = ["events:PutTargets", "events:PutRule", "events:DescribeRule"], + Resource = "arn:aws:events:eu-west-1:${data.aws_caller_identity.current.account_id}:rule/StepFunctions*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "sfn_policy_attachment" { + role = aws_iam_role.state_machine_execution_role.name + policy_arn = aws_iam_policy.state_machine_policy.arn +} diff --git a/terraform/state_machine_pipeline.tf b/terraform/state_machine_pipeline.tf new file mode 100644 index 0000000000..7aa583ec8c --- /dev/null +++ b/terraform/state_machine_pipeline.tf @@ -0,0 +1,30 @@ +resource "aws_sfn_state_machine" "catalogue_graph_pipeline" { + name = "catalogue-graph-pipeline" + role_arn = aws_iam_role.state_machine_execution_role.arn + + definition = jsonencode({ + Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." + StartAt = "Extractors" + States = { + "Extractors" = { + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", + Parameters = { + StateMachineArn = aws_sfn_state_machine.catalogue_graph_extractors.arn + } + Next = "Bulk loaders" + }, + "Bulk loaders" = { + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", + Parameters = { + StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loaders.arn + } + Next = "Success" + }, + Success = { + Type = "Succeed" + } + }, + }) +} From 2517c5a2903df9adeded89dc0559ac7702151767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 2 Jan 2025 15:19:14 +0000 Subject: [PATCH 013/310] Add README --- README.md | 85 +++++++++++ scripts/neptune_experimental_queries.ipynb | 169 --------------------- src/clients/base_neptune_client.py | 7 + src/converters/cypher/base_converter.py | 4 + src/extractor.py | 8 +- src/transformers/base_transformer.py | 6 +- src/utils/aws.py | 2 +- terraform/lambda_bulk_loader.tf | 1 + 8 files changed, 105 insertions(+), 177 deletions(-) create mode 100644 README.md delete mode 100644 scripts/neptune_experimental_queries.ipynb diff --git a/README.md b/README.md new file mode 100644 index 0000000000..3405bcc06d --- /dev/null +++ b/README.md @@ -0,0 +1,85 @@ +# Catalogue graph pipeline + +The catalogue graph pipeline extracts concepts from various sources (e.g. LoC, MeSH) and stores them into the catalogue +graph database (running in Amazon Neptune). It consists of several Lambda functions: + +* `extractor`: Extracts a single entity type (nodes or edges) from a single source (e.g. LoC Names) and streams the + transformed entities into the specified destination. Supported destinations include S3, SNS, and Neptune: + * S3 is used when loading many entities in bulk via the Neptune bulk loader. + * Neptune is used when loading a smaller number of entities directly into the cluster using openCypher queries. + * SNS is used when loading entities using openCypher queries via the `indexer` Lambda function. This method was + originally used for loading large numbers of entities into the cluster, but has since been superseded by the bulk + load method and might be removed in the future. +* `bulk_loader`: Triggers a Neptune bulk load of a single S3 file created by the `extractor` Lambda function. +* `bulk_load_poller`: Checks the status of a bulk load job. +* `indexer`: Consumes openCypher queries from the SNS topic populated by the `extractor` Lambda function and runs them + against the Neptune cluster. (There is an SQS queue between the SNS topic and the Lambda function and queries are + consumed via an event source mapping). + +Lambda function execution is orchestrated via AWS Step Functions (see `terraform` directory). Several state machines are +utilised for this purpose: + +* `catalogue-graph-pipeline`: Represents the full pipeline, extracting and loading all concepts into the cluster. + Triggers the `extractors` state machine, followed by the `bulk_loaders` state machine. +* `extractors`: Invokes `extractor` Lambda function instances in parallel, one for each combination of source type and + entity type (e.g. one for LoC Concept nodes, one for LoC Concept edges, etc.). +* `bulk_loaders`: Triggers `bulk_loader` state machine instances in sequence, one for each combination of source type + and entity type. +* `bulk_loader`: Invokes a single `bulk_loader` Lambda function to start a bulk load job. Repeatedly invokes + the `bulk_load_poller` Lambda function to check the status of the job until it completes. + +## Source code organisation + +The `src` directory contains all Python source code for the graph pipeline. (In production, we use Python 3.13.) + +The root of the directory contains a Python file for each Lambda function in the pipeline. Each file has +a `lambda_handler` function (used when running in production) and a `local_handler` function (used when running +locally). + +Subdirectories contain various modules and are shared by all Lambda functions. + +* The `clients` directory contains the `LambdaNeptuneClient` and `LocalNeptuneClient` classes, both subclassing from + `BaseNeptuneClient`. These classes are responsible for all communication with the Neptune client. This includes making + openCypher API calls and triggering bulk loads. +* The `converters` directory contains classes for converting Pydantic models into a format expected by Neptune. This + also includes converting various data types into a Neptune-compatible format. For example, lists are converted + into a `||`-separated string (since Neptune does not support storing lists/arrays). +* The `models` directory contains Pydantic models for representing all node and edge types stored in the graph. Every + entity extracted from a source must first be converted into one of these models before being loaded into the graph. +* The `query_builders` directory contains various utility functions for constructing openCypher queries (e.g. UNWIND + queries) from a list of Pydantic entities. +* The `sources` directory contains classes for extracting entities from their source and streaming them for further + processing. Each source class must implement the `stream_raw` method which must `yield` a single entity from the + source. +* The `transformers` directory contains classes for transforming raw entities returned from the relevant source class + into Pydantic models and streaming them to the desired destination. Each transformer class must subclass from the + `BaseTransformer` class and implement an `transform_node` method (which accepts a single raw entity dictionary, and + returns a single Pydantic model) and an `extract_edges` method (which also accepts a single raw entity dictionary, and + yields a single Pydantic model). The BaseTransformer class implements a `stream_to_` method for each + supported destination. + +## Local execution + +To run one of the Lambda functions locally, navigate to the `src` directory and then run the chosen function via the +command line. For example, to check the status of a bulk load job, run the following: + +```shell +AWS_PROFILE=platform-developer python3.13 bulk_load_poller.py --load-id= +``` + +## Local Neptune experimentation + +To run experimental Neptune queries locally, create a new Python file in the `src` directory, create a local Neptune +client, and then run your queries. For example: + +```python3 +from utils.aws import get_neptune_client + +neptune_client = get_neptune_client(True) + +query = """ +MATCH (n) RETURN count(*) +""" +result = neptune_client.run_open_cypher_query(query) +print(result) +``` diff --git a/scripts/neptune_experimental_queries.ipynb b/scripts/neptune_experimental_queries.ipynb deleted file mode 100644 index ae2af04ada..0000000000 --- a/scripts/neptune_experimental_queries.ipynb +++ /dev/null @@ -1,169 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "ExecuteTime": { - "end_time": "2024-12-10T09:46:24.193547Z", - "start_time": "2024-12-10T09:46:22.627832Z" - } - }, - "outputs": [], - "source": [ - "import json\n", - "\n", - "import requests\n", - "import boto3\n", - "from botocore.auth import SigV4Auth\n", - "from botocore.awsrequest import AWSRequest\n", - "\n", - "import urllib3\n", - "urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n", - "\n", - "session = boto3.Session(profile_name=\"platform-developer\")\n", - "\n", - "\n", - "def _get_secret(secret_name: str):\n", - " secrets_manager_client = session.client('secretsmanager', region_name='eu-west-1')\n", - " response = secrets_manager_client.get_secret_value(SecretId=secret_name)\n", - "\n", - " return response['SecretString']\n", - "\n", - "# The experimental database is accessible from outside of the VPC via a Network Load Balancer (NLB) for testing purposes\n", - "LOAD_BALANCER_URL = _get_secret(\"NeptuneTest/LoadBalancerUrl\")\n", - "NEPTUNE_INSTANCE_ENDPOINT = _get_secret(\"NeptuneTest/InstanceEndpoint\")\n", - "\n", - "def run_open_cypher_query(query: str):\n", - " \"\"\"Run a Cypher query against an experimental serverless Neptune cluster\"\"\"\n", - " open_cypher_endpoint_url = f'{LOAD_BALANCER_URL}/openCypher'\n", - "\n", - " headers = {\n", - " \"Host\": NEPTUNE_INSTANCE_ENDPOINT,\n", - " \"Content-Type\": \"application/json\"\n", - " }\n", - " payload = {\"query\": query}\n", - " \n", - " # We use IAM database authentication, which means we need to authenticate the request using AWS Signature\n", - " request = AWSRequest(method=\"POST\", url=open_cypher_endpoint_url, data=json.dumps(payload), headers=headers)\n", - " SigV4Auth(session.get_credentials(), \"neptune-db\", \"eu-west-1\" ).add_auth(request)\n", - "\n", - " # We need to send a manual request rather than using boto3 since we are accessing the instance via a NLB\n", - " response = requests.post(\n", - " open_cypher_endpoint_url,\n", - " data=json.dumps(payload),\n", - " headers=dict(request.headers),\n", - " # We are using the default NLB DNS name, which does not support custom SSL certificates, so we need to disable SSL certificate verification.\n", - " # This increases the risks of a man-in-the-middle attack, which is acceptable for a testing database.\n", - " # In production, we will be connecting to the database directly from within the VPC.\n", - " verify=False\n", - " )\n", - "\n", - " if response.status_code != 200:\n", - " raise Exception(response.content)\n", - "\n", - " return response.json()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query = \"\"\"\n", - "CREATE (n:Person {name: 'Alice', age: 30, city: 'New York'})\n", - "RETURN n\n", - "\"\"\"\n", - "\n", - "run_open_cypher_query(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": { - "ExecuteTime": { - "end_time": "2024-12-10T09:51:35.288073Z", - "start_time": "2024-12-10T09:51:34.570581Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": "{'results': []}" - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"\"\"\n", - "MATCH (person:SourceConcept {id: \"sh00000011\"})-[:NARROWER_THAN]->(friend)\n", - "RETURN friend;\n", - "\"\"\"\n", - "\n", - "run_open_cypher_query(query)" - ] - }, - { - "cell_type": "code", - "outputs": [ - { - "data": { - "text/plain": "{'results': []}" - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = \"\"\"\n", - "MATCH (person:SourceConcept {id: \"sh00000011\"})\n", - "RETURN person\n", - "\"\"\"\n", - "\n", - "run_open_cypher_query(query)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-12-10T09:57:24.678422Z", - "start_time": "2024-12-10T09:57:24.401967Z" - } - }, - "execution_count": 72 - }, - { - "cell_type": "code", - "outputs": [], - "source": [], - "metadata": { - "collapsed": false - } - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 003fd3c3b0..60279b7d27 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -7,6 +7,13 @@ class BaseNeptuneClient: + """ + Communicates with the Neptune cluster. Makes openCypher queries, triggers bulk load operations, etc. + + Do not use this base class directly. Use either LambdaNeptuneClient (when running in the same VPC as the Neptune + cluster) or LocalNeptuneClient (when connecting to the cluster from outside the VPC). + """ + def __init__(self): self.session = None self.neptune_endpoint = None diff --git a/src/converters/cypher/base_converter.py b/src/converters/cypher/base_converter.py index c0d562a8ad..80a65aaa5d 100644 --- a/src/converters/cypher/base_converter.py +++ b/src/converters/cypher/base_converter.py @@ -1,4 +1,8 @@ class CypherBaseConverter: + """ + Converts various data types into a format compatible with openCypher. + """ + def _convert_str(self, raw_value: str) -> str: return raw_value diff --git a/src/extractor.py b/src/extractor.py index 7a573c6185..e16b599d56 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -6,7 +6,7 @@ from transformers.base_transformer import BaseTransformer, EntityType, StreamDestination from transformers.transformer_type import TransformerType -QUERY_CHUNK_SIZE = 256 +CHUNK_SIZE = 256 S3_BULK_LOAD_BUCKET_NAME = "wellcomecollection-neptune-graph-loader" SNS_QUERY_TOPIC_ARN = "arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries" @@ -28,15 +28,15 @@ def handler( if stream_destination == "graph": neptune_client = get_neptune_client(is_local) transformer.stream_to_graph( - neptune_client, entity_type, QUERY_CHUNK_SIZE, sample_size + neptune_client, entity_type, CHUNK_SIZE, sample_size ) elif stream_destination == "s3": file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" - transformer.stream_to_s3(s3_uri, entity_type, QUERY_CHUNK_SIZE, sample_size) + transformer.stream_to_s3(s3_uri, entity_type, CHUNK_SIZE, sample_size) elif stream_destination == "sns": transformer.stream_to_sns( - SNS_QUERY_TOPIC_ARN, entity_type, QUERY_CHUNK_SIZE, sample_size + SNS_QUERY_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size ) else: raise ValueError("Unsupported stream destination.") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index e2b3d76caf..1a183d1b17 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -4,7 +4,7 @@ from sources.base_source import BaseSource from clients.base_neptune_client import BaseNeptuneClient from query_builders.cypher import construct_upsert_cypher_query -from utils.aws import publish_to_sns +from utils.aws import publish_batch_to_sns from converters.cypher.bulk_load_converter import CypherBulkLoadConverter import smart_open @@ -186,7 +186,7 @@ def stream_to_sns( # SNS supports a maximum batch size of 10 if len(queries) >= 10: - publish_to_sns(topic_arn, queries) + publish_batch_to_sns(topic_arn, queries) queries = [] counter += 1 @@ -195,4 +195,4 @@ def stream_to_sns( # Publish remaining messages (if any) if len(queries) > 0: - publish_to_sns(topic_arn, queries) + publish_batch_to_sns(topic_arn, queries) diff --git a/src/utils/aws.py b/src/utils/aws.py index 5f5347378e..eef89ba3c9 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -12,7 +12,7 @@ def get_secret(secret_name: str): return response["SecretString"] -def publish_to_sns(topic_arn: str, queries: list[str]): +def publish_batch_to_sns(topic_arn: str, queries: list[str]): request_entries = [] for i, query in enumerate(queries): request_entries.append( diff --git a/terraform/lambda_bulk_loader.tf b/terraform/lambda_bulk_loader.tf index 101cc4d0e9..2d513ce847 100644 --- a/terraform/lambda_bulk_loader.tf +++ b/terraform/lambda_bulk_loader.tf @@ -20,6 +20,7 @@ module "bulk_loader_lambda" { # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] } +# S3 bucket for storing files to be bulk loaded into the Neptune cluster resource "aws_s3_bucket" "neptune_bulk_upload_bucket" { bucket = "wellcomecollection-neptune-graph-loader" } From bfeca036c794e5f1f2961d2da30c5ea324e6f42b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 2 Jan 2025 16:43:53 +0000 Subject: [PATCH 014/310] Add environment variables --- README.md | 2 +- src/bulk_loader.py | 8 +++++--- src/extractor.py | 9 +++++---- src/transformers/transformer_type.py | 4 ---- terraform/lambda_bulk_loader.tf | 8 +++++++- terraform/lambda_extractor.tf | 9 ++++++++- 6 files changed, 26 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 3405bcc06d..7364900fb5 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ graph database (running in Amazon Neptune). It consists of several Lambda functi Lambda function execution is orchestrated via AWS Step Functions (see `terraform` directory). Several state machines are utilised for this purpose: -* `catalogue-graph-pipeline`: Represents the full pipeline, extracting and loading all concepts into the cluster. +* `catalogue-graph-pipeline`: Represents the full pipeline, extracting all concepts and loading them into the cluster. Triggers the `extractors` state machine, followed by the `bulk_loaders` state machine. * `extractors`: Invokes `extractor` Lambda function instances in parallel, one for each combination of source type and entity type (e.g. one for LoC Concept nodes, one for LoC Concept edges, etc.). diff --git a/src/bulk_loader.py b/src/bulk_loader.py index b99fd96803..40c34e3dc3 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -1,11 +1,13 @@ +import typing +import argparse +import os + from transformers.base_transformer import EntityType from transformers.transformer_type import TransformerType from utils.aws import get_neptune_client -import typing -import argparse -S3_BULK_LOAD_BUCKET_NAME = "wellcomecollection-neptune-graph-loader" +S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] def handler(transformer_type: str, entity_type: EntityType, is_local=False): diff --git a/src/extractor.py b/src/extractor.py index e16b599d56..07d65b2c75 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -1,14 +1,15 @@ import argparse import typing - +import os from utils.aws import get_neptune_client from transformers.base_transformer import BaseTransformer, EntityType, StreamDestination from transformers.transformer_type import TransformerType + CHUNK_SIZE = 256 -S3_BULK_LOAD_BUCKET_NAME = "wellcomecollection-neptune-graph-loader" -SNS_QUERY_TOPIC_ARN = "arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries" +S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] +GRAPH_QUERIES_SNS_TOPIC_ARN = os.environ["GRAPH_QUERIES_SNS_TOPIC_ARN"] def handler( @@ -36,7 +37,7 @@ def handler( transformer.stream_to_s3(s3_uri, entity_type, CHUNK_SIZE, sample_size) elif stream_destination == "sns": transformer.stream_to_sns( - SNS_QUERY_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size + GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size ) else: raise ValueError("Unsupported stream destination.") diff --git a/src/transformers/transformer_type.py b/src/transformers/transformer_type.py index 843c4d66a2..7689079cad 100644 --- a/src/transformers/transformer_type.py +++ b/src/transformers/transformer_type.py @@ -9,10 +9,6 @@ ) LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" -GRAPH_QUERIES_SNS_TOPIC_ARN = ( - "arn:aws:sns:eu-west-1:760097843905:catalogue_graph_queries" -) - class TransformerType(enum.Enum): LOC_CONCEPTS = LibraryOfCongressConceptsTransformer(LOC_SUBJECT_HEADINGS_URL) diff --git a/terraform/lambda_bulk_loader.tf b/terraform/lambda_bulk_loader.tf index 2d513ce847..dd371b0259 100644 --- a/terraform/lambda_bulk_loader.tf +++ b/terraform/lambda_bulk_loader.tf @@ -5,7 +5,7 @@ module "bulk_loader_lambda" { description = "Bulk loads entities from an S3 bucket into the Neptune database." runtime = "python3.13" - filename = "../build.zip" + filename = "../build.zip" source_code_hash = filesha256("../build.zip") handler = "bulk_loader.lambda_handler" @@ -17,6 +17,12 @@ module "bulk_loader_lambda" { security_group_ids = [aws_security_group.graph_indexer_lambda_security_group.id] } + environment = { + variables = { + S3_BULK_LOAD_BUCKET_NAME = aws_s3_bucket.neptune_bulk_upload_bucket.bucket + } + } + # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] } diff --git a/terraform/lambda_extractor.tf b/terraform/lambda_extractor.tf index d1aa07dadd..63812201ab 100644 --- a/terraform/lambda_extractor.tf +++ b/terraform/lambda_extractor.tf @@ -8,7 +8,7 @@ module "extractor_lambda" { filename = "../build.zip" source_code_hash = filesha256("../build.zip") - handler = "extractor.lambda_handler" + handler = "extractor.lambda_handler" // This Lambda does not need a lot of memory, but it downloads and processes large datasets (with up to 10 million // items) and therefore needs the additional compute and networking capacity which comes with increased memory. @@ -20,6 +20,13 @@ module "extractor_lambda" { security_group_ids = [aws_security_group.graph_indexer_lambda_security_group.id] } + environment = { + variables = { + S3_BULK_LOAD_BUCKET_NAME = aws_s3_bucket.neptune_bulk_upload_bucket.bucket + GRAPH_QUERIES_SNS_TOPIC_ARN = module.catalogue_graph_queries_topic.arn + } + } + # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] } From 0c21efa85039300df60adf3c1bb7cbf4b83a84fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 3 Jan 2025 09:36:01 +0000 Subject: [PATCH 015/310] Terraform refactoring and comments --- ...e_machine_iam.tf => iam_state_machines.tf} | 2 ++ terraform/lambda_bulk_load_poller.tf | 14 +------------ terraform/lambda_bulk_loader.tf | 9 ++------- terraform/lambda_extractor.tf | 6 ++++++ terraform/lambda_indexer.tf | 5 ----- terraform/neptune.tf | 20 +++++++++++++++++++ terraform/state_machine_bulk_loader.tf | 2 +- terraform/state_machine_bulk_loaders.tf | 6 +++--- terraform/state_machine_extractors.tf | 6 +++--- terraform/state_machine_pipeline.tf | 2 +- terraform/variables.tf | 4 +++- 11 files changed, 42 insertions(+), 34 deletions(-) rename terraform/{state_machine_iam.tf => iam_state_machines.tf} (87%) diff --git a/terraform/state_machine_iam.tf b/terraform/iam_state_machines.tf similarity index 87% rename from terraform/state_machine_iam.tf rename to terraform/iam_state_machines.tf index 5a75038e0a..74df7a822f 100644 --- a/terraform/state_machine_iam.tf +++ b/terraform/iam_state_machines.tf @@ -35,6 +35,8 @@ resource "aws_iam_policy" "state_machine_policy" { Action = ["states:StartExecution", "states:DescribeExecution", "states:StopExecution"], Resource = "*" }, + # These EventBridge permissions are needed to allow state machines to perform the "startExecution.sync:2" action + # (i.e. trigger another state machine and wait for it to complete) { Effect = "Allow", Action = ["events:PutTargets", "events:PutRule", "events:DescribeRule"], diff --git a/terraform/lambda_bulk_load_poller.tf b/terraform/lambda_bulk_load_poller.tf index 6b949b5514..57690876c3 100644 --- a/terraform/lambda_bulk_load_poller.tf +++ b/terraform/lambda_bulk_load_poller.tf @@ -25,19 +25,7 @@ resource "aws_iam_role_policy" "bulk_load_poller_lambda_read_secrets_policy" { policy = data.aws_iam_policy_document.allow_secret_read.json } -data "aws_iam_policy_document" "neptune_poll" { - statement { - actions = [ - "neptune-db:GetLoaderJobStatus" - ] - - resources = [ - "*" - ] - } -} - resource "aws_iam_role_policy" "bulk_load_poller_lambda_neptune_policy" { role = module.bulk_load_poller_lambda.lambda_role.name - policy = data.aws_iam_policy_document.neptune_poll.json + policy = data.aws_iam_policy_document.neptune_load_poll.json } diff --git a/terraform/lambda_bulk_loader.tf b/terraform/lambda_bulk_loader.tf index dd371b0259..59a92cd281 100644 --- a/terraform/lambda_bulk_loader.tf +++ b/terraform/lambda_bulk_loader.tf @@ -31,17 +31,12 @@ resource "aws_s3_bucket" "neptune_bulk_upload_bucket" { bucket = "wellcomecollection-neptune-graph-loader" } -resource "aws_iam_role_policy_attachment" "s3_readonly_attachment" { - role = aws_iam_role.catalogue_graph_cluster.name - policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" -} - resource "aws_iam_role_policy" "bulk_loader_lambda_read_secrets_policy" { role = module.bulk_loader_lambda.lambda_role.name policy = data.aws_iam_policy_document.allow_secret_read.json } -data "aws_iam_policy_document" "neptune_load" { +data "aws_iam_policy_document" "neptune_load_poll" { statement { actions = [ "neptune-db:StartLoaderJob", @@ -56,5 +51,5 @@ data "aws_iam_policy_document" "neptune_load" { resource "aws_iam_role_policy" "bulk_loader_lambda_neptune_policy" { role = module.bulk_loader_lambda.lambda_role.name - policy = data.aws_iam_policy_document.neptune_load.json + policy = data.aws_iam_policy_document.neptune_load_poll.json } diff --git a/terraform/lambda_extractor.tf b/terraform/lambda_extractor.tf index 63812201ab..69a818bc57 100644 --- a/terraform/lambda_extractor.tf +++ b/terraform/lambda_extractor.tf @@ -30,6 +30,12 @@ module "extractor_lambda" { # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] } +# openCypher queries will be streamed to this SNS topic (when SNS is chosen as the streaming destination) +module "catalogue_graph_queries_topic" { + source = "github.com/wellcomecollection/terraform-aws-sns-topic.git?ref=v1.0.0" + name = "catalogue_graph_queries" +} + data "aws_iam_policy_document" "stream_to_sns" { statement { actions = [ diff --git a/terraform/lambda_indexer.tf b/terraform/lambda_indexer.tf index fd57436e78..0beb6e48a0 100644 --- a/terraform/lambda_indexer.tf +++ b/terraform/lambda_indexer.tf @@ -52,11 +52,6 @@ resource "aws_vpc_security_group_egress_rule" "neptune_lambda_egress" { ip_protocol = "-1" } -module "catalogue_graph_queries_topic" { - source = "github.com/wellcomecollection/terraform-aws-sns-topic.git?ref=v1.0.0" - name = "catalogue_graph_queries" -} - # Add an SQS queue which will collect messages from SNS module "indexer_message_queue" { source = "github.com/wellcomecollection/terraform-aws-sqs//queue?ref=v1.2.1" diff --git a/terraform/neptune.tf b/terraform/neptune.tf index 6f89aea559..3bbf17a920 100644 --- a/terraform/neptune.tf +++ b/terraform/neptune.tf @@ -34,6 +34,26 @@ resource "aws_iam_role" "catalogue_graph_cluster" { }) } +# Read-only access to the bulk load S3 bucket +data "aws_iam_policy_document" "neptune_s3_read_only_policy" { + statement { + effect = "Allow" + actions = [ + "s3:GetObject", + "s3:ListBucket" + ] + resources = [ + aws_s3_bucket.neptune_bulk_upload_bucket.arn, + "${aws_s3_bucket.neptune_bulk_upload_bucket.arn}/*" + ] + } +} + +resource "aws_iam_role_policy" "s3_read_only_policy_attachment" { + role = aws_iam_role.catalogue_graph_cluster.name + policy = data.aws_iam_policy_document.neptune_s3_read_only_policy.json +} + resource "aws_neptune_cluster_instance" "catalogue_graph_instance" { cluster_identifier = aws_neptune_cluster.catalogue_graph_cluster.cluster_identifier instance_class = "db.serverless" diff --git a/terraform/state_machine_bulk_loader.tf b/terraform/state_machine_bulk_loader.tf index f7e0b21be8..f560c460ce 100644 --- a/terraform/state_machine_bulk_loader.tf +++ b/terraform/state_machine_bulk_loader.tf @@ -4,7 +4,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { definition = jsonencode({ QueryLanguage = "JSONata" - Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." + Comment = "Trigger a Neptune bulk load from a file stored in S3 and periodically check the status of the bulk load until complete." StartAt = "Trigger bulk load" States = { "Trigger bulk load" : { diff --git a/terraform/state_machine_bulk_loaders.tf b/terraform/state_machine_bulk_loaders.tf index b28dc4b2b2..0d4d3e661f 100644 --- a/terraform/state_machine_bulk_loaders.tf +++ b/terraform/state_machine_bulk_loaders.tf @@ -3,9 +3,9 @@ resource "aws_sfn_state_machine" "catalogue_graph_bulk_loaders" { role_arn = aws_iam_role.state_machine_execution_role.arn definition = jsonencode({ - Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." - StartAt = "Load ${var.state_machine_inputs[0].label}" - States = merge(tomap({ + Comment = "Trigger the catalogue-graph-bulk-loader state machine in sequence for each combination of inputs." + StartAt = "Load ${var.state_machine_inputs[0].label}" + States = merge(tomap({ for index, task_input in var.state_machine_inputs : "Load ${task_input.label}" => { Type = "Task" diff --git a/terraform/state_machine_extractors.tf b/terraform/state_machine_extractors.tf index c5d9dd81f4..e8aee4c50e 100644 --- a/terraform/state_machine_extractors.tf +++ b/terraform/state_machine_extractors.tf @@ -3,9 +3,9 @@ resource "aws_sfn_state_machine" "catalogue_graph_extractors" { role_arn = aws_iam_role.state_machine_execution_role.arn definition = jsonencode({ - Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." - StartAt = "Trigger extractors" - States = { + Comment = "Extract raw concepts from all sources, transform them into nodes and edges, and stream them into an S3 bucket." + StartAt = "Trigger extractors" + States = { "Trigger extractors" = { Type = "Parallel" Branches = flatten([ diff --git a/terraform/state_machine_pipeline.tf b/terraform/state_machine_pipeline.tf index 7aa583ec8c..e9513e2e99 100644 --- a/terraform/state_machine_pipeline.tf +++ b/terraform/state_machine_pipeline.tf @@ -3,7 +3,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_pipeline" { role_arn = aws_iam_role.state_machine_execution_role.arn definition = jsonencode({ - Comment = "Extract raw concepts, transform them into nodes and edges, and stream them into an S3 bucket." + Comment = "Extract all concepts and load them into the catalogue graph." StartAt = "Extractors" States = { "Extractors" = { diff --git a/terraform/variables.tf b/terraform/variables.tf index e1c6a9089b..3729976c46 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -1,5 +1,7 @@ +# Each entry corresponds to a single execution of the `extractor` and `bulk_loader` Lambda functions. The `extractor` +# Lambda function will output a single S3 file, which will be loaded into the database via the `bulk_loader` Lambda function. variable "state_machine_inputs" { - type = list(object({label: string, transformer_type: string, entity_type: string})) + type = list(object({ label : string, transformer_type : string, entity_type : string })) default = [ { "label" : "LoC Concept Nodes", From b92d0da2040818ec95cb0510792f74087aa98d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 3 Jan 2025 14:22:20 +0000 Subject: [PATCH 016/310] Add single-extractor-loader state machine --- README.md | 33 ++++++++++++++---- src/clients/base_neptune_client.py | 1 + .../state_machine_single_extractor_loader.tf | 34 +++++++++++++++++++ 3 files changed, 61 insertions(+), 7 deletions(-) create mode 100644 terraform/state_machine_single_extractor_loader.tf diff --git a/README.md b/README.md index 7364900fb5..824d086a19 100644 --- a/README.md +++ b/README.md @@ -20,13 +20,32 @@ Lambda function execution is orchestrated via AWS Step Functions (see `terraform utilised for this purpose: * `catalogue-graph-pipeline`: Represents the full pipeline, extracting all concepts and loading them into the cluster. - Triggers the `extractors` state machine, followed by the `bulk_loaders` state machine. -* `extractors`: Invokes `extractor` Lambda function instances in parallel, one for each combination of source type and - entity type (e.g. one for LoC Concept nodes, one for LoC Concept edges, etc.). -* `bulk_loaders`: Triggers `bulk_loader` state machine instances in sequence, one for each combination of source type - and entity type. -* `bulk_loader`: Invokes a single `bulk_loader` Lambda function to start a bulk load job. Repeatedly invokes - the `bulk_load_poller` Lambda function to check the status of the job until it completes. + Triggers the `catalogue-graph-extractors` state machine, followed by the `catalogue-graph-bulk_loaders` state machine. +* `catalogue-graph-extractors`: Invokes `extractor` Lambda function instances in parallel, one for each combination of + source type and entity type (e.g. one for LoC Concept nodes, one for LoC Concept edges, etc.). +* `catalogue-graph-bulk-loaders`: Triggers `catalogue-graph-bulk-loader` state machine instances in sequence, one for + each combination of transformer type and entity type. +* `catalogue-graph-bulk-loader`: Invokes a single `bulk_loader` Lambda function to start a bulk load job. Then + repeatedly invokes the `bulk_load_poller` Lambda function to check the status of the job until it completes. +* `catalogue-graph-single-extract-load`: Not part of the full pipeline. Extracts and loads a single entity type by + invoking the `extractor` Lambda function, followed by the `catalogue-graph-bulk-loader` state machine. Useful for + updating the graph after a change in a single source/transformer without having to run the full pipeline. + +## Running the pipeline + +The full pipeline can be triggered manually via +the [AWS console](https://eu-west-1.console.aws.amazon.com/states/home?region=eu-west-1#/statemachines/view/arn%3Aaws%3Astates%3Aeu-west-1%3A760097843905%3AstateMachine%3Acatalogue-graph-pipeline). + +The `catalogue-graph-single-extract-load` pipeline can also be triggered via the console, requiring input in the +following format: + +```json +{ + "transformer_type": "loc_concepts", + "entity_type": "nodes", + "sample_size": null +} +``` ## Source code organisation diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 60279b7d27..42f409b10a 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -78,6 +78,7 @@ def initiate_bulk_load(self, s3_file_uri: str) -> str: "parallelism": "MEDIUM", "queueRequest": "TRUE", "userProvidedEdgeIds": "TRUE", + "updateSingleCardinalityProperties": "TRUE", }, ) return response["payload"]["loadId"] diff --git a/terraform/state_machine_single_extractor_loader.tf b/terraform/state_machine_single_extractor_loader.tf new file mode 100644 index 0000000000..7614d03db4 --- /dev/null +++ b/terraform/state_machine_single_extractor_loader.tf @@ -0,0 +1,34 @@ +resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load" { + name = "catalogue-graph-single-extract-load" + role_arn = aws_iam_role.state_machine_execution_role.arn + + definition = jsonencode({ + Comment = "Extract nodes/edges from a single source and load them into the catalogue graph." + StartAt = "Extract" + States = { + "Extract" = { + Type = "Task" + Resource = module.extractor_lambda.lambda.arn + Next = "Load" + "Parameters" : { + "stream_destination" : "s3", + "transformer_type.$" : "$$.Execution.Input.transformer_type", + "entity_type.$" : "$$.Execution.Input.entity_type", + "sample_size.$" : "$$.Execution.Input.sample_size" + } + } + "Load" = { + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", + Parameters = { + StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loader.arn + "Input.$" : "$$.Execution.Input", + } + Next = "Success" + }, + Success = { + Type = "Succeed" + } + }, + }) +} From ac8edbb26d0d6a66760046b5db7ba5b142ea7fd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 3 Jan 2025 14:33:55 +0000 Subject: [PATCH 017/310] Update README.md --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 2fd6d339b2..871b887e72 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,19 @@ Subdirectories contain various modules and are shared by all Lambda functions. yields a single Pydantic model). The BaseTransformer class implements a `stream_to_` method for each supported destination. +## Deployment + +The pipeline does not currently have an automated deployment process in place. To deploy a new version of the source +code to all Lambda functions, run the `create_zip.sh` script (to create a new `build.zip` file), followed by +a `terraform apply` command (to upload the new zip file to all Lambda functions): + +```sh +sh create_zip.sh +cd terraform +terraform apply +``` + + ## Local execution To run one of the Lambda functions locally, navigate to the `src` directory and then run the chosen function via the From fd7bdfa90fc3e4e8f3e35652a3e1f18eefb8ffae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 10:45:26 +0000 Subject: [PATCH 018/310] Expand BaseNeptuneClient documentation --- src/clients/base_neptune_client.py | 18 +++++++++++++++++- src/transformers/base_transformer.py | 7 +++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 42f409b10a..9f7a0ed30b 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -46,15 +46,22 @@ def _make_request(self, method: str, relative_url: str, payload: dict = None): @backoff.on_exception(backoff.constant, Exception, max_tries=5, interval=1) def run_open_cypher_query(self, query: str): + """Runs an openCypher query against the Neptune cluster. Automatically retries up to 5 times + to mitigate transient errors.""" payload = {"query": query} response = self._make_request("POST", "/openCypher", payload) return response["results"] def get_graph_summary(self): + """ + Returns a Neptune summary report about the graph. + See https://docs.aws.amazon.com/neptune/latest/userguide/neptune-graph-summary.html for more info. + """ response = self._make_request("GET", "/propertygraph/statistics/summary") return response["payload"]["graphSummary"] - def reset_database(self): + def _reset_database(self): + """Irreversibly wipes all data from the database. This method only exists for development purposes.""" # TODO: Only keep this function for testing purposes. Remove before releasing. data = {"action": "initiateDatabaseReset"} response = self._make_request("POST", "/system", data) @@ -66,6 +73,10 @@ def reset_database(self): return response def initiate_bulk_load(self, s3_file_uri: str) -> str: + """ + Initiates a Neptune bulk load from an S3 file. + See https://docs.aws.amazon.com/neptune/latest/userguide/load-api-reference-load.html for more info. + """ response = self._make_request( "POST", "/loader", @@ -84,6 +95,10 @@ def initiate_bulk_load(self, s3_file_uri: str) -> str: return response["payload"]["loadId"] def get_bulk_load_status(self, load_id: str): + """ + Checks the status of a Neptune bulk load job and prints the results. Returns the overall status of the job. + See https://docs.aws.amazon.com/neptune/latest/userguide/load-api-reference-status-requests.html for more info. + """ # Response format: https://docs.aws.amazon.com/neptune/latest/userguide/load-api-reference-status-response.html response = self._make_request( "GET", f"/loader?loadId={load_id}&errors=TRUE&details=TRUE" @@ -123,6 +138,7 @@ def get_bulk_load_status(self, load_id: str): return status def get_bulk_load_statuses(self): + """Returns the loadIDs of the last 5 Neptune bulk load jobs.""" response = self._make_request("GET", "/loader") payload = response["payload"] return payload diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 1a183d1b17..9ee6cf3638 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -151,19 +151,22 @@ def run_query(chunk): neptune_client.run_open_cypher_query(query) with concurrent.futures.ThreadPoolExecutor() as executor: + # Run the first 10 queries in parallel futures = { executor.submit(run_query, chunk) for i, chunk in enumerate(islice(chunks, 10)) } while futures: + # Wait for one or more queries to complete done, futures = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) - for fut in done: - fut.result() + for future in done: + future.result() + # Top up with new queries to keep the total number of parallel queries at 10 for chunk in islice(chunks, len(done)): futures.add(executor.submit(run_query, chunk)) From 7070ccebd77f0a516f0d548267f4ade8b7021923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 11:39:46 +0000 Subject: [PATCH 019/310] Add autoformating workflow Add autoformating workflow --- .github/workflows/autoformat.yml | 49 ++++++++++++++++++++++++++++++++ scripts/run_formatting.sh | 14 +++++++++ src/test.py | 6 ++++ 3 files changed, 69 insertions(+) create mode 100644 .github/workflows/autoformat.yml create mode 100755 scripts/run_formatting.sh create mode 100644 src/test.py diff --git a/.github/workflows/autoformat.yml b/.github/workflows/autoformat.yml new file mode 100644 index 0000000000..944e96a291 --- /dev/null +++ b/.github/workflows/autoformat.yml @@ -0,0 +1,49 @@ +# Runs auto-formatting script on push to any branch +name: "Python auto-formatting & code quality" + +on: push + +permissions: + id-token: write + contents: write + +jobs: + black: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Install dependencies + run: | + pip install black isort mypy + + - name: Run Black + run: | + black src/ + + - name: Run isort + run: | + isort src/ + + - name: Run mypy + run: | + mypy src/ + + - name: Check for formatting changes + id: check_formatting_changes + run: | + if [[ -n $(git status --porcelain) ]]; then + echo "changes=true" >> "$GITHUB_OUTPUT"; + fi + + - name: Commit and push formatting changes + if: steps.check_formatting_changes.outputs.changes == 'true' + run: | + git config user.name "Github on behalf of Wellcome Collection" + git config user.email "wellcomedigitalplatform@wellcome.ac.uk" + git commit -am "Apply auto-formatting rules" + git push diff --git a/scripts/run_formatting.sh b/scripts/run_formatting.sh new file mode 100755 index 0000000000..444c8020c9 --- /dev/null +++ b/scripts/run_formatting.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset + +ECR_REGISTRY="760097843905.dkr.ecr.eu-west-1.amazonaws.com" + +ROOT=$(git rev-parse --show-toplevel) + +docker run --tty --rm \ + --volume "$ROOT:/repo" \ + --workdir /repo \ + "$ECR_REGISTRY/pyfound/black" \ + black --exclude ".lambda_zips/|.terraform/|target/" . diff --git a/src/test.py b/src/test.py new file mode 100644 index 0000000000..981a260cf8 --- /dev/null +++ b/src/test.py @@ -0,0 +1,6 @@ +import re +import math + + +math.sqrt(2) +re.match("awd", "awd") From deb9431c59d5c4f22f1f01e23f91180ea644edf4 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Mon, 6 Jan 2025 12:12:58 +0000 Subject: [PATCH 020/310] Apply auto-formatting rules --- src/test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/test.py b/src/test.py index 981a260cf8..b26e35164a 100644 --- a/src/test.py +++ b/src/test.py @@ -1,6 +1,5 @@ -import re import math - +import re math.sqrt(2) re.match("awd", "awd") From ee8add8e21c17b2bdcb112339d02930b1d8ae389 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 13:25:46 +0000 Subject: [PATCH 021/310] Typing fixes --- src/clients/base_neptune_client.py | 4 +++- src/converters/__init__.py | 0 src/converters/cypher/base_converter.py | 7 ++++-- src/extractor.py | 2 +- src/sources/base_source.py | 1 - src/transformers/base_transformer.py | 22 +++++++++---------- src/transformers/loc/concepts_transformer.py | 2 +- src/transformers/loc/locations_transformer.py | 2 +- src/transformers/loc/names_transformer.py | 2 +- 9 files changed, 23 insertions(+), 19 deletions(-) create mode 100644 src/converters/__init__.py diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 9f7a0ed30b..2c9bf44ef3 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -22,7 +22,9 @@ def __init__(self): def _get_client_url(self): raise NotImplementedError() - def _make_request(self, method: str, relative_url: str, payload: dict = None): + def _make_request( + self, method: str, relative_url: str, payload: dict | None = None + ): client_url = self._get_client_url() url = f"{client_url}{relative_url}" diff --git a/src/converters/__init__.py b/src/converters/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/converters/cypher/base_converter.py b/src/converters/cypher/base_converter.py index 80a65aaa5d..43f0b5f4ea 100644 --- a/src/converters/cypher/base_converter.py +++ b/src/converters/cypher/base_converter.py @@ -1,3 +1,6 @@ +import typing + + class CypherBaseConverter: """ Converts various data types into a format compatible with openCypher. @@ -12,11 +15,11 @@ def _convert_bool(self, raw_value: bool) -> str: def _convert_none(self) -> str: return "null" - def _convert_list(self, raw_value: list[any]) -> str: + def _convert_list(self, raw_value: list[typing.Any]) -> str: # Neptune does not support lists, so we convert them to a single string with a `||` separator return self._raw_value_to_cypher_value("||".join(raw_value)) - def _raw_value_to_cypher_value(self, raw_value: any) -> str: + def _raw_value_to_cypher_value(self, raw_value: typing.Any) -> str: if isinstance(raw_value, str): value = self._convert_str(raw_value) elif isinstance(raw_value, bool): diff --git a/src/extractor.py b/src/extractor.py index 07d65b2c75..56fad106de 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -16,7 +16,7 @@ def handler( stream_destination: StreamDestination, transformer_type: TransformerType, entity_type: EntityType, - sample_size: int = None, + sample_size: int | None = None, is_local=False, ): print( diff --git a/src/sources/base_source.py b/src/sources/base_source.py index 84e59ce540..9f135c668d 100644 --- a/src/sources/base_source.py +++ b/src/sources/base_source.py @@ -4,4 +4,3 @@ class BaseSource: def stream_raw(self) -> Generator[dict]: raise NotImplementedError("Each source must implement a `stream_raw` method.") - diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 9ee6cf3638..17c6b27609 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -41,7 +41,7 @@ def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: "Each transformer must implement an `extract_edges` method." ) - def _stream_nodes(self, number: int = None) -> Generator[BaseNode]: + def _stream_nodes(self, number: int | None = None) -> Generator[BaseNode]: """ Extracts nodes from the specified source and transforms them. The `source` must define a `stream_raw` method. Takes an optional parameter to only extract the first `number` nodes. @@ -60,7 +60,7 @@ def _stream_nodes(self, number: int = None) -> Generator[BaseNode]: if counter == number: return - def _stream_edges(self, number: int = None) -> Generator[BaseEdge]: + def _stream_edges(self, number: int | None = None) -> Generator[BaseEdge]: """ Extracts edges from the specified source and transforms them. The `source` must define a `stream_raw` method. Takes an optional parameter to only extract the first `number` edges. @@ -80,7 +80,7 @@ def _stream_edges(self, number: int = None) -> Generator[BaseEdge]: return def _stream_entities( - self, entity_type: EntityType, sample_size: int = None + self, entity_type: EntityType, sample_size: int | None = None ) -> Generator[BaseNode | BaseEdge]: if entity_type == "nodes": entities = self._stream_nodes(sample_size) @@ -95,7 +95,7 @@ def _stream_chunks( self, entity_type: EntityType, chunk_size: int, - sample_size: int = None, + sample_size: int | None = None, ) -> Generator[list[BaseNode | BaseEdge]]: """ Extracts the specified entity type (nodes or edges) from its source, transforms each entity, @@ -110,7 +110,7 @@ def stream_to_s3( s3_uri: str, entity_type: EntityType, chunk_size: int, - sample_size: int = None, + sample_size: int | None = None, ): """ Streams transformed entities (nodes or edges) into an S3 bucket for bulk loading into the Neptune cluster. @@ -123,13 +123,13 @@ def stream_to_s3( converter = CypherBulkLoadConverter(entity_type) for chunk in self._stream_chunks(entity_type, chunk_size, sample_size): bulk_dicts = [] - for entity in chunk: bulk_dict = converter.convert_to_bulk_cypher(entity) bulk_dicts.append(bulk_dict) - if csv_writer is None: - csv_writer = csv.DictWriter(f, fieldnames=bulk_dict.keys()) - csv_writer.writeheader() + + if csv_writer is None: + csv_writer = csv.DictWriter(f, fieldnames=bulk_dicts[0].keys()) + csv_writer.writeheader() csv_writer.writerows(bulk_dicts) @@ -138,7 +138,7 @@ def stream_to_graph( neptune_client: BaseNeptuneClient, entity_type: EntityType, query_chunk_size: int, - sample_size: int = None, + sample_size: int | None = None, ): """ Streams transformed entities (nodes or edges) directly into Neptune using multiple threads for parallel @@ -175,7 +175,7 @@ def stream_to_sns( topic_arn: str, entity_type: EntityType, query_chunk_size: int, - sample_size: int = None, + sample_size: int | None = None, ): """ Streams transformed entities (nodes or edges) into an SNS topic as openCypher queries, where they will be diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index b10e0a5ddd..bc7c6cc47f 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -27,7 +27,7 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: - return [] + yield from () for broader_id in raw_concept.broader_concept_ids: yield SourceConceptNarrowerThan( diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 31ab0fe9a6..fed284bbc6 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -27,7 +27,7 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: - return [] + yield from () for broader_id in raw_concept.broader_concept_ids: yield SourceConceptNarrowerThan( diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index a6985d30c3..0c6b8377d5 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -26,4 +26,4 @@ def transform_node(self, raw_node: dict) -> SourceName | None: def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: # At the moment there are no edges to extract. Return an empty generator. - yield from [] + yield from () From 3c17f28c982d6a42a0e9083d75e7e2fa947e87a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 13:28:14 +0000 Subject: [PATCH 022/310] Sort imports --- src/bulk_load_poller.py | 4 ++-- src/bulk_loader.py | 3 +-- src/clients/base_neptune_client.py | 5 +++-- src/clients/lambda_neptune_client.py | 3 ++- src/converters/cypher/bulk_load_converter.py | 6 ++++-- src/converters/cypher/query_converter.py | 6 ++++-- src/extractor.py | 8 +++---- src/indexer.py | 2 +- src/models/graph_node.py | 2 +- src/query_builders/cypher.py | 2 +- src/sources/gzip_source.py | 6 ++++-- src/transformers/base_transformer.py | 21 ++++++++++--------- src/transformers/loc/concepts_transformer.py | 8 ++++--- src/transformers/loc/locations_transformer.py | 8 ++++--- src/transformers/loc/names_transformer.py | 7 ++++--- src/transformers/transformer_type.py | 2 +- src/utils/aws.py | 3 ++- 17 files changed, 55 insertions(+), 41 deletions(-) diff --git a/src/bulk_load_poller.py b/src/bulk_load_poller.py index 1a4535edd8..d135b8501d 100644 --- a/src/bulk_load_poller.py +++ b/src/bulk_load_poller.py @@ -1,7 +1,7 @@ -from utils.aws import get_neptune_client - import argparse +from utils.aws import get_neptune_client + def handler(load_id: str, is_local=False): neptune_client = get_neptune_client(is_local) diff --git a/src/bulk_loader.py b/src/bulk_loader.py index 40c34e3dc3..c71aa9732b 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -1,12 +1,11 @@ -import typing import argparse import os +import typing from transformers.base_transformer import EntityType from transformers.transformer_type import TransformerType from utils.aws import get_neptune_client - S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 2c9bf44ef3..6934e2f021 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -1,7 +1,8 @@ -import backoff +import datetime import json + +import backoff import requests -import datetime from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest diff --git a/src/clients/lambda_neptune_client.py b/src/clients/lambda_neptune_client.py index e348f3567f..5da7e2353a 100644 --- a/src/clients/lambda_neptune_client.py +++ b/src/clients/lambda_neptune_client.py @@ -1,6 +1,7 @@ -import boto3 import os +import boto3 + from .base_neptune_client import BaseNeptuneClient diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index a7941f2d05..2f1ea70780 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -1,7 +1,9 @@ -from models.graph_node import BaseNode +from typing import Literal + from models.graph_edge import BaseEdge +from models.graph_node import BaseNode + from .base_converter import CypherBaseConverter -from typing import Literal class CypherBulkLoadConverter(CypherBaseConverter): diff --git a/src/converters/cypher/query_converter.py b/src/converters/cypher/query_converter.py index b90a9b3752..0f76516c77 100644 --- a/src/converters/cypher/query_converter.py +++ b/src/converters/cypher/query_converter.py @@ -1,7 +1,9 @@ -from models.graph_node import BaseNode +from typing import Literal + from models.graph_edge import BaseEdge +from models.graph_node import BaseNode + from .base_converter import CypherBaseConverter -from typing import Literal class CypherQueryConverter(CypherBaseConverter): diff --git a/src/extractor.py b/src/extractor.py index 56fad106de..8f17dff541 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -1,11 +1,11 @@ import argparse -import typing import os +import typing -from utils.aws import get_neptune_client -from transformers.base_transformer import BaseTransformer, EntityType, StreamDestination +from transformers.base_transformer import (BaseTransformer, EntityType, + StreamDestination) from transformers.transformer_type import TransformerType - +from utils.aws import get_neptune_client CHUNK_SIZE = 256 S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] diff --git a/src/indexer.py b/src/indexer.py index 5648ae8161..a08a88322b 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,5 +1,5 @@ -import json import argparse +import json from utils.aws import get_neptune_client diff --git a/src/models/graph_node.py b/src/models/graph_node.py index c02e53b76d..d1d7241a45 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -1,7 +1,7 @@ import datetime +from typing import Literal, Optional from pydantic import BaseModel -from typing import Literal, Optional # Each node must have a label and an id diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index e9f62f6a8b..2014fe82ed 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -1,6 +1,6 @@ +from converters.cypher.query_converter import CypherQueryConverter from models.graph_edge import BaseEdge from models.graph_node import BaseNode -from converters.cypher.query_converter import CypherQueryConverter def construct_upsert_nodes_query(nodes: list[BaseNode]) -> str: diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 57ea91dfe0..59755a9366 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -1,7 +1,9 @@ -import requests -from collections.abc import Generator import gzip import json +from collections.abc import Generator + +import requests + from .base_source import BaseSource diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 17c6b27609..4cc8fb45a6 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -1,18 +1,19 @@ +import concurrent.futures +import csv from collections.abc import Generator +from itertools import islice +from typing import Literal + +import boto3 +import smart_open + +from clients.base_neptune_client import BaseNeptuneClient +from converters.cypher.bulk_load_converter import CypherBulkLoadConverter from models.graph_edge import BaseEdge from models.graph_node import BaseNode -from sources.base_source import BaseSource -from clients.base_neptune_client import BaseNeptuneClient from query_builders.cypher import construct_upsert_cypher_query +from sources.base_source import BaseSource from utils.aws import publish_batch_to_sns -from converters.cypher.bulk_load_converter import CypherBulkLoadConverter - -import smart_open -import concurrent.futures -import boto3 -import csv -from typing import Literal -from itertools import islice EntityType = Literal["nodes", "edges"] StreamDestination = Literal["graph", "s3", "sns"] diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index bc7c6cc47f..783d30f516 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -1,8 +1,10 @@ -from sources.gzip_source import GZipSource -from transformers.base_transformer import BaseTransformer from collections.abc import Generator -from models.graph_node import SourceConcept + from models.graph_edge import SourceConceptNarrowerThan +from models.graph_node import SourceConcept +from sources.gzip_source import GZipSource +from transformers.base_transformer import BaseTransformer + from .raw_concept import RawLibraryOfCongressConcept diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index fed284bbc6..5ff7f430e9 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -1,8 +1,10 @@ -from sources.gzip_source import MultiGZipSource -from transformers.base_transformer import BaseTransformer from collections.abc import Generator -from models.graph_node import SourceLocation + from models.graph_edge import SourceConceptNarrowerThan +from models.graph_node import SourceLocation +from sources.gzip_source import MultiGZipSource +from transformers.base_transformer import BaseTransformer + from .raw_concept import RawLibraryOfCongressConcept diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 0c6b8377d5..7206d30039 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -1,9 +1,10 @@ +from collections.abc import Generator + +from models.graph_edge import BaseEdge +from models.graph_node import SourceName from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer -from collections.abc import Generator -from models.graph_node import SourceName -from models.graph_edge import BaseEdge from .raw_concept import RawLibraryOfCongressConcept diff --git a/src/transformers/transformer_type.py b/src/transformers/transformer_type.py index 7689079cad..39fd20bcc4 100644 --- a/src/transformers/transformer_type.py +++ b/src/transformers/transformer_type.py @@ -1,8 +1,8 @@ import enum from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer -from .loc.names_transformer import LibraryOfCongressNamesTransformer from .loc.locations_transformer import LibraryOfCongressLocationsTransformer +from .loc.names_transformer import LibraryOfCongressNamesTransformer LOC_SUBJECT_HEADINGS_URL = ( "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" diff --git a/src/utils/aws.py b/src/utils/aws.py index eef89ba3c9..555d04b757 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -1,6 +1,7 @@ -import boto3 import json +import boto3 + from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient From c3fb0e9a4629aafb0f9bc2b66f3263b4298c4186 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 13:43:20 +0000 Subject: [PATCH 023/310] Create mypy.ini --- src/mypy.ini | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 src/mypy.ini diff --git a/src/mypy.ini b/src/mypy.ini new file mode 100644 index 0000000000..49965705e4 --- /dev/null +++ b/src/mypy.ini @@ -0,0 +1,2 @@ +[mypy-smart_open.*] +ignore_missing_imports = True From 6def21cc40a770ec7eb75237cbcab0b01e80ac3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 13:46:22 +0000 Subject: [PATCH 024/310] Install stubs --- src/clients/base_neptune_client.py | 6 +++++- src/requirements.in | 3 +++ src/requirements.txt | 11 +++++++++++ src/transformers/base_transformer.py | 2 ++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 6934e2f021..014800d537 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -39,7 +39,11 @@ def _make_request( ) response = requests.request( - method, url, data=data, headers=request.headers, verify=self.verify_requests + method, + url, + data=data, + headers=dict(request.headers), + verify=self.verify_requests, ) if response.status_code != 200: diff --git a/src/requirements.in b/src/requirements.in index f56c57658c..33f10299d1 100644 --- a/src/requirements.in +++ b/src/requirements.in @@ -3,3 +3,6 @@ requests pydantic backoff smart-open + +boto3-stubs +types-requests diff --git a/src/requirements.txt b/src/requirements.txt index 78556d0fd0..4fb198ddf7 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -10,10 +10,14 @@ backoff==2.2.1 # via -r requirements.in boto3==1.35.77 # via -r requirements.in +boto3-stubs==1.35.92 + # via -r requirements.in botocore==1.35.77 # via # boto3 # s3transfer +botocore-stubs==1.35.92 + # via boto3-stubs certifi==2024.8.30 # via requests charset-normalizer==3.4.0 @@ -38,6 +42,12 @@ six==1.17.0 # via python-dateutil smart-open==7.1.0 # via -r requirements.in +types-awscrt==0.23.6 + # via botocore-stubs +types-requests==2.32.0.20241016 + # via -r requirements.in +types-s3transfer==0.10.4 + # via boto3-stubs typing-extensions==4.12.2 # via # pydantic @@ -46,5 +56,6 @@ urllib3==2.2.3 # via # botocore # requests + # types-requests wrapt==1.17.0 # via smart-open diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 4cc8fb45a6..2f1930d9a3 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -83,6 +83,8 @@ def _stream_edges(self, number: int | None = None) -> Generator[BaseEdge]: def _stream_entities( self, entity_type: EntityType, sample_size: int | None = None ) -> Generator[BaseNode | BaseEdge]: + entities: Generator[BaseNode | BaseEdge] # Make mypy happy + if entity_type == "nodes": entities = self._stream_nodes(sample_size) elif entity_type == "edges": From c642833959e2b5e410bc25bb182a1bb375a83656 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 13:47:53 +0000 Subject: [PATCH 025/310] Refactor autoformat GitHub action --- ...utoformat.yml => autoformat-typecheck.yml} | 33 ++++++++++++++----- scripts/run_formatting.sh | 14 -------- src/converters/cypher/bulk_load_converter.py | 8 ++--- src/converters/cypher/query_converter.py | 6 ++-- src/mypy.ini | 2 ++ src/query_builders/cypher.py | 14 +++++--- src/test.py | 5 --- src/transformers/base_transformer.py | 4 +-- 8 files changed, 44 insertions(+), 42 deletions(-) rename .github/workflows/{autoformat.yml => autoformat-typecheck.yml} (64%) delete mode 100755 scripts/run_formatting.sh delete mode 100644 src/test.py diff --git a/.github/workflows/autoformat.yml b/.github/workflows/autoformat-typecheck.yml similarity index 64% rename from .github/workflows/autoformat.yml rename to .github/workflows/autoformat-typecheck.yml index 944e96a291..ae6a1cff7f 100644 --- a/.github/workflows/autoformat.yml +++ b/.github/workflows/autoformat-typecheck.yml @@ -1,5 +1,5 @@ -# Runs auto-formatting script on push to any branch -name: "Python auto-formatting & code quality" +# Runs auto-formatting and type checking on push to any branch +name: "Python auto-formatting & type checking" on: push @@ -8,7 +8,7 @@ permissions: contents: write jobs: - black: + typecheck: runs-on: ubuntu-latest steps: @@ -19,7 +19,26 @@ jobs: - name: Install dependencies run: | - pip install black isort mypy + pip install mypy + pip install -r src/requirements.txt + + - name: Run mypy + run: | + mypy --config-file src/mypy.ini src/ + + autoformat: + runs-on: ubuntu-latest + needs: typecheck + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Install dependencies + run: | + pip install black isort - name: Run Black run: | @@ -27,11 +46,7 @@ jobs: - name: Run isort run: | - isort src/ - - - name: Run mypy - run: | - mypy src/ + isort src/ - name: Check for formatting changes id: check_formatting_changes diff --git a/scripts/run_formatting.sh b/scripts/run_formatting.sh deleted file mode 100755 index 444c8020c9..0000000000 --- a/scripts/run_formatting.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -set -o errexit -set -o nounset - -ECR_REGISTRY="760097843905.dkr.ecr.eu-west-1.amazonaws.com" - -ROOT=$(git rev-parse --show-toplevel) - -docker run --tty --rm \ - --volume "$ROOT:/repo" \ - --workdir /repo \ - "$ECR_REGISTRY/pyfound/black" \ - black --exclude ".lambda_zips/|.terraform/|target/" . diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index 2f1ea70780..5b356b304a 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import Literal, cast from models.graph_edge import BaseEdge from models.graph_node import BaseNode @@ -33,16 +33,16 @@ def _edge_to_bulk_cypher(self, model: BaseEdge): return bulk_edge - def convert_to_bulk_cypher(self, model: BaseNode | BaseEdge): + def convert_to_bulk_cypher(self, model: BaseNode | BaseEdge) -> dict[str, str]: """ Returns a dictionary representing the entity (node or edge), converting all values into a format compatible with openCypher, and adding all required values for bulk upload, such as `:ID` or `:LABEL`. See https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load-tutorial-format-opencypher.html. """ if self.entity_type == "nodes": - return self._node_to_bulk_cypher(model) + return self._node_to_bulk_cypher(cast(BaseNode, model)) elif self.entity_type == "edges": - return self._edge_to_bulk_cypher(model) + return self._edge_to_bulk_cypher(cast(BaseEdge, model)) else: raise ValueError( "Unsupported Pydantic model. Each model must subclass BaseEdge or BaseNode." diff --git a/src/converters/cypher/query_converter.py b/src/converters/cypher/query_converter.py index 0f76516c77..6ac5c43544 100644 --- a/src/converters/cypher/query_converter.py +++ b/src/converters/cypher/query_converter.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import Literal, cast from models.graph_edge import BaseEdge from models.graph_node import BaseNode @@ -49,9 +49,9 @@ def convert_to_cypher_map(self, model: BaseNode | BaseEdge): See https://neo4j.com/docs/cypher-manual/current/values-and-types/maps/. """ if self.entity_type == "nodes": - return self._node_to_cypher_map(model) + return self._node_to_cypher_map(cast(BaseNode, model)) elif self.entity_type == "edges": - return self._edge_to_cypher_map(model) + return self._edge_to_cypher_map(cast(BaseEdge, model)) else: raise ValueError( "Unsupported Pydantic model. Each model must subclass BaseEdge or BaseNode." diff --git a/src/mypy.ini b/src/mypy.ini index 49965705e4..41349e5583 100644 --- a/src/mypy.ini +++ b/src/mypy.ini @@ -1,2 +1,4 @@ +[mypy] + [mypy-smart_open.*] ignore_missing_imports = True diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index 2014fe82ed..54f8479961 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -2,6 +2,8 @@ from models.graph_edge import BaseEdge from models.graph_node import BaseNode +from typing import Literal, cast + def construct_upsert_nodes_query(nodes: list[BaseNode]) -> str: model_name = type(nodes[0]).__name__ @@ -54,7 +56,9 @@ def construct_upsert_edges_query(edges: list[BaseEdge]) -> str: return query -def construct_upsert_cypher_query(entities: list[BaseNode | BaseEdge]): +def construct_upsert_cypher_query( + entities: list[BaseNode | BaseEdge], entity_type: Literal["nodes", "edges"] +): """ Returns an openCypher `UNWIND` query which creates a graph node or edge for each item specified in `entities`, or updates an existing matching node or edge. @@ -62,10 +66,10 @@ def construct_upsert_cypher_query(entities: list[BaseNode | BaseEdge]): All passed `entities` must be instances of the same Pydantic model because labels cannot be set dynamically in openCypher. """ - if isinstance(entities[0], BaseNode): - return construct_upsert_nodes_query(entities) - elif isinstance(entities[0], BaseEdge): - return construct_upsert_edges_query(entities) + if entity_type == "nodes": + return construct_upsert_nodes_query(cast(list[BaseNode], entities)) + elif entity_type == "edges": + return construct_upsert_edges_query(cast(list[BaseEdge], entities)) else: raise ValueError( "Unsupported Pydantic model. Each model must subclass BaseEdge or BaseNode." diff --git a/src/test.py b/src/test.py deleted file mode 100644 index b26e35164a..0000000000 --- a/src/test.py +++ /dev/null @@ -1,5 +0,0 @@ -import math -import re - -math.sqrt(2) -re.match("awd", "awd") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 2f1930d9a3..77620aac60 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -150,7 +150,7 @@ def stream_to_graph( chunks = self._stream_chunks(entity_type, query_chunk_size, sample_size) def run_query(chunk): - query = construct_upsert_cypher_query(chunk) + query = construct_upsert_cypher_query(chunk, entity_type) neptune_client.run_open_cypher_query(query) with concurrent.futures.ThreadPoolExecutor() as executor: @@ -188,7 +188,7 @@ def stream_to_sns( counter = 0 for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): - queries.append(construct_upsert_cypher_query(chunk)) + queries.append(construct_upsert_cypher_query(chunk, entity_type)) # SNS supports a maximum batch size of 10 if len(queries) >= 10: From 6d7f265019265c1724bb7a65807d2ad0dad86ee4 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Mon, 6 Jan 2025 14:32:24 +0000 Subject: [PATCH 026/310] Apply auto-formatting rules --- src/query_builders/cypher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index 54f8479961..fdb1363fbd 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -1,9 +1,9 @@ +from typing import Literal, cast + from converters.cypher.query_converter import CypherQueryConverter from models.graph_edge import BaseEdge from models.graph_node import BaseNode -from typing import Literal, cast - def construct_upsert_nodes_query(nodes: list[BaseNode]) -> str: model_name = type(nodes[0]).__name__ From 04b3cab582c606a3db6bce18d13be2c8bb423a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 16:02:33 +0000 Subject: [PATCH 027/310] Add more comprehensive typing --- src/bulk_load_poller.py | 7 +++-- src/bulk_loader.py | 16 +++++----- src/clients/base_neptune_client.py | 31 +++++++++--------- src/clients/lambda_neptune_client.py | 2 +- src/clients/local_neptune_client.py | 2 +- src/converters/cypher/bulk_load_converter.py | 6 ++-- src/converters/cypher/query_converter.py | 2 +- src/extractor.py | 18 +++++------ src/indexer.py | 9 +++--- src/query_builders/cypher.py | 2 +- src/transformers/base_transformer.py | 10 +++--- src/transformers/create_transformer.py | 26 +++++++++++++++ src/transformers/loc/raw_concept.py | 33 +++++++++++++------- src/transformers/transformer_type.py | 26 --------------- src/utils/aws.py | 7 +++-- 15 files changed, 107 insertions(+), 90 deletions(-) create mode 100644 src/transformers/create_transformer.py delete mode 100644 src/transformers/transformer_type.py diff --git a/src/bulk_load_poller.py b/src/bulk_load_poller.py index d135b8501d..9aaafadb4c 100644 --- a/src/bulk_load_poller.py +++ b/src/bulk_load_poller.py @@ -1,9 +1,10 @@ import argparse +import typing from utils.aws import get_neptune_client -def handler(load_id: str, is_local=False): +def handler(load_id: str, is_local: bool = False) -> dict[str, str]: neptune_client = get_neptune_client(is_local) status = neptune_client.get_bulk_load_status(load_id) @@ -22,12 +23,12 @@ def handler(load_id: str, is_local=False): raise Exception("Load failed. See error log above.") -def lambda_handler(event: dict, context): +def lambda_handler(event: dict, context: typing.Any) -> dict[str, str]: load_id = event["loadId"] return handler(load_id) -def local_handler(): +def local_handler() -> None: parser = argparse.ArgumentParser(description="") parser.add_argument( "--load-id", diff --git a/src/bulk_loader.py b/src/bulk_loader.py index c71aa9732b..c8b67db4d5 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -3,13 +3,15 @@ import typing from transformers.base_transformer import EntityType -from transformers.transformer_type import TransformerType +from transformers.create_transformer import TransformerType from utils.aws import get_neptune_client S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] -def handler(transformer_type: str, entity_type: EntityType, is_local=False): +def handler( + transformer_type: TransformerType, entity_type: EntityType, is_local: bool = False +) -> dict[str, str]: file_name = f"{transformer_type}__{entity_type}.csv" s3_file_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" @@ -21,18 +23,18 @@ def handler(transformer_type: str, entity_type: EntityType, is_local=False): return {"loadId": load_id} -def lambda_handler(event: dict, context): - transformer_type = TransformerType.argparse(event["transformer_type"]) +def lambda_handler(event: dict, context: typing.Any) -> dict[str, str]: + transformer_type = event["transformer_type"] entity_type = event["entity_type"] return handler(transformer_type, entity_type) -def local_handler(): +def local_handler() -> None: parser = argparse.ArgumentParser(description="") parser.add_argument( "--transformer-type", - type=TransformerType.argparse, - choices=list(TransformerType), + type=str, + choices=typing.get_args(TransformerType), help="Which transformer's output to bulk load.", required=True, ) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 6934e2f021..393c3ee29f 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -2,6 +2,7 @@ import json import backoff +import boto3 import requests from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest @@ -15,17 +16,21 @@ class BaseNeptuneClient: cluster) or LocalNeptuneClient (when connecting to the cluster from outside the VPC). """ - def __init__(self): - self.session = None - self.neptune_endpoint = None - self.verify_requests = True + def __init__(self) -> None: + self.session: boto3.Session | None = None + self.neptune_endpoint: str | None = None + self.verify_requests: bool = True - def _get_client_url(self): + def _get_client_url(self) -> str: raise NotImplementedError() def _make_request( self, method: str, relative_url: str, payload: dict | None = None - ): + ) -> dict: + assert self.session + credentials = self.session.get_credentials() + assert credentials is not None + client_url = self._get_client_url() url = f"{client_url}{relative_url}" @@ -34,9 +39,7 @@ def _make_request( # We use IAM database authentication, which means we need to authenticate the request using AWS Signature request = AWSRequest(method=method, url=url, data=data, headers=headers) - SigV4Auth(self.session.get_credentials(), "neptune-db", "eu-west-1").add_auth( - request - ) + SigV4Auth(credentials, "neptune-db", "eu-west-1").add_auth(request) response = requests.request( method, url, data=data, headers=request.headers, verify=self.verify_requests @@ -48,14 +51,14 @@ def _make_request( return response.json() @backoff.on_exception(backoff.constant, Exception, max_tries=5, interval=1) - def run_open_cypher_query(self, query: str): + def run_open_cypher_query(self, query: str) -> dict: """Runs an openCypher query against the Neptune cluster. Automatically retries up to 5 times to mitigate transient errors.""" payload = {"query": query} response = self._make_request("POST", "/openCypher", payload) return response["results"] - def get_graph_summary(self): + def get_graph_summary(self) -> dict: """ Returns a Neptune summary report about the graph. See https://docs.aws.amazon.com/neptune/latest/userguide/neptune-graph-summary.html for more info. @@ -63,7 +66,7 @@ def get_graph_summary(self): response = self._make_request("GET", "/propertygraph/statistics/summary") return response["payload"]["graphSummary"] - def _reset_database(self): + def _reset_database(self) -> dict: """Irreversibly wipes all data from the database. This method only exists for development purposes.""" # TODO: Only keep this function for testing purposes. Remove before releasing. data = {"action": "initiateDatabaseReset"} @@ -97,7 +100,7 @@ def initiate_bulk_load(self, s3_file_uri: str) -> str: ) return response["payload"]["loadId"] - def get_bulk_load_status(self, load_id: str): + def get_bulk_load_status(self, load_id: str) -> str: """ Checks the status of a Neptune bulk load job and prints the results. Returns the overall status of the job. See https://docs.aws.amazon.com/neptune/latest/userguide/load-api-reference-status-requests.html for more info. @@ -140,7 +143,7 @@ def get_bulk_load_status(self, load_id: str): return status - def get_bulk_load_statuses(self): + def get_bulk_load_statuses(self) -> list[str]: """Returns the loadIDs of the last 5 Neptune bulk load jobs.""" response = self._make_request("GET", "/loader") payload = response["payload"] diff --git a/src/clients/lambda_neptune_client.py b/src/clients/lambda_neptune_client.py index 5da7e2353a..58746efef6 100644 --- a/src/clients/lambda_neptune_client.py +++ b/src/clients/lambda_neptune_client.py @@ -15,5 +15,5 @@ def __init__(self, neptune_endpoint: str): aws_session_token=os.getenv("AWS_SESSION_TOKEN"), ) - def _get_client_url(self): + def _get_client_url(self) -> str: return f"https://{self.neptune_endpoint}:8182" diff --git a/src/clients/local_neptune_client.py b/src/clients/local_neptune_client.py index decc84fa6d..8db3ddd13e 100644 --- a/src/clients/local_neptune_client.py +++ b/src/clients/local_neptune_client.py @@ -17,5 +17,5 @@ def __init__(self, load_balancer_url: str, neptune_endpoint: str): self.neptune_endpoint = neptune_endpoint self.session = boto3.Session() - def _get_client_url(self): + def _get_client_url(self) -> str: return self.load_balancer_url diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index 2f1ea70780..9a80f0365f 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -10,7 +10,7 @@ class CypherBulkLoadConverter(CypherBaseConverter): def __init__(self, entity_type: Literal["nodes", "edges"]): self.entity_type = entity_type - def _node_to_bulk_cypher(self, model: BaseNode): + def _node_to_bulk_cypher(self, model: BaseNode) -> dict: bulk_node = {":ID": model.id, ":LABEL": type(model).__name__} for key, raw_value in model.dict().items(): @@ -19,7 +19,7 @@ def _node_to_bulk_cypher(self, model: BaseNode): return bulk_node - def _edge_to_bulk_cypher(self, model: BaseEdge): + def _edge_to_bulk_cypher(self, model: BaseEdge) -> dict: bulk_edge = { ":ID": f"{model.from_id}-->{model.to_id}", ":START_ID": model.from_id, @@ -33,7 +33,7 @@ def _edge_to_bulk_cypher(self, model: BaseEdge): return bulk_edge - def convert_to_bulk_cypher(self, model: BaseNode | BaseEdge): + def convert_to_bulk_cypher(self, model: BaseNode | BaseEdge) -> dict: """ Returns a dictionary representing the entity (node or edge), converting all values into a format compatible with openCypher, and adding all required values for bulk upload, such as `:ID` or `:LABEL`. diff --git a/src/converters/cypher/query_converter.py b/src/converters/cypher/query_converter.py index 0f76516c77..2b5b6bca1e 100644 --- a/src/converters/cypher/query_converter.py +++ b/src/converters/cypher/query_converter.py @@ -39,7 +39,7 @@ def _edge_to_cypher_map(self, model: BaseEdge) -> str: return "{" + ", ".join(properties) + "}" - def convert_to_cypher_map(self, model: BaseNode | BaseEdge): + def convert_to_cypher_map(self, model: BaseNode | BaseEdge) -> str: """ Returns a string representing an openCypher Map of the entity (node or edge) for use with an `UNWIND` query. diff --git a/src/extractor.py b/src/extractor.py index 8f17dff541..08aea8be20 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -4,7 +4,7 @@ from transformers.base_transformer import (BaseTransformer, EntityType, StreamDestination) -from transformers.transformer_type import TransformerType +from transformers.create_transformer import TransformerType, create_transformer from utils.aws import get_neptune_client CHUNK_SIZE = 256 @@ -17,14 +17,14 @@ def handler( transformer_type: TransformerType, entity_type: EntityType, sample_size: int | None = None, - is_local=False, -): + is_local: bool = False, +) -> None: print( f"Transforming {sample_size or 'all'} {entity_type} using the {transformer_type} " f"transformer and streaming them into {stream_destination}." ) - transformer: BaseTransformer = TransformerType[transformer_type.name].value + transformer: BaseTransformer = create_transformer(transformer_type) if stream_destination == "graph": neptune_client = get_neptune_client(is_local) @@ -43,21 +43,21 @@ def handler( raise ValueError("Unsupported stream destination.") -def lambda_handler(event: dict, context): +def lambda_handler(event: dict, context: typing.Any) -> None: stream_destination = event["stream_destination"] - transformer_type = TransformerType.argparse(event["transformer_type"]) + transformer_type = event["transformer_type"] entity_type = event["entity_type"] sample_size = event.get("sample_size") handler(stream_destination, transformer_type, entity_type, sample_size) -def local_handler(): +def local_handler() -> None: parser = argparse.ArgumentParser(description="") parser.add_argument( "--transformer-type", - type=TransformerType.argparse, - choices=list(TransformerType), + type=str, + choices=typing.get_args(TransformerType), help="Which transformer to use for streaming.", required=True, ) diff --git a/src/indexer.py b/src/indexer.py index a08a88322b..dad19ea5f9 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,10 +1,11 @@ import argparse import json +import typing from utils.aws import get_neptune_client -def extract_sns_messages_from_sqs_event(event): +def extract_sns_messages_from_sqs_event(event: dict) -> list[str]: queries = [] for record in event["Records"]: @@ -14,7 +15,7 @@ def extract_sns_messages_from_sqs_event(event): return queries -def handler(queries: list[str], is_local=False): +def handler(queries: list[str], is_local: bool = False) -> None: neptune_client = get_neptune_client(is_local) print(f"Received number of queries: {len(queries)}") @@ -23,12 +24,12 @@ def handler(queries: list[str], is_local=False): neptune_client.run_open_cypher_query(query) -def lambda_handler(event: dict, context): +def lambda_handler(event: dict, context: typing.Any) -> None: queries = extract_sns_messages_from_sqs_event(event) handler(queries) -def local_handler(): +def local_handler() -> None: parser = argparse.ArgumentParser(description="") parser.add_argument( "--cypher-query", diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index 2014fe82ed..82cf0f9811 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -54,7 +54,7 @@ def construct_upsert_edges_query(edges: list[BaseEdge]) -> str: return query -def construct_upsert_cypher_query(entities: list[BaseNode | BaseEdge]): +def construct_upsert_cypher_query(entities: list[BaseNode | BaseEdge]) -> str: """ Returns an openCypher `UNWIND` query which creates a graph node or edge for each item specified in `entities`, or updates an existing matching node or edge. diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 4cc8fb45a6..960b951b65 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -29,7 +29,7 @@ def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: class BaseTransformer: - def __init__(self): + def __init__(self) -> None: self.source: BaseSource = BaseSource() def transform_node(self, raw_node: dict) -> BaseNode | None: @@ -112,7 +112,7 @@ def stream_to_s3( entity_type: EntityType, chunk_size: int, sample_size: int | None = None, - ): + ) -> None: """ Streams transformed entities (nodes or edges) into an S3 bucket for bulk loading into the Neptune cluster. Suitable for indexing large numbers of entities in production. Provides limited observability. @@ -140,14 +140,14 @@ def stream_to_graph( entity_type: EntityType, query_chunk_size: int, sample_size: int | None = None, - ): + ) -> None: """ Streams transformed entities (nodes or edges) directly into Neptune using multiple threads for parallel processing. Suitable for local testing. Not recommended for indexing large numbers of entities. """ chunks = self._stream_chunks(entity_type, query_chunk_size, sample_size) - def run_query(chunk): + def run_query(chunk: list[BaseNode | BaseEdge]) -> None: query = construct_upsert_cypher_query(chunk) neptune_client.run_open_cypher_query(query) @@ -177,7 +177,7 @@ def stream_to_sns( entity_type: EntityType, query_chunk_size: int, sample_size: int | None = None, - ): + ) -> None: """ Streams transformed entities (nodes or edges) into an SNS topic as openCypher queries, where they will be consumed by the `indexer` Lambda function. diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py new file mode 100644 index 0000000000..aa2a139c6e --- /dev/null +++ b/src/transformers/create_transformer.py @@ -0,0 +1,26 @@ +from typing import Literal + +from .base_transformer import BaseTransformer +from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer +from .loc.locations_transformer import LibraryOfCongressLocationsTransformer +from .loc.names_transformer import LibraryOfCongressNamesTransformer + +LOC_SUBJECT_HEADINGS_URL = ( + "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" +) +LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" + +TransformerType = Literal["loc_concepts", "loc_names", "loc_locations"] + + +def create_transformer(transformer_type: TransformerType) -> BaseTransformer: + if transformer_type == "loc_concepts": + return LibraryOfCongressConceptsTransformer(LOC_SUBJECT_HEADINGS_URL) + if transformer_type == "loc_names": + return LibraryOfCongressNamesTransformer(LOC_NAMES_URL) + if transformer_type == "loc_locations": + return LibraryOfCongressLocationsTransformer( + LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL + ) + + raise ValueError(f"Unknown transformer type: {transformer_type}") diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 76254a9ef1..f0508a0b81 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -1,3 +1,5 @@ +from typing import Literal + ID_PREFIXES_TO_REMOVE = ( "/authorities/subjects/", "http://id.loc.gov/authorities/subjects/", @@ -11,14 +13,14 @@ def __init__(self, raw_concept: dict): self._raw_concept_node = self._extract_concept_node() @staticmethod - def _remove_id_prefix(raw_id: str): + def _remove_id_prefix(raw_id: str) -> str: for prefix in ID_PREFIXES_TO_REMOVE: raw_id = raw_id.removeprefix(prefix) return raw_id - def _extract_concept_node(self): - graph = self.raw_concept["@graph"] + def _extract_concept_node(self) -> dict | None: + graph: list[dict] = self.raw_concept["@graph"] # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. # When this happens, return `None` because there is no concept for us to extract. @@ -26,7 +28,8 @@ def _extract_concept_node(self): ( node for node in graph - if self.source_id in node.get("@id") and node["@type"] == "skos:Concept" + if self.source_id in node.get("@id", "") + and node["@type"] == "skos:Concept" ), None, ) @@ -34,7 +37,7 @@ def _extract_concept_node(self): return concept_node @staticmethod - def _extract_label(raw_label: str | dict): + def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. if isinstance(raw_label, str): return raw_label @@ -45,7 +48,7 @@ def _extract_label(raw_label: str | dict): return raw_label["@value"] - def exclude(self): + def exclude(self) -> bool: if self._raw_concept_node is None: return True @@ -57,16 +60,20 @@ def exclude(self): return False @property - def source_id(self): + def source_id(self) -> str: return self._remove_id_prefix(self.raw_concept["@id"]) @property - def label(self): + def label(self) -> str: + assert self._raw_concept_node is not None + raw_preferred_label = self._raw_concept_node["skos:prefLabel"] return self._extract_label(raw_preferred_label) @property - def alternative_labels(self): + def alternative_labels(self) -> list[str]: + assert self._raw_concept_node is not None + raw_alternative_labels = self._raw_concept_node.get("skos:altLabel", []) # Raw alternative labels are either returned in a list of labels, or as a single label @@ -77,7 +84,9 @@ def alternative_labels(self): return [self._extract_label(raw_alternative_labels)] @property - def broader_concept_ids(self): + def broader_concept_ids(self) -> list[str]: + assert self._raw_concept_node is not None + broader_concepts = self._raw_concept_node.get("skos:broader", []) # Sometimes broader concepts are returned as a list of concepts, and sometimes as just a single JSON @@ -96,7 +105,7 @@ def broader_concept_ids(self): return broader_ids @property - def is_geographic(self): + def is_geographic(self) -> bool: if self._raw_concept_node is None: return False @@ -110,7 +119,7 @@ def is_geographic(self): return "http://id.loc.gov/datatypes/codes/gac" in notation_types @property - def source(self): + def source(self) -> Literal["lc-subjects", "lc-names"]: if "subjects" in self.raw_concept["@id"]: return "lc-subjects" diff --git a/src/transformers/transformer_type.py b/src/transformers/transformer_type.py deleted file mode 100644 index 39fd20bcc4..0000000000 --- a/src/transformers/transformer_type.py +++ /dev/null @@ -1,26 +0,0 @@ -import enum - -from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer -from .loc.locations_transformer import LibraryOfCongressLocationsTransformer -from .loc.names_transformer import LibraryOfCongressNamesTransformer - -LOC_SUBJECT_HEADINGS_URL = ( - "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" -) -LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" - - -class TransformerType(enum.Enum): - LOC_CONCEPTS = LibraryOfCongressConceptsTransformer(LOC_SUBJECT_HEADINGS_URL) - LOC_NAMES = LibraryOfCongressNamesTransformer(LOC_NAMES_URL) - LOC_LOCATIONS = LibraryOfCongressLocationsTransformer( - LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL - ) - - def __str__(self): - return self.name.lower() - - # For parsing lowercase Lambda/command line arguments - @staticmethod - def argparse(s): - return TransformerType[s.upper()] diff --git a/src/utils/aws.py b/src/utils/aws.py index 555d04b757..1b65713695 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -2,18 +2,19 @@ import boto3 +from clients.base_neptune_client import BaseNeptuneClient from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient -def get_secret(secret_name: str): +def get_secret(secret_name: str) -> str: secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") response = secrets_manager_client.get_secret_value(SecretId=secret_name) return response["SecretString"] -def publish_batch_to_sns(topic_arn: str, queries: list[str]): +def publish_batch_to_sns(topic_arn: str, queries: list[str]) -> None: request_entries = [] for i, query in enumerate(queries): request_entries.append( @@ -30,7 +31,7 @@ def publish_batch_to_sns(topic_arn: str, queries: list[str]): ) -def get_neptune_client(is_local: bool): +def get_neptune_client(is_local: bool) -> BaseNeptuneClient: if is_local: return LocalNeptuneClient( get_secret("NeptuneTest/LoadBalancerUrl"), From 5afbdab891887635dbc5a64defb2df15366953a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 16:27:01 +0000 Subject: [PATCH 028/310] Update mypy config --- src/mypy.ini | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/mypy.ini b/src/mypy.ini index 41349e5583..7bb317dbbb 100644 --- a/src/mypy.ini +++ b/src/mypy.ini @@ -1,4 +1,15 @@ [mypy] +strict_optional = True +strict_equality = True +disallow_untyped_calls = True +disallow_untyped_defs = True + +warn_redundant_casts = True +warn_unused_ignores = True +warn_no_return = True +warn_return_any = True +warn_unreachable = True +warn_unused_configs = True [mypy-smart_open.*] ignore_missing_imports = True From 6d6b9afc6fb07bf22884f60d4d894c64346e7c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 16:02:33 +0000 Subject: [PATCH 029/310] Add more comprehensive typing --- src/bulk_load_poller.py | 7 ++- src/bulk_loader.py | 16 ++--- src/clients/base_neptune_client.py | 62 ++++++++++++-------- src/clients/lambda_neptune_client.py | 2 +- src/clients/local_neptune_client.py | 2 +- src/converters/cypher/bulk_load_converter.py | 6 +- src/converters/cypher/query_converter.py | 2 +- src/extractor.py | 18 +++--- src/indexer.py | 9 +-- src/query_builders/cypher.py | 2 +- src/transformers/base_transformer.py | 10 ++-- src/transformers/create_transformer.py | 26 ++++++++ src/transformers/loc/raw_concept.py | 33 +++++++---- src/transformers/transformer_type.py | 26 -------- src/utils/aws.py | 10 ++-- 15 files changed, 129 insertions(+), 102 deletions(-) create mode 100644 src/transformers/create_transformer.py delete mode 100644 src/transformers/transformer_type.py diff --git a/src/bulk_load_poller.py b/src/bulk_load_poller.py index d135b8501d..9aaafadb4c 100644 --- a/src/bulk_load_poller.py +++ b/src/bulk_load_poller.py @@ -1,9 +1,10 @@ import argparse +import typing from utils.aws import get_neptune_client -def handler(load_id: str, is_local=False): +def handler(load_id: str, is_local: bool = False) -> dict[str, str]: neptune_client = get_neptune_client(is_local) status = neptune_client.get_bulk_load_status(load_id) @@ -22,12 +23,12 @@ def handler(load_id: str, is_local=False): raise Exception("Load failed. See error log above.") -def lambda_handler(event: dict, context): +def lambda_handler(event: dict, context: typing.Any) -> dict[str, str]: load_id = event["loadId"] return handler(load_id) -def local_handler(): +def local_handler() -> None: parser = argparse.ArgumentParser(description="") parser.add_argument( "--load-id", diff --git a/src/bulk_loader.py b/src/bulk_loader.py index c71aa9732b..c8b67db4d5 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -3,13 +3,15 @@ import typing from transformers.base_transformer import EntityType -from transformers.transformer_type import TransformerType +from transformers.create_transformer import TransformerType from utils.aws import get_neptune_client S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] -def handler(transformer_type: str, entity_type: EntityType, is_local=False): +def handler( + transformer_type: TransformerType, entity_type: EntityType, is_local: bool = False +) -> dict[str, str]: file_name = f"{transformer_type}__{entity_type}.csv" s3_file_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" @@ -21,18 +23,18 @@ def handler(transformer_type: str, entity_type: EntityType, is_local=False): return {"loadId": load_id} -def lambda_handler(event: dict, context): - transformer_type = TransformerType.argparse(event["transformer_type"]) +def lambda_handler(event: dict, context: typing.Any) -> dict[str, str]: + transformer_type = event["transformer_type"] entity_type = event["entity_type"] return handler(transformer_type, entity_type) -def local_handler(): +def local_handler() -> None: parser = argparse.ArgumentParser(description="") parser.add_argument( "--transformer-type", - type=TransformerType.argparse, - choices=list(TransformerType), + type=str, + choices=typing.get_args(TransformerType), help="Which transformer's output to bulk load.", required=True, ) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 6934e2f021..690261a494 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -2,6 +2,7 @@ import json import backoff +import boto3 import requests from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest @@ -15,17 +16,21 @@ class BaseNeptuneClient: cluster) or LocalNeptuneClient (when connecting to the cluster from outside the VPC). """ - def __init__(self): - self.session = None - self.neptune_endpoint = None - self.verify_requests = True + def __init__(self) -> None: + self.session: boto3.Session | None = None + self.neptune_endpoint: str | None = None + self.verify_requests: bool = True - def _get_client_url(self): + def _get_client_url(self) -> str: raise NotImplementedError() def _make_request( self, method: str, relative_url: str, payload: dict | None = None - ): + ) -> dict: + assert self.session + credentials = self.session.get_credentials() + assert credentials is not None + client_url = self._get_client_url() url = f"{client_url}{relative_url}" @@ -34,36 +39,41 @@ def _make_request( # We use IAM database authentication, which means we need to authenticate the request using AWS Signature request = AWSRequest(method=method, url=url, data=data, headers=headers) - SigV4Auth(self.session.get_credentials(), "neptune-db", "eu-west-1").add_auth( - request - ) - - response = requests.request( - method, url, data=data, headers=request.headers, verify=self.verify_requests + SigV4Auth(credentials, "neptune-db", "eu-west-1").add_auth(request) + + raw_response = requests.request( + method, + url, + data=data, + headers=dict(request.headers), + verify=self.verify_requests, ) - if response.status_code != 200: - raise Exception(response.content) + if raw_response.status_code != 200: + raise Exception(raw_response.content) - return response.json() + response: dict = raw_response.json() + return response @backoff.on_exception(backoff.constant, Exception, max_tries=5, interval=1) - def run_open_cypher_query(self, query: str): + def run_open_cypher_query(self, query: str) -> dict: """Runs an openCypher query against the Neptune cluster. Automatically retries up to 5 times to mitigate transient errors.""" payload = {"query": query} response = self._make_request("POST", "/openCypher", payload) - return response["results"] + results: dict = response["results"] + return results - def get_graph_summary(self): + def get_graph_summary(self) -> dict: """ Returns a Neptune summary report about the graph. See https://docs.aws.amazon.com/neptune/latest/userguide/neptune-graph-summary.html for more info. """ response = self._make_request("GET", "/propertygraph/statistics/summary") - return response["payload"]["graphSummary"] + graph_summary: dict = response["payload"]["graphSummary"] + return graph_summary - def _reset_database(self): + def _reset_database(self) -> dict: """Irreversibly wipes all data from the database. This method only exists for development purposes.""" # TODO: Only keep this function for testing purposes. Remove before releasing. data = {"action": "initiateDatabaseReset"} @@ -95,9 +105,11 @@ def initiate_bulk_load(self, s3_file_uri: str) -> str: "updateSingleCardinalityProperties": "TRUE", }, ) - return response["payload"]["loadId"] - def get_bulk_load_status(self, load_id: str): + load_id: str = response["payload"]["loadId"] + return load_id + + def get_bulk_load_status(self, load_id: str) -> str: """ Checks the status of a Neptune bulk load job and prints the results. Returns the overall status of the job. See https://docs.aws.amazon.com/neptune/latest/userguide/load-api-reference-status-requests.html for more info. @@ -112,7 +124,7 @@ def get_bulk_load_status(self, load_id: str): error_logs = payload["errors"]["errorLogs"] # Statuses: https://docs.aws.amazon.com/neptune/latest/userguide/loader-message.html - status = overall_status["status"] + status: str = overall_status["status"] processed_count = overall_status["totalRecords"] print(f"Bulk load status: {status}. (Processed {processed_count:,} records.)") @@ -140,8 +152,8 @@ def get_bulk_load_status(self, load_id: str): return status - def get_bulk_load_statuses(self): + def get_bulk_load_statuses(self) -> list[str]: """Returns the loadIDs of the last 5 Neptune bulk load jobs.""" response = self._make_request("GET", "/loader") - payload = response["payload"] + payload: list[str] = response["payload"] return payload diff --git a/src/clients/lambda_neptune_client.py b/src/clients/lambda_neptune_client.py index 5da7e2353a..58746efef6 100644 --- a/src/clients/lambda_neptune_client.py +++ b/src/clients/lambda_neptune_client.py @@ -15,5 +15,5 @@ def __init__(self, neptune_endpoint: str): aws_session_token=os.getenv("AWS_SESSION_TOKEN"), ) - def _get_client_url(self): + def _get_client_url(self) -> str: return f"https://{self.neptune_endpoint}:8182" diff --git a/src/clients/local_neptune_client.py b/src/clients/local_neptune_client.py index decc84fa6d..8db3ddd13e 100644 --- a/src/clients/local_neptune_client.py +++ b/src/clients/local_neptune_client.py @@ -17,5 +17,5 @@ def __init__(self, load_balancer_url: str, neptune_endpoint: str): self.neptune_endpoint = neptune_endpoint self.session = boto3.Session() - def _get_client_url(self): + def _get_client_url(self) -> str: return self.load_balancer_url diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index 2f1ea70780..9a80f0365f 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -10,7 +10,7 @@ class CypherBulkLoadConverter(CypherBaseConverter): def __init__(self, entity_type: Literal["nodes", "edges"]): self.entity_type = entity_type - def _node_to_bulk_cypher(self, model: BaseNode): + def _node_to_bulk_cypher(self, model: BaseNode) -> dict: bulk_node = {":ID": model.id, ":LABEL": type(model).__name__} for key, raw_value in model.dict().items(): @@ -19,7 +19,7 @@ def _node_to_bulk_cypher(self, model: BaseNode): return bulk_node - def _edge_to_bulk_cypher(self, model: BaseEdge): + def _edge_to_bulk_cypher(self, model: BaseEdge) -> dict: bulk_edge = { ":ID": f"{model.from_id}-->{model.to_id}", ":START_ID": model.from_id, @@ -33,7 +33,7 @@ def _edge_to_bulk_cypher(self, model: BaseEdge): return bulk_edge - def convert_to_bulk_cypher(self, model: BaseNode | BaseEdge): + def convert_to_bulk_cypher(self, model: BaseNode | BaseEdge) -> dict: """ Returns a dictionary representing the entity (node or edge), converting all values into a format compatible with openCypher, and adding all required values for bulk upload, such as `:ID` or `:LABEL`. diff --git a/src/converters/cypher/query_converter.py b/src/converters/cypher/query_converter.py index 0f76516c77..2b5b6bca1e 100644 --- a/src/converters/cypher/query_converter.py +++ b/src/converters/cypher/query_converter.py @@ -39,7 +39,7 @@ def _edge_to_cypher_map(self, model: BaseEdge) -> str: return "{" + ", ".join(properties) + "}" - def convert_to_cypher_map(self, model: BaseNode | BaseEdge): + def convert_to_cypher_map(self, model: BaseNode | BaseEdge) -> str: """ Returns a string representing an openCypher Map of the entity (node or edge) for use with an `UNWIND` query. diff --git a/src/extractor.py b/src/extractor.py index 8f17dff541..08aea8be20 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -4,7 +4,7 @@ from transformers.base_transformer import (BaseTransformer, EntityType, StreamDestination) -from transformers.transformer_type import TransformerType +from transformers.create_transformer import TransformerType, create_transformer from utils.aws import get_neptune_client CHUNK_SIZE = 256 @@ -17,14 +17,14 @@ def handler( transformer_type: TransformerType, entity_type: EntityType, sample_size: int | None = None, - is_local=False, -): + is_local: bool = False, +) -> None: print( f"Transforming {sample_size or 'all'} {entity_type} using the {transformer_type} " f"transformer and streaming them into {stream_destination}." ) - transformer: BaseTransformer = TransformerType[transformer_type.name].value + transformer: BaseTransformer = create_transformer(transformer_type) if stream_destination == "graph": neptune_client = get_neptune_client(is_local) @@ -43,21 +43,21 @@ def handler( raise ValueError("Unsupported stream destination.") -def lambda_handler(event: dict, context): +def lambda_handler(event: dict, context: typing.Any) -> None: stream_destination = event["stream_destination"] - transformer_type = TransformerType.argparse(event["transformer_type"]) + transformer_type = event["transformer_type"] entity_type = event["entity_type"] sample_size = event.get("sample_size") handler(stream_destination, transformer_type, entity_type, sample_size) -def local_handler(): +def local_handler() -> None: parser = argparse.ArgumentParser(description="") parser.add_argument( "--transformer-type", - type=TransformerType.argparse, - choices=list(TransformerType), + type=str, + choices=typing.get_args(TransformerType), help="Which transformer to use for streaming.", required=True, ) diff --git a/src/indexer.py b/src/indexer.py index a08a88322b..dad19ea5f9 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,10 +1,11 @@ import argparse import json +import typing from utils.aws import get_neptune_client -def extract_sns_messages_from_sqs_event(event): +def extract_sns_messages_from_sqs_event(event: dict) -> list[str]: queries = [] for record in event["Records"]: @@ -14,7 +15,7 @@ def extract_sns_messages_from_sqs_event(event): return queries -def handler(queries: list[str], is_local=False): +def handler(queries: list[str], is_local: bool = False) -> None: neptune_client = get_neptune_client(is_local) print(f"Received number of queries: {len(queries)}") @@ -23,12 +24,12 @@ def handler(queries: list[str], is_local=False): neptune_client.run_open_cypher_query(query) -def lambda_handler(event: dict, context): +def lambda_handler(event: dict, context: typing.Any) -> None: queries = extract_sns_messages_from_sqs_event(event) handler(queries) -def local_handler(): +def local_handler() -> None: parser = argparse.ArgumentParser(description="") parser.add_argument( "--cypher-query", diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index 2014fe82ed..82cf0f9811 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -54,7 +54,7 @@ def construct_upsert_edges_query(edges: list[BaseEdge]) -> str: return query -def construct_upsert_cypher_query(entities: list[BaseNode | BaseEdge]): +def construct_upsert_cypher_query(entities: list[BaseNode | BaseEdge]) -> str: """ Returns an openCypher `UNWIND` query which creates a graph node or edge for each item specified in `entities`, or updates an existing matching node or edge. diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 4cc8fb45a6..960b951b65 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -29,7 +29,7 @@ def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: class BaseTransformer: - def __init__(self): + def __init__(self) -> None: self.source: BaseSource = BaseSource() def transform_node(self, raw_node: dict) -> BaseNode | None: @@ -112,7 +112,7 @@ def stream_to_s3( entity_type: EntityType, chunk_size: int, sample_size: int | None = None, - ): + ) -> None: """ Streams transformed entities (nodes or edges) into an S3 bucket for bulk loading into the Neptune cluster. Suitable for indexing large numbers of entities in production. Provides limited observability. @@ -140,14 +140,14 @@ def stream_to_graph( entity_type: EntityType, query_chunk_size: int, sample_size: int | None = None, - ): + ) -> None: """ Streams transformed entities (nodes or edges) directly into Neptune using multiple threads for parallel processing. Suitable for local testing. Not recommended for indexing large numbers of entities. """ chunks = self._stream_chunks(entity_type, query_chunk_size, sample_size) - def run_query(chunk): + def run_query(chunk: list[BaseNode | BaseEdge]) -> None: query = construct_upsert_cypher_query(chunk) neptune_client.run_open_cypher_query(query) @@ -177,7 +177,7 @@ def stream_to_sns( entity_type: EntityType, query_chunk_size: int, sample_size: int | None = None, - ): + ) -> None: """ Streams transformed entities (nodes or edges) into an SNS topic as openCypher queries, where they will be consumed by the `indexer` Lambda function. diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py new file mode 100644 index 0000000000..aa2a139c6e --- /dev/null +++ b/src/transformers/create_transformer.py @@ -0,0 +1,26 @@ +from typing import Literal + +from .base_transformer import BaseTransformer +from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer +from .loc.locations_transformer import LibraryOfCongressLocationsTransformer +from .loc.names_transformer import LibraryOfCongressNamesTransformer + +LOC_SUBJECT_HEADINGS_URL = ( + "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" +) +LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" + +TransformerType = Literal["loc_concepts", "loc_names", "loc_locations"] + + +def create_transformer(transformer_type: TransformerType) -> BaseTransformer: + if transformer_type == "loc_concepts": + return LibraryOfCongressConceptsTransformer(LOC_SUBJECT_HEADINGS_URL) + if transformer_type == "loc_names": + return LibraryOfCongressNamesTransformer(LOC_NAMES_URL) + if transformer_type == "loc_locations": + return LibraryOfCongressLocationsTransformer( + LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL + ) + + raise ValueError(f"Unknown transformer type: {transformer_type}") diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 76254a9ef1..f0508a0b81 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -1,3 +1,5 @@ +from typing import Literal + ID_PREFIXES_TO_REMOVE = ( "/authorities/subjects/", "http://id.loc.gov/authorities/subjects/", @@ -11,14 +13,14 @@ def __init__(self, raw_concept: dict): self._raw_concept_node = self._extract_concept_node() @staticmethod - def _remove_id_prefix(raw_id: str): + def _remove_id_prefix(raw_id: str) -> str: for prefix in ID_PREFIXES_TO_REMOVE: raw_id = raw_id.removeprefix(prefix) return raw_id - def _extract_concept_node(self): - graph = self.raw_concept["@graph"] + def _extract_concept_node(self) -> dict | None: + graph: list[dict] = self.raw_concept["@graph"] # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. # When this happens, return `None` because there is no concept for us to extract. @@ -26,7 +28,8 @@ def _extract_concept_node(self): ( node for node in graph - if self.source_id in node.get("@id") and node["@type"] == "skos:Concept" + if self.source_id in node.get("@id", "") + and node["@type"] == "skos:Concept" ), None, ) @@ -34,7 +37,7 @@ def _extract_concept_node(self): return concept_node @staticmethod - def _extract_label(raw_label: str | dict): + def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. if isinstance(raw_label, str): return raw_label @@ -45,7 +48,7 @@ def _extract_label(raw_label: str | dict): return raw_label["@value"] - def exclude(self): + def exclude(self) -> bool: if self._raw_concept_node is None: return True @@ -57,16 +60,20 @@ def exclude(self): return False @property - def source_id(self): + def source_id(self) -> str: return self._remove_id_prefix(self.raw_concept["@id"]) @property - def label(self): + def label(self) -> str: + assert self._raw_concept_node is not None + raw_preferred_label = self._raw_concept_node["skos:prefLabel"] return self._extract_label(raw_preferred_label) @property - def alternative_labels(self): + def alternative_labels(self) -> list[str]: + assert self._raw_concept_node is not None + raw_alternative_labels = self._raw_concept_node.get("skos:altLabel", []) # Raw alternative labels are either returned in a list of labels, or as a single label @@ -77,7 +84,9 @@ def alternative_labels(self): return [self._extract_label(raw_alternative_labels)] @property - def broader_concept_ids(self): + def broader_concept_ids(self) -> list[str]: + assert self._raw_concept_node is not None + broader_concepts = self._raw_concept_node.get("skos:broader", []) # Sometimes broader concepts are returned as a list of concepts, and sometimes as just a single JSON @@ -96,7 +105,7 @@ def broader_concept_ids(self): return broader_ids @property - def is_geographic(self): + def is_geographic(self) -> bool: if self._raw_concept_node is None: return False @@ -110,7 +119,7 @@ def is_geographic(self): return "http://id.loc.gov/datatypes/codes/gac" in notation_types @property - def source(self): + def source(self) -> Literal["lc-subjects", "lc-names"]: if "subjects" in self.raw_concept["@id"]: return "lc-subjects" diff --git a/src/transformers/transformer_type.py b/src/transformers/transformer_type.py deleted file mode 100644 index 39fd20bcc4..0000000000 --- a/src/transformers/transformer_type.py +++ /dev/null @@ -1,26 +0,0 @@ -import enum - -from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer -from .loc.locations_transformer import LibraryOfCongressLocationsTransformer -from .loc.names_transformer import LibraryOfCongressNamesTransformer - -LOC_SUBJECT_HEADINGS_URL = ( - "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" -) -LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" - - -class TransformerType(enum.Enum): - LOC_CONCEPTS = LibraryOfCongressConceptsTransformer(LOC_SUBJECT_HEADINGS_URL) - LOC_NAMES = LibraryOfCongressNamesTransformer(LOC_NAMES_URL) - LOC_LOCATIONS = LibraryOfCongressLocationsTransformer( - LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL - ) - - def __str__(self): - return self.name.lower() - - # For parsing lowercase Lambda/command line arguments - @staticmethod - def argparse(s): - return TransformerType[s.upper()] diff --git a/src/utils/aws.py b/src/utils/aws.py index 555d04b757..575dd47f4d 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -2,18 +2,20 @@ import boto3 +from clients.base_neptune_client import BaseNeptuneClient from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient -def get_secret(secret_name: str): +def get_secret(secret_name: str) -> str: secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") response = secrets_manager_client.get_secret_value(SecretId=secret_name) - return response["SecretString"] + secret: str = response["SecretString"] + return secret -def publish_batch_to_sns(topic_arn: str, queries: list[str]): +def publish_batch_to_sns(topic_arn: str, queries: list[str]) -> None: request_entries = [] for i, query in enumerate(queries): request_entries.append( @@ -30,7 +32,7 @@ def publish_batch_to_sns(topic_arn: str, queries: list[str]): ) -def get_neptune_client(is_local: bool): +def get_neptune_client(is_local: bool) -> BaseNeptuneClient: if is_local: return LocalNeptuneClient( get_secret("NeptuneTest/LoadBalancerUrl"), From 2bc9550c9f27ae1b25f0e5bbeff5a8eab31b7ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 6 Jan 2025 16:34:24 +0000 Subject: [PATCH 030/310] Update autoformat-typecheck.yml --- .github/workflows/autoformat-typecheck.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/autoformat-typecheck.yml b/.github/workflows/autoformat-typecheck.yml index ae6a1cff7f..23a95afc22 100644 --- a/.github/workflows/autoformat-typecheck.yml +++ b/.github/workflows/autoformat-typecheck.yml @@ -46,7 +46,8 @@ jobs: - name: Run isort run: | - isort src/ + # Set the profile to "black" to prevent conflicts between black and isort + isort --profile=black src/ - name: Check for formatting changes id: check_formatting_changes From 36105d4a37d93e10d021a8d4bef3b22d01b483a0 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Mon, 6 Jan 2025 16:35:27 +0000 Subject: [PATCH 031/310] Apply auto-formatting rules --- src/extractor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/extractor.py b/src/extractor.py index 08aea8be20..f8bc9896a7 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -2,8 +2,7 @@ import os import typing -from transformers.base_transformer import (BaseTransformer, EntityType, - StreamDestination) +from transformers.base_transformer import BaseTransformer, EntityType, StreamDestination from transformers.create_transformer import TransformerType, create_transformer from utils.aws import get_neptune_client From 3e9da5fabbbf6e2837dad372615fd99e0e55923d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 7 Jan 2025 11:27:02 +0000 Subject: [PATCH 032/310] Tweak LoC raw label handling and update comment --- src/extractor.py | 8 ++++++-- src/transformers/base_transformer.py | 14 +++++++++++++- src/transformers/loc/raw_concept.py | 7 +++++-- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/extractor.py b/src/extractor.py index f8bc9896a7..65735c2ce8 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -7,8 +7,8 @@ from utils.aws import get_neptune_client CHUNK_SIZE = 256 -S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] -GRAPH_QUERIES_SNS_TOPIC_ARN = os.environ["GRAPH_QUERIES_SNS_TOPIC_ARN"] +S3_BULK_LOAD_BUCKET_NAME = os.environ.get("S3_BULK_LOAD_BUCKET_NAME") +GRAPH_QUERIES_SNS_TOPIC_ARN = os.environ.get("GRAPH_QUERIES_SNS_TOPIC_ARN") def handler( @@ -31,13 +31,17 @@ def handler( neptune_client, entity_type, CHUNK_SIZE, sample_size ) elif stream_destination == "s3": + assert S3_BULK_LOAD_BUCKET_NAME is not None file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" transformer.stream_to_s3(s3_uri, entity_type, CHUNK_SIZE, sample_size) elif stream_destination == "sns": + assert GRAPH_QUERIES_SNS_TOPIC_ARN is not None transformer.stream_to_sns( GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size ) + elif stream_destination == "void": + transformer.stream_to_nowhere(entity_type, CHUNK_SIZE, sample_size) else: raise ValueError("Unsupported stream destination.") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 231586beb2..e7ba6815e9 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -16,7 +16,7 @@ from utils.aws import publish_batch_to_sns EntityType = Literal["nodes", "edges"] -StreamDestination = Literal["graph", "s3", "sns"] +StreamDestination = Literal["graph", "s3", "sns", "void"] def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: @@ -202,3 +202,15 @@ def stream_to_sns( # Publish remaining messages (if any) if len(queries) > 0: publish_batch_to_sns(topic_arn, queries) + + def stream_to_nowhere( + self, + entity_type: EntityType, + query_chunk_size: int, + sample_size: int | None = None, + ): + """ + Streams transformed entities (nodes or edges) into the void. Useful for development and testing purposes. + """ + for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): + pass diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index f0508a0b81..4adebc8781 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -41,9 +41,12 @@ def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. if isinstance(raw_label, str): return raw_label - # Very rarely, labels are returned as a list of strings. When this happens, - # we only return the first item in the list. + + # In cases where an LoC Name has multiple labels written using different writing systems, labels are returned + # as a list. When this happens, we extract the first item in the list, which always stores the Latin script + # version of the label as a string. if isinstance(raw_label, list): + assert isinstance(raw_label[0], str) return raw_label[0] return raw_label["@value"] From 162bb6dadaa5ecd53ae8348bcd983326273a0ed9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 7 Jan 2025 11:29:37 +0000 Subject: [PATCH 033/310] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 871b887e72..4a0ecca1c7 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Eventually the contents of this repository might be merged into another reposito See the following RFCs for more context: * [RFC 062: Wellcome Collection Graph overview and next steps](https://github.com/wellcomecollection/docs/tree/main/rfcs/062-knowledge-graph) -* [RFC 064: Graph data model](https://github.com/wellcomecollection/docs/blob/rfc-064-graph-model/rfcs/064-graph-data-model/README.md) +* [RFC 064: Graph data model](https://github.com/wellcomecollection/docs/tree/main/rfcs/064-graph-data-model/README.md) ## Architecture overview From a4bd040ff117a20f08addf3368f657b78f051368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 7 Jan 2025 11:27:02 +0000 Subject: [PATCH 034/310] Tweak LoC raw label handling and update comment --- src/extractor.py | 8 ++++++-- src/transformers/base_transformer.py | 14 +++++++++++++- src/transformers/loc/raw_concept.py | 7 +++++-- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/extractor.py b/src/extractor.py index 08aea8be20..0d8272eec5 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -8,8 +8,8 @@ from utils.aws import get_neptune_client CHUNK_SIZE = 256 -S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] -GRAPH_QUERIES_SNS_TOPIC_ARN = os.environ["GRAPH_QUERIES_SNS_TOPIC_ARN"] +S3_BULK_LOAD_BUCKET_NAME = os.environ.get("S3_BULK_LOAD_BUCKET_NAME") +GRAPH_QUERIES_SNS_TOPIC_ARN = os.environ.get("GRAPH_QUERIES_SNS_TOPIC_ARN") def handler( @@ -32,13 +32,17 @@ def handler( neptune_client, entity_type, CHUNK_SIZE, sample_size ) elif stream_destination == "s3": + assert S3_BULK_LOAD_BUCKET_NAME is not None file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" transformer.stream_to_s3(s3_uri, entity_type, CHUNK_SIZE, sample_size) elif stream_destination == "sns": + assert GRAPH_QUERIES_SNS_TOPIC_ARN is not None transformer.stream_to_sns( GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size ) + elif stream_destination == "void": + transformer.stream_to_nowhere(entity_type, CHUNK_SIZE, sample_size) else: raise ValueError("Unsupported stream destination.") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 960b951b65..8791af12bf 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -16,7 +16,7 @@ from utils.aws import publish_batch_to_sns EntityType = Literal["nodes", "edges"] -StreamDestination = Literal["graph", "s3", "sns"] +StreamDestination = Literal["graph", "s3", "sns", "void"] def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: @@ -200,3 +200,15 @@ def stream_to_sns( # Publish remaining messages (if any) if len(queries) > 0: publish_batch_to_sns(topic_arn, queries) + + def stream_to_nowhere( + self, + entity_type: EntityType, + query_chunk_size: int, + sample_size: int | None = None, + ): + """ + Streams transformed entities (nodes or edges) into the void. Useful for development and testing purposes. + """ + for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): + pass diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index f0508a0b81..4adebc8781 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -41,9 +41,12 @@ def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. if isinstance(raw_label, str): return raw_label - # Very rarely, labels are returned as a list of strings. When this happens, - # we only return the first item in the list. + + # In cases where an LoC Name has multiple labels written using different writing systems, labels are returned + # as a list. When this happens, we extract the first item in the list, which always stores the Latin script + # version of the label as a string. if isinstance(raw_label, list): + assert isinstance(raw_label[0], str) return raw_label[0] return raw_label["@value"] From 7f83b0332dfddd0fff63ba0a472136f3f4d99741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 7 Jan 2025 11:29:37 +0000 Subject: [PATCH 035/310] Update README.md --- README.md | 2 +- src/transformers/base_transformer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 871b887e72..4a0ecca1c7 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Eventually the contents of this repository might be merged into another reposito See the following RFCs for more context: * [RFC 062: Wellcome Collection Graph overview and next steps](https://github.com/wellcomecollection/docs/tree/main/rfcs/062-knowledge-graph) -* [RFC 064: Graph data model](https://github.com/wellcomecollection/docs/blob/rfc-064-graph-model/rfcs/064-graph-data-model/README.md) +* [RFC 064: Graph data model](https://github.com/wellcomecollection/docs/tree/main/rfcs/064-graph-data-model/README.md) ## Architecture overview diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 8791af12bf..0ee6310918 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -206,7 +206,7 @@ def stream_to_nowhere( entity_type: EntityType, query_chunk_size: int, sample_size: int | None = None, - ): + ) -> None: """ Streams transformed entities (nodes or edges) into the void. Useful for development and testing purposes. """ From 452ab2f758d5267e3367c8d2f7d510880b3a8b2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 8 Jan 2025 11:10:40 +0000 Subject: [PATCH 036/310] Improve documentation --- src/sources/base_source.py | 1 + src/transformers/base_transformer.py | 2 ++ src/transformers/loc/raw_concept.py | 4 ++++ src/utils/aws.py | 15 ++++++++++++--- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/sources/base_source.py b/src/sources/base_source.py index 9f135c668d..1da81e53cf 100644 --- a/src/sources/base_source.py +++ b/src/sources/base_source.py @@ -3,4 +3,5 @@ class BaseSource: def stream_raw(self) -> Generator[dict]: + """Returns a generator of dictionaries, each corresponding to a raw entity extracted from the source.""" raise NotImplementedError("Each source must implement a `stream_raw` method.") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 0ee6310918..3c120992d1 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -33,11 +33,13 @@ def __init__(self) -> None: self.source: BaseSource = BaseSource() def transform_node(self, raw_node: dict) -> BaseNode | None: + """Accepts a raw node from the source dataset and returns a transformed node as a Pydantic model.""" raise NotImplementedError( "Each transformer must implement a `transform_node` method." ) def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: + """Accepts a raw node from the source dataset and returns a generator of extracted edges as Pydantic models.""" raise NotImplementedError( "Each transformer must implement an `extract_edges` method." ) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 4adebc8781..1dd5b8ec28 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -52,6 +52,7 @@ def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: return raw_label["@value"] def exclude(self) -> bool: + """Returns True if the concept should be excluded from the graph.""" if self._raw_concept_node is None: return True @@ -75,6 +76,7 @@ def label(self) -> str: @property def alternative_labels(self) -> list[str]: + """Returns a list of alternative labels for the concept.""" assert self._raw_concept_node is not None raw_alternative_labels = self._raw_concept_node.get("skos:altLabel", []) @@ -88,6 +90,7 @@ def alternative_labels(self) -> list[str]: @property def broader_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are broader than the current concept.""" assert self._raw_concept_node is not None broader_concepts = self._raw_concept_node.get("skos:broader", []) @@ -109,6 +112,7 @@ def broader_concept_ids(self) -> list[str]: @property def is_geographic(self) -> bool: + """Returns True if the node represents a geographic concept, as determined by `skos:notation`.""" if self._raw_concept_node is None: return False diff --git a/src/utils/aws.py b/src/utils/aws.py index 575dd47f4d..cb40ac178e 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -8,6 +8,7 @@ def get_secret(secret_name: str) -> str: + """Returns an AWS Secrets Manager secret string associated with a given secret name.""" secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") response = secrets_manager_client.get_secret_value(SecretId=secret_name) @@ -15,13 +16,17 @@ def get_secret(secret_name: str) -> str: return secret -def publish_batch_to_sns(topic_arn: str, queries: list[str]) -> None: +def publish_batch_to_sns(topic_arn: str, messages: list[str]) -> None: + """Publishes a batch of up to 10 messages to the specified SNS topic.""" + + assert len(messages) <= 10 + request_entries = [] - for i, query in enumerate(queries): + for i, message in enumerate(messages): request_entries.append( { "Id": f"batch_message_{i}", - "Message": json.dumps({"default": query}), + "Message": json.dumps({"default": message}), "MessageStructure": "json", } ) @@ -33,6 +38,10 @@ def publish_batch_to_sns(topic_arn: str, queries: list[str]) -> None: def get_neptune_client(is_local: bool) -> BaseNeptuneClient: + """ + Returns an instance of LambdaNeptuneClient or LocalNeptuneClient (if `is_local` is True). LocalNeptuneClient + should only be used when connecting to the cluster from outside the VPC. + """ if is_local: return LocalNeptuneClient( get_secret("NeptuneTest/LoadBalancerUrl"), From a1ad401331aed7bf446642f1f322396674c0825c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 8 Jan 2025 10:50:32 +0000 Subject: [PATCH 037/310] Add custom domain to NLB to enable TLS termination --- src/clients/base_neptune_client.py | 11 ++---- src/clients/lambda_neptune_client.py | 4 +- src/clients/local_neptune_client.py | 9 +---- terraform/load_balancer.tf | 56 +++++++++++++++++++++++----- terraform/terraform.tf | 9 +++++ 5 files changed, 61 insertions(+), 28 deletions(-) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 690261a494..8839653bdb 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -16,10 +16,9 @@ class BaseNeptuneClient: cluster) or LocalNeptuneClient (when connecting to the cluster from outside the VPC). """ - def __init__(self) -> None: + def __init__(self, neptune_endpoint: str) -> None: self.session: boto3.Session | None = None - self.neptune_endpoint: str | None = None - self.verify_requests: bool = True + self.neptune_endpoint: str = neptune_endpoint def _get_client_url(self) -> str: raise NotImplementedError() @@ -42,11 +41,7 @@ def _make_request( SigV4Auth(credentials, "neptune-db", "eu-west-1").add_auth(request) raw_response = requests.request( - method, - url, - data=data, - headers=dict(request.headers), - verify=self.verify_requests, + method, url, data=data, headers=dict(request.headers) ) if raw_response.status_code != 200: diff --git a/src/clients/lambda_neptune_client.py b/src/clients/lambda_neptune_client.py index 58746efef6..028985c93d 100644 --- a/src/clients/lambda_neptune_client.py +++ b/src/clients/lambda_neptune_client.py @@ -7,8 +7,8 @@ class LambdaNeptuneClient(BaseNeptuneClient): def __init__(self, neptune_endpoint: str): - super().__init__() - self.neptune_endpoint = neptune_endpoint + super().__init__(neptune_endpoint) + self.session = boto3.Session( aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), diff --git a/src/clients/local_neptune_client.py b/src/clients/local_neptune_client.py index 8db3ddd13e..fbfa9bb43c 100644 --- a/src/clients/local_neptune_client.py +++ b/src/clients/local_neptune_client.py @@ -1,20 +1,13 @@ import boto3 -import urllib3 from .base_neptune_client import BaseNeptuneClient class LocalNeptuneClient(BaseNeptuneClient): def __init__(self, load_balancer_url: str, neptune_endpoint: str): - # We are using the default NLB DNS name, which does not support custom SSL certificates, so we need to - # disable SSL certificate verification. This increases the risks of a man-in-the-middle attack, - # which is acceptable for a testing database. In production, we will be connecting to the database - # directly from within the VPC. - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - self.verify_requests = False + super().__init__(neptune_endpoint) self.load_balancer_url = load_balancer_url - self.neptune_endpoint = neptune_endpoint self.session = boto3.Session() def _get_client_url(self) -> str: diff --git a/terraform/load_balancer.tf b/terraform/load_balancer.tf index ba3c1b8c8e..7b08bd7f20 100644 --- a/terraform/load_balancer.tf +++ b/terraform/load_balancer.tf @@ -1,9 +1,8 @@ # A Network Load Balancer for accessing the Neptune cluster from outside of the VPC. # See https://aws-samples.github.io/aws-dbs-refarch-graph/src/connecting-using-a-load-balancer/. -# TODO: This only exists for testing purposes and should be destroyed before we switch to production. -resource "aws_lb" "neptune_experimental_network_lb" { - name = "neptune-test" +resource "aws_lb" "neptune_network_load_balancer" { + name = "catalogue-graph-neptune-nlb" internal = false load_balancer_type = "network" security_groups = [aws_security_group.neptune_lb_security_group.id] @@ -12,9 +11,9 @@ resource "aws_lb" "neptune_experimental_network_lb" { # Create a new target group and attach the IP of the Neptune cluster resource "aws_lb_target_group" "neptune_instance" { - name = "neptune-test-cluster" + name = "neptune-catalogue-graph-cluster" port = 8182 - protocol = "TCP" + protocol = "TLS" vpc_id = data.aws_vpc.vpc.id target_type = "ip" } @@ -29,12 +28,30 @@ resource "aws_lb_target_group_attachment" "neptune_instance_attachment" { target_id = "172.42.174.101" } +locals { + catalogue_graph_nlb_url = "catalogue-graph.wellcomecollection.org" +} + +# A custom certificate which will be used for TLS termination +module "catalogue_graph_nlb_certificate" { + source = "github.com/wellcomecollection/terraform-aws-acm-certificate?ref=v1.0.0" + + domain_name = local.catalogue_graph_nlb_url + zone_id = data.aws_route53_zone.weco_zone.id + + providers = { + aws = aws + aws.dns = aws.dns + } +} -# Forward traffic to the Neptune target group +# Terminate TLS using the custom certificate and forward traffic to the Neptune target group resource "aws_lb_listener" "listener" { - load_balancer_arn = aws_lb.neptune_experimental_network_lb.arn - port = "8182" - protocol = "TCP" + load_balancer_arn = aws_lb.neptune_network_load_balancer.arn + port = "443" + protocol = "TLS" + + certificate_arn = module.catalogue_graph_nlb_certificate.arn default_action { type = "forward" @@ -66,6 +83,25 @@ resource "aws_secretsmanager_secret" "neptune_nlb_url" { resource "aws_secretsmanager_secret_version" "neptune_nlb_endpoint_url" { secret_id = aws_secretsmanager_secret.neptune_nlb_url.id - secret_string = "https://${aws_lb.neptune_experimental_network_lb.dns_name}:8182" + secret_string = "https://${local.catalogue_graph_nlb_url}" +} + +data "aws_route53_zone" "weco_zone" { + provider = aws.dns + name = "wellcomecollection.org." } +# Add an alias A record to the wellcomecollection.org hosted zone, which maps the catalogue graph domain name +# to the NLB +resource "aws_route53_record" "catalogue_graph_nlb_record" { + provider = aws.dns + zone_id = data.aws_route53_zone.weco_zone.id + name = local.catalogue_graph_nlb_url + type = "A" + + alias { + name = aws_lb.neptune_network_load_balancer.dns_name + zone_id = aws_lb.neptune_network_load_balancer.zone_id + evaluate_target_health = false + } +} diff --git a/terraform/terraform.tf b/terraform/terraform.tf index e6835e41c0..b7f6f11b4f 100644 --- a/terraform/terraform.tf +++ b/terraform/terraform.tf @@ -21,3 +21,12 @@ data "terraform_remote_state" "aws_account_infrastructure" { region = "eu-west-1" } } + +provider "aws" { + region = "eu-west-1" + alias = "dns" + + assume_role { + role_arn = "arn:aws:iam::267269328833:role/wellcomecollection-assume_role_hosted_zone_update" + } +} From 217866e3d7a936d4f50040f2d91e4aa220bc9f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 8 Jan 2025 14:58:50 +0000 Subject: [PATCH 038/310] Update README.md --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 4a0ecca1c7..e4af8792bd 100644 --- a/README.md +++ b/README.md @@ -128,3 +128,27 @@ MATCH (n) RETURN count(*) result = neptune_client.run_open_cypher_query(query) print(result) ``` + +Additionally, it is possible to connect to the cluster using [AWS graph notebook](https://github.com/aws/graph-notebook) +with the following configuration: +``` +%%graph_notebook_config +{ + "host": , + "neptune_service": "neptune-db", + "port": 8182, + "ssl": true, + "proxy_port": 443, + "proxy_host": "catalogue-graph.wellcomecollection.org", + "auth_mode": "IAM", + "aws_region": "eu-west-1", + "load_from_s3_arn": "" +} +``` + +To communicate with the cluster, the AWS_PROFILE environment variable first needs to be set like this in the same +Jupyter notebook: +``` +%env AWS_PROFILE=platform-developer +``` + From 24b604d03355568c1f6ed9c3fd129eb36b90f6f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 9 Jan 2025 16:52:30 +0000 Subject: [PATCH 039/310] Add Wikidata source & transformer --- src/extractor.py | 10 ++- src/sources/wikidata/__init__.py | 0 src/sources/wikidata/concepts_source.py | 48 +++++++++++++ src/sources/wikidata/sparql_client.py | 23 ++++++ src/sources/wikidata/sparql_query_builder.py | 63 +++++++++++++++++ src/transformers/create_transformer.py | 5 +- src/transformers/wikidata/__init__.py | 0 .../wikidata/concepts_transformer.py | 21 ++++++ src/transformers/wikidata/raw_concept.py | 70 +++++++++++++++++++ 9 files changed, 237 insertions(+), 3 deletions(-) create mode 100644 src/sources/wikidata/__init__.py create mode 100644 src/sources/wikidata/concepts_source.py create mode 100644 src/sources/wikidata/sparql_client.py create mode 100644 src/sources/wikidata/sparql_query_builder.py create mode 100644 src/transformers/wikidata/__init__.py create mode 100644 src/transformers/wikidata/concepts_transformer.py create mode 100644 src/transformers/wikidata/raw_concept.py diff --git a/src/extractor.py b/src/extractor.py index 65735c2ce8..c5b3a6c79a 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -31,12 +31,18 @@ def handler( neptune_client, entity_type, CHUNK_SIZE, sample_size ) elif stream_destination == "s3": - assert S3_BULK_LOAD_BUCKET_NAME is not None + assert ( + S3_BULK_LOAD_BUCKET_NAME is not None + ), "To stream to S3, the S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." + file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" transformer.stream_to_s3(s3_uri, entity_type, CHUNK_SIZE, sample_size) elif stream_destination == "sns": - assert GRAPH_QUERIES_SNS_TOPIC_ARN is not None + assert ( + GRAPH_QUERIES_SNS_TOPIC_ARN is not None + ), "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." + transformer.stream_to_sns( GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size ) diff --git a/src/sources/wikidata/__init__.py b/src/sources/wikidata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/sources/wikidata/concepts_source.py b/src/sources/wikidata/concepts_source.py new file mode 100644 index 0000000000..90832968ee --- /dev/null +++ b/src/sources/wikidata/concepts_source.py @@ -0,0 +1,48 @@ +from collections.abc import Generator + +from sources.base_source import BaseSource +from .sparql_client import WikidataSparqlClient +from .sparql_query_builder import SparqlQueryBuilder, NodeType, LinkedSource + +WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" + + +class WikidataConceptsSource(BaseSource): + def __init__(self, node_type: NodeType, linked_source: LinkedSource): + self.client = WikidataSparqlClient() + self.node_type = node_type + self.linked_source = linked_source + + def _get_all_wikidata_ids(self) -> list[str]: + """Returns the IDs of all Wikidata items which reference a Library of Congress ID. + There are currently about 1.6 million such items, and the query takes ~1 minute to run. + """ + + loc_ids_query = """ + SELECT DISTINCT ?item WHERE { + ?item p:P244 ?statement0. + ?statement0 ps:P244 _:anyValueP244. + } + """ + + items = self.client.run_query(loc_ids_query) + + raw_ids: list[str] = [item["item"]["value"] for item in items] + ids = [raw_id.removeprefix(WIKIDATA_ID_PREFIX) for raw_id in raw_ids] + + return ids + + def stream_raw(self) -> Generator[dict]: + all_ids = self._get_all_wikidata_ids() + + chunk_size = 300 + + for i in range(0, len(all_ids), chunk_size): + chunk = all_ids[i : i + chunk_size] + query = SparqlQueryBuilder.get_items_query( + chunk, self.node_type, self.linked_source + ) + items = self.client.run_query(query) + + for item in items: + yield item diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py new file mode 100644 index 0000000000..f74f63193e --- /dev/null +++ b/src/sources/wikidata/sparql_client.py @@ -0,0 +1,23 @@ +import requests + + +class WikidataSparqlClient: + @staticmethod + def _get_user_agent_header(): + # https://foundation.wikimedia.org/wiki/Policy:Wikimedia_Foundation_User-Agent_Policy + return ( + "WellcomeCollectionCatalogueGraphPipeline/0.1 (https://wellcomecollection.org/; " + "digital@wellcomecollection.org) wellcome-collection-catalogue-graph/0.1" + ) + + def run_query(self, query: str) -> list[dict]: + r = requests.get( + "https://query.wikidata.org/sparql", + params={"format": "json", "query": query}, + headers={"User-Agent": self._get_user_agent_header()}, + ) + + if r.status_code != 200: + raise Exception(r.content) + + return r.json()["results"]["bindings"] diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py new file mode 100644 index 0000000000..668cbeba18 --- /dev/null +++ b/src/sources/wikidata/sparql_query_builder.py @@ -0,0 +1,63 @@ +from typing import Literal + +NodeType = Literal["concepts", "names", "locations"] +LinkedSource = Literal["mesh", "loc"] + + +class SparqlQueryBuilder: + @staticmethod + def _get_formatted_fields(node_type: NodeType, linked_source: LinkedSource): + fields = ["?item", "?itemLabel", "?itemDescription", "?itemAltLabel"] + + if node_type == "names": + fields += ["?dateOfBirthLabel", "?dateOfDeathLabel", "?placeOfBirthLabel"] + elif node_type == "locations": + fields += ["?coordinateLocation"] + + if linked_source == "loc": + fields.append("?libraryOfCongressId") + elif linked_source == "mesh": + fields.append("?meshId") + + return " ".join(fields) + + @staticmethod + def _get_formatted_field_definitions( + node_type: NodeType, linked_source: Literal["mesh", "loc"] + ): + definitions = [] + + if node_type == "names": + definitions += [ + "OPTIONAL { ?item wdt:P569 ?dateOfBirth. }", + "OPTIONAL { ?item wdt:P570 ?dateOfDeath. }", + "OPTIONAL { ?item wdt:P19 ?placeOfBirth. }", + ] + elif node_type == "locations": + definitions += ["OPTIONAL {{ ?item wdt:P625 ?coordinateLocation. }}"] + + if linked_source == "loc": + definitions.append("?item wdt:P244 ?libraryOfCongressId.") + elif linked_source == "mesh": + definitions.append("?item wdt:P486 ?meshId.") + + return "\n".join(definitions) + + @classmethod + def get_items_query( + cls, item_ids: list[str], node_type: NodeType, linked_source: LinkedSource + ): + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + + query = f""" + SELECT DISTINCT {cls._get_formatted_fields(node_type, linked_source)} + WHERE {{ + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + + VALUES ?item {{ {ids_clause} }} + + {cls._get_formatted_field_definitions(node_type, linked_source)} + }} + """ + + return query diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index aa2a139c6e..31e21b47ce 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -4,13 +4,14 @@ from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer from .loc.locations_transformer import LibraryOfCongressLocationsTransformer from .loc.names_transformer import LibraryOfCongressNamesTransformer +from .wikidata.concepts_transformer import WikidataConceptsTransformer LOC_SUBJECT_HEADINGS_URL = ( "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" ) LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" -TransformerType = Literal["loc_concepts", "loc_names", "loc_locations"] +TransformerType = Literal["loc_concepts", "loc_names", "loc_locations", "wikidata"] def create_transformer(transformer_type: TransformerType) -> BaseTransformer: @@ -22,5 +23,7 @@ def create_transformer(transformer_type: TransformerType) -> BaseTransformer: return LibraryOfCongressLocationsTransformer( LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL ) + if transformer_type == "wikidata": + return WikidataConceptsTransformer() raise ValueError(f"Unknown transformer type: {transformer_type}") diff --git a/src/transformers/wikidata/__init__.py b/src/transformers/wikidata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py new file mode 100644 index 0000000000..22da380335 --- /dev/null +++ b/src/transformers/wikidata/concepts_transformer.py @@ -0,0 +1,21 @@ +from models.graph_node import SourceConcept +from sources.wikidata.concepts_source import WikidataConceptsSource +from transformers.base_transformer import BaseTransformer + +from .raw_concept import RawWikidataConcept + + +class WikidataConceptsTransformer(BaseTransformer): + def __init__(self): + self.source = WikidataConceptsSource("concepts", "loc") + + def transform_node(self, raw_node: dict) -> SourceConcept | None: + raw_concept = RawWikidataConcept(raw_node) + + return SourceConcept( + id=raw_concept.source_id, + label=raw_concept.label, + source=raw_concept.source, + alternative_labels=raw_concept.alternative_labels, + description=raw_concept.description, + ) diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py new file mode 100644 index 0000000000..0a6d7c4a0c --- /dev/null +++ b/src/transformers/wikidata/raw_concept.py @@ -0,0 +1,70 @@ +from typing import Literal + +# { +# "country": {"type": "uri", "value": "http://www.wikidata.org/entity/Q183"}, +# "countryLabel": {"xml:lang": "en", "type": "literal", "value": "Germany"}, +# "countryDescription": { +# "xml:lang": "en", +# "type": "literal", +# "value": "country in Central Europe", +# }, +# "countryAltLabel": { +# "xml:lang": "en", +# "type": "literal", +# "value": "BR Deutschland, Bundesrepublik Deutschland, Deutschland, Federal Republic of Germany", +# }, +# "coordinate_location": { +# "datatype": "http://www.opengis.net/ont/geosparql#wktLiteral", +# "type": "literal", +# "value": "Point(9.83 53.54)", +# }, +# } + +WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" + + +class RawWikidataConcept: + def __init__(self, raw_concept: dict): + self.raw_concept = raw_concept + + def _extract_english_field_value(self, field_name: str): + field = self.raw_concept[field_name] + + # assert field["xml:lang"] == "en" + assert field["type"] == "literal" + + return field["value"] + + @staticmethod + def _remove_id_prefix(raw_id: str) -> str: + return raw_id.removeprefix(WIKIDATA_ID_PREFIX) + + @property + def source_id(self) -> str: + item_field = self.raw_concept["item"] + assert item_field["type"] == "uri" + return self._remove_id_prefix(item_field["value"]) + + @property + def label(self) -> str: + return self._extract_english_field_value("itemLabel") + + @property + def alternative_labels(self) -> list[str]: + """Returns a list of alternative labels for the concept.""" + if "itemAltLabel" not in self.raw_concept: + return [] + + raw_alternative_labels = self._extract_english_field_value("itemAltLabel") + return raw_alternative_labels.split(", ") + + @property + def description(self) -> str | None: + if "itemDescription" not in self.raw_concept: + return None + + return self._extract_english_field_value("itemDescription") + + @property + def source(self) -> Literal["wikidata"]: + return "wikidata" From 65e148e189b2436dce7fd4f32ea966d13136c864 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 10 Jan 2025 14:51:33 +0000 Subject: [PATCH 040/310] Add xml source --- src/sources/gzip_source.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 59755a9366..63a3c9d5e4 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -1,22 +1,41 @@ import gzip import json from collections.abc import Generator +from typing import Literal, Union +import xml.etree.ElementTree as ET import requests from .base_source import BaseSource class GZipSource(BaseSource): - def __init__(self, url: str): + def __init__(self, url: str, ftype: Literal["json", "xml"] = "json"): self.url = url + self.ftype = ftype - def stream_raw(self) -> Generator[dict]: - response = requests.get(self.url, stream=True) - + def _stream_raw_json(self, response: requests.Response) -> Generator[dict]: with gzip.GzipFile(fileobj=response.raw) as file: for line_bytes in file: yield json.loads(line_bytes.decode("utf8")) + + def _stream_raw_xml(self, response: requests.Response) -> Generator[ET.Element]: + response.raw.decode_content = True + events = ET.iterparse(response.raw) + for _, elem in events: + yield elem + + def stream_raw(self) -> Generator[Union[dict, ET.Element]]: + response = requests.get(self.url, stream=True) + + if self.ftype == "json": + return self._stream_raw_json(response) + + elif self.ftype == "xml": + return self._stream_raw_xml(response) + + else: + raise ValueError("Unknown file type.") class MultiGZipSource(BaseSource): From eed169335fdfc45555c5ac1fac85459118403d8c Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 10 Jan 2025 15:09:55 +0000 Subject: [PATCH 041/310] Add RELATED_TO edge --- src/models/graph_edge.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index 1c352fa48a..85b1211db4 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -16,3 +16,10 @@ class SourceConceptNarrowerThan(BaseEdge): to_type: str = "SourceConcept" relationship: str = "NARROWER_THAN" directed: bool = True + + +class SourceConceptRelatedTo(BaseEdge): + from_type: str = "SourceConcept" + to_type: str = "SourceConcept" + relationship: str = "RELATED_TO" + directed: bool = False From 9897907405975b02f15b2f6bf54e46d80692a719 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 10 Jan 2025 15:28:14 +0000 Subject: [PATCH 042/310] Remove import --- src/sources/gzip_source.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 63a3c9d5e4..9f8b2fe1c3 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -1,7 +1,7 @@ import gzip import json from collections.abc import Generator -from typing import Literal, Union +from typing import Literal import xml.etree.ElementTree as ET import requests @@ -25,7 +25,7 @@ def _stream_raw_xml(self, response: requests.Response) -> Generator[ET.Element]: for _, elem in events: yield elem - def stream_raw(self) -> Generator[Union[dict, ET.Element]]: + def stream_raw(self) -> Generator[dict | ET.Element]: response = requests.get(self.url, stream=True) if self.ftype == "json": From 835f40b0d24a3f09751b383d1bb522ea9dddbb98 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 10 Jan 2025 16:19:29 +0000 Subject: [PATCH 043/310] Add HAS_PARENT edge --- src/models/graph_edge.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index 85b1211db4..7024f9a2f0 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -23,3 +23,10 @@ class SourceConceptRelatedTo(BaseEdge): to_type: str = "SourceConcept" relationship: str = "RELATED_TO" directed: bool = False + + +class SourceConceptHasParent(BaseEdge): + from_type: str = "SourceConcept" + to_type: str = "SourceConcept" + relationship: str = "HAS_PARENT" + directed: bool = True From d53de6cbbe01bbf657e2988f856afe0074a31728 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 10 Jan 2025 20:49:26 +0000 Subject: [PATCH 044/310] Refactor MeSH source --- src/sources/gzip_source.py | 27 ++++----------------------- src/sources/mesh/__init__.py | 0 src/sources/mesh/concepts_source.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 23 deletions(-) create mode 100644 src/sources/mesh/__init__.py create mode 100644 src/sources/mesh/concepts_source.py diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 9f8b2fe1c3..59755a9366 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -1,41 +1,22 @@ import gzip import json from collections.abc import Generator -from typing import Literal -import xml.etree.ElementTree as ET import requests from .base_source import BaseSource class GZipSource(BaseSource): - def __init__(self, url: str, ftype: Literal["json", "xml"] = "json"): + def __init__(self, url: str): self.url = url - self.ftype = ftype - def _stream_raw_json(self, response: requests.Response) -> Generator[dict]: + def stream_raw(self) -> Generator[dict]: + response = requests.get(self.url, stream=True) + with gzip.GzipFile(fileobj=response.raw) as file: for line_bytes in file: yield json.loads(line_bytes.decode("utf8")) - - def _stream_raw_xml(self, response: requests.Response) -> Generator[ET.Element]: - response.raw.decode_content = True - events = ET.iterparse(response.raw) - for _, elem in events: - yield elem - - def stream_raw(self) -> Generator[dict | ET.Element]: - response = requests.get(self.url, stream=True) - - if self.ftype == "json": - return self._stream_raw_json(response) - - elif self.ftype == "xml": - return self._stream_raw_xml(response) - - else: - raise ValueError("Unknown file type.") class MultiGZipSource(BaseSource): diff --git a/src/sources/mesh/__init__.py b/src/sources/mesh/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py new file mode 100644 index 0000000000..af89af980d --- /dev/null +++ b/src/sources/mesh/concepts_source.py @@ -0,0 +1,18 @@ +from collections.abc import Generator +import requests +import xml.etree.ElementTree as ET + +from sources.base_source import BaseSource + +class MeSHConceptsSource(BaseSource): + def __init__(self, url: str): + self.url = url + + def stream_raw(self) -> Generator[ET.Element]: + response = requests.get(self.url, stream=True) + response.raw.decode_content = True + + events = ET.iterparse(response.raw) + return ( + elem for _, elem in events if elem.tag == "DescriptorRecord" + ) From c9a8ec1b0422870ee8c8dcf9b703d72aa157dc43 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 11:32:47 +0000 Subject: [PATCH 045/310] Add raw MeSH concept --- src/transformers/mesh/__init__.py | 0 src/transformers/mesh/raw_concept.py | 79 ++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 src/transformers/mesh/__init__.py create mode 100644 src/transformers/mesh/raw_concept.py diff --git a/src/transformers/mesh/__init__.py b/src/transformers/mesh/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py new file mode 100644 index 0000000000..4526d7cbbb --- /dev/null +++ b/src/transformers/mesh/raw_concept.py @@ -0,0 +1,79 @@ +import requests +import xml.etree.ElementTree as ET + +ID_PREFIX = "http://id.nlm.nih.gov/mesh/" + + +class RawMeSHConcept: + def __init__(self, raw_concept: ET.Element): + self.raw_concept = raw_concept + self.source = "mesh" + + @staticmethod + def _remove_id_prefix(raw_id: str) -> str: + """Removes prefix from MeSH descriptor (only present in JSON).""" + + return raw_id.removeprefix(ID_PREFIX) + + @property + def source_id(self) -> str: + """Returns MeSH descriptor (unique ID).""" + + return self.raw_concept.find("DescriptorUI").text + + @property + def label(self) -> str: + """Returns the concept label.""" + + return self.raw_concept.find('DescriptorName//String').text + + @property + def alternative_labels(self) -> list[str] | None: + """Returns a list of alternative labels for the concept.""" + altern_labels = [] + + for altern_concept in self.raw_concept.findall("ConceptList//Concept[@PreferredConceptYN='N']"): + altern_labels.append(altern_concept.find("ConceptName//String").text) + + return altern_labels + + @property + def alternative_ids(self) -> list[str]: + """Returns a list of MeSH tree numbers for the concept.""" + + return [treenum.text for treenum in self.raw_concept.findall("TreeNumberList//TreeNumber")] + + @property + def description(self) -> str | None: + """Returns the preferred term's scope note (free-text narrative of its scope and meaning).""" + + scope_note = self.raw_concept.find("ConceptList//Concept[@PreferredConceptYN='Y']//ScopeNote") + if scope_note is not None: + return scope_note.text + + return scope_note + + @staticmethod + def fetch_mesh(source_id: str) -> dict: + """Fetch JSON containing RDF data for a given MeSH concept.""" + + response = requests.get(f"https://id.nlm.nih.gov/mesh/{source_id}.json") + return response.json() + + @property + def parent_concept_ids(self) -> list[str]: + """Extract parent MeSH descriptors from JSON.""" + + mesh_data = self.fetch_mesh(self.source_id) + broader_desc = mesh_data.get("broaderDescriptor", []) + + if not isinstance(broader_desc, list): + broader_desc = [broader_desc] + + return [self._remove_id_prefix(desc) for desc in broader_desc] + + @property + def is_geographic(self) -> bool: + """Returns True if the node represents a geographic concept, as determined by `DescriptorClass`.""" + + return self.raw_concept.attrib["DescriptorClass"] == "4" From 0cef1a2e1b39d6ea861c34cc7a88b43ca92e972d Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 11:52:04 +0000 Subject: [PATCH 046/310] Add related MeSH concepts --- src/transformers/mesh/raw_concept.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 4526d7cbbb..e9ea0a5af6 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -71,6 +71,14 @@ def parent_concept_ids(self) -> list[str]: broader_desc = [broader_desc] return [self._remove_id_prefix(desc) for desc in broader_desc] + + @property + def related_concept_ids(self) -> list[str]: + """Extract related MeSH descriptors.""" + + related_desc = self.raw_concept.findall("SeeRelatedDescriptor") + + return [desc.find("DescriptorReferredTo//DescriptorUI").text for desc in related_desc] @property def is_geographic(self) -> bool: From 10dbe9689aa78611e842ab4904156ee9b19233b9 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 11:54:18 +0000 Subject: [PATCH 047/310] Add MeSH concepts transformer --- src/transformers/mesh/concepts_transformer.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 src/transformers/mesh/concepts_transformer.py diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py new file mode 100644 index 0000000000..0f1f1a1ce3 --- /dev/null +++ b/src/transformers/mesh/concepts_transformer.py @@ -0,0 +1,44 @@ +from collections.abc import Generator + +from models.graph_edge import SourceConceptHasParent, SourceConceptRelatedTo +from models.graph_node import SourceConcept +from sources.mesh.concepts_source import MeSHConceptsSource +from transformers.base_transformer import BaseTransformer + +from .raw_concept import RawMeSHConcept + + +class MeSHConceptsTransformer(BaseTransformer): + def __init__(self, url: str): + self.source = MeSHConceptsSource(url) + + def transform_node(self, raw_node: dict) -> SourceConcept | None: + raw_concept = RawMeSHConcept(raw_node) + + if raw_concept.is_geographic: + return None + + return SourceConcept( + id=raw_concept.source_id, + label=raw_concept.label, + source=raw_concept.source, + alternative_labels=raw_concept.alternative_labels, + alternative_ids=raw_concept.alternative_ids, + description=raw_concept.description + ) + + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptHasParent]: + raw_concept = RawMeSHConcept(raw_node) + + if raw_concept.is_geographic: + yield from () + + for parent_id in raw_concept.parent_concept_ids: + yield SourceConceptHasParent( + from_id=raw_concept.source_id, to_id=parent_id + ) + + for related_id in raw_concept.related_concept_ids: + yield SourceConceptRelatedTo( + from_id=raw_concept.source_id, to_id=related_id + ) From d75072dc9e170744587e42f1061bbe26b871960a Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 14:08:50 +0000 Subject: [PATCH 048/310] Fix typing issues --- src/transformers/base_transformer.py | 5 ++-- src/transformers/mesh/concepts_transformer.py | 5 ++-- src/transformers/mesh/raw_concept.py | 30 ++++++++++++++----- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 96cd0b7490..9228e9df3e 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -3,6 +3,7 @@ from collections.abc import Generator from itertools import islice from typing import Literal +import xml.etree.ElementTree as ET import boto3 import smart_open @@ -32,13 +33,13 @@ class BaseTransformer: def __init__(self) -> None: self.source: BaseSource = BaseSource() - def transform_node(self, raw_node: dict) -> BaseNode | None: + def transform_node(self, raw_node: dict | ET.Element) -> BaseNode | None: """Accepts a raw node from the source dataset and returns a transformed node as a Pydantic model.""" raise NotImplementedError( "Each transformer must implement a `transform_node` method." ) - def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: + def extract_edges(self, raw_node: dict | ET.Element) -> Generator[BaseEdge]: """Accepts a raw node from the source dataset and returns a generator of extracted edges as Pydantic models.""" raise NotImplementedError( "Each transformer must implement an `extract_edges` method." diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 0f1f1a1ce3..504ae541f3 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -1,4 +1,5 @@ from collections.abc import Generator +import xml.etree.ElementTree as ET from models.graph_edge import SourceConceptHasParent, SourceConceptRelatedTo from models.graph_node import SourceConcept @@ -12,7 +13,7 @@ class MeSHConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node(self, raw_node: dict) -> SourceConcept | None: + def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: @@ -27,7 +28,7 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: description=raw_concept.description ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptHasParent]: + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index e9ea0a5af6..049dfd4612 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -7,23 +7,28 @@ class RawMeSHConcept: def __init__(self, raw_concept: ET.Element): self.raw_concept = raw_concept - self.source = "mesh" + self.source = "nlm-mesh" @staticmethod def _remove_id_prefix(raw_id: str) -> str: """Removes prefix from MeSH descriptor (only present in JSON).""" - return raw_id.removeprefix(ID_PREFIX) @property def source_id(self) -> str: """Returns MeSH descriptor (unique ID).""" + desc_elem = self.raw_concept.find("DescriptorUI") + + assert(isinstance(desc_elem, ET.Element)) return self.raw_concept.find("DescriptorUI").text @property def label(self) -> str: """Returns the concept label.""" + label_elem = self.raw_concept.find('DescriptorName//String') + + assert(isinstance(label_elem, ET.Element)) return self.raw_concept.find('DescriptorName//String').text @@ -33,15 +38,21 @@ def alternative_labels(self) -> list[str] | None: altern_labels = [] for altern_concept in self.raw_concept.findall("ConceptList//Concept[@PreferredConceptYN='N']"): - altern_labels.append(altern_concept.find("ConceptName//String").text) + altern_label = altern_concept.find("ConceptName//String") + assert(isinstance(altern_label, ET.Element)) + altern_labels.append(altern_label.text) return altern_labels @property def alternative_ids(self) -> list[str]: """Returns a list of MeSH tree numbers for the concept.""" + treenums = [] + for treenum_elem in self.raw_concept.findall("TreeNumberList//TreeNumber"): + assert(isinstance(treenum_elem, ET.Element)) + treenums.append(treenum_elem.text) - return [treenum.text for treenum in self.raw_concept.findall("TreeNumberList//TreeNumber")] + return treenums @property def description(self) -> str | None: @@ -54,7 +65,7 @@ def description(self) -> str | None: return scope_note @staticmethod - def fetch_mesh(source_id: str) -> dict: + def fetch_mesh(source_id: str) -> dict[str, str | list]: """Fetch JSON containing RDF data for a given MeSH concept.""" response = requests.get(f"https://id.nlm.nih.gov/mesh/{source_id}.json") @@ -76,12 +87,15 @@ def parent_concept_ids(self) -> list[str]: def related_concept_ids(self) -> list[str]: """Extract related MeSH descriptors.""" - related_desc = self.raw_concept.findall("SeeRelatedDescriptor") + related_descriptors = [] + for desc in self.raw_concept.findall("SeeRelatedDescriptor//DescriptorReferredTo//DescriptorUI"): + assert(isinstance(desc, ET.Element)) + related_descriptors.append(desc.text) - return [desc.find("DescriptorReferredTo//DescriptorUI").text for desc in related_desc] + return related_descriptors @property def is_geographic(self) -> bool: """Returns True if the node represents a geographic concept, as determined by `DescriptorClass`.""" - return self.raw_concept.attrib["DescriptorClass"] == "4" + return self.raw_concept.attrib.get("DescriptorClass") == "4" From f56802241a1a91f0e5dd783650a3a44deabf6b24 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 14:19:00 +0000 Subject: [PATCH 049/310] Add xml type to transformers --- src/transformers/loc/concepts_transformer.py | 5 +++-- src/transformers/loc/locations_transformer.py | 5 +++-- src/transformers/loc/names_transformer.py | 5 +++-- src/transformers/mesh/concepts_transformer.py | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 783d30f516..8b51dd303f 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -1,4 +1,5 @@ from collections.abc import Generator +import xml.etree.ElementTree as ET from models.graph_edge import SourceConceptNarrowerThan from models.graph_node import SourceConcept @@ -12,7 +13,7 @@ class LibraryOfCongressConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = GZipSource(url) - def transform_node(self, raw_node: dict) -> SourceConcept | None: + def transform_node(self, raw_node: dict | ET.Element) -> SourceConcept | None: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: @@ -25,7 +26,7 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: + def extract_edges(self, raw_node: dict | ET.Element) -> Generator[SourceConceptNarrowerThan]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 5ff7f430e9..0ab72ea388 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -1,4 +1,5 @@ from collections.abc import Generator +import xml.etree.ElementTree as ET from models.graph_edge import SourceConceptNarrowerThan from models.graph_node import SourceLocation @@ -12,7 +13,7 @@ class LibraryOfCongressLocationsTransformer(BaseTransformer): def __init__(self, subject_headings_url: str, names_url: str): self.source = MultiGZipSource([subject_headings_url, names_url]) - def transform_node(self, raw_node: dict) -> SourceLocation | None: + def transform_node(self, raw_node: dict | ET.Element) -> SourceLocation | None: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: @@ -25,7 +26,7 @@ def transform_node(self, raw_node: dict) -> SourceLocation | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: + def extract_edges(self, raw_node: dict | ET.Element) -> Generator[SourceConceptNarrowerThan]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 7206d30039..94616ef1b7 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -1,4 +1,5 @@ from collections.abc import Generator +import xml.etree.ElementTree as ET from models.graph_edge import BaseEdge from models.graph_node import SourceName @@ -12,7 +13,7 @@ class LibraryOfCongressNamesTransformer(BaseTransformer): def __init__(self, url: str): self.source = GZipSource(url) - def transform_node(self, raw_node: dict) -> SourceName | None: + def transform_node(self, raw_node: dict | ET.Element) -> SourceName | None: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: @@ -25,6 +26,6 @@ def transform_node(self, raw_node: dict) -> SourceName | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: + def extract_edges(self, raw_node: dict | ET.Element) -> Generator[BaseEdge]: # At the moment there are no edges to extract. Return an empty generator. yield from () diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 504ae541f3..21f70c57d0 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -13,7 +13,7 @@ class MeSHConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: + def transform_node(self, raw_node: dict | ET.Element) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: @@ -28,7 +28,7 @@ def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: description=raw_concept.description ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: + def extract_edges(self, raw_node: dict | ET.Element) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: From 3d0ee143851ffe4ccda8dd08f1f72e4612e1b675 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 14:27:07 +0000 Subject: [PATCH 050/310] Fix types in raw MeSH concept --- src/transformers/mesh/raw_concept.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 049dfd4612..2e0126ad3e 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -21,7 +21,7 @@ def source_id(self) -> str: assert(isinstance(desc_elem, ET.Element)) - return self.raw_concept.find("DescriptorUI").text + return desc_elem.text @property def label(self) -> str: @@ -30,17 +30,17 @@ def label(self) -> str: assert(isinstance(label_elem, ET.Element)) - return self.raw_concept.find('DescriptorName//String').text + return label_elem.text @property - def alternative_labels(self) -> list[str] | None: + def alternative_labels(self) -> list[str]: """Returns a list of alternative labels for the concept.""" altern_labels = [] for altern_concept in self.raw_concept.findall("ConceptList//Concept[@PreferredConceptYN='N']"): altern_label = altern_concept.find("ConceptName//String") - assert(isinstance(altern_label, ET.Element)) - altern_labels.append(altern_label.text) + if isinstance(altern_label, ET.Element): + altern_labels.append(altern_label.text) return altern_labels @@ -49,8 +49,8 @@ def alternative_ids(self) -> list[str]: """Returns a list of MeSH tree numbers for the concept.""" treenums = [] for treenum_elem in self.raw_concept.findall("TreeNumberList//TreeNumber"): - assert(isinstance(treenum_elem, ET.Element)) - treenums.append(treenum_elem.text) + if isinstance(treenum_elem, ET.Element): + treenums.append(treenum_elem.text) return treenums @@ -59,13 +59,13 @@ def description(self) -> str | None: """Returns the preferred term's scope note (free-text narrative of its scope and meaning).""" scope_note = self.raw_concept.find("ConceptList//Concept[@PreferredConceptYN='Y']//ScopeNote") - if scope_note is not None: - return scope_note.text + if isinstance(scope_note, ET.Element): + scope_note = scope_note.text return scope_note @staticmethod - def fetch_mesh(source_id: str) -> dict[str, str | list]: + def fetch_mesh(source_id: str) -> dict[str, str | list[str]]: """Fetch JSON containing RDF data for a given MeSH concept.""" response = requests.get(f"https://id.nlm.nih.gov/mesh/{source_id}.json") @@ -89,8 +89,8 @@ def related_concept_ids(self) -> list[str]: related_descriptors = [] for desc in self.raw_concept.findall("SeeRelatedDescriptor//DescriptorReferredTo//DescriptorUI"): - assert(isinstance(desc, ET.Element)) - related_descriptors.append(desc.text) + if isinstance(desc, ET.Element): + related_descriptors.append(desc.text) return related_descriptors From 901a37578da5c529a9b7594256c0862aee2a094e Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 14:28:52 +0000 Subject: [PATCH 051/310] Add xml type to base source --- src/sources/base_source.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sources/base_source.py b/src/sources/base_source.py index 1da81e53cf..7e41986ff9 100644 --- a/src/sources/base_source.py +++ b/src/sources/base_source.py @@ -1,7 +1,8 @@ from collections.abc import Generator +import xml.etree.ElementTree as ET class BaseSource: - def stream_raw(self) -> Generator[dict]: + def stream_raw(self) -> Generator[dict | ET.Element]: """Returns a generator of dictionaries, each corresponding to a raw entity extracted from the source.""" raise NotImplementedError("Each source must implement a `stream_raw` method.") From edd28e7423c355f67a7399bcc6ec2515cd4ccad0 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 14:55:34 +0000 Subject: [PATCH 052/310] Narrow raw MeSH concept return types --- src/transformers/mesh/raw_concept.py | 35 ++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 2e0126ad3e..50d8ebcad0 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -20,8 +20,10 @@ def source_id(self) -> str: desc_elem = self.raw_concept.find("DescriptorUI") assert(isinstance(desc_elem, ET.Element)) + descriptor = desc_elem.text + assert isinstance(descriptor, str) - return desc_elem.text + return descriptor @property def label(self) -> str: @@ -29,8 +31,10 @@ def label(self) -> str: label_elem = self.raw_concept.find('DescriptorName//String') assert(isinstance(label_elem, ET.Element)) + label = label_elem.text + assert isinstance(label, str) - return label_elem.text + return label @property def alternative_labels(self) -> list[str]: @@ -38,9 +42,11 @@ def alternative_labels(self) -> list[str]: altern_labels = [] for altern_concept in self.raw_concept.findall("ConceptList//Concept[@PreferredConceptYN='N']"): - altern_label = altern_concept.find("ConceptName//String") - if isinstance(altern_label, ET.Element): - altern_labels.append(altern_label.text) + altern_label_elem = altern_concept.find("ConceptName//String") + if isinstance(altern_label_elem, ET.Element): + altern_label = altern_label_elem.text + assert(isinstance(altern_label, str)) + altern_labels.append(altern_label) return altern_labels @@ -50,17 +56,20 @@ def alternative_ids(self) -> list[str]: treenums = [] for treenum_elem in self.raw_concept.findall("TreeNumberList//TreeNumber"): if isinstance(treenum_elem, ET.Element): - treenums.append(treenum_elem.text) + treenum = treenum_elem.text + assert(isinstance(treenum, str)) + treenums.append(treenum) return treenums @property def description(self) -> str | None: """Returns the preferred term's scope note (free-text narrative of its scope and meaning).""" + scope_note = None - scope_note = self.raw_concept.find("ConceptList//Concept[@PreferredConceptYN='Y']//ScopeNote") - if isinstance(scope_note, ET.Element): - scope_note = scope_note.text + scope_note_elem = self.raw_concept.find("ConceptList//Concept[@PreferredConceptYN='Y']//ScopeNote") + if isinstance(scope_note_elem, ET.Element): + scope_note = scope_note_elem.text return scope_note @@ -88,9 +97,11 @@ def related_concept_ids(self) -> list[str]: """Extract related MeSH descriptors.""" related_descriptors = [] - for desc in self.raw_concept.findall("SeeRelatedDescriptor//DescriptorReferredTo//DescriptorUI"): - if isinstance(desc, ET.Element): - related_descriptors.append(desc.text) + for desc_elem in self.raw_concept.findall("SeeRelatedDescriptor//DescriptorReferredTo//DescriptorUI"): + if isinstance(desc_elem, ET.Element): + desc = desc_elem.text + assert(isinstance(desc, str)) + related_descriptors.append(desc) return related_descriptors From 3867334e937558cebc2b00bd4fc094cafdf1a91d Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 15:04:48 +0000 Subject: [PATCH 053/310] Fix literal type --- src/transformers/mesh/raw_concept.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 50d8ebcad0..9f6313018b 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -1,13 +1,13 @@ import requests +from typing import Literal import xml.etree.ElementTree as ET ID_PREFIX = "http://id.nlm.nih.gov/mesh/" class RawMeSHConcept: - def __init__(self, raw_concept: ET.Element): + def __init__(self, raw_concept: dict | ET.Element): self.raw_concept = raw_concept - self.source = "nlm-mesh" @staticmethod def _remove_id_prefix(raw_id: str) -> str: @@ -66,7 +66,7 @@ def alternative_ids(self) -> list[str]: def description(self) -> str | None: """Returns the preferred term's scope note (free-text narrative of its scope and meaning).""" scope_note = None - + scope_note_elem = self.raw_concept.find("ConceptList//Concept[@PreferredConceptYN='Y']//ScopeNote") if isinstance(scope_note_elem, ET.Element): scope_note = scope_note_elem.text @@ -110,3 +110,7 @@ def is_geographic(self) -> bool: """Returns True if the node represents a geographic concept, as determined by `DescriptorClass`.""" return self.raw_concept.attrib.get("DescriptorClass") == "4" + + @property + def source(self) -> Literal["nlm-mesh"]: + return "nlm-mesh" From e198e4395b0e94f1eb018d928c1b328a12ce54da Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 15:05:18 +0000 Subject: [PATCH 054/310] Add xml type --- src/transformers/loc/raw_concept.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 1dd5b8ec28..b79c21c295 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -1,4 +1,5 @@ from typing import Literal +import xml.etree.ElementTree as ET ID_PREFIXES_TO_REMOVE = ( "/authorities/subjects/", @@ -8,7 +9,7 @@ class RawLibraryOfCongressConcept: - def __init__(self, raw_concept: dict): + def __init__(self, raw_concept: dict | ET.Element): self.raw_concept = raw_concept self._raw_concept_node = self._extract_concept_node() From a569672457b9fc479cf27f05f79cad9a4af89d74 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 15:40:10 +0000 Subject: [PATCH 055/310] Create json and xml source/transformer --- src/sources/base_source.py | 13 +++++++++++-- src/sources/gzip_source.py | 6 +++--- src/sources/mesh/concepts_source.py | 4 ++-- src/transformers/base_transformer.py | 10 +++++++++- src/transformers/loc/concepts_transformer.py | 9 ++++----- src/transformers/loc/locations_transformer.py | 9 ++++----- src/transformers/loc/names_transformer.py | 9 ++++----- src/transformers/loc/raw_concept.py | 3 +-- src/transformers/mesh/concepts_transformer.py | 8 ++++---- src/transformers/mesh/raw_concept.py | 2 +- 10 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/sources/base_source.py b/src/sources/base_source.py index 7e41986ff9..afdf067a7e 100644 --- a/src/sources/base_source.py +++ b/src/sources/base_source.py @@ -1,8 +1,17 @@ from collections.abc import Generator import xml.etree.ElementTree as ET - class BaseSource: - def stream_raw(self) -> Generator[dict | ET.Element]: + def stream_raw(self) -> Generator[dict| ET.Element]: + """Returns a generator of dictionaries or XML Elements, each corresponding to a raw entity extracted from the source.""" + raise NotImplementedError("Each source must implement a `stream_raw` method.") + +class JSONSource: + def stream_raw(self) -> Generator[dict]: """Returns a generator of dictionaries, each corresponding to a raw entity extracted from the source.""" raise NotImplementedError("Each source must implement a `stream_raw` method.") + +class XMLSource: + def stream_raw(self) -> Generator[ET.Element]: + """Returns a generator of XML Elements, each corresponding to a raw entity extracted from the source.""" + raise NotImplementedError("Each source must implement a `stream_raw` method.") diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 59755a9366..1b0c4f9f4f 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -4,10 +4,10 @@ import requests -from .base_source import BaseSource +from .base_source import JSONSource -class GZipSource(BaseSource): +class GZipSource(JSONSource): def __init__(self, url: str): self.url = url @@ -19,7 +19,7 @@ def stream_raw(self) -> Generator[dict]: yield json.loads(line_bytes.decode("utf8")) -class MultiGZipSource(BaseSource): +class MultiGZipSource(JSONSource): def __init__(self, urls: list[str]): self.urls = urls diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index af89af980d..96b9c2b6e1 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -2,9 +2,9 @@ import requests import xml.etree.ElementTree as ET -from sources.base_source import BaseSource +from sources.base_source import XMLSource -class MeSHConceptsSource(BaseSource): +class MeSHConceptsSource(XMLSource): def __init__(self, url: str): self.url = url diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 9228e9df3e..c476d28f97 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -13,7 +13,7 @@ from models.graph_edge import BaseEdge from models.graph_node import BaseNode from query_builders.cypher import construct_upsert_cypher_query -from sources.base_source import BaseSource +from sources.base_source import BaseSource, JSONSource, XMLSource from utils.aws import publish_batch_to_sns EntityType = Literal["nodes", "edges"] @@ -217,3 +217,11 @@ def stream_to_nowhere( """ for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): pass + +class JSONTransformer(BaseTransformer): + def __init__(self) -> None: + self.source: JSONSource = JSONSource() + +class XMLTransformer(BaseTransformer): + def __init__(self) -> None: + self.source: XMLSource = XMLSource() diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 8b51dd303f..9d49a261c0 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -1,19 +1,18 @@ from collections.abc import Generator -import xml.etree.ElementTree as ET from models.graph_edge import SourceConceptNarrowerThan from models.graph_node import SourceConcept from sources.gzip_source import GZipSource -from transformers.base_transformer import BaseTransformer +from transformers.base_transformer import JSONTransformer from .raw_concept import RawLibraryOfCongressConcept -class LibraryOfCongressConceptsTransformer(BaseTransformer): +class LibraryOfCongressConceptsTransformer(JSONTransformer): def __init__(self, url: str): self.source = GZipSource(url) - def transform_node(self, raw_node: dict | ET.Element) -> SourceConcept | None: + def transform_node(self, raw_node: dict) -> SourceConcept | None: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: @@ -26,7 +25,7 @@ def transform_node(self, raw_node: dict | ET.Element) -> SourceConcept | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict | ET.Element) -> Generator[SourceConceptNarrowerThan]: + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 0ab72ea388..539631670a 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -1,19 +1,18 @@ from collections.abc import Generator -import xml.etree.ElementTree as ET from models.graph_edge import SourceConceptNarrowerThan from models.graph_node import SourceLocation from sources.gzip_source import MultiGZipSource -from transformers.base_transformer import BaseTransformer +from transformers.base_transformer import JSONTransformer from .raw_concept import RawLibraryOfCongressConcept -class LibraryOfCongressLocationsTransformer(BaseTransformer): +class LibraryOfCongressLocationsTransformer(JSONTransformer): def __init__(self, subject_headings_url: str, names_url: str): self.source = MultiGZipSource([subject_headings_url, names_url]) - def transform_node(self, raw_node: dict | ET.Element) -> SourceLocation | None: + def transform_node(self, raw_node: dict) -> SourceLocation | None: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: @@ -26,7 +25,7 @@ def transform_node(self, raw_node: dict | ET.Element) -> SourceLocation | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict | ET.Element) -> Generator[SourceConceptNarrowerThan]: + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 94616ef1b7..dc5d68e4bf 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -1,19 +1,18 @@ from collections.abc import Generator -import xml.etree.ElementTree as ET from models.graph_edge import BaseEdge from models.graph_node import SourceName from sources.gzip_source import GZipSource -from transformers.base_transformer import BaseTransformer +from transformers.base_transformer import JSONTransformer from .raw_concept import RawLibraryOfCongressConcept -class LibraryOfCongressNamesTransformer(BaseTransformer): +class LibraryOfCongressNamesTransformer(JSONTransformer): def __init__(self, url: str): self.source = GZipSource(url) - def transform_node(self, raw_node: dict | ET.Element) -> SourceName | None: + def transform_node(self, raw_node: dict) -> SourceName | None: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: @@ -26,6 +25,6 @@ def transform_node(self, raw_node: dict | ET.Element) -> SourceName | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict | ET.Element) -> Generator[BaseEdge]: + def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: # At the moment there are no edges to extract. Return an empty generator. yield from () diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index b79c21c295..1dd5b8ec28 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -1,5 +1,4 @@ from typing import Literal -import xml.etree.ElementTree as ET ID_PREFIXES_TO_REMOVE = ( "/authorities/subjects/", @@ -9,7 +8,7 @@ class RawLibraryOfCongressConcept: - def __init__(self, raw_concept: dict | ET.Element): + def __init__(self, raw_concept: dict): self.raw_concept = raw_concept self._raw_concept_node = self._extract_concept_node() diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 21f70c57d0..0a11c4b1e9 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -4,16 +4,16 @@ from models.graph_edge import SourceConceptHasParent, SourceConceptRelatedTo from models.graph_node import SourceConcept from sources.mesh.concepts_source import MeSHConceptsSource -from transformers.base_transformer import BaseTransformer +from transformers.base_transformer import XMLTransformer from .raw_concept import RawMeSHConcept -class MeSHConceptsTransformer(BaseTransformer): +class MeSHConceptsTransformer(XMLTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node(self, raw_node: dict | ET.Element) -> SourceConcept | None: + def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: @@ -28,7 +28,7 @@ def transform_node(self, raw_node: dict | ET.Element) -> SourceConcept | None: description=raw_concept.description ) - def extract_edges(self, raw_node: dict | ET.Element) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: + def extract_edges(self, raw_node: ET.Element) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 9f6313018b..71a869cbd3 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -6,7 +6,7 @@ class RawMeSHConcept: - def __init__(self, raw_concept: dict | ET.Element): + def __init__(self, raw_concept: ET.Element): self.raw_concept = raw_concept @staticmethod From 9f32273a0e410a1731567eed7d4c55f95e27eaf8 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 15:55:51 +0000 Subject: [PATCH 056/310] Fix source types --- src/transformers/base_transformer.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index c476d28f97..ae440be4d3 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -31,7 +31,7 @@ def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: class BaseTransformer: def __init__(self) -> None: - self.source: BaseSource = BaseSource() + self.source: BaseSource | JSONSource | XMLSource = BaseSource() def transform_node(self, raw_node: dict | ET.Element) -> BaseNode | None: """Accepts a raw node from the source dataset and returns a transformed node as a Pydantic model.""" @@ -221,7 +221,32 @@ def stream_to_nowhere( class JSONTransformer(BaseTransformer): def __init__(self) -> None: self.source: JSONSource = JSONSource() + + def transform_node(self, raw_node: dict) -> BaseNode | None: + """Accepts a raw node from the source dataset and returns a transformed node as a Pydantic model.""" + raise NotImplementedError( + "Each transformer must implement a `transform_node` method." + ) + + def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: + """Accepts a raw node from the source dataset and returns a generator of extracted edges as Pydantic models.""" + raise NotImplementedError( + "Each transformer must implement an `extract_edges` method." + ) + class XMLTransformer(BaseTransformer): def __init__(self) -> None: self.source: XMLSource = XMLSource() + + def transform_node(self, raw_node: ET.Element) -> BaseNode | None: + """Accepts a raw node from the source dataset and returns a transformed node as a Pydantic model.""" + raise NotImplementedError( + "Each transformer must implement a `transform_node` method." + ) + + def extract_edges(self, raw_node: ET.Element) -> Generator[BaseEdge]: + """Accepts a raw node from the source dataset and returns a generator of extracted edges as Pydantic models.""" + raise NotImplementedError( + "Each transformer must implement an `extract_edges` method." + ) From e956aa004dbbb4b36f094c4dd44ea06c30a8e72f Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 17:01:24 +0000 Subject: [PATCH 057/310] Make node types compatible --- src/sources/base_source.py | 4 +- src/transformers/base_transformer.py | 41 ++----------------- src/transformers/loc/concepts_transformer.py | 4 +- src/transformers/loc/locations_transformer.py | 4 +- src/transformers/loc/names_transformer.py | 4 +- src/transformers/mesh/concepts_transformer.py | 4 +- 6 files changed, 14 insertions(+), 47 deletions(-) diff --git a/src/sources/base_source.py b/src/sources/base_source.py index afdf067a7e..9dfb1efe11 100644 --- a/src/sources/base_source.py +++ b/src/sources/base_source.py @@ -6,12 +6,12 @@ def stream_raw(self) -> Generator[dict| ET.Element]: """Returns a generator of dictionaries or XML Elements, each corresponding to a raw entity extracted from the source.""" raise NotImplementedError("Each source must implement a `stream_raw` method.") -class JSONSource: +class JSONSource(BaseSource): def stream_raw(self) -> Generator[dict]: """Returns a generator of dictionaries, each corresponding to a raw entity extracted from the source.""" raise NotImplementedError("Each source must implement a `stream_raw` method.") -class XMLSource: +class XMLSource(BaseSource): def stream_raw(self) -> Generator[ET.Element]: """Returns a generator of XML Elements, each corresponding to a raw entity extracted from the source.""" raise NotImplementedError("Each source must implement a `stream_raw` method.") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index ae440be4d3..2e1b3fbbd6 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -2,7 +2,7 @@ import csv from collections.abc import Generator from itertools import islice -from typing import Literal +from typing import Any, Literal import xml.etree.ElementTree as ET import boto3 @@ -31,15 +31,15 @@ def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: class BaseTransformer: def __init__(self) -> None: - self.source: BaseSource | JSONSource | XMLSource = BaseSource() + self.source: BaseSource = BaseSource() - def transform_node(self, raw_node: dict | ET.Element) -> BaseNode | None: + def transform_node(self, raw_node: Any) -> BaseNode | None: """Accepts a raw node from the source dataset and returns a transformed node as a Pydantic model.""" raise NotImplementedError( "Each transformer must implement a `transform_node` method." ) - def extract_edges(self, raw_node: dict | ET.Element) -> Generator[BaseEdge]: + def extract_edges(self, raw_node: Any) -> Generator[BaseEdge]: """Accepts a raw node from the source dataset and returns a generator of extracted edges as Pydantic models.""" raise NotImplementedError( "Each transformer must implement an `extract_edges` method." @@ -217,36 +217,3 @@ def stream_to_nowhere( """ for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): pass - -class JSONTransformer(BaseTransformer): - def __init__(self) -> None: - self.source: JSONSource = JSONSource() - - def transform_node(self, raw_node: dict) -> BaseNode | None: - """Accepts a raw node from the source dataset and returns a transformed node as a Pydantic model.""" - raise NotImplementedError( - "Each transformer must implement a `transform_node` method." - ) - - def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: - """Accepts a raw node from the source dataset and returns a generator of extracted edges as Pydantic models.""" - raise NotImplementedError( - "Each transformer must implement an `extract_edges` method." - ) - - -class XMLTransformer(BaseTransformer): - def __init__(self) -> None: - self.source: XMLSource = XMLSource() - - def transform_node(self, raw_node: ET.Element) -> BaseNode | None: - """Accepts a raw node from the source dataset and returns a transformed node as a Pydantic model.""" - raise NotImplementedError( - "Each transformer must implement a `transform_node` method." - ) - - def extract_edges(self, raw_node: ET.Element) -> Generator[BaseEdge]: - """Accepts a raw node from the source dataset and returns a generator of extracted edges as Pydantic models.""" - raise NotImplementedError( - "Each transformer must implement an `extract_edges` method." - ) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 9d49a261c0..783d30f516 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -3,12 +3,12 @@ from models.graph_edge import SourceConceptNarrowerThan from models.graph_node import SourceConcept from sources.gzip_source import GZipSource -from transformers.base_transformer import JSONTransformer +from transformers.base_transformer import BaseTransformer from .raw_concept import RawLibraryOfCongressConcept -class LibraryOfCongressConceptsTransformer(JSONTransformer): +class LibraryOfCongressConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = GZipSource(url) diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 539631670a..5ff7f430e9 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -3,12 +3,12 @@ from models.graph_edge import SourceConceptNarrowerThan from models.graph_node import SourceLocation from sources.gzip_source import MultiGZipSource -from transformers.base_transformer import JSONTransformer +from transformers.base_transformer import BaseTransformer from .raw_concept import RawLibraryOfCongressConcept -class LibraryOfCongressLocationsTransformer(JSONTransformer): +class LibraryOfCongressLocationsTransformer(BaseTransformer): def __init__(self, subject_headings_url: str, names_url: str): self.source = MultiGZipSource([subject_headings_url, names_url]) diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index dc5d68e4bf..7206d30039 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -3,12 +3,12 @@ from models.graph_edge import BaseEdge from models.graph_node import SourceName from sources.gzip_source import GZipSource -from transformers.base_transformer import JSONTransformer +from transformers.base_transformer import BaseTransformer from .raw_concept import RawLibraryOfCongressConcept -class LibraryOfCongressNamesTransformer(JSONTransformer): +class LibraryOfCongressNamesTransformer(BaseTransformer): def __init__(self, url: str): self.source = GZipSource(url) diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 0a11c4b1e9..22a57f0214 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -4,12 +4,12 @@ from models.graph_edge import SourceConceptHasParent, SourceConceptRelatedTo from models.graph_node import SourceConcept from sources.mesh.concepts_source import MeSHConceptsSource -from transformers.base_transformer import XMLTransformer +from transformers.base_transformer import BaseTransformer from .raw_concept import RawMeSHConcept -class MeSHConceptsTransformer(XMLTransformer): +class MeSHConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) From fdb62c44dddd248fa7d33e7c0f3d41852d9953bd Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 20:10:51 +0000 Subject: [PATCH 058/310] Refactor data source --- src/sources/base_source.py | 15 +++------------ src/sources/gzip_source.py | 6 +++--- src/sources/mesh/concepts_source.py | 4 ++-- src/transformers/base_transformer.py | 2 +- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/src/sources/base_source.py b/src/sources/base_source.py index 9dfb1efe11..841a37b60f 100644 --- a/src/sources/base_source.py +++ b/src/sources/base_source.py @@ -1,17 +1,8 @@ from collections.abc import Generator +from typing import Any import xml.etree.ElementTree as ET class BaseSource: - def stream_raw(self) -> Generator[dict| ET.Element]: - """Returns a generator of dictionaries or XML Elements, each corresponding to a raw entity extracted from the source.""" - raise NotImplementedError("Each source must implement a `stream_raw` method.") - -class JSONSource(BaseSource): - def stream_raw(self) -> Generator[dict]: - """Returns a generator of dictionaries, each corresponding to a raw entity extracted from the source.""" - raise NotImplementedError("Each source must implement a `stream_raw` method.") - -class XMLSource(BaseSource): - def stream_raw(self) -> Generator[ET.Element]: - """Returns a generator of XML Elements, each corresponding to a raw entity extracted from the source.""" + def stream_raw(self) -> Generator[Any]: + """Returns a generator of raw data corresponding to an entity extracted from the source.""" raise NotImplementedError("Each source must implement a `stream_raw` method.") diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 1b0c4f9f4f..59755a9366 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -4,10 +4,10 @@ import requests -from .base_source import JSONSource +from .base_source import BaseSource -class GZipSource(JSONSource): +class GZipSource(BaseSource): def __init__(self, url: str): self.url = url @@ -19,7 +19,7 @@ def stream_raw(self) -> Generator[dict]: yield json.loads(line_bytes.decode("utf8")) -class MultiGZipSource(JSONSource): +class MultiGZipSource(BaseSource): def __init__(self, urls: list[str]): self.urls = urls diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index 96b9c2b6e1..af89af980d 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -2,9 +2,9 @@ import requests import xml.etree.ElementTree as ET -from sources.base_source import XMLSource +from sources.base_source import BaseSource -class MeSHConceptsSource(XMLSource): +class MeSHConceptsSource(BaseSource): def __init__(self, url: str): self.url = url diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 2e1b3fbbd6..7a5046dfdf 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -13,7 +13,7 @@ from models.graph_edge import BaseEdge from models.graph_node import BaseNode from query_builders.cypher import construct_upsert_cypher_query -from sources.base_source import BaseSource, JSONSource, XMLSource +from sources.base_source import BaseSource from utils.aws import publish_batch_to_sns EntityType = Literal["nodes", "edges"] From 9e1255d41c2e978dee7bc81dd6ca1dfb4617df89 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 20:17:51 +0000 Subject: [PATCH 059/310] Add locations transformer --- .../mesh/locations_transformer.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 src/transformers/mesh/locations_transformer.py diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py new file mode 100644 index 0000000000..be7172ce6f --- /dev/null +++ b/src/transformers/mesh/locations_transformer.py @@ -0,0 +1,33 @@ +from collections.abc import Generator +import xml.etree.ElementTree as ET + +from models.graph_edge import BaseEdge +from models.graph_node import SourceConcept +from sources.mesh.concepts_source import MeSHConceptsSource +from transformers.base_transformer import BaseTransformer + +from .raw_concept import RawMeSHConcept + + +class MeSHConceptsTransformer(BaseTransformer): + def __init__(self, url: str): + self.source = MeSHConceptsSource(url) + + def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: + raw_concept = RawMeSHConcept(raw_node) + + if not raw_concept.is_geographic: + return None + + return SourceConcept( + id=raw_concept.source_id, + label=raw_concept.label, + source=raw_concept.source, + alternative_labels=raw_concept.alternative_labels, + alternative_ids=raw_concept.alternative_ids, + description=raw_concept.description + ) + + def extract_edges(self, raw_node: ET.Element) -> Generator[BaseEdge]: + """There are no edges to extract from MeSH Locations.""" + yield from () From decc79a835b926742505740887a33fbc779bfc3e Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 20:21:21 +0000 Subject: [PATCH 060/310] Add mesh to create_transformer --- src/transformers/create_transformer.py | 10 +++++++++- src/transformers/mesh/locations_transformer.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index aa2a139c6e..5d6e6572e3 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -4,13 +4,17 @@ from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer from .loc.locations_transformer import LibraryOfCongressLocationsTransformer from .loc.names_transformer import LibraryOfCongressNamesTransformer +from .mesh.concepts_transformer import MeSHConceptsTransformer +from .mesh.locations_transformer import MeSHLocationsTransformer LOC_SUBJECT_HEADINGS_URL = ( "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" ) LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" -TransformerType = Literal["loc_concepts", "loc_names", "loc_locations"] +MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" + +TransformerType = Literal["loc_concepts", "loc_names", "loc_locations", "mesh_concepts", "mesh_locations"] def create_transformer(transformer_type: TransformerType) -> BaseTransformer: @@ -22,5 +26,9 @@ def create_transformer(transformer_type: TransformerType) -> BaseTransformer: return LibraryOfCongressLocationsTransformer( LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL ) + if transformer_type == "mesh_concepts": + return MeSHConceptsTransformer(MESH_URL) + if transformer_type == "mesh_locations": + return MeSHLocationsTransformer(MESH_URL) raise ValueError(f"Unknown transformer type: {transformer_type}") diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index be7172ce6f..ed911ea498 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -9,7 +9,7 @@ from .raw_concept import RawMeSHConcept -class MeSHConceptsTransformer(BaseTransformer): +class MeSHLocationsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) From 1cc674d250e1a448617107c7f289c2493774ff10 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 13 Jan 2025 20:43:49 +0000 Subject: [PATCH 061/310] Clean up docstrings --- src/transformers/mesh/raw_concept.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 71a869cbd3..68ddde819a 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -1,5 +1,5 @@ import requests -from typing import Literal +from typing import Any, Literal import xml.etree.ElementTree as ET ID_PREFIX = "http://id.nlm.nih.gov/mesh/" @@ -8,10 +8,11 @@ class RawMeSHConcept: def __init__(self, raw_concept: ET.Element): self.raw_concept = raw_concept + self.source: Literal["nlm-mesh"] = "nlm-mesh" @staticmethod def _remove_id_prefix(raw_id: str) -> str: - """Removes prefix from MeSH descriptor (only present in JSON).""" + """Removes prefix from MeSH descriptor (only present in extra JSON).""" return raw_id.removeprefix(ID_PREFIX) @property @@ -74,15 +75,15 @@ def description(self) -> str | None: return scope_note @staticmethod - def fetch_mesh(source_id: str) -> dict[str, str | list[str]]: - """Fetch JSON containing RDF data for a given MeSH concept.""" + def fetch_mesh(source_id: str) -> Any: + """Fetches JSON containing RDF data for a given MeSH concept.""" response = requests.get(f"https://id.nlm.nih.gov/mesh/{source_id}.json") return response.json() @property def parent_concept_ids(self) -> list[str]: - """Extract parent MeSH descriptors from JSON.""" + """Extracts parent MeSH descriptors from JSON.""" mesh_data = self.fetch_mesh(self.source_id) broader_desc = mesh_data.get("broaderDescriptor", []) @@ -94,7 +95,7 @@ def parent_concept_ids(self) -> list[str]: @property def related_concept_ids(self) -> list[str]: - """Extract related MeSH descriptors.""" + """Extracts related MeSH descriptors.""" related_descriptors = [] for desc_elem in self.raw_concept.findall("SeeRelatedDescriptor//DescriptorReferredTo//DescriptorUI"): @@ -110,7 +111,3 @@ def is_geographic(self) -> bool: """Returns True if the node represents a geographic concept, as determined by `DescriptorClass`.""" return self.raw_concept.attrib.get("DescriptorClass") == "4" - - @property - def source(self) -> Literal["nlm-mesh"]: - return "nlm-mesh" From b60d42bf21aaa5d8c818d1c23ce5713736d29168 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Mon, 13 Jan 2025 20:45:06 +0000 Subject: [PATCH 062/310] Apply auto-formatting rules --- src/sources/base_source.py | 3 +- src/sources/mesh/concepts_source.py | 8 ++-- src/transformers/base_transformer.py | 2 +- src/transformers/create_transformer.py | 4 +- src/transformers/mesh/concepts_transformer.py | 12 +++--- .../mesh/locations_transformer.py | 4 +- src/transformers/mesh/raw_concept.py | 39 +++++++++++-------- 7 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/sources/base_source.py b/src/sources/base_source.py index 841a37b60f..9397e075aa 100644 --- a/src/sources/base_source.py +++ b/src/sources/base_source.py @@ -1,6 +1,7 @@ +import xml.etree.ElementTree as ET from collections.abc import Generator from typing import Any -import xml.etree.ElementTree as ET + class BaseSource: def stream_raw(self) -> Generator[Any]: diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index af89af980d..32581718a6 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -1,9 +1,11 @@ +import xml.etree.ElementTree as ET from collections.abc import Generator + import requests -import xml.etree.ElementTree as ET from sources.base_source import BaseSource + class MeSHConceptsSource(BaseSource): def __init__(self, url: str): self.url = url @@ -13,6 +15,4 @@ def stream_raw(self) -> Generator[ET.Element]: response.raw.decode_content = True events = ET.iterparse(response.raw) - return ( - elem for _, elem in events if elem.tag == "DescriptorRecord" - ) + return (elem for _, elem in events if elem.tag == "DescriptorRecord") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 7a5046dfdf..f9b3cd63b6 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -1,9 +1,9 @@ import concurrent.futures import csv +import xml.etree.ElementTree as ET from collections.abc import Generator from itertools import islice from typing import Any, Literal -import xml.etree.ElementTree as ET import boto3 import smart_open diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index 5d6e6572e3..d7d41ef697 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -14,7 +14,9 @@ MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" -TransformerType = Literal["loc_concepts", "loc_names", "loc_locations", "mesh_concepts", "mesh_locations"] +TransformerType = Literal[ + "loc_concepts", "loc_names", "loc_locations", "mesh_concepts", "mesh_locations" +] def create_transformer(transformer_type: TransformerType) -> BaseTransformer: diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 22a57f0214..e5cb6a3f02 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -1,5 +1,5 @@ -from collections.abc import Generator import xml.etree.ElementTree as ET +from collections.abc import Generator from models.graph_edge import SourceConceptHasParent, SourceConceptRelatedTo from models.graph_node import SourceConcept @@ -25,19 +25,19 @@ def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: source=raw_concept.source, alternative_labels=raw_concept.alternative_labels, alternative_ids=raw_concept.alternative_ids, - description=raw_concept.description + description=raw_concept.description, ) - def extract_edges(self, raw_node: ET.Element) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: + def extract_edges( + self, raw_node: ET.Element + ) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: yield from () for parent_id in raw_concept.parent_concept_ids: - yield SourceConceptHasParent( - from_id=raw_concept.source_id, to_id=parent_id - ) + yield SourceConceptHasParent(from_id=raw_concept.source_id, to_id=parent_id) for related_id in raw_concept.related_concept_ids: yield SourceConceptRelatedTo( diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index ed911ea498..02922775e1 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -1,5 +1,5 @@ -from collections.abc import Generator import xml.etree.ElementTree as ET +from collections.abc import Generator from models.graph_edge import BaseEdge from models.graph_node import SourceConcept @@ -25,7 +25,7 @@ def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: source=raw_concept.source, alternative_labels=raw_concept.alternative_labels, alternative_ids=raw_concept.alternative_ids, - description=raw_concept.description + description=raw_concept.description, ) def extract_edges(self, raw_node: ET.Element) -> Generator[BaseEdge]: diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 68ddde819a..de6a8d7b54 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -1,6 +1,7 @@ -import requests -from typing import Any, Literal import xml.etree.ElementTree as ET +from typing import Any, Literal + +import requests ID_PREFIX = "http://id.nlm.nih.gov/mesh/" @@ -20,7 +21,7 @@ def source_id(self) -> str: """Returns MeSH descriptor (unique ID).""" desc_elem = self.raw_concept.find("DescriptorUI") - assert(isinstance(desc_elem, ET.Element)) + assert isinstance(desc_elem, ET.Element) descriptor = desc_elem.text assert isinstance(descriptor, str) @@ -29,9 +30,9 @@ def source_id(self) -> str: @property def label(self) -> str: """Returns the concept label.""" - label_elem = self.raw_concept.find('DescriptorName//String') + label_elem = self.raw_concept.find("DescriptorName//String") - assert(isinstance(label_elem, ET.Element)) + assert isinstance(label_elem, ET.Element) label = label_elem.text assert isinstance(label, str) @@ -42,13 +43,15 @@ def alternative_labels(self) -> list[str]: """Returns a list of alternative labels for the concept.""" altern_labels = [] - for altern_concept in self.raw_concept.findall("ConceptList//Concept[@PreferredConceptYN='N']"): + for altern_concept in self.raw_concept.findall( + "ConceptList//Concept[@PreferredConceptYN='N']" + ): altern_label_elem = altern_concept.find("ConceptName//String") if isinstance(altern_label_elem, ET.Element): altern_label = altern_label_elem.text - assert(isinstance(altern_label, str)) + assert isinstance(altern_label, str) altern_labels.append(altern_label) - + return altern_labels @property @@ -58,7 +61,7 @@ def alternative_ids(self) -> list[str]: for treenum_elem in self.raw_concept.findall("TreeNumberList//TreeNumber"): if isinstance(treenum_elem, ET.Element): treenum = treenum_elem.text - assert(isinstance(treenum, str)) + assert isinstance(treenum, str) treenums.append(treenum) return treenums @@ -68,10 +71,12 @@ def description(self) -> str | None: """Returns the preferred term's scope note (free-text narrative of its scope and meaning).""" scope_note = None - scope_note_elem = self.raw_concept.find("ConceptList//Concept[@PreferredConceptYN='Y']//ScopeNote") + scope_note_elem = self.raw_concept.find( + "ConceptList//Concept[@PreferredConceptYN='Y']//ScopeNote" + ) if isinstance(scope_note_elem, ET.Element): scope_note = scope_note_elem.text - + return scope_note @staticmethod @@ -90,18 +95,20 @@ def parent_concept_ids(self) -> list[str]: if not isinstance(broader_desc, list): broader_desc = [broader_desc] - + return [self._remove_id_prefix(desc) for desc in broader_desc] - + @property def related_concept_ids(self) -> list[str]: """Extracts related MeSH descriptors.""" related_descriptors = [] - for desc_elem in self.raw_concept.findall("SeeRelatedDescriptor//DescriptorReferredTo//DescriptorUI"): + for desc_elem in self.raw_concept.findall( + "SeeRelatedDescriptor//DescriptorReferredTo//DescriptorUI" + ): if isinstance(desc_elem, ET.Element): desc = desc_elem.text - assert(isinstance(desc, str)) + assert isinstance(desc, str) related_descriptors.append(desc) return related_descriptors @@ -109,5 +116,5 @@ def related_concept_ids(self) -> list[str]: @property def is_geographic(self) -> bool: """Returns True if the node represents a geographic concept, as determined by `DescriptorClass`.""" - + return self.raw_concept.attrib.get("DescriptorClass") == "4" From 62a49ba86eea99007d543f73e034b4456a08fd67 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Tue, 14 Jan 2025 11:29:38 +0000 Subject: [PATCH 063/310] extract related_to edges for loc concepts locations names --- src/models/graph_edge.py | 6 ++++++ src/transformers/loc/concepts_transformer.py | 9 ++++++-- src/transformers/loc/locations_transformer.py | 12 +++++++++-- src/transformers/loc/names_transformer.py | 17 ++++++++++++--- src/transformers/loc/raw_concept.py | 21 +++++++++++++++++++ 5 files changed, 58 insertions(+), 7 deletions(-) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index 1c352fa48a..a8bcb02f5e 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -16,3 +16,9 @@ class SourceConceptNarrowerThan(BaseEdge): to_type: str = "SourceConcept" relationship: str = "NARROWER_THAN" directed: bool = True + +class SourceConceptRelatedTo(BaseEdge): + from_type: str = "SourceConcept" + to_type: str = "SourceConcept" + relationship: str = "RELATED_TO" + directed: bool = True \ No newline at end of file diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 783d30f516..981d08ddea 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -1,6 +1,6 @@ from collections.abc import Generator -from models.graph_edge import SourceConceptNarrowerThan +from models.graph_edge import SourceConceptNarrowerThan, SourceConceptRelatedTo from models.graph_node import SourceConcept from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer @@ -25,7 +25,7 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan, SourceConceptRelatedTo]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: @@ -35,3 +35,8 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) + + for related_id in raw_concept.related_concept_ids: + yield SourceConceptRelatedTo( + from_id=raw_concept.source_id, to_id=related_id + ) diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 5ff7f430e9..b13044a492 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -1,6 +1,6 @@ from collections.abc import Generator -from models.graph_edge import SourceConceptNarrowerThan +from models.graph_edge import SourceConceptNarrowerThan, SourceConceptRelatedTo from models.graph_node import SourceLocation from sources.gzip_source import MultiGZipSource from transformers.base_transformer import BaseTransformer @@ -25,7 +25,7 @@ def transform_node(self, raw_node: dict) -> SourceLocation | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan, SourceConceptRelatedTo]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: @@ -35,3 +35,11 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan]: yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) + + for related_id in raw_concept.related_concept_ids: + print(SourceConceptRelatedTo( + from_id=raw_concept.source_id, to_id=related_id + )) + yield SourceConceptRelatedTo( + from_id=raw_concept.source_id, to_id=related_id + ) \ No newline at end of file diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 7206d30039..a440b5ae4d 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -1,5 +1,6 @@ from collections.abc import Generator +from models.graph_edge import SourceConceptRelatedTo from models.graph_edge import BaseEdge from models.graph_node import SourceName from sources.gzip_source import GZipSource @@ -25,6 +26,16 @@ def transform_node(self, raw_node: dict) -> SourceName | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[BaseEdge]: - # At the moment there are no edges to extract. Return an empty generator. - yield from () + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: + raw_concept = RawLibraryOfCongressConcept(raw_node) + + if raw_concept.exclude() or not raw_concept.is_geographic: + yield from () + + for related_id in raw_concept.related_concept_ids: + print(SourceConceptRelatedTo( + from_id=raw_concept.source_id, to_id=related_id + )) + yield SourceConceptRelatedTo( + from_id=raw_concept.source_id, to_id=related_id + ) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 1dd5b8ec28..f23803cf02 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -109,6 +109,27 @@ def broader_concept_ids(self) -> list[str]: broader_ids.append(self._remove_id_prefix(concept["@id"])) return broader_ids + + @property + def related_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are related to the current concept.""" + assert self._raw_concept_node is not None + + related_concepts = self._raw_concept_node.get("skos:related", []) + # Sometimes related concepts are returned as a list of concepts, and sometimes as just a single JSON + if isinstance(related_concepts, dict): + related_concepts = [related_concepts] + + related_ids = [] + for concept in related_concepts: + # Some related concepts have IDs in the format `_:n`. + # These IDs do not exist in the LoC source files or the LoC website, so we filter them out. + if concept["@id"].startswith("_:n"): + print("concept has IDs in the format `_:n`") + continue + + related_ids.append(self._remove_id_prefix(concept["@id"])) + return related_ids @property def is_geographic(self) -> bool: From e58bf5b5f8fa258d0372b7d2a69fbd2f3b421ae9 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Tue, 14 Jan 2025 11:50:06 +0000 Subject: [PATCH 064/310] fix a few things --- src/transformers/loc/concepts_transformer.py | 2 +- src/transformers/loc/locations_transformer.py | 5 +---- src/transformers/loc/names_transformer.py | 3 --- src/transformers/loc/raw_concept.py | 1 + 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 981d08ddea..d9bb6d3fd9 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -25,7 +25,7 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan, SourceConceptRelatedTo]: + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index b13044a492..7b0464bab3 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -25,7 +25,7 @@ def transform_node(self, raw_node: dict) -> SourceLocation | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan, SourceConceptRelatedTo]: + def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: @@ -37,9 +37,6 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan, ) for related_id in raw_concept.related_concept_ids: - print(SourceConceptRelatedTo( - from_id=raw_concept.source_id, to_id=related_id - )) yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) \ No newline at end of file diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index a440b5ae4d..11349ee692 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -33,9 +33,6 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: yield from () for related_id in raw_concept.related_concept_ids: - print(SourceConceptRelatedTo( - from_id=raw_concept.source_id, to_id=related_id - )) yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index f23803cf02..c8a8a2d82b 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -3,6 +3,7 @@ ID_PREFIXES_TO_REMOVE = ( "/authorities/subjects/", "http://id.loc.gov/authorities/subjects/", + "http://id.loc.gov/authorities/names/", "/authorities/names/", ) From fb19d57bcbd6be5f9b2d8310f10e027efec98d44 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Tue, 14 Jan 2025 12:03:20 +0000 Subject: [PATCH 065/310] unused import and print --- src/transformers/loc/names_transformer.py | 1 - src/transformers/loc/raw_concept.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 11349ee692..6289ba75dd 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -1,7 +1,6 @@ from collections.abc import Generator from models.graph_edge import SourceConceptRelatedTo -from models.graph_edge import BaseEdge from models.graph_node import SourceName from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index c8a8a2d82b..e67884c016 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -126,7 +126,6 @@ def related_concept_ids(self) -> list[str]: # Some related concepts have IDs in the format `_:n`. # These IDs do not exist in the LoC source files or the LoC website, so we filter them out. if concept["@id"].startswith("_:n"): - print("concept has IDs in the format `_:n`") continue related_ids.append(self._remove_id_prefix(concept["@id"])) From 56bf8dd4e866a27f3f91b0dd44f756eae8c7eb7a Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Tue, 14 Jan 2025 14:43:37 +0000 Subject: [PATCH 066/310] return rather than yield from --- src/models/graph_edge.py | 1 - src/transformers/loc/concepts_transformer.py | 2 +- src/transformers/loc/locations_transformer.py | 2 +- src/transformers/loc/names_transformer.py | 2 +- src/transformers/loc/raw_concept.py | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index a8bcb02f5e..7d9ae0aaef 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -10,7 +10,6 @@ class BaseEdge(BaseModel): directed: bool attributes: dict = {} - class SourceConceptNarrowerThan(BaseEdge): from_type: str = "SourceConcept" to_type: str = "SourceConcept" diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index d9bb6d3fd9..0719a97a06 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -29,7 +29,7 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: - yield from () + return for broader_id in raw_concept.broader_concept_ids: yield SourceConceptNarrowerThan( diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 7b0464bab3..644d7cb09d 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -29,7 +29,7 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: - yield from () + return for broader_id in raw_concept.broader_concept_ids: yield SourceConceptNarrowerThan( diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 6289ba75dd..4036bdeec5 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -29,7 +29,7 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: - yield from () + return for related_id in raw_concept.related_concept_ids: yield SourceConceptRelatedTo( diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index e67884c016..dbc8326ea0 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -3,8 +3,8 @@ ID_PREFIXES_TO_REMOVE = ( "/authorities/subjects/", "http://id.loc.gov/authorities/subjects/", - "http://id.loc.gov/authorities/names/", "/authorities/names/", + "http://id.loc.gov/authorities/names/", ) From 2311360a0318b0475dce78882600e9d34d759c3d Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Tue, 14 Jan 2025 16:39:41 +0000 Subject: [PATCH 067/310] one method for linked concepts --- src/transformers/loc/concepts_transformer.py | 6 +-- src/transformers/loc/locations_transformer.py | 4 +- src/transformers/loc/names_transformer.py | 2 +- src/transformers/loc/raw_concept.py | 42 +++++-------------- 4 files changed, 16 insertions(+), 38 deletions(-) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 0719a97a06..ecfef1ddfa 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -30,13 +30,13 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | if raw_concept.exclude() or raw_concept.is_geographic: return - - for broader_id in raw_concept.broader_concept_ids: + + for broader_id in raw_concept.linked_concepts_ids("broader"): yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) - for related_id in raw_concept.related_concept_ids: + for related_id in raw_concept.linked_concepts_ids("related"): yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 644d7cb09d..a4856e76ed 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -31,12 +31,12 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | if raw_concept.exclude() or not raw_concept.is_geographic: return - for broader_id in raw_concept.broader_concept_ids: + for broader_id in raw_concept.linked_concepts_ids("broader"): yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) - for related_id in raw_concept.related_concept_ids: + for related_id in raw_concept.linked_concepts_ids("related"): yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) \ No newline at end of file diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 4036bdeec5..4bf7622c5d 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -31,7 +31,7 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: if raw_concept.exclude() or not raw_concept.is_geographic: return - for related_id in raw_concept.related_concept_ids: + for related_id in raw_concept.linked_concepts_ids("related"): yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index dbc8326ea0..e47a3ad631 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -89,47 +89,25 @@ def alternative_labels(self) -> list[str]: return [self._extract_label(raw_alternative_labels)] - @property - def broader_concept_ids(self) -> list[str]: - """Returns a list of IDs representing concepts which are broader than the current concept.""" + def linked_concepts_ids(self, sko_link: str) -> list[str]: assert self._raw_concept_node is not None - broader_concepts = self._raw_concept_node.get("skos:broader", []) + linked_concepts = self._raw_concept_node.get(f"skos:{sko_link}", []) - # Sometimes broader concepts are returned as a list of concepts, and sometimes as just a single JSON - if isinstance(broader_concepts, dict): - broader_concepts = [broader_concepts] + # Sometimes linked concepts are returned as a list of concepts, and sometimes as just a single JSON + if isinstance(linked_concepts, dict): + linked_concepts = [linked_concepts] - broader_ids = [] - for concept in broader_concepts: - # Some broader concepts have IDs in the format `_:n`. + linked_ids = [] + for concept in linked_concepts: + # Some linked concepts have IDs in the format `_:n`. # These IDs do not exist in the LoC source files or the LoC website, so we filter them out. if concept["@id"].startswith("_:n"): continue - broader_ids.append(self._remove_id_prefix(concept["@id"])) - - return broader_ids - - @property - def related_concept_ids(self) -> list[str]: - """Returns a list of IDs representing concepts which are related to the current concept.""" - assert self._raw_concept_node is not None - - related_concepts = self._raw_concept_node.get("skos:related", []) - # Sometimes related concepts are returned as a list of concepts, and sometimes as just a single JSON - if isinstance(related_concepts, dict): - related_concepts = [related_concepts] - - related_ids = [] - for concept in related_concepts: - # Some related concepts have IDs in the format `_:n`. - # These IDs do not exist in the LoC source files or the LoC website, so we filter them out. - if concept["@id"].startswith("_:n"): - continue + linked_ids.append(self._remove_id_prefix(concept["@id"])) - related_ids.append(self._remove_id_prefix(concept["@id"])) - return related_ids + return linked_ids @property def is_geographic(self) -> bool: From aa51e56f683b9eb3d2888f35998fafe0f47b0ada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 14 Jan 2025 14:17:29 +0000 Subject: [PATCH 068/310] Add Wikidata locations and names & refactoring --- src/converters/cypher/base_converter.py | 5 ++ src/extractor.py | 2 +- src/sources/wikidata/concepts_source.py | 48 ---------- src/sources/wikidata/edges_source.py | 78 ++++++++++++++++ .../wikidata/linked_ontology_source.py | 90 +++++++++++++++++++ src/sources/wikidata/sparql_client.py | 6 +- src/sources/wikidata/sparql_query_builder.py | 72 ++++++++++----- src/transformers/base_transformer.py | 12 +-- src/transformers/create_transformer.py | 25 ++++-- .../wikidata/concepts_transformer.py | 6 +- .../wikidata/locations_transformer.py | 23 +++++ .../wikidata/names_transformer.py | 24 +++++ src/transformers/wikidata/raw_concept.py | 86 ++++++++++++------ src/utils/streaming.py | 11 +++ 14 files changed, 371 insertions(+), 117 deletions(-) delete mode 100644 src/sources/wikidata/concepts_source.py create mode 100644 src/sources/wikidata/edges_source.py create mode 100644 src/sources/wikidata/linked_ontology_source.py create mode 100644 src/transformers/wikidata/locations_transformer.py create mode 100644 src/transformers/wikidata/names_transformer.py create mode 100644 src/utils/streaming.py diff --git a/src/converters/cypher/base_converter.py b/src/converters/cypher/base_converter.py index 43f0b5f4ea..3bfc6e2139 100644 --- a/src/converters/cypher/base_converter.py +++ b/src/converters/cypher/base_converter.py @@ -15,6 +15,9 @@ def _convert_bool(self, raw_value: bool) -> str: def _convert_none(self) -> str: return "null" + def _convert_float(self, raw_value: float) -> float: + return raw_value + def _convert_list(self, raw_value: list[typing.Any]) -> str: # Neptune does not support lists, so we convert them to a single string with a `||` separator return self._raw_value_to_cypher_value("||".join(raw_value)) @@ -26,6 +29,8 @@ def _raw_value_to_cypher_value(self, raw_value: typing.Any) -> str: value = self._convert_bool(raw_value) elif isinstance(raw_value, list): value = self._convert_list(raw_value) + elif isinstance(raw_value, float): + value = self._convert_float(raw_value) elif raw_value is None: value = self._convert_none() else: diff --git a/src/extractor.py b/src/extractor.py index c5b3a6c79a..5963de4b5a 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -23,7 +23,7 @@ def handler( f"transformer and streaming them into {stream_destination}." ) - transformer: BaseTransformer = create_transformer(transformer_type) + transformer: BaseTransformer = create_transformer(transformer_type, entity_type) if stream_destination == "graph": neptune_client = get_neptune_client(is_local) diff --git a/src/sources/wikidata/concepts_source.py b/src/sources/wikidata/concepts_source.py deleted file mode 100644 index 90832968ee..0000000000 --- a/src/sources/wikidata/concepts_source.py +++ /dev/null @@ -1,48 +0,0 @@ -from collections.abc import Generator - -from sources.base_source import BaseSource -from .sparql_client import WikidataSparqlClient -from .sparql_query_builder import SparqlQueryBuilder, NodeType, LinkedSource - -WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" - - -class WikidataConceptsSource(BaseSource): - def __init__(self, node_type: NodeType, linked_source: LinkedSource): - self.client = WikidataSparqlClient() - self.node_type = node_type - self.linked_source = linked_source - - def _get_all_wikidata_ids(self) -> list[str]: - """Returns the IDs of all Wikidata items which reference a Library of Congress ID. - There are currently about 1.6 million such items, and the query takes ~1 minute to run. - """ - - loc_ids_query = """ - SELECT DISTINCT ?item WHERE { - ?item p:P244 ?statement0. - ?statement0 ps:P244 _:anyValueP244. - } - """ - - items = self.client.run_query(loc_ids_query) - - raw_ids: list[str] = [item["item"]["value"] for item in items] - ids = [raw_id.removeprefix(WIKIDATA_ID_PREFIX) for raw_id in raw_ids] - - return ids - - def stream_raw(self) -> Generator[dict]: - all_ids = self._get_all_wikidata_ids() - - chunk_size = 300 - - for i in range(0, len(all_ids), chunk_size): - chunk = all_ids[i : i + chunk_size] - query = SparqlQueryBuilder.get_items_query( - chunk, self.node_type, self.linked_source - ) - items = self.client.run_query(query) - - for item in items: - yield item diff --git a/src/sources/wikidata/edges_source.py b/src/sources/wikidata/edges_source.py new file mode 100644 index 0000000000..e594fe14fc --- /dev/null +++ b/src/sources/wikidata/edges_source.py @@ -0,0 +1,78 @@ +from collections.abc import Generator + +from sources.base_source import BaseSource +from .sparql_client import WikidataSparqlClient +from .sparql_query_builder import SparqlQueryBuilder, NodeType, OntologyType + +import smart_open +import boto3 +import os + +WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" +S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] + + +def extract_wikidata_id(item: dict): + return item["item"]["value"].removeprefix(WIKIDATA_ID_PREFIX) + + +class WikidataEdgesSource(BaseSource): + """ + A source streaming selected Wikidata edges based on the selected linked ontology (LoC or MeSH) + and node type (concepts, locations, or names). For more information, see the `WikidataLinkedOntologySource` class. + """ + + def __init__(self, node_type: NodeType, linked_ontology: OntologyType): + self.client = WikidataSparqlClient() + self.node_type = node_type + self.linked_ontology = linked_ontology + + @staticmethod + def _get_linked_ontology_ids(node_type: NodeType, linked_ontology: OntologyType): + linked_nodes_file_name = f"{linked_ontology}_{node_type}__nodes.csv" + s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + + ids = set() + transport_params = {"client": boto3.client("s3")} + with smart_open.open(s3_url, "r", transport_params=transport_params) as f: + for line in f: + ids.add(line.split(",")[0]) + + return ids + + def _get_linked_ontology_id_mapping(self) -> list[dict]: + """ + Return a list of _all_ Wikidata items referencing an id from another ontology (LoC or MeSH). Each returned + item is a dictionary containing a Wikidata id and the referenced id. + """ + + # Get all Wikidata items referencing an id from the selected linked ontology + ids_query = SparqlQueryBuilder.get_all_ids_query(self.linked_ontology) + items = self.client.run_query(ids_query) + + if self.node_type in ["concepts", "locations"]: + loc_ids = self._get_linked_ontology_ids( + self.node_type, self.linked_ontology + ) + filtered_items = [i for i in items if i["linkedId"]["value"] in loc_ids] + else: + loc_ids = self._get_linked_ontology_ids( + "concepts", self.linked_ontology + ) | self._get_linked_ontology_ids("locations", self.linked_ontology) + filtered_items = [i for i in items if i["linkedId"]["value"] not in loc_ids] + + ids = [] + for item in filtered_items: + linked_id = item["linkedId"]["value"] + wikidata_id = extract_wikidata_id(item) + ids.append({"wikidata_id": wikidata_id, "linked_id": linked_id}) + + print( + f"Found {len(ids)} Wikidata items referencing a {self.linked_ontology} id of node type {self.node_type}." + ) + return list(ids) + + def stream_raw(self) -> Generator[dict]: + ids = self._get_linked_ontology_id_mapping() + for item in ids: + yield item diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py new file mode 100644 index 0000000000..ca17105bdd --- /dev/null +++ b/src/sources/wikidata/linked_ontology_source.py @@ -0,0 +1,90 @@ +from collections.abc import Generator + +from sources.base_source import BaseSource +from .sparql_client import WikidataSparqlClient +from .sparql_query_builder import SparqlQueryBuilder, NodeType, OntologyType + +import concurrent.futures +from itertools import islice + +from .edges_source import WikidataEdgesSource +from utils.streaming import generator_to_chunks + + +SPARQL_ITEMS_CHUNK_SIZE = 300 + + +class WikidataLinkedOntologySource(BaseSource): + """ + A source streaming selected Wikidata nodes or edges based on the selected linked ontology (LoC or MeSH) + and node type (concepts, locations, or names). For example, if a combination of "LoC" and "locations" is selected, + only Wikidata items referencing LoC geographic nodes are streamed. + + Wikidata puts strict limits on the resources which can be consumed by a single query, and queries which include + filters or do other expensive processing often time out or return a stack overflow error. This means we need + to use a somewhat convoluted way for extracting the Wikidata nodes/edges we need. + + To extract nodes: + 1. Run a SPARQL query which retrieves _all_ Wikidata ids referencing an id from the selected linked ontology. + (WikidataEdgesSource is utilised to run the query.) + 2. Split the returned ids into chunks and run a SPARQL query for each chunk. The query retrieves all the node + properties we are interested in for each id in the chunk. + 3. Stream the returned items as usual. + + To extract edges (via the `WikidataEdgesSource` class): + 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the selected linked ontology, + and returns mappings between Wikidata ids and ids from the linked ontology. + 2. Filter the returned id pairs to only include Wikidata ids corresponding to the selected node type + (i.e. concepts, locations, or names). + 3. Stream the filtered items as usual. + """ + + def __init__(self, node_type: NodeType, linked_ontology: OntologyType, entity_type): + self.client = WikidataSparqlClient() + self.node_type = node_type + self.linked_ontology = linked_ontology + self.entity_type = entity_type + self.edges_source = WikidataEdgesSource(node_type, linked_ontology) + + def _stream_wikidata_ids(self) -> Generator[dict]: + seen = set() + for item in self.edges_source.stream_raw(): + wikidata_id = item["wikidata_id"] + if wikidata_id in seen: + yield + else: + seen.add(wikidata_id) + yield wikidata_id + + def _stream_raw_nodes(self) -> Generator[dict]: + all_ids = self._stream_wikidata_ids() + chunks = generator_to_chunks(all_ids, SPARQL_ITEMS_CHUNK_SIZE) + + def run_query(chunk) -> list: + query = SparqlQueryBuilder.get_items_query(chunk, self.node_type) + return self.client.run_query(query) + + with concurrent.futures.ThreadPoolExecutor() as executor: + # Run the first 3 queries in parallel + futures = {executor.submit(run_query, chunk) for chunk in islice(chunks, 3)} + + while futures: + # Wait for one or more queries to complete + done, futures = concurrent.futures.wait( + futures, return_when=concurrent.futures.FIRST_COMPLETED + ) + + # Top up with new queries to keep the total number of parallel queries at 3 + for chunk in islice(chunks, len(done)): + futures.add(executor.submit(run_query, chunk)) + + for future in done: + items = future.result() + for item in items: + yield item + + def stream_raw(self) -> Generator[dict]: + if self.entity_type == "edges": + return self.edges_source.stream_raw() + + return self._stream_raw_nodes() diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index f74f63193e..a3a50f2b0b 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -4,13 +4,17 @@ class WikidataSparqlClient: @staticmethod def _get_user_agent_header(): - # https://foundation.wikimedia.org/wiki/Policy:Wikimedia_Foundation_User-Agent_Policy + """ + Return a User-Agent header value complying with Wikimedia's User-Agent policy: + https://foundation.wikimedia.org/wiki/Policy:Wikimedia_Foundation_User-Agent_Policy + """ return ( "WellcomeCollectionCatalogueGraphPipeline/0.1 (https://wellcomecollection.org/; " "digital@wellcomecollection.org) wellcome-collection-catalogue-graph/0.1" ) def run_query(self, query: str) -> list[dict]: + """Runs a query against Wikidata's SPARQL endpoint and returns the results as a list""" r = requests.get( "https://query.wikidata.org/sparql", params={"format": "json", "query": query}, diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 668cbeba18..794d94a229 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -1,30 +1,37 @@ from typing import Literal NodeType = Literal["concepts", "names", "locations"] -LinkedSource = Literal["mesh", "loc"] +OntologyType = Literal["mesh", "loc"] + +# https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization +# "https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain", class SparqlQueryBuilder: + """ + Contains various methods for constructing reusable SPARQL queries to run against Wikidata's SPARQL endpoint. + """ + @staticmethod - def _get_formatted_fields(node_type: NodeType, linked_source: LinkedSource): + def _get_formatted_fields(node_type: NodeType) -> str: + """ + Return the names of all fields to be retrieved via a SPARQL query based on node type. + """ fields = ["?item", "?itemLabel", "?itemDescription", "?itemAltLabel"] if node_type == "names": fields += ["?dateOfBirthLabel", "?dateOfDeathLabel", "?placeOfBirthLabel"] elif node_type == "locations": - fields += ["?coordinateLocation"] - - if linked_source == "loc": - fields.append("?libraryOfCongressId") - elif linked_source == "mesh": - fields.append("?meshId") + fields += ["?coordinates"] return " ".join(fields) @staticmethod - def _get_formatted_field_definitions( - node_type: NodeType, linked_source: Literal["mesh", "loc"] - ): + def _get_formatted_field_mappings(node_type: NodeType) -> str: + """ + Returns SPARQL field definitions, mapping field names specified + in the `_get_formatted_fields` method to Wikidata property codes. + """ definitions = [] if node_type == "names": @@ -34,29 +41,46 @@ def _get_formatted_field_definitions( "OPTIONAL { ?item wdt:P19 ?placeOfBirth. }", ] elif node_type == "locations": - definitions += ["OPTIONAL {{ ?item wdt:P625 ?coordinateLocation. }}"] - - if linked_source == "loc": - definitions.append("?item wdt:P244 ?libraryOfCongressId.") - elif linked_source == "mesh": - definitions.append("?item wdt:P486 ?meshId.") + definitions += [ + """ + { + SELECT ?item (SAMPLE(?coordinates) AS ?coordinates) { + ?item p:P625/ps:P625 ?coordinates. + } + GROUP BY ?item + } + """ + ] return "\n".join(definitions) + @staticmethod + def get_all_ids_query(linked_ontology: OntologyType) -> str: + if linked_ontology == "loc": + field_filter = "?item p:P244/ps:P244 ?linkedId." + elif linked_ontology == "mesh": + field_filter = "?item p:P486/ps:P486 ?linkedId." + else: + raise ValueError(f"Invalid linked ontology type: {linked_ontology}") + + get_ids_query = f""" + SELECT ?item ?linkedId WHERE {{ + {field_filter} + }} + """ + + return get_ids_query + @classmethod - def get_items_query( - cls, item_ids: list[str], node_type: NodeType, linked_source: LinkedSource - ): + def get_items_query(cls, item_ids: list[str], node_type: NodeType): ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) query = f""" - SELECT DISTINCT {cls._get_formatted_fields(node_type, linked_source)} + SELECT DISTINCT {cls._get_formatted_fields(node_type)} WHERE {{ SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - VALUES ?item {{ {ids_clause} }} - - {cls._get_formatted_field_definitions(node_type, linked_source)} + {cls._get_formatted_field_mappings(node_type)} }} """ diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 96cd0b7490..c4c9f6d142 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -14,20 +14,12 @@ from query_builders.cypher import construct_upsert_cypher_query from sources.base_source import BaseSource from utils.aws import publish_batch_to_sns +from utils.streaming import generator_to_chunks EntityType = Literal["nodes", "edges"] StreamDestination = Literal["graph", "s3", "sns", "void"] -def _generator_to_chunks(items: Generator, chunk_size: int) -> Generator: - while True: - chunk = list(islice(items, chunk_size)) - if chunk: - yield chunk - else: - return - - class BaseTransformer: def __init__(self) -> None: self.source: BaseSource = BaseSource() @@ -107,7 +99,7 @@ def _stream_chunks( and returns the results stream in fixed-size chunks. """ entities = self._stream_entities(entity_type, sample_size) - for chunk in _generator_to_chunks(entities, chunk_size): + for chunk in generator_to_chunks(entities, chunk_size): yield chunk def stream_to_s3( diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index 31e21b47ce..e471de228a 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -1,20 +1,31 @@ from typing import Literal -from .base_transformer import BaseTransformer +from .base_transformer import BaseTransformer, EntityType from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer from .loc.locations_transformer import LibraryOfCongressLocationsTransformer from .loc.names_transformer import LibraryOfCongressNamesTransformer from .wikidata.concepts_transformer import WikidataConceptsTransformer +from .wikidata.locations_transformer import WikidataLocationsTransformer +from .wikidata.names_transformer import WikidataNamesTransformer LOC_SUBJECT_HEADINGS_URL = ( "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" ) LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" -TransformerType = Literal["loc_concepts", "loc_names", "loc_locations", "wikidata"] +TransformerType = Literal[ + "loc_concepts", + "loc_names", + "loc_locations", + "wikidata_concepts", + "wikidata_locations", + "wikidata_names", +] -def create_transformer(transformer_type: TransformerType) -> BaseTransformer: +def create_transformer( + transformer_type: TransformerType, entity_type: EntityType +) -> BaseTransformer: if transformer_type == "loc_concepts": return LibraryOfCongressConceptsTransformer(LOC_SUBJECT_HEADINGS_URL) if transformer_type == "loc_names": @@ -23,7 +34,11 @@ def create_transformer(transformer_type: TransformerType) -> BaseTransformer: return LibraryOfCongressLocationsTransformer( LOC_SUBJECT_HEADINGS_URL, LOC_NAMES_URL ) - if transformer_type == "wikidata": - return WikidataConceptsTransformer() + if transformer_type == "wikidata_concepts": + return WikidataConceptsTransformer(entity_type) + if transformer_type == "wikidata_locations": + return WikidataLocationsTransformer(entity_type) + if transformer_type == "wikidata_names": + return WikidataNamesTransformer(entity_type) raise ValueError(f"Unknown transformer type: {transformer_type}") diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index 22da380335..2ffee6df04 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,13 +1,13 @@ from models.graph_node import SourceConcept -from sources.wikidata.concepts_source import WikidataConceptsSource +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from transformers.base_transformer import BaseTransformer from .raw_concept import RawWikidataConcept class WikidataConceptsTransformer(BaseTransformer): - def __init__(self): - self.source = WikidataConceptsSource("concepts", "loc") + def __init__(self, entity_type): + self.source = WikidataLinkedOntologySource("concepts", "loc", entity_type) def transform_node(self, raw_node: dict) -> SourceConcept | None: raw_concept = RawWikidataConcept(raw_node) diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py new file mode 100644 index 0000000000..6f021462c5 --- /dev/null +++ b/src/transformers/wikidata/locations_transformer.py @@ -0,0 +1,23 @@ +from models.graph_node import SourceLocation +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from transformers.base_transformer import BaseTransformer + +from .raw_concept import RawWikidataLocation + + +class WikidataLocationsTransformer(BaseTransformer): + def __init__(self, entity_type): + self.source = WikidataLinkedOntologySource("locations", "loc", entity_type) + + def transform_node(self, raw_node: dict) -> SourceLocation | None: + raw_concept = RawWikidataLocation(raw_node) + + return SourceLocation( + id=raw_concept.source_id, + label=raw_concept.label, + source=raw_concept.source, + alternative_labels=raw_concept.alternative_labels, + description=raw_concept.description, + latitude=raw_concept.latitude, + longitude=raw_concept.longitude, + ) diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py new file mode 100644 index 0000000000..ce20cd2da9 --- /dev/null +++ b/src/transformers/wikidata/names_transformer.py @@ -0,0 +1,24 @@ +from models.graph_node import SourceName +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from transformers.base_transformer import BaseTransformer + +from .raw_concept import RawWikidataName + + +class WikidataNamesTransformer(BaseTransformer): + def __init__(self, entity_type): + self.source = WikidataLinkedOntologySource("names", "loc", entity_type) + + def transform_node(self, raw_node: dict) -> SourceName | None: + raw_concept = RawWikidataName(raw_node) + + return SourceName( + id=raw_concept.source_id, + label=raw_concept.label, + source=raw_concept.source, + alternative_labels=raw_concept.alternative_labels, + description=raw_concept.description, + date_of_birth=raw_concept.date_of_birth, + date_of_death=raw_concept.date_of_death, + place_of_birth=raw_concept.place_of_birth, + ) diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 0a6d7c4a0c..0f32068517 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -1,24 +1,6 @@ from typing import Literal - -# { -# "country": {"type": "uri", "value": "http://www.wikidata.org/entity/Q183"}, -# "countryLabel": {"xml:lang": "en", "type": "literal", "value": "Germany"}, -# "countryDescription": { -# "xml:lang": "en", -# "type": "literal", -# "value": "country in Central Europe", -# }, -# "countryAltLabel": { -# "xml:lang": "en", -# "type": "literal", -# "value": "BR Deutschland, Bundesrepublik Deutschland, Deutschland, Federal Republic of Germany", -# }, -# "coordinate_location": { -# "datatype": "http://www.opengis.net/ont/geosparql#wktLiteral", -# "type": "literal", -# "value": "Point(9.83 53.54)", -# }, -# } +import re +from functools import lru_cache WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" @@ -27,14 +9,22 @@ class RawWikidataConcept: def __init__(self, raw_concept: dict): self.raw_concept = raw_concept - def _extract_english_field_value(self, field_name: str): + def _extract_field_value(self, field_name: str) -> str: field = self.raw_concept[field_name] - - # assert field["xml:lang"] == "en" - assert field["type"] == "literal" + assert field["type"] == "literal", self.raw_concept return field["value"] + def _extract_optional_field_value(self, field_name: str) -> str | None: + if field_name not in self.raw_concept: + return None + + return self._extract_field_value(field_name) + + def _extract_english_field_value(self, field_name: str): + assert self.raw_concept[field_name]["xml:lang"] == "en" + return self._extract_field_value(field_name) + @staticmethod def _remove_id_prefix(raw_id: str) -> str: return raw_id.removeprefix(WIKIDATA_ID_PREFIX) @@ -47,7 +37,8 @@ def source_id(self) -> str: @property def label(self) -> str: - return self._extract_english_field_value("itemLabel") + # TODO: Handle non-English labels + return self._extract_field_value("itemLabel") @property def alternative_labels(self) -> list[str]: @@ -68,3 +59,48 @@ def description(self) -> str | None: @property def source(self) -> Literal["wikidata"]: return "wikidata" + + +class RawWikidataLocation(RawWikidataConcept): + @lru_cache + def _get_coordinates(self) -> dict: + """Extracts coordinates from a raw string in the format `Point( )` (e.g. `Point(9.83 53.54)`)""" + # Some items do not return valid coordinates (e.g. Q17064702, whose coordinates just say 'unknown value' on the + # Wikidata website). When this happens, the 'type' of the 'coordinates' property always appears to be 'uri'. + if self.raw_concept["coordinates"]["type"] == "uri": + return {"longitude": None, "latitude": None} + + raw_coordinates = self._extract_field_value("coordinates") + + pattern = r"Point\((.*)\s(.*)\)" + matched_coordinates = re.search(pattern, raw_coordinates) + + assert ( + matched_coordinates is not None + ), f"Could not extract coordinates from raw value '{raw_coordinates}'. Wikidata id: {self.source_id}" + + longitude = float(matched_coordinates.group(1)) + latitude = float(matched_coordinates.group(2)) + return {"longitude": longitude, "latitude": latitude} + + @property + def longitude(self) -> float | None: + return self._get_coordinates()["longitude"] + + @property + def latitude(self) -> float | None: + return self._get_coordinates()["latitude"] + + +class RawWikidataName(RawWikidataConcept): + @property + def date_of_birth(self) -> str | None: + return self._extract_optional_field_value("dateOfBirthLabel") + + @property + def date_of_death(self) -> str | None: + return self._extract_optional_field_value("dateOfDeathLabel") + + @property + def place_of_birth(self) -> str | None: + return self._extract_optional_field_value("placeOfBirthLabel") diff --git a/src/utils/streaming.py b/src/utils/streaming.py new file mode 100644 index 0000000000..05854d4b5a --- /dev/null +++ b/src/utils/streaming.py @@ -0,0 +1,11 @@ +from collections.abc import Generator +from itertools import islice + + +def generator_to_chunks(items: Generator, chunk_size: int) -> Generator: + while True: + chunk = list(islice(items, chunk_size)) + if chunk: + yield chunk + else: + return From 56bdfee34ed03af30d66e8182e98ed12d5d7f460 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 14 Jan 2025 21:26:30 +0000 Subject: [PATCH 069/310] Add tree lookup to source --- src/sources/mesh/concepts_source.py | 30 +++++++++++++++++++++++------ src/utils/xml.py | 11 +++++++++++ 2 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 src/utils/xml.py diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index 32581718a6..ff96604f6c 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -4,15 +4,33 @@ import requests from sources.base_source import BaseSource +from utils.xml import get_text class MeSHConceptsSource(BaseSource): def __init__(self, url: str): - self.url = url + self.mesh_data = self._get_mesh_data(url) + self.treenum_lookup = self._treenum_lookup() - def stream_raw(self) -> Generator[ET.Element]: - response = requests.get(self.url, stream=True) - response.raw.decode_content = True + @staticmethod + def _get_mesh_data(url: str) -> ET.Element: + response = requests.get(url) + return ET.fromstring(response.content) - events = ET.iterparse(response.raw) - return (elem for _, elem in events if elem.tag == "DescriptorRecord") + def _treenum_lookup(self) -> dict[str, str]: + """ + Creates lookup for MeSH tree numbers. This is needed to extract parent MeSH IDs, + which are not directly available in the XML DescriptorRecord. + """ + treenum_lookup = {} + for descriptor in self.mesh_data.findall("DescriptorRecord"): + desc_ui = descriptor.find("DescriptorUI") + for treenum in descriptor.findall("TreeNumberList//TreeNumber"): + treenum_lookup[get_text(treenum)] = get_text(desc_ui) + return treenum_lookup + + def stream_raw(self) -> Generator[tuple[ET.Element, dict[str,str]]]: + + for elem in self.mesh_data.iter(): + if elem.tag == "DescriptorRecord": + yield elem, self.treenum_lookup diff --git a/src/utils/xml.py b/src/utils/xml.py new file mode 100644 index 0000000000..a9a4c7caf8 --- /dev/null +++ b/src/utils/xml.py @@ -0,0 +1,11 @@ +from typing import Any +import xml.etree.ElementTree as ET + +def get_text(xml_element: Any) -> str: + """Returns the text content of the given XML element.""" + assert(isinstance(xml_element, ET.Element)) + + elem_text = xml_element.text + assert(isinstance(elem_text, str)) + + return elem_text From 787f189b10f77ba1fe3c5f452836cb5e4bd3c4ce Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 15 Jan 2025 09:19:18 +0000 Subject: [PATCH 070/310] keep it within raw_concept --- src/transformers/loc/concepts_transformer.py | 4 ++-- src/transformers/loc/locations_transformer.py | 10 ++++++++-- src/transformers/loc/names_transformer.py | 5 ++++- src/transformers/loc/raw_concept.py | 17 +++++++++++++++-- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index ecfef1ddfa..102568a75c 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -31,12 +31,12 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | if raw_concept.exclude() or raw_concept.is_geographic: return - for broader_id in raw_concept.linked_concepts_ids("broader"): + for broader_id in raw_concept.broader_concept_ids: yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) - for related_id in raw_concept.linked_concepts_ids("related"): + for related_id in raw_concept.related_concept_ids: yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index a4856e76ed..0ffb9437d4 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -31,12 +31,18 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | if raw_concept.exclude() or not raw_concept.is_geographic: return - for broader_id in raw_concept.linked_concepts_ids("broader"): + for broader_id in raw_concept.broader_concept_ids: + print(SourceConceptNarrowerThan( + from_id=raw_concept.source_id, to_id=broader_id + )) yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) - for related_id in raw_concept.linked_concepts_ids("related"): + for related_id in raw_concept.related_concept_ids: + print(SourceConceptRelatedTo( + from_id=raw_concept.source_id, to_id=related_id + )) yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) \ No newline at end of file diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 4bf7622c5d..8c83f114f1 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -31,7 +31,10 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: if raw_concept.exclude() or not raw_concept.is_geographic: return - for related_id in raw_concept.linked_concepts_ids("related"): + for related_id in raw_concept.related_concept_ids: + print(SourceConceptRelatedTo( + from_id=raw_concept.source_id, to_id=related_id + )) yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index e47a3ad631..3d41d2af47 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -88,8 +88,9 @@ def alternative_labels(self) -> list[str]: return [self._extract_label(item) for item in raw_alternative_labels] return [self._extract_label(raw_alternative_labels)] - + def linked_concepts_ids(self, sko_link: str) -> list[str]: + """Returns a list of IDs representing concepts which are linked to the current concept""" assert self._raw_concept_node is not None linked_concepts = self._raw_concept_node.get(f"skos:{sko_link}", []) @@ -108,7 +109,19 @@ def linked_concepts_ids(self, sko_link: str) -> list[str]: linked_ids.append(self._remove_id_prefix(concept["@id"])) return linked_ids - + + @property + def broader_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are broader than the current concept.""" + sko_link_type = "broader" + return self.linked_concepts_ids(sko_link_type) + + @property + def related_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are related to the current concept.""" + sko_link_type = "related" + return self.linked_concepts_ids(sko_link_type) + @property def is_geographic(self) -> bool: """Returns True if the node represents a geographic concept, as determined by `skos:notation`.""" From 2a2a67fad24f815b2f4f117f4f992596ba05fb49 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 15 Jan 2025 09:26:07 +0000 Subject: [PATCH 071/310] print be gone --- src/transformers/loc/locations_transformer.py | 6 ------ src/transformers/loc/names_transformer.py | 3 --- 2 files changed, 9 deletions(-) diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 0ffb9437d4..644d7cb09d 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -32,17 +32,11 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | return for broader_id in raw_concept.broader_concept_ids: - print(SourceConceptNarrowerThan( - from_id=raw_concept.source_id, to_id=broader_id - )) yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) for related_id in raw_concept.related_concept_ids: - print(SourceConceptRelatedTo( - from_id=raw_concept.source_id, to_id=related_id - )) yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) \ No newline at end of file diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 8c83f114f1..4036bdeec5 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -32,9 +32,6 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: return for related_id in raw_concept.related_concept_ids: - print(SourceConceptRelatedTo( - from_id=raw_concept.source_id, to_id=related_id - )) yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) From 14dc3db96f4124e4d622415d28f31b283c091aab Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 15 Jan 2025 10:00:07 +0000 Subject: [PATCH 072/310] Add docstrings --- src/sources/mesh/concepts_source.py | 11 ++++++++--- src/utils/xml.py | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index ff96604f6c..b5ffe0abf6 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -4,7 +4,7 @@ import requests from sources.base_source import BaseSource -from utils.xml import get_text +from utils.xml import assert_get_text class MeSHConceptsSource(BaseSource): @@ -14,6 +14,11 @@ def __init__(self, url: str): @staticmethod def _get_mesh_data(url: str) -> ET.Element: + """ + Downloads MeSH data in full. Individual descriptor records cannot be streamed + because parent terms are not available directly. Instead, these need to be extracted + via a tree number lookup. + """ response = requests.get(url) return ET.fromstring(response.content) @@ -26,11 +31,11 @@ def _treenum_lookup(self) -> dict[str, str]: for descriptor in self.mesh_data.findall("DescriptorRecord"): desc_ui = descriptor.find("DescriptorUI") for treenum in descriptor.findall("TreeNumberList//TreeNumber"): - treenum_lookup[get_text(treenum)] = get_text(desc_ui) + treenum_lookup[assert_get_text(treenum)] = assert_get_text(desc_ui) return treenum_lookup def stream_raw(self) -> Generator[tuple[ET.Element, dict[str,str]]]: - + """Streams raw nodes together with tree number lookup.""" for elem in self.mesh_data.iter(): if elem.tag == "DescriptorRecord": yield elem, self.treenum_lookup diff --git a/src/utils/xml.py b/src/utils/xml.py index a9a4c7caf8..e66fb481d9 100644 --- a/src/utils/xml.py +++ b/src/utils/xml.py @@ -1,8 +1,8 @@ from typing import Any import xml.etree.ElementTree as ET -def get_text(xml_element: Any) -> str: - """Returns the text content of the given XML element.""" +def assert_get_text(xml_element: Any) -> str: + """Asserts that the given element is XML contatining text and returns text.""" assert(isinstance(xml_element, ET.Element)) elem_text = xml_element.text From 792dcfbbb147eed5cb61756298f8189b64fa0980 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 15 Jan 2025 10:23:20 +0000 Subject: [PATCH 073/310] Extract parent terms via tree numbers --- src/transformers/mesh/concepts_transformer.py | 4 +- .../mesh/locations_transformer.py | 4 +- src/transformers/mesh/raw_concept.py | 70 +++++++++---------- 3 files changed, 37 insertions(+), 41 deletions(-) diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index e5cb6a3f02..cf61634e3e 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -13,7 +13,7 @@ class MeSHConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: + def transform_node(self, raw_node: tuple[ET.Element, dict[str,str]]) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: @@ -29,7 +29,7 @@ def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: ) def extract_edges( - self, raw_node: ET.Element + self, raw_node: tuple[ET.Element, dict[str,str]] ) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: raw_concept = RawMeSHConcept(raw_node) diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index 02922775e1..f5cb4e5dc4 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -13,7 +13,7 @@ class MeSHLocationsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: + def transform_node(self, raw_node: tuple[ET.Element, dict[str,str]]) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if not raw_concept.is_geographic: @@ -28,6 +28,6 @@ def transform_node(self, raw_node: ET.Element) -> SourceConcept | None: description=raw_concept.description, ) - def extract_edges(self, raw_node: ET.Element) -> Generator[BaseEdge]: + def extract_edges(self, raw_node: tuple[ET.Element, dict[str,str]]) -> Generator[BaseEdge]: """There are no edges to extract from MeSH Locations.""" yield from () diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index de6a8d7b54..f81325699e 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -3,12 +3,15 @@ import requests +from utils.xml import assert_get_text + ID_PREFIX = "http://id.nlm.nih.gov/mesh/" class RawMeSHConcept: - def __init__(self, raw_concept: ET.Element): - self.raw_concept = raw_concept + def __init__(self, raw_concept: tuple[ET.Element, dict[str,str]]): + self.raw_concept = raw_concept[0] + self.treenum_lookup = raw_concept[1] self.source: Literal["nlm-mesh"] = "nlm-mesh" @staticmethod @@ -19,28 +22,22 @@ def _remove_id_prefix(raw_id: str) -> str: @property def source_id(self) -> str: """Returns MeSH descriptor (unique ID).""" - desc_elem = self.raw_concept.find("DescriptorUI") - - assert isinstance(desc_elem, ET.Element) - descriptor = desc_elem.text - assert isinstance(descriptor, str) + descriptor_elem = self.raw_concept.find("DescriptorUI") + source_id = assert_get_text(descriptor_elem) - return descriptor + return source_id @property def label(self) -> str: """Returns the concept label.""" label_elem = self.raw_concept.find("DescriptorName//String") - - assert isinstance(label_elem, ET.Element) - label = label_elem.text - assert isinstance(label, str) + label = assert_get_text(label_elem) return label @property def alternative_labels(self) -> list[str]: - """Returns a list of alternative labels for the concept.""" + """Returns a list of alternative labels for the concept, if available.""" altern_labels = [] for altern_concept in self.raw_concept.findall( @@ -49,8 +46,8 @@ def alternative_labels(self) -> list[str]: altern_label_elem = altern_concept.find("ConceptName//String") if isinstance(altern_label_elem, ET.Element): altern_label = altern_label_elem.text - assert isinstance(altern_label, str) - altern_labels.append(altern_label) + if isinstance(altern_label, str): + altern_labels.append(altern_label) return altern_labels @@ -58,11 +55,9 @@ def alternative_labels(self) -> list[str]: def alternative_ids(self) -> list[str]: """Returns a list of MeSH tree numbers for the concept.""" treenums = [] + for treenum_elem in self.raw_concept.findall("TreeNumberList//TreeNumber"): - if isinstance(treenum_elem, ET.Element): - treenum = treenum_elem.text - assert isinstance(treenum, str) - treenums.append(treenum) + treenums.append(assert_get_text(treenum_elem)) return treenums @@ -80,23 +75,27 @@ def description(self) -> str | None: return scope_note @staticmethod - def fetch_mesh(source_id: str) -> Any: - """Fetches JSON containing RDF data for a given MeSH concept.""" - - response = requests.get(f"https://id.nlm.nih.gov/mesh/{source_id}.json") - return response.json() + def _get_parent_treenum(treenum: str) -> str: + """ + Extracts the parent tree number by removing all digits after + the child tree number's last "." + """ + parent_treenum = treenum.split(".")[:-1] + return ".".join(parent_treenum) @property def parent_concept_ids(self) -> list[str]: - """Extracts parent MeSH descriptors from JSON.""" - - mesh_data = self.fetch_mesh(self.source_id) - broader_desc = mesh_data.get("broaderDescriptor", []) - - if not isinstance(broader_desc, list): - broader_desc = [broader_desc] - - return [self._remove_id_prefix(desc) for desc in broader_desc] + """Extracts parent MeSH descriptors from tree number lookup.""" + parent_source_ids = set() + + for treenum in self.alternative_ids: + # Make sure the child tree number is not at the top level of the hierarchy + if "." in treenum: + parent_treenum = self._get_parent_treenum(treenum) + parent_source_id = self.treenum_lookup[parent_treenum] + parent_source_ids.add(parent_source_id) + + return list(parent_source_ids) @property def related_concept_ids(self) -> list[str]: @@ -106,10 +105,7 @@ def related_concept_ids(self) -> list[str]: for desc_elem in self.raw_concept.findall( "SeeRelatedDescriptor//DescriptorReferredTo//DescriptorUI" ): - if isinstance(desc_elem, ET.Element): - desc = desc_elem.text - assert isinstance(desc, str) - related_descriptors.append(desc) + related_descriptors.append(assert_get_text(desc_elem)) return related_descriptors From 1170e5e8598d572ee23d9980d4394b84f5624271 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 15 Jan 2025 10:54:14 +0000 Subject: [PATCH 074/310] Expand docstring on parent ID extraction --- src/transformers/mesh/raw_concept.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index f81325699e..cda4015a88 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -85,7 +85,12 @@ def _get_parent_treenum(treenum: str) -> str: @property def parent_concept_ids(self) -> list[str]: - """Extracts parent MeSH descriptors from tree number lookup.""" + """ + Extracts parent MeSH descriptors from the tree number lookup. + This is possible because each concept's MeSH tree number encodes + its hierarchical relationships, e.g.: The parent tree number + of a MeSH term with tree number "A10.690.552.500" is "A10.690.552" + """ parent_source_ids = set() for treenum in self.alternative_ids: From 4d6a35ea85c14c5fc5ec1baa3c7d398979ecdc9c Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Wed, 15 Jan 2025 10:55:37 +0000 Subject: [PATCH 075/310] Apply auto-formatting rules --- src/sources/mesh/concepts_source.py | 4 ++-- src/transformers/mesh/concepts_transformer.py | 6 ++++-- src/transformers/mesh/locations_transformer.py | 8 ++++++-- src/transformers/mesh/raw_concept.py | 6 +++--- src/utils/xml.py | 11 ++++++----- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index b5ffe0abf6..6c0e6ac3ce 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -24,7 +24,7 @@ def _get_mesh_data(url: str) -> ET.Element: def _treenum_lookup(self) -> dict[str, str]: """ - Creates lookup for MeSH tree numbers. This is needed to extract parent MeSH IDs, + Creates lookup for MeSH tree numbers. This is needed to extract parent MeSH IDs, which are not directly available in the XML DescriptorRecord. """ treenum_lookup = {} @@ -34,7 +34,7 @@ def _treenum_lookup(self) -> dict[str, str]: treenum_lookup[assert_get_text(treenum)] = assert_get_text(desc_ui) return treenum_lookup - def stream_raw(self) -> Generator[tuple[ET.Element, dict[str,str]]]: + def stream_raw(self) -> Generator[tuple[ET.Element, dict[str, str]]]: """Streams raw nodes together with tree number lookup.""" for elem in self.mesh_data.iter(): if elem.tag == "DescriptorRecord": diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index cf61634e3e..417e87668c 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -13,7 +13,9 @@ class MeSHConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node(self, raw_node: tuple[ET.Element, dict[str,str]]) -> SourceConcept | None: + def transform_node( + self, raw_node: tuple[ET.Element, dict[str, str]] + ) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: @@ -29,7 +31,7 @@ def transform_node(self, raw_node: tuple[ET.Element, dict[str,str]]) -> SourceCo ) def extract_edges( - self, raw_node: tuple[ET.Element, dict[str,str]] + self, raw_node: tuple[ET.Element, dict[str, str]] ) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: raw_concept = RawMeSHConcept(raw_node) diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index f5cb4e5dc4..64ec78d130 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -13,7 +13,9 @@ class MeSHLocationsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node(self, raw_node: tuple[ET.Element, dict[str,str]]) -> SourceConcept | None: + def transform_node( + self, raw_node: tuple[ET.Element, dict[str, str]] + ) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if not raw_concept.is_geographic: @@ -28,6 +30,8 @@ def transform_node(self, raw_node: tuple[ET.Element, dict[str,str]]) -> SourceCo description=raw_concept.description, ) - def extract_edges(self, raw_node: tuple[ET.Element, dict[str,str]]) -> Generator[BaseEdge]: + def extract_edges( + self, raw_node: tuple[ET.Element, dict[str, str]] + ) -> Generator[BaseEdge]: """There are no edges to extract from MeSH Locations.""" yield from () diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index cda4015a88..754ef191f0 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -9,7 +9,7 @@ class RawMeSHConcept: - def __init__(self, raw_concept: tuple[ET.Element, dict[str,str]]): + def __init__(self, raw_concept: tuple[ET.Element, dict[str, str]]): self.raw_concept = raw_concept[0] self.treenum_lookup = raw_concept[1] self.source: Literal["nlm-mesh"] = "nlm-mesh" @@ -88,7 +88,7 @@ def parent_concept_ids(self) -> list[str]: """ Extracts parent MeSH descriptors from the tree number lookup. This is possible because each concept's MeSH tree number encodes - its hierarchical relationships, e.g.: The parent tree number + its hierarchical relationships, e.g.: The parent tree number of a MeSH term with tree number "A10.690.552.500" is "A10.690.552" """ parent_source_ids = set() @@ -99,7 +99,7 @@ def parent_concept_ids(self) -> list[str]: parent_treenum = self._get_parent_treenum(treenum) parent_source_id = self.treenum_lookup[parent_treenum] parent_source_ids.add(parent_source_id) - + return list(parent_source_ids) @property diff --git a/src/utils/xml.py b/src/utils/xml.py index e66fb481d9..eaf2399574 100644 --- a/src/utils/xml.py +++ b/src/utils/xml.py @@ -1,11 +1,12 @@ -from typing import Any import xml.etree.ElementTree as ET +from typing import Any + def assert_get_text(xml_element: Any) -> str: """Asserts that the given element is XML contatining text and returns text.""" - assert(isinstance(xml_element, ET.Element)) - + assert isinstance(xml_element, ET.Element) + elem_text = xml_element.text - assert(isinstance(elem_text, str)) - + assert isinstance(elem_text, str) + return elem_text From 6df00cf0845a4e1ec084e709616a941bad8b204e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 15 Jan 2025 11:27:23 +0000 Subject: [PATCH 076/310] Validate Wikidata dates --- src/models/graph_node.py | 10 ++++++---- src/transformers/wikidata/raw_concept.py | 14 ++++++++++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/models/graph_node.py b/src/models/graph_node.py index d1d7241a45..364fcc140c 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -1,7 +1,9 @@ -import datetime from typing import Literal, Optional -from pydantic import BaseModel +from pydantic import BaseModel, constr + +# Matches a Wikidata date, such as 1976-01-01T00:00:00Z or -0005-12-12T00:00:00Z +WIKIDATA_DATE_PATTERN = r"-?\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ" # Each node must have a label and an id @@ -30,8 +32,8 @@ class SourceLocation(SourceConcept): # Represents a LoC or Wikidata name. Inherits all fields from SourceConcept, plus other optional fields. class SourceName(SourceConcept): - date_of_birth: Optional[datetime.date] = None - date_of_death: Optional[datetime.date] = None + date_of_birth: Optional[constr(pattern=WIKIDATA_DATE_PATTERN)] + date_of_death: Optional[constr(pattern=WIKIDATA_DATE_PATTERN)] place_of_birth: Optional[str] = None diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 0f32068517..485ce4b0bc 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -93,13 +93,23 @@ def latitude(self) -> float | None: class RawWikidataName(RawWikidataConcept): + def _extract_date(self, field_name: str): + date_value = self._extract_optional_field_value(field_name) + + # When a date is unknown, sometimes Wikidata returns a URL instead of a valid date, such as + # 'http://www.wikidata.org/.well-known/genid/42feb541ed97156abba749622d33f2d9'. When this happens, return None. + if date_value is None or date_value.startswith("http"): + return None + + return date_value + @property def date_of_birth(self) -> str | None: - return self._extract_optional_field_value("dateOfBirthLabel") + return self._extract_date("dateOfBirthLabel") @property def date_of_death(self) -> str | None: - return self._extract_optional_field_value("dateOfDeathLabel") + return self._extract_date("dateOfDeathLabel") @property def place_of_birth(self) -> str | None: From a6a07b4304f201ba387cb9bf877f4daaf1089e3f Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 15 Jan 2025 19:53:48 +0000 Subject: [PATCH 077/310] Remove obsolete ID prefix --- src/sources/mesh/concepts_source.py | 2 ++ src/transformers/mesh/raw_concept.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index 6c0e6ac3ce..e7fb85b7e9 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -28,10 +28,12 @@ def _treenum_lookup(self) -> dict[str, str]: which are not directly available in the XML DescriptorRecord. """ treenum_lookup = {} + for descriptor in self.mesh_data.findall("DescriptorRecord"): desc_ui = descriptor.find("DescriptorUI") for treenum in descriptor.findall("TreeNumberList//TreeNumber"): treenum_lookup[assert_get_text(treenum)] = assert_get_text(desc_ui) + return treenum_lookup def stream_raw(self) -> Generator[tuple[ET.Element, dict[str, str]]]: diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 754ef191f0..0b322513a3 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -5,8 +5,6 @@ from utils.xml import assert_get_text -ID_PREFIX = "http://id.nlm.nih.gov/mesh/" - class RawMeSHConcept: def __init__(self, raw_concept: tuple[ET.Element, dict[str, str]]): From ebed3a388dc850dbebeb25dde36e449ae5394370 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 15 Jan 2025 19:57:46 +0000 Subject: [PATCH 078/310] Remove obsolete method and import --- src/transformers/mesh/raw_concept.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 0b322513a3..95629ba239 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -1,8 +1,6 @@ import xml.etree.ElementTree as ET from typing import Any, Literal -import requests - from utils.xml import assert_get_text @@ -12,11 +10,6 @@ def __init__(self, raw_concept: tuple[ET.Element, dict[str, str]]): self.treenum_lookup = raw_concept[1] self.source: Literal["nlm-mesh"] = "nlm-mesh" - @staticmethod - def _remove_id_prefix(raw_id: str) -> str: - """Removes prefix from MeSH descriptor (only present in extra JSON).""" - return raw_id.removeprefix(ID_PREFIX) - @property def source_id(self) -> str: """Returns MeSH descriptor (unique ID).""" From 7dd6b6877ea9c90ae6d797782042163bc2b62973 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Wed, 15 Jan 2025 19:58:58 +0000 Subject: [PATCH 079/310] Apply auto-formatting rules --- src/sources/mesh/concepts_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index e7fb85b7e9..6d5387a9f9 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -33,7 +33,7 @@ def _treenum_lookup(self) -> dict[str, str]: desc_ui = descriptor.find("DescriptorUI") for treenum in descriptor.findall("TreeNumberList//TreeNumber"): treenum_lookup[assert_get_text(treenum)] = assert_get_text(desc_ui) - + return treenum_lookup def stream_raw(self) -> Generator[tuple[ET.Element, dict[str, str]]]: From 767bff48d7e01639b2e6f003d44dbf04055d4418 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 16 Jan 2025 10:44:02 +0000 Subject: [PATCH 080/310] Remove obsolete import --- src/sources/base_source.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sources/base_source.py b/src/sources/base_source.py index 9397e075aa..ebfaaf3f85 100644 --- a/src/sources/base_source.py +++ b/src/sources/base_source.py @@ -1,4 +1,3 @@ -import xml.etree.ElementTree as ET from collections.abc import Generator from typing import Any From f89d60f5b03deb9d902b3e46ee745d13ed725587 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 16 Jan 2025 10:47:30 +0000 Subject: [PATCH 081/310] Switch to return --- src/transformers/mesh/concepts_transformer.py | 2 +- src/transformers/mesh/locations_transformer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 417e87668c..236c1cbc95 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -36,7 +36,7 @@ def extract_edges( raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: - yield from () + return for parent_id in raw_concept.parent_concept_ids: yield SourceConceptHasParent(from_id=raw_concept.source_id, to_id=parent_id) diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index 64ec78d130..0c062a8d4c 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -34,4 +34,4 @@ def extract_edges( self, raw_node: tuple[ET.Element, dict[str, str]] ) -> Generator[BaseEdge]: """There are no edges to extract from MeSH Locations.""" - yield from () + return From 944cbc536111e3d1ad5b55ee0dfc00c30d8d05ba Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 16 Jan 2025 11:11:49 +0000 Subject: [PATCH 082/310] Add reverse RELATED_TO edge --- src/transformers/mesh/concepts_transformer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 236c1cbc95..800415a836 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -45,3 +45,6 @@ def extract_edges( yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) + yield SourceConceptRelatedTo( + from_id=related_id, to_id=raw_concept.source_id + ) From 0458e59992e2f9f2b2391f8b8e395374ec12e2b3 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Thu, 16 Jan 2025 11:16:07 +0000 Subject: [PATCH 083/310] related_to goes both ways --- src/transformers/loc/concepts_transformer.py | 3 +++ src/transformers/loc/locations_transformer.py | 3 +++ src/transformers/loc/names_transformer.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 102568a75c..cc52b5f3ca 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -40,3 +40,6 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) + yield SourceConceptRelatedTo( + from_id=related_id, to_id=raw_concept.source_id + ) diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 644d7cb09d..7e18556897 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -39,4 +39,7 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | for related_id in raw_concept.related_concept_ids: yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id + ) + yield SourceConceptRelatedTo( + from_id=related_id, to_id=raw_concept.source_id ) \ No newline at end of file diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 4036bdeec5..fee3d66259 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -35,3 +35,6 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) + yield SourceConceptRelatedTo( + from_id=related_id, to_id=raw_concept.source_id + ) From fd33407e31045a88aabb3f03fef6aaac55e7e3f1 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Thu, 16 Jan 2025 11:17:27 +0000 Subject: [PATCH 084/310] whoops update the model to match --- src/models/graph_edge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index 7d9ae0aaef..f0f2e779db 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -20,4 +20,4 @@ class SourceConceptRelatedTo(BaseEdge): from_type: str = "SourceConcept" to_type: str = "SourceConcept" relationship: str = "RELATED_TO" - directed: bool = True \ No newline at end of file + directed: bool = False \ No newline at end of file From 0a9c4fc37cdf4ceab560032d15bd2b4b9b6bac15 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 16 Jan 2025 11:22:23 +0000 Subject: [PATCH 085/310] Add type alias for MeSH node --- src/sources/mesh/concepts_source.py | 5 ++++- src/transformers/mesh/concepts_transformer.py | 5 +++-- src/transformers/mesh/locations_transformer.py | 5 +++-- src/transformers/mesh/raw_concept.py | 3 ++- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index 6d5387a9f9..5b909305f1 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -7,6 +7,9 @@ from utils.xml import assert_get_text +RawMeshNode = tuple[ET.Element, dict[str, str]] + + class MeSHConceptsSource(BaseSource): def __init__(self, url: str): self.mesh_data = self._get_mesh_data(url) @@ -36,7 +39,7 @@ def _treenum_lookup(self) -> dict[str, str]: return treenum_lookup - def stream_raw(self) -> Generator[tuple[ET.Element, dict[str, str]]]: + def stream_raw(self) -> Generator[RawMeshNode]: """Streams raw nodes together with tree number lookup.""" for elem in self.mesh_data.iter(): if elem.tag == "DescriptorRecord": diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 800415a836..4e4d877655 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -7,6 +7,7 @@ from transformers.base_transformer import BaseTransformer from .raw_concept import RawMeSHConcept +from sources.mesh.concepts_source import RawMeshNode class MeSHConceptsTransformer(BaseTransformer): @@ -14,7 +15,7 @@ def __init__(self, url: str): self.source = MeSHConceptsSource(url) def transform_node( - self, raw_node: tuple[ET.Element, dict[str, str]] + self, raw_node: RawMeshNode ) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) @@ -31,7 +32,7 @@ def transform_node( ) def extract_edges( - self, raw_node: tuple[ET.Element, dict[str, str]] + self, raw_node: RawMeshNode ) -> Generator[SourceConceptHasParent | SourceConceptRelatedTo]: raw_concept = RawMeSHConcept(raw_node) diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index 0c062a8d4c..46a746ff8f 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -7,6 +7,7 @@ from transformers.base_transformer import BaseTransformer from .raw_concept import RawMeSHConcept +from sources.mesh.concepts_source import RawMeshNode class MeSHLocationsTransformer(BaseTransformer): @@ -14,7 +15,7 @@ def __init__(self, url: str): self.source = MeSHConceptsSource(url) def transform_node( - self, raw_node: tuple[ET.Element, dict[str, str]] + self, raw_node: RawMeshNode ) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) @@ -31,7 +32,7 @@ def transform_node( ) def extract_edges( - self, raw_node: tuple[ET.Element, dict[str, str]] + self, raw_node: RawMeshNode ) -> Generator[BaseEdge]: """There are no edges to extract from MeSH Locations.""" return diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 95629ba239..59d30b2b15 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -1,11 +1,12 @@ import xml.etree.ElementTree as ET from typing import Any, Literal +from sources.mesh.concepts_source import RawMeshNode from utils.xml import assert_get_text class RawMeSHConcept: - def __init__(self, raw_concept: tuple[ET.Element, dict[str, str]]): + def __init__(self, raw_concept: RawMeshNode): self.raw_concept = raw_concept[0] self.treenum_lookup = raw_concept[1] self.source: Literal["nlm-mesh"] = "nlm-mesh" From 4a03502f168675d5d0cd56297944b87d1865ab3a Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Thu, 16 Jan 2025 11:22:44 +0000 Subject: [PATCH 086/310] changes to satisfy Terraform v1.10.1 --- terraform/terraform.tf | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/terraform/terraform.tf b/terraform/terraform.tf index b7f6f11b4f..481ad60153 100644 --- a/terraform/terraform.tf +++ b/terraform/terraform.tf @@ -2,8 +2,9 @@ terraform { required_version = ">= 0.11" backend "s3" { - role_arn = "arn:aws:iam::760097843905:role/platform-developer" - + assume_role = { + role_arn = "arn:aws:iam::760097843905:role/platform-developer" + } bucket = "wellcomecollection-platform-infra" key = "terraform/catalogue/graph.tfstate" dynamodb_table = "terraform-locktable" @@ -15,7 +16,9 @@ data "terraform_remote_state" "aws_account_infrastructure" { backend = "s3" config = { + assume_role = { role_arn = "arn:aws:iam::760097843905:role/platform-read_only" + } bucket = "wellcomecollection-platform-infra" key = "terraform/aws-account-infrastructure/platform.tfstate" region = "eu-west-1" From c09e1bbe06ad2a310470a6ce4ba759425f22b483 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Thu, 16 Jan 2025 11:24:49 +0000 Subject: [PATCH 087/310] indents --- src/transformers/loc/concepts_transformer.py | 2 +- terraform/terraform.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index cc52b5f3ca..2c54794e65 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -30,7 +30,7 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | if raw_concept.exclude() or raw_concept.is_geographic: return - + for broader_id in raw_concept.broader_concept_ids: yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id diff --git a/terraform/terraform.tf b/terraform/terraform.tf index 481ad60153..b9972b9a6d 100644 --- a/terraform/terraform.tf +++ b/terraform/terraform.tf @@ -17,7 +17,7 @@ data "terraform_remote_state" "aws_account_infrastructure" { config = { assume_role = { - role_arn = "arn:aws:iam::760097843905:role/platform-read_only" + role_arn = "arn:aws:iam::760097843905:role/platform-read_only" } bucket = "wellcomecollection-platform-infra" key = "terraform/aws-account-infrastructure/platform.tfstate" From 3f83eb18f99bd2fed23b51a02685d3e267e20f5c Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Thu, 16 Jan 2025 11:26:14 +0000 Subject: [PATCH 088/310] Apply auto-formatting rules --- src/models/graph_edge.py | 4 +++- src/transformers/loc/concepts_transformer.py | 4 +++- src/transformers/loc/locations_transformer.py | 8 +++++--- src/transformers/loc/names_transformer.py | 2 +- src/transformers/loc/raw_concept.py | 8 ++++---- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index f0f2e779db..85b1211db4 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -10,14 +10,16 @@ class BaseEdge(BaseModel): directed: bool attributes: dict = {} + class SourceConceptNarrowerThan(BaseEdge): from_type: str = "SourceConcept" to_type: str = "SourceConcept" relationship: str = "NARROWER_THAN" directed: bool = True + class SourceConceptRelatedTo(BaseEdge): from_type: str = "SourceConcept" to_type: str = "SourceConcept" relationship: str = "RELATED_TO" - directed: bool = False \ No newline at end of file + directed: bool = False diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 2c54794e65..c277e025fb 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -25,7 +25,9 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: + def extract_edges( + self, raw_node: dict + ) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 7e18556897..f1053d242f 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -25,7 +25,9 @@ def transform_node(self, raw_node: dict) -> SourceLocation | None: alternative_labels=raw_concept.alternative_labels, ) - def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: + def extract_edges( + self, raw_node: dict + ) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: raw_concept = RawLibraryOfCongressConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: @@ -35,11 +37,11 @@ def extract_edges(self, raw_node: dict) -> Generator[SourceConceptNarrowerThan | yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) - + for related_id in raw_concept.related_concept_ids: yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id ) yield SourceConceptRelatedTo( from_id=related_id, to_id=raw_concept.source_id - ) \ No newline at end of file + ) diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index fee3d66259..c6f01bc5cf 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -27,7 +27,7 @@ def transform_node(self, raw_node: dict) -> SourceName | None: def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: raw_concept = RawLibraryOfCongressConcept(raw_node) - + if raw_concept.exclude() or not raw_concept.is_geographic: return diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 3d41d2af47..4f2f26769d 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -88,7 +88,7 @@ def alternative_labels(self) -> list[str]: return [self._extract_label(item) for item in raw_alternative_labels] return [self._extract_label(raw_alternative_labels)] - + def linked_concepts_ids(self, sko_link: str) -> list[str]: """Returns a list of IDs representing concepts which are linked to the current concept""" assert self._raw_concept_node is not None @@ -109,19 +109,19 @@ def linked_concepts_ids(self, sko_link: str) -> list[str]: linked_ids.append(self._remove_id_prefix(concept["@id"])) return linked_ids - + @property def broader_concept_ids(self) -> list[str]: """Returns a list of IDs representing concepts which are broader than the current concept.""" sko_link_type = "broader" return self.linked_concepts_ids(sko_link_type) - + @property def related_concept_ids(self) -> list[str]: """Returns a list of IDs representing concepts which are related to the current concept.""" sko_link_type = "related" return self.linked_concepts_ids(sko_link_type) - + @property def is_geographic(self) -> bool: """Returns True if the node represents a geographic concept, as determined by `skos:notation`.""" From 0574f0cf211286fefeed8a6f5fbb0972b76e0339 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 16 Jan 2025 11:27:08 +0000 Subject: [PATCH 089/310] Switch to return --- src/transformers/mesh/locations_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index 46a746ff8f..7e532a34c3 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -35,4 +35,4 @@ def extract_edges( self, raw_node: RawMeshNode ) -> Generator[BaseEdge]: """There are no edges to extract from MeSH Locations.""" - return + yield from () From 055822195ae16abc17f4d7bcdf6bdc39cf3898cf Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Thu, 16 Jan 2025 11:35:21 +0000 Subject: [PATCH 090/310] Apply auto-formatting rules --- src/sources/mesh/concepts_source.py | 1 - src/transformers/mesh/concepts_transformer.py | 7 ++----- src/transformers/mesh/locations_transformer.py | 11 +++-------- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index 5b909305f1..f8a17ac454 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -6,7 +6,6 @@ from sources.base_source import BaseSource from utils.xml import assert_get_text - RawMeshNode = tuple[ET.Element, dict[str, str]] diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 4e4d877655..54c0849dd7 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -3,20 +3,17 @@ from models.graph_edge import SourceConceptHasParent, SourceConceptRelatedTo from models.graph_node import SourceConcept -from sources.mesh.concepts_source import MeSHConceptsSource +from sources.mesh.concepts_source import MeSHConceptsSource, RawMeshNode from transformers.base_transformer import BaseTransformer from .raw_concept import RawMeSHConcept -from sources.mesh.concepts_source import RawMeshNode class MeSHConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node( - self, raw_node: RawMeshNode - ) -> SourceConcept | None: + def transform_node(self, raw_node: RawMeshNode) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if raw_concept.is_geographic: diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index 7e532a34c3..6d48ad4b82 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -3,20 +3,17 @@ from models.graph_edge import BaseEdge from models.graph_node import SourceConcept -from sources.mesh.concepts_source import MeSHConceptsSource +from sources.mesh.concepts_source import MeSHConceptsSource, RawMeshNode from transformers.base_transformer import BaseTransformer from .raw_concept import RawMeSHConcept -from sources.mesh.concepts_source import RawMeshNode class MeSHLocationsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node( - self, raw_node: RawMeshNode - ) -> SourceConcept | None: + def transform_node(self, raw_node: RawMeshNode) -> SourceConcept | None: raw_concept = RawMeSHConcept(raw_node) if not raw_concept.is_geographic: @@ -31,8 +28,6 @@ def transform_node( description=raw_concept.description, ) - def extract_edges( - self, raw_node: RawMeshNode - ) -> Generator[BaseEdge]: + def extract_edges(self, raw_node: RawMeshNode) -> Generator[BaseEdge]: """There are no edges to extract from MeSH Locations.""" yield from () From 4659323dd5bb387088df782cc56c6fe14d5a0601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 15 Jan 2025 17:16:29 +0000 Subject: [PATCH 091/310] Refactoring --- src/models/graph_edge.py | 8 +- src/sources/wikidata/edges_source.py | 78 ----------- .../wikidata/linked_ontology_source.py | 131 +++++++++++++----- src/sources/wikidata/sparql_query_builder.py | 38 ++++- .../wikidata/concepts_transformer.py | 14 ++ .../wikidata/locations_transformer.py | 4 +- .../wikidata/names_transformer.py | 4 +- src/utils/streaming.py | 46 +++++- 8 files changed, 198 insertions(+), 125 deletions(-) delete mode 100644 src/sources/wikidata/edges_source.py diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index 1c352fa48a..82180353da 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -7,7 +7,6 @@ class BaseEdge(BaseModel): from_id: str to_id: str relationship: str - directed: bool attributes: dict = {} @@ -15,4 +14,9 @@ class SourceConceptNarrowerThan(BaseEdge): from_type: str = "SourceConcept" to_type: str = "SourceConcept" relationship: str = "NARROWER_THAN" - directed: bool = True + + +class SourceConceptSameAs(BaseEdge): + from_type: str = "SourceConcept" + to_type: str = "SourceConcept" + relationship: str = "SAME_AS" diff --git a/src/sources/wikidata/edges_source.py b/src/sources/wikidata/edges_source.py deleted file mode 100644 index e594fe14fc..0000000000 --- a/src/sources/wikidata/edges_source.py +++ /dev/null @@ -1,78 +0,0 @@ -from collections.abc import Generator - -from sources.base_source import BaseSource -from .sparql_client import WikidataSparqlClient -from .sparql_query_builder import SparqlQueryBuilder, NodeType, OntologyType - -import smart_open -import boto3 -import os - -WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" -S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] - - -def extract_wikidata_id(item: dict): - return item["item"]["value"].removeprefix(WIKIDATA_ID_PREFIX) - - -class WikidataEdgesSource(BaseSource): - """ - A source streaming selected Wikidata edges based on the selected linked ontology (LoC or MeSH) - and node type (concepts, locations, or names). For more information, see the `WikidataLinkedOntologySource` class. - """ - - def __init__(self, node_type: NodeType, linked_ontology: OntologyType): - self.client = WikidataSparqlClient() - self.node_type = node_type - self.linked_ontology = linked_ontology - - @staticmethod - def _get_linked_ontology_ids(node_type: NodeType, linked_ontology: OntologyType): - linked_nodes_file_name = f"{linked_ontology}_{node_type}__nodes.csv" - s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" - - ids = set() - transport_params = {"client": boto3.client("s3")} - with smart_open.open(s3_url, "r", transport_params=transport_params) as f: - for line in f: - ids.add(line.split(",")[0]) - - return ids - - def _get_linked_ontology_id_mapping(self) -> list[dict]: - """ - Return a list of _all_ Wikidata items referencing an id from another ontology (LoC or MeSH). Each returned - item is a dictionary containing a Wikidata id and the referenced id. - """ - - # Get all Wikidata items referencing an id from the selected linked ontology - ids_query = SparqlQueryBuilder.get_all_ids_query(self.linked_ontology) - items = self.client.run_query(ids_query) - - if self.node_type in ["concepts", "locations"]: - loc_ids = self._get_linked_ontology_ids( - self.node_type, self.linked_ontology - ) - filtered_items = [i for i in items if i["linkedId"]["value"] in loc_ids] - else: - loc_ids = self._get_linked_ontology_ids( - "concepts", self.linked_ontology - ) | self._get_linked_ontology_ids("locations", self.linked_ontology) - filtered_items = [i for i in items if i["linkedId"]["value"] not in loc_ids] - - ids = [] - for item in filtered_items: - linked_id = item["linkedId"]["value"] - wikidata_id = extract_wikidata_id(item) - ids.append({"wikidata_id": wikidata_id, "linked_id": linked_id}) - - print( - f"Found {len(ids)} Wikidata items referencing a {self.linked_ontology} id of node type {self.node_type}." - ) - return list(ids) - - def stream_raw(self) -> Generator[dict]: - ids = self._get_linked_ontology_id_mapping() - for item in ids: - yield item diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index ca17105bdd..70764c7e63 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -3,21 +3,30 @@ from sources.base_source import BaseSource from .sparql_client import WikidataSparqlClient from .sparql_query_builder import SparqlQueryBuilder, NodeType, OntologyType +import smart_open -import concurrent.futures -from itertools import islice +from utils.streaming import process_stream_in_parallel +import os +from functools import lru_cache +import boto3 -from .edges_source import WikidataEdgesSource -from utils.streaming import generator_to_chunks +SPARQL_ITEMS_CHUNK_SIZE = 400 +MAX_PARALLEL_SPARQL_QUERIES = 3 -SPARQL_ITEMS_CHUNK_SIZE = 300 +S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] +WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" + + +def extract_wikidata_id(item: dict): + return item["item"]["value"].removeprefix(WIKIDATA_ID_PREFIX) class WikidataLinkedOntologySource(BaseSource): """ - A source streaming selected Wikidata nodes or edges based on the selected linked ontology (LoC or MeSH) - and node type (concepts, locations, or names). For example, if a combination of "LoC" and "locations" is selected, + A source for streaming selected Wikidata nodes/edges. There are _many_ Wikidata items, so we cannot store all of + them in the graph. Instead, we only include items which reference an id from a selected linked ontology + (LoC or MeSH). For example, if a combination of "LoC" and "locations" is selected, only Wikidata items referencing LoC geographic nodes are streamed. Wikidata puts strict limits on the resources which can be consumed by a single query, and queries which include @@ -44,47 +53,95 @@ def __init__(self, node_type: NodeType, linked_ontology: OntologyType, entity_ty self.node_type = node_type self.linked_ontology = linked_ontology self.entity_type = entity_type - self.edges_source = WikidataEdgesSource(node_type, linked_ontology) + + @lru_cache + def _get_linked_ontology_ids(self, node_type: NodeType): + linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" + s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + + ids = set() + transport_params = {"client": boto3.client("s3")} + with smart_open.open(s3_url, "r", transport_params=transport_params) as f: + for line in f: + ids.add(line.split(",")[0]) + + return ids + + def _linked_id_exists_in_selected_node_type(self, linked_id: str): + if self.linked_ontology == "mesh": + return True + elif self.linked_ontology == "loc": + if self.node_type in ["concepts", "locations"]: + return linked_id in self._get_linked_ontology_ids(self.node_type) + elif self.node_type == "names": + location_ids = self._get_linked_ontology_ids("locations") + + if linked_id not in location_ids and linked_id[0] == "n": + return True + else: + raise ValueError(f"Invalid node type: {self.linked_ontology}") + else: + raise ValueError(f"Invalid linked ontology {self.linked_ontology}") + + def _stream_raw_edges(self) -> Generator[dict]: + # First, get the ids of _all_ Wikidata items which reference an id from the selected linked ontology + ids_query = SparqlQueryBuilder.get_all_ids_query(self.linked_ontology) + id_items = self.client.run_query(ids_query) + + # Deduplicate. (We could deduplicate as part of the SPARQL query via the 'DISTINCT' keyword, + # but that would make the query significantly slower. It's faster to deduplicate here.) + all_ids = iter(set(extract_wikidata_id(item) for item in id_items)) + + def get_linked_ids(ids_chunk: list[str]) -> list: + query = SparqlQueryBuilder.get_linked_ids_query( + ids_chunk, self.linked_ontology + ) + return self.client.run_query(query) + + # Split ids into chunks. For each chunk, run a separate SPARQL query to retrieve a mapping between Wikidata ids + # and ids from the linked ontology. (We could run a SPARQL query to get _all_ mappings at once, but this query + # is not reliable - sometimes it times out or returns invalid JSON. Getting the mappings in chunks is much + # slower, but it works every time.) + for raw_mapping in process_stream_in_parallel( + all_ids, + get_linked_ids, + SPARQL_ITEMS_CHUNK_SIZE, + MAX_PARALLEL_SPARQL_QUERIES, + ): + linked_id = raw_mapping["linkedId"]["value"] + wikidata_id = extract_wikidata_id(raw_mapping) + mapping = {"wikidata_id": wikidata_id, "linked_id": linked_id} + + # Only yield the mapping if the linked id corresponds to the selected `node_type`. For example, if we + # want to stream Wikidata 'locations' edges but the referenced LoC id is a 'names' id, we skip it. + if self._linked_id_exists_in_selected_node_type(mapping["linked_id"]): + yield mapping def _stream_wikidata_ids(self) -> Generator[dict]: + """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" seen = set() - for item in self.edges_source.stream_raw(): + for item in self._stream_raw_edges(): wikidata_id = item["wikidata_id"] - if wikidata_id in seen: - yield - else: + if wikidata_id not in seen: seen.add(wikidata_id) yield wikidata_id def _stream_raw_nodes(self) -> Generator[dict]: - all_ids = self._stream_wikidata_ids() - chunks = generator_to_chunks(all_ids, SPARQL_ITEMS_CHUNK_SIZE) - - def run_query(chunk) -> list: + def get_linked_items(chunk) -> list: query = SparqlQueryBuilder.get_items_query(chunk, self.node_type) return self.client.run_query(query) - with concurrent.futures.ThreadPoolExecutor() as executor: - # Run the first 3 queries in parallel - futures = {executor.submit(run_query, chunk) for chunk in islice(chunks, 3)} - - while futures: - # Wait for one or more queries to complete - done, futures = concurrent.futures.wait( - futures, return_when=concurrent.futures.FIRST_COMPLETED - ) - - # Top up with new queries to keep the total number of parallel queries at 3 - for chunk in islice(chunks, len(done)): - futures.add(executor.submit(run_query, chunk)) + all_ids = self._stream_wikidata_ids() - for future in done: - items = future.result() - for item in items: - yield item + yield from process_stream_in_parallel( + all_ids, + get_linked_items, + SPARQL_ITEMS_CHUNK_SIZE, + MAX_PARALLEL_SPARQL_QUERIES, + ) def stream_raw(self) -> Generator[dict]: - if self.entity_type == "edges": - return self.edges_source.stream_raw() - - return self._stream_raw_nodes() + if self.entity_type == "nodes": + return self._stream_raw_nodes() + elif self.entity_type == "edges": + return self._stream_raw_edges() diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 794d94a229..4a932b5746 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -56,15 +56,18 @@ def _get_formatted_field_mappings(node_type: NodeType) -> str: @staticmethod def get_all_ids_query(linked_ontology: OntologyType) -> str: + """ + Return a query to retrieve the ids of _all_ Wikidata items referencing an id from the `linked_ontology`. + """ if linked_ontology == "loc": - field_filter = "?item p:P244/ps:P244 ?linkedId." + field_filter = "?item wdt:P244 _:anyValueP244." elif linked_ontology == "mesh": - field_filter = "?item p:P486/ps:P486 ?linkedId." + field_filter = "?item wdt:P486 _:anyValueP486." else: raise ValueError(f"Invalid linked ontology type: {linked_ontology}") get_ids_query = f""" - SELECT ?item ?linkedId WHERE {{ + SELECT ?item WHERE {{ {field_filter} }} """ @@ -73,6 +76,10 @@ def get_all_ids_query(linked_ontology: OntologyType) -> str: @classmethod def get_items_query(cls, item_ids: list[str], node_type: NodeType): + """ + Given a list of Wikidata `item_ids`, return a query to retrieve all required Wikidata fields for each id + in the list. + """ ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) query = f""" @@ -85,3 +92,28 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType): """ return query + + @classmethod + def get_linked_ids_query(cls, item_ids: list[str], linked_ontology: OntologyType): + """ + Given a list of Wikidata `item_ids`, return a query to retrieve all linked ontology ids referenced by each + item in the list. + """ + if linked_ontology == "loc": + field_filter = "?item p:P244/ps:P244 ?linkedId." + elif linked_ontology == "mesh": + field_filter = "?item p:P486/ps:P486 ?linkedId." + else: + raise ValueError(f"Invalid linked ontology type: {linked_ontology}") + + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + + query = f""" + SELECT DISTINCT ?item ?linkedId + WHERE {{ + VALUES ?item {{ {ids_clause} }} + {field_filter} + }} + """ + + return query diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index 2ffee6df04..fc94d8b51a 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,6 +1,8 @@ from models.graph_node import SourceConcept +from models.graph_edge import SourceConceptSameAs from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from transformers.base_transformer import BaseTransformer +from collections.abc import Generator from .raw_concept import RawWikidataConcept @@ -19,3 +21,15 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: alternative_labels=raw_concept.alternative_labels, description=raw_concept.description, ) + + def extract_edges(self, raw_edge: dict) -> Generator[SourceConceptSameAs]: + yield SourceConceptSameAs( + from_id=raw_edge["linked_id"], + to_id=raw_edge["wikidata_id"], + attributes={"source": "wikidata"}, + ) + yield SourceConceptSameAs( + from_id=raw_edge["wikidata_id"], + to_id=raw_edge["linked_id"], + attributes={"source": "wikidata"}, + ) diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 6f021462c5..8d343ca266 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,11 +1,11 @@ from models.graph_node import SourceLocation from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from transformers.base_transformer import BaseTransformer +from .concepts_transformer import WikidataConceptsTransformer from .raw_concept import RawWikidataLocation -class WikidataLocationsTransformer(BaseTransformer): +class WikidataLocationsTransformer(WikidataConceptsTransformer): def __init__(self, entity_type): self.source = WikidataLinkedOntologySource("locations", "loc", entity_type) diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index ce20cd2da9..3d3dcdfc6a 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,11 +1,11 @@ from models.graph_node import SourceName from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from transformers.base_transformer import BaseTransformer +from .concepts_transformer import WikidataConceptsTransformer from .raw_concept import RawWikidataName -class WikidataNamesTransformer(BaseTransformer): +class WikidataNamesTransformer(WikidataConceptsTransformer): def __init__(self, entity_type): self.source = WikidataLinkedOntologySource("names", "loc", entity_type) diff --git a/src/utils/streaming.py b/src/utils/streaming.py index 05854d4b5a..c096e8ee26 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -1,11 +1,55 @@ from collections.abc import Generator from itertools import islice +from typing import Callable, TypeVar, Any +import concurrent.futures +T = TypeVar("T") +S = TypeVar("S") -def generator_to_chunks(items: Generator, chunk_size: int) -> Generator: + +def generator_to_chunks(items: Generator[Any], chunk_size: int) -> Generator[list]: + """ + Split items in a generator into chunks of size `chunk_size` and return another generator yielding the chunks + one by one. + """ while True: chunk = list(islice(items, chunk_size)) if chunk: yield chunk else: return + + +def process_stream_in_parallel( + stream: Generator[T], + process: Callable[[list[T]], list[S]], + chunk_size: int, + thread_count: int, +) -> Generator[S]: + """ + Consume items from `stream` in chunks of size `chunk_size`. Apply the `process` function to each chunk in a new + thread. Keep the number of parallel threads under `thread_count`. Return a single generator streaming the processed + items. + """ + chunks = generator_to_chunks(stream, chunk_size) + + with concurrent.futures.ThreadPoolExecutor() as executor: + # Run the first `thread_count` threads in parallel + futures = { + executor.submit(process, chunk) for chunk in islice(chunks, thread_count) + } + + while futures: + # Wait for one or more threads to complete + done, futures = concurrent.futures.wait( + futures, return_when=concurrent.futures.FIRST_COMPLETED + ) + + # Top up with new queries to keep the total number of parallel threads at `thread_count` + for chunk in islice(chunks, len(done)): + futures.add(executor.submit(process, chunk)) + + for future in done: + items = future.result() + for item in items: + yield item From 89d74fbb86e782dfdc56eb1ee0ab086534ade59e Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 16 Jan 2025 19:58:17 +0000 Subject: [PATCH 092/310] Add MeSH to state machine and update locations label --- src/transformers/mesh/locations_transformer.py | 6 +++--- terraform/variables.tf | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index 6d48ad4b82..13786fb9ce 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -2,7 +2,7 @@ from collections.abc import Generator from models.graph_edge import BaseEdge -from models.graph_node import SourceConcept +from models.graph_node import SourceLocation from sources.mesh.concepts_source import MeSHConceptsSource, RawMeshNode from transformers.base_transformer import BaseTransformer @@ -13,13 +13,13 @@ class MeSHLocationsTransformer(BaseTransformer): def __init__(self, url: str): self.source = MeSHConceptsSource(url) - def transform_node(self, raw_node: RawMeshNode) -> SourceConcept | None: + def transform_node(self, raw_node: RawMeshNode) -> SourceLocation | None: raw_concept = RawMeSHConcept(raw_node) if not raw_concept.is_geographic: return None - return SourceConcept( + return SourceLocation( id=raw_concept.source_id, label=raw_concept.label, source=raw_concept.source, diff --git a/terraform/variables.tf b/terraform/variables.tf index 3729976c46..d36bce0b48 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -28,5 +28,20 @@ variable "state_machine_inputs" { "transformer_type" : "loc_locations", "entity_type" : "edges" }, + { + "label" : "MeSH Concept Nodes", + "transformer_type" : "mesh_concepts", + "entity_type" : "nodes" + }, + { + "label" : "MeSH Location Nodes", + "transformer_type" : "mesh_locations", + "entity_type" : "nodes" + }, + { + "label" : "MeSH Concept Edges", + "transformer_type" : "mesh_concepts", + "entity_type" : "edges" + } ] } From e8c1614c7cf7a81123879b4361daefb3828f236a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 16 Jan 2025 15:52:57 +0000 Subject: [PATCH 093/310] Fix type annotations --- src/converters/cypher/base_converter.py | 4 +-- src/models/graph_node.py | 9 ++--- .../wikidata/linked_ontology_source.py | 33 +++++++++++-------- src/sources/wikidata/sparql_client.py | 5 +-- src/sources/wikidata/sparql_query_builder.py | 6 ++-- .../wikidata/concepts_transformer.py | 4 +-- .../wikidata/locations_transformer.py | 3 +- .../wikidata/names_transformer.py | 3 +- src/transformers/wikidata/raw_concept.py | 7 ++-- src/utils/streaming.py | 6 ++-- 10 files changed, 47 insertions(+), 33 deletions(-) diff --git a/src/converters/cypher/base_converter.py b/src/converters/cypher/base_converter.py index 3bfc6e2139..bb18a7a6ec 100644 --- a/src/converters/cypher/base_converter.py +++ b/src/converters/cypher/base_converter.py @@ -15,8 +15,8 @@ def _convert_bool(self, raw_value: bool) -> str: def _convert_none(self) -> str: return "null" - def _convert_float(self, raw_value: float) -> float: - return raw_value + def _convert_float(self, raw_value: float) -> str: + return str(raw_value) def _convert_list(self, raw_value: list[typing.Any]) -> str: # Neptune does not support lists, so we convert them to a single string with a `||` separator diff --git a/src/models/graph_node.py b/src/models/graph_node.py index 364fcc140c..a6e2e33dfe 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -1,9 +1,10 @@ -from typing import Literal, Optional +from typing import Literal, Optional, Annotated -from pydantic import BaseModel, constr +from pydantic import BaseModel, StringConstraints # Matches a Wikidata date, such as 1976-01-01T00:00:00Z or -0005-12-12T00:00:00Z WIKIDATA_DATE_PATTERN = r"-?\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ" +FormattedDateString = Annotated[str, StringConstraints(pattern=WIKIDATA_DATE_PATTERN)] # Each node must have a label and an id @@ -32,8 +33,8 @@ class SourceLocation(SourceConcept): # Represents a LoC or Wikidata name. Inherits all fields from SourceConcept, plus other optional fields. class SourceName(SourceConcept): - date_of_birth: Optional[constr(pattern=WIKIDATA_DATE_PATTERN)] - date_of_death: Optional[constr(pattern=WIKIDATA_DATE_PATTERN)] + date_of_birth: Optional[FormattedDateString] = None + date_of_death: Optional[FormattedDateString] = None place_of_birth: Optional[str] = None diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 70764c7e63..922b17e01d 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -5,6 +5,8 @@ from .sparql_query_builder import SparqlQueryBuilder, NodeType, OntologyType import smart_open +from transformers.base_transformer import EntityType + from utils.streaming import process_stream_in_parallel import os from functools import lru_cache @@ -18,16 +20,16 @@ WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" -def extract_wikidata_id(item: dict): +def extract_wikidata_id(item: dict) -> str: + assert isinstance(item["item"]["value"], str) return item["item"]["value"].removeprefix(WIKIDATA_ID_PREFIX) class WikidataLinkedOntologySource(BaseSource): """ A source for streaming selected Wikidata nodes/edges. There are _many_ Wikidata items, so we cannot store all of - them in the graph. Instead, we only include items which reference an id from a selected linked ontology - (LoC or MeSH). For example, if a combination of "LoC" and "locations" is selected, - only Wikidata items referencing LoC geographic nodes are streamed. + them in the graph. Instead, we only include items which reference an id from a selected linked ontology, + (LoC or MeSH), as defined by the `linked_ontology` parameter. Wikidata puts strict limits on the resources which can be consumed by a single query, and queries which include filters or do other expensive processing often time out or return a stack overflow error. This means we need @@ -48,14 +50,19 @@ class WikidataLinkedOntologySource(BaseSource): 3. Stream the filtered items as usual. """ - def __init__(self, node_type: NodeType, linked_ontology: OntologyType, entity_type): + def __init__( + self, + node_type: NodeType, + linked_ontology: OntologyType, + entity_type: EntityType, + ): self.client = WikidataSparqlClient() self.node_type = node_type self.linked_ontology = linked_ontology self.entity_type = entity_type @lru_cache - def _get_linked_ontology_ids(self, node_type: NodeType): + def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" @@ -67,7 +74,7 @@ def _get_linked_ontology_ids(self, node_type: NodeType): return ids - def _linked_id_exists_in_selected_node_type(self, linked_id: str): + def _linked_id_exists_in_selected_node_type(self, linked_id: str) -> bool: if self.linked_ontology == "mesh": return True elif self.linked_ontology == "loc": @@ -75,9 +82,7 @@ def _linked_id_exists_in_selected_node_type(self, linked_id: str): return linked_id in self._get_linked_ontology_ids(self.node_type) elif self.node_type == "names": location_ids = self._get_linked_ontology_ids("locations") - - if linked_id not in location_ids and linked_id[0] == "n": - return True + return linked_id not in location_ids and linked_id[0] == "n" else: raise ValueError(f"Invalid node type: {self.linked_ontology}") else: @@ -117,17 +122,17 @@ def get_linked_ids(ids_chunk: list[str]) -> list: if self._linked_id_exists_in_selected_node_type(mapping["linked_id"]): yield mapping - def _stream_wikidata_ids(self) -> Generator[dict]: + def _stream_wikidata_ids(self) -> Generator[str]: """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" seen = set() for item in self._stream_raw_edges(): - wikidata_id = item["wikidata_id"] + wikidata_id: str = item["wikidata_id"] if wikidata_id not in seen: seen.add(wikidata_id) yield wikidata_id def _stream_raw_nodes(self) -> Generator[dict]: - def get_linked_items(chunk) -> list: + def get_linked_items(chunk: list[str]) -> list: query = SparqlQueryBuilder.get_items_query(chunk, self.node_type) return self.client.run_query(query) @@ -145,3 +150,5 @@ def stream_raw(self) -> Generator[dict]: return self._stream_raw_nodes() elif self.entity_type == "edges": return self._stream_raw_edges() + else: + raise ValueError(f"Invalid entity type: {self.entity_type}") diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index a3a50f2b0b..6639aeedc3 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -3,7 +3,7 @@ class WikidataSparqlClient: @staticmethod - def _get_user_agent_header(): + def _get_user_agent_header() -> str: """ Return a User-Agent header value complying with Wikimedia's User-Agent policy: https://foundation.wikimedia.org/wiki/Policy:Wikimedia_Foundation_User-Agent_Policy @@ -24,4 +24,5 @@ def run_query(self, query: str) -> list[dict]: if r.status_code != 200: raise Exception(r.content) - return r.json()["results"]["bindings"] + results: list[dict] = r.json()["results"]["bindings"] + return results diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 4a932b5746..dac920d48d 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -75,7 +75,7 @@ def get_all_ids_query(linked_ontology: OntologyType) -> str: return get_ids_query @classmethod - def get_items_query(cls, item_ids: list[str], node_type: NodeType): + def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: """ Given a list of Wikidata `item_ids`, return a query to retrieve all required Wikidata fields for each id in the list. @@ -94,7 +94,9 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType): return query @classmethod - def get_linked_ids_query(cls, item_ids: list[str], linked_ontology: OntologyType): + def get_linked_ids_query( + cls, item_ids: list[str], linked_ontology: OntologyType + ) -> str: """ Given a list of Wikidata `item_ids`, return a query to retrieve all linked ontology ids referenced by each item in the list. diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index fc94d8b51a..6541e7d4d2 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,14 +1,14 @@ from models.graph_node import SourceConcept from models.graph_edge import SourceConceptSameAs from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from transformers.base_transformer import BaseTransformer +from transformers.base_transformer import BaseTransformer, EntityType from collections.abc import Generator from .raw_concept import RawWikidataConcept class WikidataConceptsTransformer(BaseTransformer): - def __init__(self, entity_type): + def __init__(self, entity_type: EntityType): self.source = WikidataLinkedOntologySource("concepts", "loc", entity_type) def transform_node(self, raw_node: dict) -> SourceConcept | None: diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 8d343ca266..d51e91e719 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,12 +1,13 @@ from models.graph_node import SourceLocation from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from .concepts_transformer import WikidataConceptsTransformer +from transformers.base_transformer import EntityType from .raw_concept import RawWikidataLocation class WikidataLocationsTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type): + def __init__(self, entity_type: EntityType): self.source = WikidataLinkedOntologySource("locations", "loc", entity_type) def transform_node(self, raw_node: dict) -> SourceLocation | None: diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 3d3dcdfc6a..dd78d5a183 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,12 +1,13 @@ from models.graph_node import SourceName from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from .concepts_transformer import WikidataConceptsTransformer +from transformers.base_transformer import EntityType from .raw_concept import RawWikidataName class WikidataNamesTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type): + def __init__(self, entity_type: EntityType): self.source = WikidataLinkedOntologySource("names", "loc", entity_type) def transform_node(self, raw_node: dict) -> SourceName | None: diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 485ce4b0bc..40f3042552 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -12,6 +12,7 @@ def __init__(self, raw_concept: dict): def _extract_field_value(self, field_name: str) -> str: field = self.raw_concept[field_name] assert field["type"] == "literal", self.raw_concept + assert isinstance(field["value"], str) return field["value"] @@ -21,7 +22,7 @@ def _extract_optional_field_value(self, field_name: str) -> str | None: return self._extract_field_value(field_name) - def _extract_english_field_value(self, field_name: str): + def _extract_english_field_value(self, field_name: str) -> str: assert self.raw_concept[field_name]["xml:lang"] == "en" return self._extract_field_value(field_name) @@ -63,7 +64,7 @@ def source(self) -> Literal["wikidata"]: class RawWikidataLocation(RawWikidataConcept): @lru_cache - def _get_coordinates(self) -> dict: + def _get_coordinates(self) -> dict[str, float | None]: """Extracts coordinates from a raw string in the format `Point( )` (e.g. `Point(9.83 53.54)`)""" # Some items do not return valid coordinates (e.g. Q17064702, whose coordinates just say 'unknown value' on the # Wikidata website). When this happens, the 'type' of the 'coordinates' property always appears to be 'uri'. @@ -93,7 +94,7 @@ def latitude(self) -> float | None: class RawWikidataName(RawWikidataConcept): - def _extract_date(self, field_name: str): + def _extract_date(self, field_name: str) -> str | None: date_value = self._extract_optional_field_value(field_name) # When a date is unknown, sometimes Wikidata returns a URL instead of a valid date, such as diff --git a/src/utils/streaming.py b/src/utils/streaming.py index c096e8ee26..282362f14f 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -1,4 +1,4 @@ -from collections.abc import Generator +from collections.abc import Iterator, Generator from itertools import islice from typing import Callable, TypeVar, Any import concurrent.futures @@ -7,7 +7,7 @@ S = TypeVar("S") -def generator_to_chunks(items: Generator[Any], chunk_size: int) -> Generator[list]: +def generator_to_chunks(items: Iterator[Any], chunk_size: int) -> Generator[list]: """ Split items in a generator into chunks of size `chunk_size` and return another generator yielding the chunks one by one. @@ -21,7 +21,7 @@ def generator_to_chunks(items: Generator[Any], chunk_size: int) -> Generator[lis def process_stream_in_parallel( - stream: Generator[T], + stream: Iterator[T], process: Callable[[list[T]], list[S]], chunk_size: int, thread_count: int, From 325ff5180a3e6e126bdf7d0b722d05e752c2ec76 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Fri, 17 Jan 2025 09:21:00 +0000 Subject: [PATCH 094/310] Apply auto-formatting rules --- src/models/graph_node.py | 2 +- src/sources/wikidata/linked_ontology_source.py | 13 ++++++------- src/transformers/create_transformer.py | 1 - src/transformers/wikidata/concepts_transformer.py | 5 +++-- src/transformers/wikidata/locations_transformer.py | 2 +- src/transformers/wikidata/names_transformer.py | 2 +- src/transformers/wikidata/raw_concept.py | 2 +- src/utils/streaming.py | 6 +++--- 8 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/models/graph_node.py b/src/models/graph_node.py index a6e2e33dfe..6706cc2729 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -1,4 +1,4 @@ -from typing import Literal, Optional, Annotated +from typing import Annotated, Literal, Optional from pydantic import BaseModel, StringConstraints diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 922b17e01d..4e773fc5fc 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -1,17 +1,16 @@ +import os from collections.abc import Generator +from functools import lru_cache -from sources.base_source import BaseSource -from .sparql_client import WikidataSparqlClient -from .sparql_query_builder import SparqlQueryBuilder, NodeType, OntologyType +import boto3 import smart_open +from sources.base_source import BaseSource from transformers.base_transformer import EntityType - from utils.streaming import process_stream_in_parallel -import os -from functools import lru_cache -import boto3 +from .sparql_client import WikidataSparqlClient +from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder SPARQL_ITEMS_CHUNK_SIZE = 400 MAX_PARALLEL_SPARQL_QUERIES = 3 diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index a63a797fe3..e2d8fce031 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -10,7 +10,6 @@ from .wikidata.locations_transformer import WikidataLocationsTransformer from .wikidata.names_transformer import WikidataNamesTransformer - LOC_SUBJECT_HEADINGS_URL = ( "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" ) diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index 6541e7d4d2..866c640664 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,8 +1,9 @@ -from models.graph_node import SourceConcept +from collections.abc import Generator + from models.graph_edge import SourceConceptSameAs +from models.graph_node import SourceConcept from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from transformers.base_transformer import BaseTransformer, EntityType -from collections.abc import Generator from .raw_concept import RawWikidataConcept diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index d51e91e719..ced1f74e50 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,8 +1,8 @@ from models.graph_node import SourceLocation from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from .concepts_transformer import WikidataConceptsTransformer from transformers.base_transformer import EntityType +from .concepts_transformer import WikidataConceptsTransformer from .raw_concept import RawWikidataLocation diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index dd78d5a183..32ef231a95 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,8 +1,8 @@ from models.graph_node import SourceName from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from .concepts_transformer import WikidataConceptsTransformer from transformers.base_transformer import EntityType +from .concepts_transformer import WikidataConceptsTransformer from .raw_concept import RawWikidataName diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 40f3042552..d27f31f4ee 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -1,6 +1,6 @@ -from typing import Literal import re from functools import lru_cache +from typing import Literal WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" diff --git a/src/utils/streaming.py b/src/utils/streaming.py index 282362f14f..ab3a1ee084 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -1,7 +1,7 @@ -from collections.abc import Iterator, Generator -from itertools import islice -from typing import Callable, TypeVar, Any import concurrent.futures +from collections.abc import Generator, Iterator +from itertools import islice +from typing import Any, Callable, TypeVar T = TypeVar("T") S = TypeVar("S") From 896e2b89a9a0a086b8359a2a85d011150b4499bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 16 Jan 2025 15:52:57 +0000 Subject: [PATCH 095/310] Fix type annotations --- src/converters/cypher/base_converter.py | 4 +- src/models/graph_edge.py | 3 ++ src/models/graph_node.py | 9 +++-- .../wikidata/linked_ontology_source.py | 38 +++++++++++-------- src/sources/wikidata/sparql_client.py | 5 ++- src/sources/wikidata/sparql_query_builder.py | 6 ++- .../wikidata/concepts_transformer.py | 4 +- .../wikidata/locations_transformer.py | 3 +- .../wikidata/names_transformer.py | 3 +- src/transformers/wikidata/raw_concept.py | 7 ++-- src/utils/streaming.py | 6 +-- 11 files changed, 53 insertions(+), 35 deletions(-) diff --git a/src/converters/cypher/base_converter.py b/src/converters/cypher/base_converter.py index 3bfc6e2139..bb18a7a6ec 100644 --- a/src/converters/cypher/base_converter.py +++ b/src/converters/cypher/base_converter.py @@ -15,8 +15,8 @@ def _convert_bool(self, raw_value: bool) -> str: def _convert_none(self) -> str: return "null" - def _convert_float(self, raw_value: float) -> float: - return raw_value + def _convert_float(self, raw_value: float) -> str: + return str(raw_value) def _convert_list(self, raw_value: list[typing.Any]) -> str: # Neptune does not support lists, so we convert them to a single string with a `||` separator diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index 4d3e4c020f..2d9afd125e 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -7,6 +7,7 @@ class BaseEdge(BaseModel): from_id: str to_id: str relationship: str + directed: bool attributes: dict = {} @@ -14,12 +15,14 @@ class SourceConceptNarrowerThan(BaseEdge): from_type: str = "SourceConcept" to_type: str = "SourceConcept" relationship: str = "NARROWER_THAN" + directed: bool = True class SourceConceptSameAs(BaseEdge): from_type: str = "SourceConcept" to_type: str = "SourceConcept" relationship: str = "SAME_AS" + directed: bool = False class SourceConceptRelatedTo(BaseEdge): diff --git a/src/models/graph_node.py b/src/models/graph_node.py index 364fcc140c..a6e2e33dfe 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -1,9 +1,10 @@ -from typing import Literal, Optional +from typing import Literal, Optional, Annotated -from pydantic import BaseModel, constr +from pydantic import BaseModel, StringConstraints # Matches a Wikidata date, such as 1976-01-01T00:00:00Z or -0005-12-12T00:00:00Z WIKIDATA_DATE_PATTERN = r"-?\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ" +FormattedDateString = Annotated[str, StringConstraints(pattern=WIKIDATA_DATE_PATTERN)] # Each node must have a label and an id @@ -32,8 +33,8 @@ class SourceLocation(SourceConcept): # Represents a LoC or Wikidata name. Inherits all fields from SourceConcept, plus other optional fields. class SourceName(SourceConcept): - date_of_birth: Optional[constr(pattern=WIKIDATA_DATE_PATTERN)] - date_of_death: Optional[constr(pattern=WIKIDATA_DATE_PATTERN)] + date_of_birth: Optional[FormattedDateString] = None + date_of_death: Optional[FormattedDateString] = None place_of_birth: Optional[str] = None diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 70764c7e63..313ba2abc7 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -5,6 +5,8 @@ from .sparql_query_builder import SparqlQueryBuilder, NodeType, OntologyType import smart_open +from transformers.base_transformer import EntityType + from utils.streaming import process_stream_in_parallel import os from functools import lru_cache @@ -18,16 +20,16 @@ WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" -def extract_wikidata_id(item: dict): +def extract_wikidata_id(item: dict) -> str: + assert isinstance(item["item"]["value"], str) return item["item"]["value"].removeprefix(WIKIDATA_ID_PREFIX) class WikidataLinkedOntologySource(BaseSource): """ A source for streaming selected Wikidata nodes/edges. There are _many_ Wikidata items, so we cannot store all of - them in the graph. Instead, we only include items which reference an id from a selected linked ontology - (LoC or MeSH). For example, if a combination of "LoC" and "locations" is selected, - only Wikidata items referencing LoC geographic nodes are streamed. + them in the graph. Instead, we only include items which reference an id from a selected linked ontology, + (LoC or MeSH), as defined by the `linked_ontology` parameter. Wikidata puts strict limits on the resources which can be consumed by a single query, and queries which include filters or do other expensive processing often time out or return a stack overflow error. This means we need @@ -48,14 +50,19 @@ class WikidataLinkedOntologySource(BaseSource): 3. Stream the filtered items as usual. """ - def __init__(self, node_type: NodeType, linked_ontology: OntologyType, entity_type): + def __init__( + self, + node_type: NodeType, + linked_ontology: OntologyType, + entity_type: EntityType, + ): self.client = WikidataSparqlClient() self.node_type = node_type self.linked_ontology = linked_ontology self.entity_type = entity_type @lru_cache - def _get_linked_ontology_ids(self, node_type: NodeType): + def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" @@ -67,7 +74,7 @@ def _get_linked_ontology_ids(self, node_type: NodeType): return ids - def _linked_id_exists_in_selected_node_type(self, linked_id: str): + def _linked_id_exists_in_selected_node_type(self, linked_id: str) -> bool: if self.linked_ontology == "mesh": return True elif self.linked_ontology == "loc": @@ -75,9 +82,7 @@ def _linked_id_exists_in_selected_node_type(self, linked_id: str): return linked_id in self._get_linked_ontology_ids(self.node_type) elif self.node_type == "names": location_ids = self._get_linked_ontology_ids("locations") - - if linked_id not in location_ids and linked_id[0] == "n": - return True + return linked_id not in location_ids and linked_id[0] == "n" else: raise ValueError(f"Invalid node type: {self.linked_ontology}") else: @@ -112,22 +117,23 @@ def get_linked_ids(ids_chunk: list[str]) -> list: wikidata_id = extract_wikidata_id(raw_mapping) mapping = {"wikidata_id": wikidata_id, "linked_id": linked_id} - # Only yield the mapping if the linked id corresponds to the selected `node_type`. For example, if we - # want to stream Wikidata 'locations' edges but the referenced LoC id is a 'names' id, we skip it. + # Only yield the mapping if the linked id corresponds to the selected `node_type`, as determined by the + # linked ontology. For example, if we want to stream Wikidata 'names' edges, but we classify the referenced + # LoC id is a 'locations' id, we skip it. if self._linked_id_exists_in_selected_node_type(mapping["linked_id"]): yield mapping - def _stream_wikidata_ids(self) -> Generator[dict]: + def _stream_wikidata_ids(self) -> Generator[str]: """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" seen = set() for item in self._stream_raw_edges(): - wikidata_id = item["wikidata_id"] + wikidata_id: str = item["wikidata_id"] if wikidata_id not in seen: seen.add(wikidata_id) yield wikidata_id def _stream_raw_nodes(self) -> Generator[dict]: - def get_linked_items(chunk) -> list: + def get_linked_items(chunk: list[str]) -> list: query = SparqlQueryBuilder.get_items_query(chunk, self.node_type) return self.client.run_query(query) @@ -145,3 +151,5 @@ def stream_raw(self) -> Generator[dict]: return self._stream_raw_nodes() elif self.entity_type == "edges": return self._stream_raw_edges() + else: + raise ValueError(f"Invalid entity type: {self.entity_type}") diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index a3a50f2b0b..6639aeedc3 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -3,7 +3,7 @@ class WikidataSparqlClient: @staticmethod - def _get_user_agent_header(): + def _get_user_agent_header() -> str: """ Return a User-Agent header value complying with Wikimedia's User-Agent policy: https://foundation.wikimedia.org/wiki/Policy:Wikimedia_Foundation_User-Agent_Policy @@ -24,4 +24,5 @@ def run_query(self, query: str) -> list[dict]: if r.status_code != 200: raise Exception(r.content) - return r.json()["results"]["bindings"] + results: list[dict] = r.json()["results"]["bindings"] + return results diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 4a932b5746..dac920d48d 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -75,7 +75,7 @@ def get_all_ids_query(linked_ontology: OntologyType) -> str: return get_ids_query @classmethod - def get_items_query(cls, item_ids: list[str], node_type: NodeType): + def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: """ Given a list of Wikidata `item_ids`, return a query to retrieve all required Wikidata fields for each id in the list. @@ -94,7 +94,9 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType): return query @classmethod - def get_linked_ids_query(cls, item_ids: list[str], linked_ontology: OntologyType): + def get_linked_ids_query( + cls, item_ids: list[str], linked_ontology: OntologyType + ) -> str: """ Given a list of Wikidata `item_ids`, return a query to retrieve all linked ontology ids referenced by each item in the list. diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index fc94d8b51a..6541e7d4d2 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,14 +1,14 @@ from models.graph_node import SourceConcept from models.graph_edge import SourceConceptSameAs from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from transformers.base_transformer import BaseTransformer +from transformers.base_transformer import BaseTransformer, EntityType from collections.abc import Generator from .raw_concept import RawWikidataConcept class WikidataConceptsTransformer(BaseTransformer): - def __init__(self, entity_type): + def __init__(self, entity_type: EntityType): self.source = WikidataLinkedOntologySource("concepts", "loc", entity_type) def transform_node(self, raw_node: dict) -> SourceConcept | None: diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 8d343ca266..d51e91e719 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,12 +1,13 @@ from models.graph_node import SourceLocation from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from .concepts_transformer import WikidataConceptsTransformer +from transformers.base_transformer import EntityType from .raw_concept import RawWikidataLocation class WikidataLocationsTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type): + def __init__(self, entity_type: EntityType): self.source = WikidataLinkedOntologySource("locations", "loc", entity_type) def transform_node(self, raw_node: dict) -> SourceLocation | None: diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 3d3dcdfc6a..dd78d5a183 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,12 +1,13 @@ from models.graph_node import SourceName from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from .concepts_transformer import WikidataConceptsTransformer +from transformers.base_transformer import EntityType from .raw_concept import RawWikidataName class WikidataNamesTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type): + def __init__(self, entity_type: EntityType): self.source = WikidataLinkedOntologySource("names", "loc", entity_type) def transform_node(self, raw_node: dict) -> SourceName | None: diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 485ce4b0bc..40f3042552 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -12,6 +12,7 @@ def __init__(self, raw_concept: dict): def _extract_field_value(self, field_name: str) -> str: field = self.raw_concept[field_name] assert field["type"] == "literal", self.raw_concept + assert isinstance(field["value"], str) return field["value"] @@ -21,7 +22,7 @@ def _extract_optional_field_value(self, field_name: str) -> str | None: return self._extract_field_value(field_name) - def _extract_english_field_value(self, field_name: str): + def _extract_english_field_value(self, field_name: str) -> str: assert self.raw_concept[field_name]["xml:lang"] == "en" return self._extract_field_value(field_name) @@ -63,7 +64,7 @@ def source(self) -> Literal["wikidata"]: class RawWikidataLocation(RawWikidataConcept): @lru_cache - def _get_coordinates(self) -> dict: + def _get_coordinates(self) -> dict[str, float | None]: """Extracts coordinates from a raw string in the format `Point( )` (e.g. `Point(9.83 53.54)`)""" # Some items do not return valid coordinates (e.g. Q17064702, whose coordinates just say 'unknown value' on the # Wikidata website). When this happens, the 'type' of the 'coordinates' property always appears to be 'uri'. @@ -93,7 +94,7 @@ def latitude(self) -> float | None: class RawWikidataName(RawWikidataConcept): - def _extract_date(self, field_name: str): + def _extract_date(self, field_name: str) -> str | None: date_value = self._extract_optional_field_value(field_name) # When a date is unknown, sometimes Wikidata returns a URL instead of a valid date, such as diff --git a/src/utils/streaming.py b/src/utils/streaming.py index c096e8ee26..282362f14f 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -1,4 +1,4 @@ -from collections.abc import Generator +from collections.abc import Iterator, Generator from itertools import islice from typing import Callable, TypeVar, Any import concurrent.futures @@ -7,7 +7,7 @@ S = TypeVar("S") -def generator_to_chunks(items: Generator[Any], chunk_size: int) -> Generator[list]: +def generator_to_chunks(items: Iterator[Any], chunk_size: int) -> Generator[list]: """ Split items in a generator into chunks of size `chunk_size` and return another generator yielding the chunks one by one. @@ -21,7 +21,7 @@ def generator_to_chunks(items: Generator[Any], chunk_size: int) -> Generator[lis def process_stream_in_parallel( - stream: Generator[T], + stream: Iterator[T], process: Callable[[list[T]], list[S]], chunk_size: int, thread_count: int, From f540c8d0ba59134e70f32cf8e257488d3d964749 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Fri, 17 Jan 2025 09:30:49 +0000 Subject: [PATCH 096/310] Apply auto-formatting rules --- src/sources/wikidata/linked_ontology_source.py | 1 - src/transformers/wikidata/concepts_transformer.py | 7 +++---- src/transformers/wikidata/locations_transformer.py | 2 +- src/transformers/wikidata/names_transformer.py | 2 +- src/utils/streaming.py | 7 +++---- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 2210866497..a2eb5924d6 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -7,7 +7,6 @@ from sources.base_source import BaseSource from transformers.base_transformer import EntityType - from utils.streaming import process_stream_in_parallel from .sparql_client import WikidataSparqlClient diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index d54442dd51..866c640664 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,11 +1,10 @@ -from models.graph_node import SourceConcept +from collections.abc import Generator + from models.graph_edge import SourceConceptSameAs +from models.graph_node import SourceConcept from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from transformers.base_transformer import BaseTransformer, EntityType -from collections.abc import Generator - - from .raw_concept import RawWikidataConcept diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index d51e91e719..ced1f74e50 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,8 +1,8 @@ from models.graph_node import SourceLocation from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from .concepts_transformer import WikidataConceptsTransformer from transformers.base_transformer import EntityType +from .concepts_transformer import WikidataConceptsTransformer from .raw_concept import RawWikidataLocation diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index dd78d5a183..32ef231a95 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,8 +1,8 @@ from models.graph_node import SourceName from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from .concepts_transformer import WikidataConceptsTransformer from transformers.base_transformer import EntityType +from .concepts_transformer import WikidataConceptsTransformer from .raw_concept import RawWikidataName diff --git a/src/utils/streaming.py b/src/utils/streaming.py index 3d62649f7f..ab3a1ee084 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -1,8 +1,7 @@ -from collections.abc import Iterator, Generator -from itertools import islice -from typing import Callable, TypeVar, Any - import concurrent.futures +from collections.abc import Generator, Iterator +from itertools import islice +from typing import Any, Callable, TypeVar T = TypeVar("T") S = TypeVar("S") From 44067e2d538f17ce46b1f5c9943677f99d4e5ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 17 Jan 2025 09:56:05 +0000 Subject: [PATCH 097/310] Expand comments --- src/sources/wikidata/linked_ontology_source.py | 6 ++++++ src/sources/wikidata/sparql_query_builder.py | 3 --- src/utils/streaming.py | 8 +++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 2210866497..5c2135e505 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -14,6 +14,10 @@ from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder SPARQL_ITEMS_CHUNK_SIZE = 400 + +# Wikidata limits the number of parallel queries from a single IP address to 5. +# See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits +# However, experimentally, running more than 3 queries in parallel eventually results a '429 Too Many Requests' error. MAX_PARALLEL_SPARQL_QUERIES = 3 S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] @@ -34,6 +38,8 @@ class WikidataLinkedOntologySource(BaseSource): Wikidata puts strict limits on the resources which can be consumed by a single query, and queries which include filters or do other expensive processing often time out or return a stack overflow error. This means we need to use a somewhat convoluted way for extracting the Wikidata nodes/edges we need. + See https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization for more information on how + to optimise SPARQL queries. To extract nodes: 1. Run a SPARQL query which retrieves _all_ Wikidata ids referencing an id from the selected linked ontology. diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index dac920d48d..1e0535173e 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -3,9 +3,6 @@ NodeType = Literal["concepts", "names", "locations"] OntologyType = Literal["mesh", "loc"] -# https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization -# "https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain", - class SparqlQueryBuilder: """ diff --git a/src/utils/streaming.py b/src/utils/streaming.py index 3d62649f7f..aae5c9aadf 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -28,9 +28,11 @@ def process_stream_in_parallel( thread_count: int, ) -> Generator[S]: """ - Consume items from `stream` in chunks of size `chunk_size`. Apply the `process` function to each chunk in a new - thread. Keep the number of parallel threads under `thread_count`. Return a single generator streaming the processed - items. + Process items from a stream in parallel using multiple threads. Return a single generator streaming + the processed items. + + Items are consumed from `stream` in chunks of size `chunk_size`. The `process` function is applied to each chunk in + a separate thread. The number of parallel threads is kept under `thread_count`. """ chunks = generator_to_chunks(stream, chunk_size) From 2bb7b406de3fb1cdab1a8c526051d9d331345b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 17 Jan 2025 10:02:22 +0000 Subject: [PATCH 098/310] Refactoring --- src/sources/wikidata/linked_ontology_source.py | 4 ++++ src/transformers/wikidata/raw_concept.py | 9 ++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 94c5a1230c..869e7f7387 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -24,7 +24,11 @@ def extract_wikidata_id(item: dict) -> str: + """ + Accepts a raw `item` dictionary returned by the Wikidata SPARQL endpoint and returns the Wikidata id of the item. + """ assert isinstance(item["item"]["value"], str) + assert item["item"]["type"] == "uri" return item["item"]["value"].removeprefix(WIKIDATA_ID_PREFIX) diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index d27f31f4ee..23fb338a1e 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -1,6 +1,7 @@ import re from functools import lru_cache from typing import Literal +from sources.wikidata.linked_ontology_source import extract_wikidata_id WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" @@ -26,15 +27,9 @@ def _extract_english_field_value(self, field_name: str) -> str: assert self.raw_concept[field_name]["xml:lang"] == "en" return self._extract_field_value(field_name) - @staticmethod - def _remove_id_prefix(raw_id: str) -> str: - return raw_id.removeprefix(WIKIDATA_ID_PREFIX) - @property def source_id(self) -> str: - item_field = self.raw_concept["item"] - assert item_field["type"] == "uri" - return self._remove_id_prefix(item_field["value"]) + return extract_wikidata_id(self.raw_concept) @property def label(self) -> str: From 895c4db08123f4e0804369b3c52c21fa145a2b4e Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Fri, 17 Jan 2025 10:03:25 +0000 Subject: [PATCH 099/310] Apply auto-formatting rules --- src/transformers/wikidata/raw_concept.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 23fb338a1e..eec789f0a2 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -1,6 +1,7 @@ import re from functools import lru_cache from typing import Literal + from sources.wikidata.linked_ontology_source import extract_wikidata_id WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" From e3079c9825acebd0d66e75f318e0cb23a114ae86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 17 Jan 2025 10:02:22 +0000 Subject: [PATCH 100/310] Refactoring --- src/sources/wikidata/linked_ontology_source.py | 4 ++++ src/transformers/wikidata/raw_concept.py | 11 ++--------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 94c5a1230c..869e7f7387 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -24,7 +24,11 @@ def extract_wikidata_id(item: dict) -> str: + """ + Accepts a raw `item` dictionary returned by the Wikidata SPARQL endpoint and returns the Wikidata id of the item. + """ assert isinstance(item["item"]["value"], str) + assert item["item"]["type"] == "uri" return item["item"]["value"].removeprefix(WIKIDATA_ID_PREFIX) diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index d27f31f4ee..27837723bf 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -1,8 +1,7 @@ import re from functools import lru_cache from typing import Literal - -WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" +from sources.wikidata.linked_ontology_source import extract_wikidata_id class RawWikidataConcept: @@ -26,15 +25,9 @@ def _extract_english_field_value(self, field_name: str) -> str: assert self.raw_concept[field_name]["xml:lang"] == "en" return self._extract_field_value(field_name) - @staticmethod - def _remove_id_prefix(raw_id: str) -> str: - return raw_id.removeprefix(WIKIDATA_ID_PREFIX) - @property def source_id(self) -> str: - item_field = self.raw_concept["item"] - assert item_field["type"] == "uri" - return self._remove_id_prefix(item_field["value"]) + return extract_wikidata_id(self.raw_concept) @property def label(self) -> str: From 2bd733fcf1f5676c6bf623de5981343f425794d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 17 Jan 2025 10:08:49 +0000 Subject: [PATCH 101/310] Refactoring --- src/transformers/wikidata/concepts_transformer.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index 866c640664..ccbbbc86f8 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -24,13 +24,12 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: ) def extract_edges(self, raw_edge: dict) -> Generator[SourceConceptSameAs]: + linked_id, wikidata_id = raw_edge["linked_id"], raw_edge["wikidata_id"] + edge_attributes = {"source": "wikidata"} + yield SourceConceptSameAs( - from_id=raw_edge["linked_id"], - to_id=raw_edge["wikidata_id"], - attributes={"source": "wikidata"}, + from_id=linked_id, to_id=wikidata_id, attributes=edge_attributes ) yield SourceConceptSameAs( - from_id=raw_edge["wikidata_id"], - to_id=raw_edge["linked_id"], - attributes={"source": "wikidata"}, + from_id=wikidata_id, to_id=linked_id, attributes=edge_attributes ) From dba9a5bf93d06f783d31ea0b5e579d6f720d7ce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 17 Jan 2025 11:54:59 +0000 Subject: [PATCH 102/310] Add support for non-string fields when bulk loading --- src/converters/cypher/base_converter.py | 2 +- src/converters/cypher/bulk_load_converter.py | 29 ++++++++++++++++---- src/converters/cypher/query_converter.py | 3 ++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/converters/cypher/base_converter.py b/src/converters/cypher/base_converter.py index bb18a7a6ec..9d563b2428 100644 --- a/src/converters/cypher/base_converter.py +++ b/src/converters/cypher/base_converter.py @@ -13,7 +13,7 @@ def _convert_bool(self, raw_value: bool) -> str: return str(raw_value).lower() def _convert_none(self) -> str: - return "null" + return "" def _convert_float(self, raw_value: float) -> str: return str(raw_value) diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index b8780fd0e4..dc525be81a 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -1,7 +1,8 @@ from typing import Literal, cast from models.graph_edge import BaseEdge -from models.graph_node import BaseNode +from models.graph_node import BaseNode, SourceLocation, SourceName +from pydantic import BaseModel from .base_converter import CypherBaseConverter @@ -10,12 +11,29 @@ class CypherBulkLoadConverter(CypherBaseConverter): def __init__(self, entity_type: Literal["nodes", "edges"]): self.entity_type = entity_type + def _get_bulk_loader_column_header(self, model: BaseModel, field_name: str) -> str: + """ + Return a Neptune bulk loader column header, defining the name and type of the column. See here for more info: + https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load-tutorial-format-opencypher.html#bulk-load-tutorial-format-opencypher-data-types + """ + # Most fields are stored as strings + field_type = "String" + if isinstance(model, SourceLocation): + if field_name in {"longitude", "latitude"}: + field_type = "Float" + if isinstance(model, SourceName): + if field_name in {"date_of_birth", "date_of_death"}: + field_type = "DateTime" + + return f"{field_name}:{field_type}" + def _node_to_bulk_cypher(self, model: BaseNode) -> dict: bulk_node = {":ID": model.id, ":LABEL": type(model).__name__} - for key, raw_value in model.dict().items(): + for field_name, raw_value in model.dict().items(): + column_header = self._get_bulk_loader_column_header(model, field_name) value = self._raw_value_to_cypher_value(raw_value) - bulk_node[key] = value + bulk_node[column_header] = value return bulk_node @@ -27,9 +45,10 @@ def _edge_to_bulk_cypher(self, model: BaseEdge) -> dict: ":TYPE": model.relationship, } - for key, raw_value in model.attributes.items(): + for field_name, raw_value in model.attributes.items(): + column_header = self._get_bulk_loader_column_header(model, field_name) value = self._raw_value_to_cypher_value(raw_value) - bulk_edge[key] = value + bulk_edge[column_header] = value return bulk_edge diff --git a/src/converters/cypher/query_converter.py b/src/converters/cypher/query_converter.py index 417f87c262..90aaab55a0 100644 --- a/src/converters/cypher/query_converter.py +++ b/src/converters/cypher/query_converter.py @@ -16,6 +16,9 @@ def _convert_str(self, raw_value: str) -> str: escaped = raw_value.replace("'", "\\'") return f"'{escaped}'" + def _convert_none(self) -> str: + return "null" + def _node_to_cypher_map(self, model: BaseNode) -> str: properties = [] From 0e7b1f882101373de51adb5ebfe6b712cb8352a8 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Fri, 17 Jan 2025 11:56:01 +0000 Subject: [PATCH 103/310] Apply auto-formatting rules --- src/converters/cypher/bulk_load_converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index dc525be81a..bc9223ed8d 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -1,8 +1,9 @@ from typing import Literal, cast +from pydantic import BaseModel + from models.graph_edge import BaseEdge from models.graph_node import BaseNode, SourceLocation, SourceName -from pydantic import BaseModel from .base_converter import CypherBaseConverter From 2864cc980c39a534952cbe90b7e43d8bf586700d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 17 Jan 2025 16:13:09 +0000 Subject: [PATCH 104/310] Refactoring --- src/bulk_loader.py | 2 +- src/clients/base_neptune_client.py | 9 +- .../wikidata/linked_ontology_id_extractor.py | 63 ++++++++ .../wikidata/linked_ontology_source.py | 136 +++++++----------- src/sources/wikidata/sparql_client.py | 33 ++++- 5 files changed, 158 insertions(+), 85 deletions(-) create mode 100644 src/sources/wikidata/linked_ontology_id_extractor.py diff --git a/src/bulk_loader.py b/src/bulk_loader.py index c8b67db4d5..e032fea535 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -47,7 +47,7 @@ def local_handler() -> None: ) args = parser.parse_args() - handler(**args.__dict__, is_local=True) + print(handler(**args.__dict__, is_local=True)) if __name__ == "__main__": diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 8839653bdb..5603b48500 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -143,7 +143,14 @@ def get_bulk_load_status(self, load_id: str) -> str: for error_log in error_logs: code = error_log["errorCode"] message = error_log["errorMessage"] - print(f" {code}: {message}") + record_num = error_log["recordNum"] + print(f" {code}: {message}. (Row number: {record_num})") + + failed_feeds = payload.get("failedFeeds") + if failed_feeds: + print(" Failed feed statuses:") + for failed_feed in failed_feeds: + print(f" {failed_feed['status']}") return status diff --git a/src/sources/wikidata/linked_ontology_id_extractor.py b/src/sources/wikidata/linked_ontology_id_extractor.py new file mode 100644 index 0000000000..67cd929982 --- /dev/null +++ b/src/sources/wikidata/linked_ontology_id_extractor.py @@ -0,0 +1,63 @@ +import os +from functools import lru_cache + +import boto3 +import smart_open + + +from .sparql_query_builder import NodeType, OntologyType + + +S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] + + +class LinkedOntologyIdTypeChecker: + """ + A class for checking whether ids from a given linked ontology (LoC or MeSH) are classified under + a selected node type (concepts, locations, or names). + """ + + def __init__(self, node_type: NodeType, linked_ontology: OntologyType): + self.node_type = node_type + self.linked_ontology = linked_ontology + + # MeSH only has concepts and locations, so make sure we don't attempt to extract names. + if node_type == "names": + assert ( + linked_ontology != "mesh" + ), "Invalid node_type for ontology type MeSH." + + @lru_cache + def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: + """Return all ids classified under a given `node_type` for the selected ontology.""" + # Retrieve the bulk load file outputted by the relevant transformer so that we can extract ids from it. + linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" + s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + + print(f"Retrieving {linked_nodes_file_name} from S3.") + + ids = set() + transport_params = {"client": boto3.client("s3")} + with smart_open.open(s3_url, "r", transport_params=transport_params) as f: + # Loop through all items in the file and extract the id from each item + for line in f: + ids.add(line.split(",")[0]) + + print( + f"Retrieved {len(ids)} ids of type '{node_type}' from ontology '{self.linked_ontology}'." + ) + + return ids + + def id_included_in_selected_type(self, linked_id: str) -> bool: + """Return `True` if a given linked ontology id is classified under the selected node type (concepts, + locations, or names).""" + + # To check whether a Library of Congress id is classified under 'names', we could examine all the 'names' ids, + # but the corresponding file is large and it would take too long. Instead, it's better to check that the + # LoC id starts with an 'n' and that it is not classified under 'locations'. + if self.linked_ontology == "loc" and self.node_type == "names": + location_ids = self._get_linked_ontology_ids("locations") + return linked_id not in location_ids and linked_id[0] == "n" + + return linked_id in self._get_linked_ontology_ids(self.node_type) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 869e7f7387..cd5dd09dde 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -1,25 +1,17 @@ -import os from collections.abc import Generator -from functools import lru_cache -import boto3 -import smart_open from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel -from .sparql_client import WikidataSparqlClient +from .sparql_client import WikidataSparqlClient, MAX_PARALLEL_SPARQL_QUERIES from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder -SPARQL_ITEMS_CHUNK_SIZE = 400 +from .linked_ontology_id_extractor import LinkedOntologyIdTypeChecker -# Wikidata limits the number of parallel queries from a single IP address to 5. -# See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits -# However, experimentally, running more than 3 queries in parallel eventually results a '429 Too Many Requests' error. -MAX_PARALLEL_SPARQL_QUERIES = 3 +SPARQL_ITEMS_CHUNK_SIZE = 400 -S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" @@ -43,20 +35,6 @@ class WikidataLinkedOntologySource(BaseSource): to use a somewhat convoluted way for extracting the Wikidata nodes/edges we need. See https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization for more information on how to optimise SPARQL queries. - - To extract nodes: - 1. Run a SPARQL query which retrieves _all_ Wikidata ids referencing an id from the selected linked ontology. - (WikidataEdgesSource is utilised to run the query.) - 2. Split the returned ids into chunks and run a SPARQL query for each chunk. The query retrieves all the node - properties we are interested in for each id in the chunk. - 3. Stream the returned items as usual. - - To extract edges (via the `WikidataEdgesSource` class): - 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the selected linked ontology, - and returns mappings between Wikidata ids and ids from the linked ontology. - 2. Filter the returned id pairs to only include Wikidata ids corresponding to the selected node type - (i.e. concepts, locations, or names). - 3. Stream the filtered items as usual. """ def __init__( @@ -69,56 +47,57 @@ def __init__( self.node_type = node_type self.linked_ontology = linked_ontology self.entity_type = entity_type + self.id_type_checker = LinkedOntologyIdTypeChecker(node_type, linked_ontology) - @lru_cache - def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: - linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" - s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" - - ids = set() - transport_params = {"client": boto3.client("s3")} - with smart_open.open(s3_url, "r", transport_params=transport_params) as f: - for line in f: - ids.add(line.split(",")[0]) - - return ids - - def _linked_id_exists_in_selected_node_type(self, linked_id: str) -> bool: - if self.linked_ontology == "mesh": - return True - elif self.linked_ontology == "loc": - if self.node_type in ["concepts", "locations"]: - return linked_id in self._get_linked_ontology_ids(self.node_type) - elif self.node_type == "names": - location_ids = self._get_linked_ontology_ids("locations") - return linked_id not in location_ids and linked_id[0] == "n" - else: - raise ValueError(f"Invalid node type: {self.linked_ontology}") - else: - raise ValueError(f"Invalid linked ontology {self.linked_ontology}") - - def _stream_raw_edges(self) -> Generator[dict]: - # First, get the ids of _all_ Wikidata items which reference an id from the selected linked ontology + def _get_all_ids(self) -> Generator[str]: + """Return all Wikidata ids corresponding to Wikidata items referencing the selected linked ontology.""" + print(f"Retrieving Wikidata ids linked to {self.linked_ontology} items.") ids_query = SparqlQueryBuilder.get_all_ids_query(self.linked_ontology) id_items = self.client.run_query(ids_query) # Deduplicate. (We could deduplicate as part of the SPARQL query via the 'DISTINCT' keyword, # but that would make the query significantly slower. It's faster to deduplicate here.) - all_ids = iter(set(extract_wikidata_id(item) for item in id_items)) - - def get_linked_ids(ids_chunk: list[str]) -> list: - query = SparqlQueryBuilder.get_linked_ids_query( - ids_chunk, self.linked_ontology - ) - return self.client.run_query(query) - - # Split ids into chunks. For each chunk, run a separate SPARQL query to retrieve a mapping between Wikidata ids - # and ids from the linked ontology. (We could run a SPARQL query to get _all_ mappings at once, but this query - # is not reliable - sometimes it times out or returns invalid JSON. Getting the mappings in chunks is much - # slower, but it works every time.) + all_ids = set(extract_wikidata_id(item) for item in id_items) + + print(f"Retrieved a total of {len(all_ids)} Wikidata ids.") + yield from all_ids + + def _get_linked_id_mappings(self, wikidata_ids: list[str]) -> list[dict]: + query = SparqlQueryBuilder.get_linked_ids_query( + wikidata_ids, self.linked_ontology + ) + return self.client.run_query(query) + + def _get_linked_items(self, wikidata_ids: list[str]) -> list: + query = SparqlQueryBuilder.get_items_query(wikidata_ids, self.node_type) + return self.client.run_query(query) + + def _stream_wikidata_ids(self) -> Generator[str]: + """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" + seen = set() + for item in self._stream_raw_edges(): + wikidata_id: str = item["wikidata_id"] + if wikidata_id not in seen: + seen.add(wikidata_id) + yield wikidata_id + + def _stream_raw_edges(self) -> Generator[dict]: + """ + Extract edges via the following steps: + 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the linked ontology. + 2. Split the returned ids into chunks. For each chunk, run a second SPARQL query to retrieve a mapping + between Wikidata ids and ids from the linked ontology. (It is possible to modify the query in step 1 to + return all the mappings at once, but this makes the query unreliable - sometimes it times out or returns + invalid JSON. Getting the mappings in chunks is much slower, but it works every time.) + 3. Filter the returned id pairs to only include Wikidata ids corresponding to the selected node type + (i.e. concepts, locations, or names). + """ + all_ids = self._get_all_ids() + + # Parallelise the second query to retrieve the mappings faster. for raw_mapping in process_stream_in_parallel( all_ids, - get_linked_ids, + self._get_linked_id_mappings, SPARQL_ITEMS_CHUNK_SIZE, MAX_PARALLEL_SPARQL_QUERIES, ): @@ -129,28 +108,21 @@ def get_linked_ids(ids_chunk: list[str]) -> list: # Only yield the mapping if the linked id corresponds to the selected `node_type`, as determined by the # linked ontology. For example, if we want to stream Wikidata 'names' edges, but we classify the referenced # LoC id is a 'locations' id, we skip it. - if self._linked_id_exists_in_selected_node_type(mapping["linked_id"]): + if self.id_type_checker.id_included_in_selected_type(mapping["linked_id"]): yield mapping - def _stream_wikidata_ids(self) -> Generator[str]: - """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" - seen = set() - for item in self._stream_raw_edges(): - wikidata_id: str = item["wikidata_id"] - if wikidata_id not in seen: - seen.add(wikidata_id) - yield wikidata_id - def _stream_raw_nodes(self) -> Generator[dict]: - def get_linked_items(chunk: list[str]) -> list: - query = SparqlQueryBuilder.get_items_query(chunk, self.node_type) - return self.client.run_query(query) - + """ + Extract nodes via the following steps: + 1. Stream edges via the `_stream_raw_edges` method and extract Wikidata ids from the streamed edges. + 2. Split the extracted ids into chunks. For each chunk, run a SPARQL query to retrieve all the corresponding + Wikidata fields required to create a node. + """ all_ids = self._stream_wikidata_ids() yield from process_stream_in_parallel( all_ids, - get_linked_items, + self._get_linked_items, SPARQL_ITEMS_CHUNK_SIZE, MAX_PARALLEL_SPARQL_QUERIES, ) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 6639aeedc3..b47a62965d 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,7 +1,21 @@ import requests +import time + +# Wikidata limits the number of parallel queries from a single IP address to 5. +# See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits +# However, experimentally, running more than 4 queries in parallel consistently results '429 Too Many Requests' errors. +MAX_PARALLEL_SPARQL_QUERIES = 4 class WikidataSparqlClient: + """ + A client class for querying Wikidata via SPARQL queries. Automatically throttles requests so that we do not exceed + Wikidata rate limits. + """ + + parallel_query_count = 0 + too_many_requests = False + @staticmethod def _get_user_agent_header() -> str: """ @@ -15,13 +29,30 @@ def _get_user_agent_header() -> str: def run_query(self, query: str) -> list[dict]: """Runs a query against Wikidata's SPARQL endpoint and returns the results as a list""" + + # Make sure we don't exceed the rate limit. + while ( + self.parallel_query_count >= MAX_PARALLEL_SPARQL_QUERIES + or self.too_many_requests + ): + time.sleep(2) + + self.parallel_query_count += 1 r = requests.get( "https://query.wikidata.org/sparql", params={"format": "json", "query": query}, headers={"User-Agent": self._get_user_agent_header()}, ) + self.parallel_query_count -= 1 + + if r.status_code == 429: + self.too_many_requests = True + retry_after = int(r.headers["Retry-After"]) + time.sleep(max(60, retry_after)) + self.too_many_requests = False - if r.status_code != 200: + return self.run_query(query) + elif r.status_code != 200: raise Exception(r.content) results: list[dict] = r.json()["results"]["bindings"] From 29f6be6e29b7eccd4072897a9c72edf02fbf9ac3 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Fri, 17 Jan 2025 16:14:14 +0000 Subject: [PATCH 105/310] Apply auto-formatting rules --- src/sources/wikidata/linked_ontology_id_extractor.py | 2 -- src/sources/wikidata/linked_ontology_source.py | 6 ++---- src/sources/wikidata/sparql_client.py | 3 ++- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_id_extractor.py b/src/sources/wikidata/linked_ontology_id_extractor.py index 67cd929982..fc0ec23914 100644 --- a/src/sources/wikidata/linked_ontology_id_extractor.py +++ b/src/sources/wikidata/linked_ontology_id_extractor.py @@ -4,10 +4,8 @@ import boto3 import smart_open - from .sparql_query_builder import NodeType, OntologyType - S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index cd5dd09dde..e7ad53a639 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -1,14 +1,12 @@ from collections.abc import Generator - from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel -from .sparql_client import WikidataSparqlClient, MAX_PARALLEL_SPARQL_QUERIES -from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder - from .linked_ontology_id_extractor import LinkedOntologyIdTypeChecker +from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient +from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder SPARQL_ITEMS_CHUNK_SIZE = 400 diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index b47a62965d..cbcf6a71e3 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,6 +1,7 @@ -import requests import time +import requests + # Wikidata limits the number of parallel queries from a single IP address to 5. # See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits # However, experimentally, running more than 4 queries in parallel consistently results '429 Too Many Requests' errors. From 5eddc59f3fd7db4bf9562ab38581edcf762c2789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 17 Jan 2025 16:13:09 +0000 Subject: [PATCH 106/310] Refactoring --- src/bulk_loader.py | 2 +- src/clients/base_neptune_client.py | 9 +- .../wikidata/linked_ontology_id_extractor.py | 63 ++++++++ .../wikidata/linked_ontology_source.py | 136 +++++++----------- src/sources/wikidata/sparql_client.py | 33 ++++- src/transformers/wikidata/raw_concept.py | 6 + 6 files changed, 164 insertions(+), 85 deletions(-) create mode 100644 src/sources/wikidata/linked_ontology_id_extractor.py diff --git a/src/bulk_loader.py b/src/bulk_loader.py index c8b67db4d5..e032fea535 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -47,7 +47,7 @@ def local_handler() -> None: ) args = parser.parse_args() - handler(**args.__dict__, is_local=True) + print(handler(**args.__dict__, is_local=True)) if __name__ == "__main__": diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 8839653bdb..5603b48500 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -143,7 +143,14 @@ def get_bulk_load_status(self, load_id: str) -> str: for error_log in error_logs: code = error_log["errorCode"] message = error_log["errorMessage"] - print(f" {code}: {message}") + record_num = error_log["recordNum"] + print(f" {code}: {message}. (Row number: {record_num})") + + failed_feeds = payload.get("failedFeeds") + if failed_feeds: + print(" Failed feed statuses:") + for failed_feed in failed_feeds: + print(f" {failed_feed['status']}") return status diff --git a/src/sources/wikidata/linked_ontology_id_extractor.py b/src/sources/wikidata/linked_ontology_id_extractor.py new file mode 100644 index 0000000000..67cd929982 --- /dev/null +++ b/src/sources/wikidata/linked_ontology_id_extractor.py @@ -0,0 +1,63 @@ +import os +from functools import lru_cache + +import boto3 +import smart_open + + +from .sparql_query_builder import NodeType, OntologyType + + +S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] + + +class LinkedOntologyIdTypeChecker: + """ + A class for checking whether ids from a given linked ontology (LoC or MeSH) are classified under + a selected node type (concepts, locations, or names). + """ + + def __init__(self, node_type: NodeType, linked_ontology: OntologyType): + self.node_type = node_type + self.linked_ontology = linked_ontology + + # MeSH only has concepts and locations, so make sure we don't attempt to extract names. + if node_type == "names": + assert ( + linked_ontology != "mesh" + ), "Invalid node_type for ontology type MeSH." + + @lru_cache + def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: + """Return all ids classified under a given `node_type` for the selected ontology.""" + # Retrieve the bulk load file outputted by the relevant transformer so that we can extract ids from it. + linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" + s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + + print(f"Retrieving {linked_nodes_file_name} from S3.") + + ids = set() + transport_params = {"client": boto3.client("s3")} + with smart_open.open(s3_url, "r", transport_params=transport_params) as f: + # Loop through all items in the file and extract the id from each item + for line in f: + ids.add(line.split(",")[0]) + + print( + f"Retrieved {len(ids)} ids of type '{node_type}' from ontology '{self.linked_ontology}'." + ) + + return ids + + def id_included_in_selected_type(self, linked_id: str) -> bool: + """Return `True` if a given linked ontology id is classified under the selected node type (concepts, + locations, or names).""" + + # To check whether a Library of Congress id is classified under 'names', we could examine all the 'names' ids, + # but the corresponding file is large and it would take too long. Instead, it's better to check that the + # LoC id starts with an 'n' and that it is not classified under 'locations'. + if self.linked_ontology == "loc" and self.node_type == "names": + location_ids = self._get_linked_ontology_ids("locations") + return linked_id not in location_ids and linked_id[0] == "n" + + return linked_id in self._get_linked_ontology_ids(self.node_type) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 869e7f7387..cd5dd09dde 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -1,25 +1,17 @@ -import os from collections.abc import Generator -from functools import lru_cache -import boto3 -import smart_open from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel -from .sparql_client import WikidataSparqlClient +from .sparql_client import WikidataSparqlClient, MAX_PARALLEL_SPARQL_QUERIES from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder -SPARQL_ITEMS_CHUNK_SIZE = 400 +from .linked_ontology_id_extractor import LinkedOntologyIdTypeChecker -# Wikidata limits the number of parallel queries from a single IP address to 5. -# See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits -# However, experimentally, running more than 3 queries in parallel eventually results a '429 Too Many Requests' error. -MAX_PARALLEL_SPARQL_QUERIES = 3 +SPARQL_ITEMS_CHUNK_SIZE = 400 -S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" @@ -43,20 +35,6 @@ class WikidataLinkedOntologySource(BaseSource): to use a somewhat convoluted way for extracting the Wikidata nodes/edges we need. See https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization for more information on how to optimise SPARQL queries. - - To extract nodes: - 1. Run a SPARQL query which retrieves _all_ Wikidata ids referencing an id from the selected linked ontology. - (WikidataEdgesSource is utilised to run the query.) - 2. Split the returned ids into chunks and run a SPARQL query for each chunk. The query retrieves all the node - properties we are interested in for each id in the chunk. - 3. Stream the returned items as usual. - - To extract edges (via the `WikidataEdgesSource` class): - 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the selected linked ontology, - and returns mappings between Wikidata ids and ids from the linked ontology. - 2. Filter the returned id pairs to only include Wikidata ids corresponding to the selected node type - (i.e. concepts, locations, or names). - 3. Stream the filtered items as usual. """ def __init__( @@ -69,56 +47,57 @@ def __init__( self.node_type = node_type self.linked_ontology = linked_ontology self.entity_type = entity_type + self.id_type_checker = LinkedOntologyIdTypeChecker(node_type, linked_ontology) - @lru_cache - def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: - linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" - s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" - - ids = set() - transport_params = {"client": boto3.client("s3")} - with smart_open.open(s3_url, "r", transport_params=transport_params) as f: - for line in f: - ids.add(line.split(",")[0]) - - return ids - - def _linked_id_exists_in_selected_node_type(self, linked_id: str) -> bool: - if self.linked_ontology == "mesh": - return True - elif self.linked_ontology == "loc": - if self.node_type in ["concepts", "locations"]: - return linked_id in self._get_linked_ontology_ids(self.node_type) - elif self.node_type == "names": - location_ids = self._get_linked_ontology_ids("locations") - return linked_id not in location_ids and linked_id[0] == "n" - else: - raise ValueError(f"Invalid node type: {self.linked_ontology}") - else: - raise ValueError(f"Invalid linked ontology {self.linked_ontology}") - - def _stream_raw_edges(self) -> Generator[dict]: - # First, get the ids of _all_ Wikidata items which reference an id from the selected linked ontology + def _get_all_ids(self) -> Generator[str]: + """Return all Wikidata ids corresponding to Wikidata items referencing the selected linked ontology.""" + print(f"Retrieving Wikidata ids linked to {self.linked_ontology} items.") ids_query = SparqlQueryBuilder.get_all_ids_query(self.linked_ontology) id_items = self.client.run_query(ids_query) # Deduplicate. (We could deduplicate as part of the SPARQL query via the 'DISTINCT' keyword, # but that would make the query significantly slower. It's faster to deduplicate here.) - all_ids = iter(set(extract_wikidata_id(item) for item in id_items)) - - def get_linked_ids(ids_chunk: list[str]) -> list: - query = SparqlQueryBuilder.get_linked_ids_query( - ids_chunk, self.linked_ontology - ) - return self.client.run_query(query) - - # Split ids into chunks. For each chunk, run a separate SPARQL query to retrieve a mapping between Wikidata ids - # and ids from the linked ontology. (We could run a SPARQL query to get _all_ mappings at once, but this query - # is not reliable - sometimes it times out or returns invalid JSON. Getting the mappings in chunks is much - # slower, but it works every time.) + all_ids = set(extract_wikidata_id(item) for item in id_items) + + print(f"Retrieved a total of {len(all_ids)} Wikidata ids.") + yield from all_ids + + def _get_linked_id_mappings(self, wikidata_ids: list[str]) -> list[dict]: + query = SparqlQueryBuilder.get_linked_ids_query( + wikidata_ids, self.linked_ontology + ) + return self.client.run_query(query) + + def _get_linked_items(self, wikidata_ids: list[str]) -> list: + query = SparqlQueryBuilder.get_items_query(wikidata_ids, self.node_type) + return self.client.run_query(query) + + def _stream_wikidata_ids(self) -> Generator[str]: + """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" + seen = set() + for item in self._stream_raw_edges(): + wikidata_id: str = item["wikidata_id"] + if wikidata_id not in seen: + seen.add(wikidata_id) + yield wikidata_id + + def _stream_raw_edges(self) -> Generator[dict]: + """ + Extract edges via the following steps: + 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the linked ontology. + 2. Split the returned ids into chunks. For each chunk, run a second SPARQL query to retrieve a mapping + between Wikidata ids and ids from the linked ontology. (It is possible to modify the query in step 1 to + return all the mappings at once, but this makes the query unreliable - sometimes it times out or returns + invalid JSON. Getting the mappings in chunks is much slower, but it works every time.) + 3. Filter the returned id pairs to only include Wikidata ids corresponding to the selected node type + (i.e. concepts, locations, or names). + """ + all_ids = self._get_all_ids() + + # Parallelise the second query to retrieve the mappings faster. for raw_mapping in process_stream_in_parallel( all_ids, - get_linked_ids, + self._get_linked_id_mappings, SPARQL_ITEMS_CHUNK_SIZE, MAX_PARALLEL_SPARQL_QUERIES, ): @@ -129,28 +108,21 @@ def get_linked_ids(ids_chunk: list[str]) -> list: # Only yield the mapping if the linked id corresponds to the selected `node_type`, as determined by the # linked ontology. For example, if we want to stream Wikidata 'names' edges, but we classify the referenced # LoC id is a 'locations' id, we skip it. - if self._linked_id_exists_in_selected_node_type(mapping["linked_id"]): + if self.id_type_checker.id_included_in_selected_type(mapping["linked_id"]): yield mapping - def _stream_wikidata_ids(self) -> Generator[str]: - """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" - seen = set() - for item in self._stream_raw_edges(): - wikidata_id: str = item["wikidata_id"] - if wikidata_id not in seen: - seen.add(wikidata_id) - yield wikidata_id - def _stream_raw_nodes(self) -> Generator[dict]: - def get_linked_items(chunk: list[str]) -> list: - query = SparqlQueryBuilder.get_items_query(chunk, self.node_type) - return self.client.run_query(query) - + """ + Extract nodes via the following steps: + 1. Stream edges via the `_stream_raw_edges` method and extract Wikidata ids from the streamed edges. + 2. Split the extracted ids into chunks. For each chunk, run a SPARQL query to retrieve all the corresponding + Wikidata fields required to create a node. + """ all_ids = self._stream_wikidata_ids() yield from process_stream_in_parallel( all_ids, - get_linked_items, + self._get_linked_items, SPARQL_ITEMS_CHUNK_SIZE, MAX_PARALLEL_SPARQL_QUERIES, ) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 6639aeedc3..b47a62965d 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,7 +1,21 @@ import requests +import time + +# Wikidata limits the number of parallel queries from a single IP address to 5. +# See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits +# However, experimentally, running more than 4 queries in parallel consistently results '429 Too Many Requests' errors. +MAX_PARALLEL_SPARQL_QUERIES = 4 class WikidataSparqlClient: + """ + A client class for querying Wikidata via SPARQL queries. Automatically throttles requests so that we do not exceed + Wikidata rate limits. + """ + + parallel_query_count = 0 + too_many_requests = False + @staticmethod def _get_user_agent_header() -> str: """ @@ -15,13 +29,30 @@ def _get_user_agent_header() -> str: def run_query(self, query: str) -> list[dict]: """Runs a query against Wikidata's SPARQL endpoint and returns the results as a list""" + + # Make sure we don't exceed the rate limit. + while ( + self.parallel_query_count >= MAX_PARALLEL_SPARQL_QUERIES + or self.too_many_requests + ): + time.sleep(2) + + self.parallel_query_count += 1 r = requests.get( "https://query.wikidata.org/sparql", params={"format": "json", "query": query}, headers={"User-Agent": self._get_user_agent_header()}, ) + self.parallel_query_count -= 1 + + if r.status_code == 429: + self.too_many_requests = True + retry_after = int(r.headers["Retry-After"]) + time.sleep(max(60, retry_after)) + self.too_many_requests = False - if r.status_code != 200: + return self.run_query(query) + elif r.status_code != 200: raise Exception(r.content) results: list[dict] = r.json()["results"]["bindings"] diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index fbfc0863af..7402aa9690 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -96,6 +96,12 @@ def _extract_date(self, field_name: str) -> str | None: if date_value is None or date_value.startswith("http"): return None + # There is currently one case where a Wikidata item stores an invalid date (Wikidata id: Q10904907). The date + # is correctly formatted but references year 0, which does not exist. Neptune would throw an error if we tried + # to load it in, so we filter it out. + if date_value == "+0000-00-00T00:00:00Z": + return None + return date_value @property From de364c883c0d0ca7f5068b9b9df4b84e8cb23059 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Fri, 17 Jan 2025 17:18:03 +0000 Subject: [PATCH 107/310] Apply auto-formatting rules --- src/sources/wikidata/linked_ontology_id_type_checker.py | 2 -- src/sources/wikidata/linked_ontology_source.py | 3 +-- src/sources/wikidata/sparql_client.py | 3 ++- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 53f0c752d0..8cce4e134a 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -4,10 +4,8 @@ import boto3 import smart_open - from .sparql_query_builder import NodeType, OntologyType - S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index ce637fef94..62b306c326 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -4,11 +4,10 @@ from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel +from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder -from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker - SPARQL_ITEMS_CHUNK_SIZE = 400 WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index b47a62965d..cbcf6a71e3 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,6 +1,7 @@ -import requests import time +import requests + # Wikidata limits the number of parallel queries from a single IP address to 5. # See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits # However, experimentally, running more than 4 queries in parallel consistently results '429 Too Many Requests' errors. From ffbda51846b2d6f0873791166162642cf18e4dab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 20 Jan 2025 09:10:53 +0000 Subject: [PATCH 108/310] Add Wikidata concepts linking to MeSH --- src/transformers/base_transformer.py | 5 +++- src/transformers/create_transformer.py | 24 ++++++++++++------- .../wikidata/concepts_transformer.py | 11 ++++++--- .../wikidata/locations_transformer.py | 11 ++++++--- .../wikidata/names_transformer.py | 9 ++++--- 5 files changed, 41 insertions(+), 19 deletions(-) diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index dc289234c3..e575729446 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -1,6 +1,5 @@ import concurrent.futures import csv -import xml.etree.ElementTree as ET from collections.abc import Generator from itertools import islice from typing import Any, Literal @@ -56,6 +55,8 @@ def _stream_nodes(self, number: int | None = None) -> Generator[BaseNode]: if counter == number: return + print(f"Streamed all {counter} nodes...") + def _stream_edges(self, number: int | None = None) -> Generator[BaseEdge]: """ Extracts edges from the specified source and transforms them. The `source` must define a `stream_raw` method. @@ -75,6 +76,8 @@ def _stream_edges(self, number: int | None = None) -> Generator[BaseEdge]: if counter == number: return + print(f"Streamed all {counter} edges...") + def _stream_entities( self, entity_type: EntityType, sample_size: int | None = None ) -> Generator[BaseNode | BaseEdge]: diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index e2d8fce031..9fc3315d2c 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -22,9 +22,11 @@ "loc_locations", "mesh_concepts", "mesh_locations", - "wikidata_concepts", - "wikidata_locations", - "wikidata_names", + "wikidata_linked_loc_concepts", + "wikidata_linked_loc_locations", + "wikidata_linked_loc_names", + "wikidata_linked_mesh_concepts", + "wikidata_linked_mesh_locations", ] @@ -43,11 +45,15 @@ def create_transformer( return MeSHConceptsTransformer(MESH_URL) if transformer_type == "mesh_locations": return MeSHLocationsTransformer(MESH_URL) - if transformer_type == "wikidata_concepts": - return WikidataConceptsTransformer(entity_type) - if transformer_type == "wikidata_locations": - return WikidataLocationsTransformer(entity_type) - if transformer_type == "wikidata_names": - return WikidataNamesTransformer(entity_type) + if transformer_type == "wikidata_linked_loc_concepts": + return WikidataConceptsTransformer(entity_type, "loc") + if transformer_type == "wikidata_linked_loc_locations": + return WikidataLocationsTransformer(entity_type, "loc") + if transformer_type == "wikidata_linked_loc_names": + return WikidataNamesTransformer(entity_type, "loc") + if transformer_type == "wikidata_linked_mesh_concepts": + return WikidataConceptsTransformer(entity_type, "mesh") + if transformer_type == "wikidata_linked_mesh_locations": + return WikidataLocationsTransformer(entity_type, "mesh") raise ValueError(f"Unknown transformer type: {transformer_type}") diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index ccbbbc86f8..e751637bdb 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -2,15 +2,20 @@ from models.graph_edge import SourceConceptSameAs from models.graph_node import SourceConcept -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from sources.wikidata.linked_ontology_source import ( + WikidataLinkedOntologySource, + OntologyType, +) from transformers.base_transformer import BaseTransformer, EntityType from .raw_concept import RawWikidataConcept class WikidataConceptsTransformer(BaseTransformer): - def __init__(self, entity_type: EntityType): - self.source = WikidataLinkedOntologySource("concepts", "loc", entity_type) + def __init__(self, entity_type: EntityType, linked_ontology: OntologyType): + self.source = WikidataLinkedOntologySource( + "concepts", linked_ontology, entity_type + ) def transform_node(self, raw_node: dict) -> SourceConcept | None: raw_concept = RawWikidataConcept(raw_node) diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index ced1f74e50..44c212ed6e 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,5 +1,8 @@ from models.graph_node import SourceLocation -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from sources.wikidata.linked_ontology_source import ( + WikidataLinkedOntologySource, + OntologyType, +) from transformers.base_transformer import EntityType from .concepts_transformer import WikidataConceptsTransformer @@ -7,8 +10,10 @@ class WikidataLocationsTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type: EntityType): - self.source = WikidataLinkedOntologySource("locations", "loc", entity_type) + def __init__(self, entity_type: EntityType, ontology_type: OntologyType): + self.source = WikidataLinkedOntologySource( + "locations", ontology_type, entity_type + ) def transform_node(self, raw_node: dict) -> SourceLocation | None: raw_concept = RawWikidataLocation(raw_node) diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 32ef231a95..2e14d9ed46 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,5 +1,8 @@ from models.graph_node import SourceName -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from sources.wikidata.linked_ontology_source import ( + WikidataLinkedOntologySource, + OntologyType, +) from transformers.base_transformer import EntityType from .concepts_transformer import WikidataConceptsTransformer @@ -7,8 +10,8 @@ class WikidataNamesTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type: EntityType): - self.source = WikidataLinkedOntologySource("names", "loc", entity_type) + def __init__(self, entity_type: EntityType, ontology_type: OntologyType): + self.source = WikidataLinkedOntologySource("names", ontology_type, entity_type) def transform_node(self, raw_node: dict) -> SourceName | None: raw_concept = RawWikidataName(raw_node) From 974eb841d76f2eeccdc0f4d61137e5cdbd91a43f Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Mon, 20 Jan 2025 09:11:57 +0000 Subject: [PATCH 109/310] Apply auto-formatting rules --- src/transformers/wikidata/concepts_transformer.py | 2 +- src/transformers/wikidata/locations_transformer.py | 2 +- src/transformers/wikidata/names_transformer.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index e751637bdb..8a83b6bcf2 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -3,8 +3,8 @@ from models.graph_edge import SourceConceptSameAs from models.graph_node import SourceConcept from sources.wikidata.linked_ontology_source import ( - WikidataLinkedOntologySource, OntologyType, + WikidataLinkedOntologySource, ) from transformers.base_transformer import BaseTransformer, EntityType diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 44c212ed6e..8b7be0e636 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,7 +1,7 @@ from models.graph_node import SourceLocation from sources.wikidata.linked_ontology_source import ( - WikidataLinkedOntologySource, OntologyType, + WikidataLinkedOntologySource, ) from transformers.base_transformer import EntityType diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 2e14d9ed46..5c8b8318cb 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,7 +1,7 @@ from models.graph_node import SourceName from sources.wikidata.linked_ontology_source import ( - WikidataLinkedOntologySource, OntologyType, + WikidataLinkedOntologySource, ) from transformers.base_transformer import EntityType From aa9685892f81a6d1e061beb7ac1f22ab5d641f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 20 Jan 2025 09:10:53 +0000 Subject: [PATCH 110/310] Add Wikidata concepts linking to MeSH --- src/sources/wikidata/sparql_client.py | 2 +- src/transformers/base_transformer.py | 5 +++- src/transformers/create_transformer.py | 24 ++++++++++++------- .../wikidata/concepts_transformer.py | 11 ++++++--- .../wikidata/locations_transformer.py | 11 ++++++--- .../wikidata/names_transformer.py | 9 ++++--- 6 files changed, 42 insertions(+), 20 deletions(-) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index cbcf6a71e3..885d7a018c 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -4,7 +4,7 @@ # Wikidata limits the number of parallel queries from a single IP address to 5. # See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits -# However, experimentally, running more than 4 queries in parallel consistently results '429 Too Many Requests' errors. +# However, experimentally, running more than 4 queries in parallel consistently results in '429 Too Many Requests' errors. MAX_PARALLEL_SPARQL_QUERIES = 4 diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index dc289234c3..e575729446 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -1,6 +1,5 @@ import concurrent.futures import csv -import xml.etree.ElementTree as ET from collections.abc import Generator from itertools import islice from typing import Any, Literal @@ -56,6 +55,8 @@ def _stream_nodes(self, number: int | None = None) -> Generator[BaseNode]: if counter == number: return + print(f"Streamed all {counter} nodes...") + def _stream_edges(self, number: int | None = None) -> Generator[BaseEdge]: """ Extracts edges from the specified source and transforms them. The `source` must define a `stream_raw` method. @@ -75,6 +76,8 @@ def _stream_edges(self, number: int | None = None) -> Generator[BaseEdge]: if counter == number: return + print(f"Streamed all {counter} edges...") + def _stream_entities( self, entity_type: EntityType, sample_size: int | None = None ) -> Generator[BaseNode | BaseEdge]: diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index e2d8fce031..9fc3315d2c 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -22,9 +22,11 @@ "loc_locations", "mesh_concepts", "mesh_locations", - "wikidata_concepts", - "wikidata_locations", - "wikidata_names", + "wikidata_linked_loc_concepts", + "wikidata_linked_loc_locations", + "wikidata_linked_loc_names", + "wikidata_linked_mesh_concepts", + "wikidata_linked_mesh_locations", ] @@ -43,11 +45,15 @@ def create_transformer( return MeSHConceptsTransformer(MESH_URL) if transformer_type == "mesh_locations": return MeSHLocationsTransformer(MESH_URL) - if transformer_type == "wikidata_concepts": - return WikidataConceptsTransformer(entity_type) - if transformer_type == "wikidata_locations": - return WikidataLocationsTransformer(entity_type) - if transformer_type == "wikidata_names": - return WikidataNamesTransformer(entity_type) + if transformer_type == "wikidata_linked_loc_concepts": + return WikidataConceptsTransformer(entity_type, "loc") + if transformer_type == "wikidata_linked_loc_locations": + return WikidataLocationsTransformer(entity_type, "loc") + if transformer_type == "wikidata_linked_loc_names": + return WikidataNamesTransformer(entity_type, "loc") + if transformer_type == "wikidata_linked_mesh_concepts": + return WikidataConceptsTransformer(entity_type, "mesh") + if transformer_type == "wikidata_linked_mesh_locations": + return WikidataLocationsTransformer(entity_type, "mesh") raise ValueError(f"Unknown transformer type: {transformer_type}") diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index ccbbbc86f8..e751637bdb 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -2,15 +2,20 @@ from models.graph_edge import SourceConceptSameAs from models.graph_node import SourceConcept -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from sources.wikidata.linked_ontology_source import ( + WikidataLinkedOntologySource, + OntologyType, +) from transformers.base_transformer import BaseTransformer, EntityType from .raw_concept import RawWikidataConcept class WikidataConceptsTransformer(BaseTransformer): - def __init__(self, entity_type: EntityType): - self.source = WikidataLinkedOntologySource("concepts", "loc", entity_type) + def __init__(self, entity_type: EntityType, linked_ontology: OntologyType): + self.source = WikidataLinkedOntologySource( + "concepts", linked_ontology, entity_type + ) def transform_node(self, raw_node: dict) -> SourceConcept | None: raw_concept = RawWikidataConcept(raw_node) diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index ced1f74e50..44c212ed6e 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,5 +1,8 @@ from models.graph_node import SourceLocation -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from sources.wikidata.linked_ontology_source import ( + WikidataLinkedOntologySource, + OntologyType, +) from transformers.base_transformer import EntityType from .concepts_transformer import WikidataConceptsTransformer @@ -7,8 +10,10 @@ class WikidataLocationsTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type: EntityType): - self.source = WikidataLinkedOntologySource("locations", "loc", entity_type) + def __init__(self, entity_type: EntityType, ontology_type: OntologyType): + self.source = WikidataLinkedOntologySource( + "locations", ontology_type, entity_type + ) def transform_node(self, raw_node: dict) -> SourceLocation | None: raw_concept = RawWikidataLocation(raw_node) diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 32ef231a95..2e14d9ed46 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,5 +1,8 @@ from models.graph_node import SourceName -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from sources.wikidata.linked_ontology_source import ( + WikidataLinkedOntologySource, + OntologyType, +) from transformers.base_transformer import EntityType from .concepts_transformer import WikidataConceptsTransformer @@ -7,8 +10,8 @@ class WikidataNamesTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type: EntityType): - self.source = WikidataLinkedOntologySource("names", "loc", entity_type) + def __init__(self, entity_type: EntityType, ontology_type: OntologyType): + self.source = WikidataLinkedOntologySource("names", ontology_type, entity_type) def transform_node(self, raw_node: dict) -> SourceName | None: raw_concept = RawWikidataName(raw_node) From 3272c2156d3ce43e4ed7d668673b9cfa2421ff70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 20 Jan 2025 12:30:22 +0000 Subject: [PATCH 111/310] Make WikidataSparqlClient thread-safe --- src/sources/wikidata/sparql_client.py | 45 ++++++++++++++++----------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 885d7a018c..49cf46fbb4 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,4 +1,5 @@ import time +import threading import requests @@ -10,12 +11,14 @@ class WikidataSparqlClient: """ - A client class for querying Wikidata via SPARQL queries. Automatically throttles requests so that we do not exceed - Wikidata rate limits. + A client class for querying Wikidata via SPARQL queries. Automatically throttles requests (in a thread-safe way) + so that we do not exceed Wikidata rate limits. """ - parallel_query_count = 0 - too_many_requests = False + def __init__(self): + self.parallel_query_semaphore = threading.Semaphore(MAX_PARALLEL_SPARQL_QUERIES) + self.too_many_requests = False + self.too_many_requests_lock = threading.Lock() @staticmethod def _get_user_agent_header() -> str: @@ -31,26 +34,32 @@ def _get_user_agent_header() -> str: def run_query(self, query: str) -> list[dict]: """Runs a query against Wikidata's SPARQL endpoint and returns the results as a list""" - # Make sure we don't exceed the rate limit. - while ( - self.parallel_query_count >= MAX_PARALLEL_SPARQL_QUERIES - or self.too_many_requests - ): + while True: + with self.too_many_requests_lock: + if not self.too_many_requests: + break time.sleep(2) - self.parallel_query_count += 1 - r = requests.get( - "https://query.wikidata.org/sparql", - params={"format": "json", "query": query}, - headers={"User-Agent": self._get_user_agent_header()}, - ) - self.parallel_query_count -= 1 + # Use a semaphore to throttle the number of parallel requests + with self.parallel_query_semaphore: + r = requests.get( + "https://query.wikidata.org/sparql", + params={"format": "json", "query": query}, + headers={"User-Agent": self._get_user_agent_header()}, + ) + # Even though we limit the number of requests, we might still occasionally get a 429 error. + # When this happens, set the `too_many_requests` flag to prevent other threads from making new requests + # and sleep for at least a minute. if r.status_code == 429: - self.too_many_requests = True + with self.too_many_requests_lock: + self.too_many_requests = True + retry_after = int(r.headers["Retry-After"]) time.sleep(max(60, retry_after)) - self.too_many_requests = False + + with self.too_many_requests_lock: + self.too_many_requests = False return self.run_query(query) elif r.status_code != 200: From a79eba63630d13d1ebe2bd76ddf16d10d04e4936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 20 Jan 2025 12:30:50 +0000 Subject: [PATCH 112/310] Update Neptune to the latest version --- terraform/neptune.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform/neptune.tf b/terraform/neptune.tf index 3bbf17a920..62a6749f9c 100644 --- a/terraform/neptune.tf +++ b/terraform/neptune.tf @@ -1,8 +1,8 @@ resource "aws_neptune_cluster" "catalogue_graph_cluster" { cluster_identifier = "catalogue-graph" engine = "neptune" - engine_version = "1.3.2.1" - neptune_cluster_parameter_group_name = "default.neptune1.3" + engine_version = "1.4.2.0" + neptune_cluster_parameter_group_name = "default.neptune1.4" iam_database_authentication_enabled = true apply_immediately = true storage_encrypted = true @@ -57,7 +57,7 @@ resource "aws_iam_role_policy" "s3_read_only_policy_attachment" { resource "aws_neptune_cluster_instance" "catalogue_graph_instance" { cluster_identifier = aws_neptune_cluster.catalogue_graph_cluster.cluster_identifier instance_class = "db.serverless" - neptune_parameter_group_name = "default.neptune1.3" + neptune_parameter_group_name = "default.neptune1.4" } resource "aws_db_subnet_group" "neptune_subnet_group" { From 1712e7fdea7bd360008de74616033028fe700b7e Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 20 Jan 2025 12:28:51 +0000 Subject: [PATCH 113/310] Script setup and build process --- .github/workflows/autoformat-typecheck.yml | 22 ++++------- .github/workflows/build.yml | 22 +++++++++++ .python-version | 1 + README.md | 5 ++- create_zip.sh | 12 ------ scripts/autoformat.sh | 13 +++++++ scripts/build.sh | 26 +++++++++++++ scripts/ci-setup.sh | 17 ++++++++ scripts/setup.sh | 45 ++++++++++++++++++++++ scripts/typecheck.sh | 12 ++++++ src/dev_requirements.in | 4 ++ src/dev_requirements.txt | 34 ++++++++++++++++ 12 files changed, 186 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/build.yml create mode 100644 .python-version delete mode 100644 create_zip.sh create mode 100755 scripts/autoformat.sh create mode 100755 scripts/build.sh create mode 100755 scripts/ci-setup.sh create mode 100755 scripts/setup.sh create mode 100755 scripts/typecheck.sh create mode 100644 src/dev_requirements.in create mode 100644 src/dev_requirements.txt diff --git a/.github/workflows/autoformat-typecheck.yml b/.github/workflows/autoformat-typecheck.yml index 23a95afc22..f297804002 100644 --- a/.github/workflows/autoformat-typecheck.yml +++ b/.github/workflows/autoformat-typecheck.yml @@ -17,14 +17,13 @@ jobs: with: python-version: '3.13' - - name: Install dependencies + - name: Setup CI run: | - pip install mypy - pip install -r src/requirements.txt + ./scripts/ci-setup.sh - - name: Run mypy + - name: Run typecheck run: | - mypy --config-file src/mypy.ini src/ + ./scripts/typecheck.sh autoformat: runs-on: ubuntu-latest @@ -36,18 +35,13 @@ jobs: with: python-version: '3.13' - - name: Install dependencies + - name: Setup CI run: | - pip install black isort + ./scripts/ci-setup.sh - - name: Run Black + - name: Run autoformat run: | - black src/ - - - name: Run isort - run: | - # Set the profile to "black" to prevent conflicts between black and isort - isort --profile=black src/ + ./scripts/autoformat.sh - name: Check for formatting changes id: check_formatting_changes diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000000..358076b736 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,22 @@ +# Build deployment artifact +name: "Build deployment zip" + +on: push + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Setup CI + run: | + ./scripts/ci-setup.sh + + - name: Build zip + run: | + ./scripts/build.sh \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000000..3a4f41ef34 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.13 \ No newline at end of file diff --git a/README.md b/README.md index e4af8792bd..ed3cbf9d18 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,10 @@ Subdirectories contain various modules and are shared by all Lambda functions. yields a single Pydantic model). The BaseTransformer class implements a `stream_to_` method for each supported destination. +## Setting up the project + +Run `./scripts/setup.sh` to install the project dependencies. + ## Deployment The pipeline does not currently have an automated deployment process in place. To deploy a new version of the source @@ -102,7 +106,6 @@ cd terraform terraform apply ``` - ## Local execution To run one of the Lambda functions locally, navigate to the `src` directory and then run the chosen function via the diff --git a/create_zip.sh b/create_zip.sh deleted file mode 100644 index fe0019a3de..0000000000 --- a/create_zip.sh +++ /dev/null @@ -1,12 +0,0 @@ -# Create a temporary directory to hold source code and packages -mkdir -p temp - -cp -r src/* temp/ -pip3 install -r src/requirements.txt --platform manylinux2014_x86_64 --target temp/ --only-binary=:all: --python-version 3.13 - -cd temp -zip -r ../build.zip . -cd .. - -# Clean up the temporary build directory -rm -rf temp diff --git a/scripts/autoformat.sh b/scripts/autoformat.sh new file mode 100755 index 0000000000..bfaf2b4886 --- /dev/null +++ b/scripts/autoformat.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT+="$(dirname "$DIR")" + +# change working directory to the root of the project +cd "$ROOT" + +black src/ +isort --profile=black src/ \ No newline at end of file diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100755 index 0000000000..74c02ae4e0 --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT+="$(dirname "$DIR")" + +# change working directory to the root of the project +cd "$ROOT" + +# Create a temporary directory to hold source code and packages +mkdir -p target/tmp + +# get python version from .python-version +PY_VERSION=$(cat .python-version) + +cp -r src/* target/tmp +pip install -r src/requirements.txt --platform manylinux2014_x86_64 --target target/tmp --only-binary=:all: --python-version $PY_VERSION + +cd target/tmp +zip -r ../build.zip . +cd ../.. + +# Clean up the temporary build directory +rm -rf target/tmp diff --git a/scripts/ci-setup.sh b/scripts/ci-setup.sh new file mode 100755 index 0000000000..dba7e432c3 --- /dev/null +++ b/scripts/ci-setup.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT+="$(dirname "$DIR")" + +# check output of python version matches .python-version +if ! python -V | grep -q "$(cat $ROOT/.python-version)"; then + echo "Python version does not match .python-version" + exit 1 +fi + +# install the requirements +pip install -r "$ROOT/src/requirements.txt" +pip install -r "$ROOT/src/dev_requirements.txt" \ No newline at end of file diff --git a/scripts/setup.sh b/scripts/setup.sh new file mode 100755 index 0000000000..23249247f9 --- /dev/null +++ b/scripts/setup.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT+="$(dirname "$DIR")" + +echo "Setting up the project ..." + +# check if pyenv is installed, otherwise exit and echo install from message +if ! command -v pyenv &> /dev/null; then + echo "pyenv is not installed. Please install it using \"brew install pyenv\"" + exit 1 +fi + +# get the version specified using pyenv local +PY_VERSION=$(pyenv local) +# check if the version is installed, otherwise install it +if ! pyenv versions | grep -q "$PY_VERSION"; then + echo "Installing Python version $PY_VERSION ..." + pyenv install "$PY_VERSION" +fi + +# set the shell to use the version specified +pyenv shell "$PY_VERSION" + +# check if venv exists otherwise create it +if [ ! -d "$ROOT/venv" ]; then + echo "Creating virtual environment ..." + python -m venv "$ROOT/venv" +fi + +# activate the virtual environment +source "$ROOT/venv/bin/activate" + +# install the requirements +pip install -r "$ROOT/src/requirements.txt" +pip install -r "$ROOT/src/dev_requirements.txt" + +# install pip-tools +pip install pip-tools + +# echo the setup is complete, and to run "source ./venv/bin/activate" +echo "Setup complete, please run \"source ./venv/bin/activate\" to activate the virtual environment." diff --git a/scripts/typecheck.sh b/scripts/typecheck.sh new file mode 100755 index 0000000000..ae9dddca7a --- /dev/null +++ b/scripts/typecheck.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT+="$(dirname "$DIR")" + +# change working directory to the root of the project +cd "$ROOT" + +mypy --config-file src/mypy.ini src/ \ No newline at end of file diff --git a/src/dev_requirements.in b/src/dev_requirements.in new file mode 100644 index 0000000000..5427fe912c --- /dev/null +++ b/src/dev_requirements.in @@ -0,0 +1,4 @@ +pytest +black +isort +mypy \ No newline at end of file diff --git a/src/dev_requirements.txt b/src/dev_requirements.txt new file mode 100644 index 0000000000..9867a5587c --- /dev/null +++ b/src/dev_requirements.txt @@ -0,0 +1,34 @@ +# +# This file is autogenerated by pip-compile with Python 3.13 +# by the following command: +# +# pip-compile dev_requirements.in +# +black==24.10.0 + # via -r dev_requirements.in +click==8.1.8 + # via black +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via -r dev_requirements.in +mypy==1.14.1 + # via -r dev_requirements.in +mypy-extensions==1.0.0 + # via + # black + # mypy +packaging==24.2 + # via + # black + # pytest +pathspec==0.12.1 + # via black +platformdirs==4.3.6 + # via black +pluggy==1.5.0 + # via pytest +pytest==8.3.4 + # via -r dev_requirements.in +typing-extensions==4.12.2 + # via mypy From 490573ab289761477f5d7be0f8729f63f402b832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 20 Jan 2025 12:59:50 +0000 Subject: [PATCH 114/310] Remove sample size from 'extractors' state machine --- terraform/state_machine_extractors.tf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/terraform/state_machine_extractors.tf b/terraform/state_machine_extractors.tf index e8aee4c50e..e86b989864 100644 --- a/terraform/state_machine_extractors.tf +++ b/terraform/state_machine_extractors.tf @@ -18,8 +18,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_extractors" { Parameters = { "transformer_type" = task_input.transformer_type, "entity_type" = task_input.entity_type, - "stream_destination" = "s3", - "sample_size" = 1000 # Only stream a small sample while testing + "stream_destination" = "s3" } End = true } From a9ee917b92e6b91bf6f733b5b5b788343bd5d7b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 20 Jan 2025 13:11:45 +0000 Subject: [PATCH 115/310] Fix edge id bug --- src/converters/cypher/bulk_load_converter.py | 3 ++- src/sources/wikidata/sparql_client.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index bc9223ed8d..59ea03fa8c 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -40,7 +40,8 @@ def _node_to_bulk_cypher(self, model: BaseNode) -> dict: def _edge_to_bulk_cypher(self, model: BaseEdge) -> dict: bulk_edge = { - ":ID": f"{model.from_id}-->{model.to_id}", + # We need to give the edge a unique ID so that the Neptune bulk loader recognises duplicates + ":ID": f"{model.relationship}:{model.from_id}-->{model.to_id}", ":START_ID": model.from_id, ":END_ID": model.to_id, ":TYPE": model.relationship, diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 49cf46fbb4..d89407d942 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -15,7 +15,7 @@ class WikidataSparqlClient: so that we do not exceed Wikidata rate limits. """ - def __init__(self): + def __init__(self) -> None: self.parallel_query_semaphore = threading.Semaphore(MAX_PARALLEL_SPARQL_QUERIES) self.too_many_requests = False self.too_many_requests_lock = threading.Lock() From c6bd9f9c59054bc8259495d586f258f292ac1425 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Mon, 20 Jan 2025 13:21:59 +0000 Subject: [PATCH 116/310] Apply auto-formatting rules --- src/sources/wikidata/sparql_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index d89407d942..ffbde5c1ce 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,5 +1,5 @@ -import time import threading +import time import requests From 01bd94cfef1f6b997d52874ec4660df7993930af Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Tue, 21 Jan 2025 14:42:39 +0000 Subject: [PATCH 117/310] Set env var rather than use pyenv shell MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Štěpán Brychta --- scripts/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/setup.sh b/scripts/setup.sh index 23249247f9..6157a56b38 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -22,8 +22,8 @@ if ! pyenv versions | grep -q "$PY_VERSION"; then pyenv install "$PY_VERSION" fi -# set the shell to use the version specified -pyenv shell "$PY_VERSION" +# specify the python version to use for pyenv +export PYENV_VERSION="$PY_VERSION" # check if venv exists otherwise create it if [ ! -d "$ROOT/venv" ]; then From 608f496e3d127133e5cb3853bdd0bc9ec91b6d85 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 22 Jan 2025 15:19:16 +0000 Subject: [PATCH 118/310] Add first test and run in CI --- .github/workflows/test.yml | 22 ++++++++++++++++++++++ scripts/test.sh | 9 +++++++++ src/dev_requirements.in | 6 ++++-- src/dev_requirements.txt | 8 ++++++++ src/utils/test_xml.py | 17 +++++++++++++++++ 5 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/test.yml create mode 100755 scripts/test.sh create mode 100644 src/utils/test_xml.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000000..d913f3975f --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,22 @@ +# Build deployment artifact +name: "Run tests" + +on: push + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + + - name: Setup CI + run: | + ./scripts/ci-setup.sh + + - name: Setup CI + run: | + ./scripts/test.sh \ No newline at end of file diff --git a/scripts/test.sh b/scripts/test.sh new file mode 100755 index 0000000000..ca4867521a --- /dev/null +++ b/scripts/test.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT+="$(dirname "$DIR")" + +pytest --cov src ./src --cov-report term \ No newline at end of file diff --git a/src/dev_requirements.in b/src/dev_requirements.in index 5427fe912c..c14ceff04b 100644 --- a/src/dev_requirements.in +++ b/src/dev_requirements.in @@ -1,4 +1,6 @@ -pytest black +coverage isort -mypy \ No newline at end of file +mypy +pytest +pytest-cov diff --git a/src/dev_requirements.txt b/src/dev_requirements.txt index 9867a5587c..a6513dc561 100644 --- a/src/dev_requirements.txt +++ b/src/dev_requirements.txt @@ -8,6 +8,10 @@ black==24.10.0 # via -r dev_requirements.in click==8.1.8 # via black +coverage[toml]==7.6.10 + # via + # -r dev_requirements.in + # pytest-cov iniconfig==2.0.0 # via pytest isort==5.13.2 @@ -29,6 +33,10 @@ platformdirs==4.3.6 pluggy==1.5.0 # via pytest pytest==8.3.4 + # via + # -r dev_requirements.in + # pytest-cov +pytest-cov==6.0.0 # via -r dev_requirements.in typing-extensions==4.12.2 # via mypy diff --git a/src/utils/test_xml.py b/src/utils/test_xml.py new file mode 100644 index 0000000000..f106303e60 --- /dev/null +++ b/src/utils/test_xml.py @@ -0,0 +1,17 @@ +from utils.xml import assert_get_text +import xml.etree.ElementTree as ET +import pytest + +def test_assert_get_text(): + elem = ET.Element("root") + elem.text = "text" + + assert assert_get_text(elem) == "text" + + elem.text = None + with pytest.raises(AssertionError): + assert_get_text(elem) + + elem.text = 1 + with pytest.raises(AssertionError): + assert_get_text(elem) \ No newline at end of file From fd4507c3afcbd70d98d6565c733925840e5db062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 22 Jan 2025 15:23:58 +0000 Subject: [PATCH 119/310] Update pipeline state machine --- terraform/variables.tf | 52 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/terraform/variables.tf b/terraform/variables.tf index d36bce0b48..c881c092d2 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -42,6 +42,56 @@ variable "state_machine_inputs" { "label" : "MeSH Concept Edges", "transformer_type" : "mesh_concepts", "entity_type" : "edges" - } + }, + { + "label" : "Wikidata Linked LoC Concept Nodes", + "transformer_type" : "wikidata_linked_loc_concepts", + "entity_type" : "nodes" + }, + { + "label" : "Wikidata Linked LoC Location Nodes", + "transformer_type" : "wikidata_linked_loc_locations", + "entity_type" : "nodes" + }, + { + "label" : "Wikidata Linked LoC Name Nodes", + "transformer_type" : "wikidata_linked_loc_names", + "entity_type" : "nodes" + }, + { + "label" : "Wikidata Linked LoC Concept Edges", + "transformer_type" : "wikidata_linked_loc_concepts", + "entity_type" : "edges" + }, + { + "label" : "Wikidata Linked LoC Location Edges", + "transformer_type" : "wikidata_linked_loc_locations", + "entity_type" : "edges" + }, + { + "label" : "Wikidata Linked LoC Name Edges", + "transformer_type" : "wikidata_linked_loc_names", + "entity_type" : "edges" + }, + { + "label" : "Wikidata Linked MeSH Concept Nodes", + "transformer_type" : "wikidata_linked_mesh_concepts", + "entity_type" : "nodes" + }, + { + "label" : "Wikidata Linked MeSH Location Nodes", + "transformer_type" : "wikidata_linked_mesh_locations", + "entity_type" : "nodes" + }, + { + "label" : "Wikidata Linked MeSH Concept Edges", + "transformer_type" : "wikidata_linked_mesh_concepts", + "entity_type" : "edges" + }, + { + "label" : "Wikidata Linked MeSH Location Edges", + "transformer_type" : "wikidata_linked_mesh_locations", + "entity_type" : "edges" + }, ] } From e747c8fa6273fc611c7d5e180fa65495b03b4958 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 22 Jan 2025 15:24:55 +0000 Subject: [PATCH 120/310] please mypy --- src/utils/test_xml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/test_xml.py b/src/utils/test_xml.py index f106303e60..855b06b4aa 100644 --- a/src/utils/test_xml.py +++ b/src/utils/test_xml.py @@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET import pytest -def test_assert_get_text(): +def test_assert_get_text() -> None: elem = ET.Element("root") elem.text = "text" @@ -12,6 +12,6 @@ def test_assert_get_text(): with pytest.raises(AssertionError): assert_get_text(elem) - elem.text = 1 + elem.text = 1 # type: ignore # Deliberate type error to test assertion with pytest.raises(AssertionError): assert_get_text(elem) \ No newline at end of file From 1acf3b1afade413f4095477bd74f35954e8f2fff Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Wed, 22 Jan 2025 15:26:39 +0000 Subject: [PATCH 121/310] Apply auto-formatting rules --- src/utils/test_xml.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/utils/test_xml.py b/src/utils/test_xml.py index 855b06b4aa..76df45d33e 100644 --- a/src/utils/test_xml.py +++ b/src/utils/test_xml.py @@ -1,7 +1,10 @@ -from utils.xml import assert_get_text import xml.etree.ElementTree as ET + import pytest +from utils.xml import assert_get_text + + def test_assert_get_text() -> None: elem = ET.Element("root") elem.text = "text" @@ -12,6 +15,6 @@ def test_assert_get_text() -> None: with pytest.raises(AssertionError): assert_get_text(elem) - elem.text = 1 # type: ignore # Deliberate type error to test assertion + elem.text = 1 # type: ignore # Deliberate type error to test assertion with pytest.raises(AssertionError): - assert_get_text(elem) \ No newline at end of file + assert_get_text(elem) From 4e0070dee11a31df6bd0db5f4c91aea4e46a893a Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 22 Jan 2025 15:32:58 +0000 Subject: [PATCH 122/310] add coverage report --- .github/workflows/test.yml | 20 ++++++++++++++++---- scripts/test.sh | 4 +++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d913f3975f..7b1cc6fb4b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,10 +1,15 @@ # Build deployment artifact name: "Run tests" -on: push +on: + push: + branches: + - main + pull_request: + workflow_dispatch: jobs: - build: + test: runs-on: ubuntu-latest steps: @@ -17,6 +22,13 @@ jobs: run: | ./scripts/ci-setup.sh - - name: Setup CI + - name: Run tests & generate coverage report run: | - ./scripts/test.sh \ No newline at end of file + ./scripts/test.sh + + - name: Get Cover + if : ${{ github.event_name == 'pull_request' }} + uses: orgoro/coverage@v3.2 + with: + coverageFile: coverage.xml + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/scripts/test.sh b/scripts/test.sh index ca4867521a..52bdff7cf8 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -6,4 +6,6 @@ set -o nounset DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ROOT+="$(dirname "$DIR")" -pytest --cov src ./src --cov-report term \ No newline at end of file +pytest --cov src ./src \ + --cov-report term \ + --cov-report xml:coverage.xml \ No newline at end of file From a820f838c2cce823ad5b02178850f12d3f26845f Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 22 Jan 2025 16:52:25 +0000 Subject: [PATCH 123/310] Add example query notebook --- .gitignore | 4 + notebooks/graph_exploration.ipynb | 395 ++++++++++++++++++++++++++++++ 2 files changed, 399 insertions(+) create mode 100644 notebooks/graph_exploration.ipynb diff --git a/.gitignore b/.gitignore index 90a6243bee..1adc4e1d59 100644 --- a/.gitignore +++ b/.gitignore @@ -378,3 +378,7 @@ tags # End of https://www.toptal.com/developers/gitignore/api/osx,vim,java,linux,python,intellij .terraform/ + +# notebooks +notebooks/* +!notebooks/graph_exploration.ipynb diff --git a/notebooks/graph_exploration.ipynb b/notebooks/graph_exploration.ipynb new file mode 100644 index 0000000000..743c6778f6 --- /dev/null +++ b/notebooks/graph_exploration.ipynb @@ -0,0 +1,395 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3559b90d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%env AWS_PROFILE=platform-developer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cdd025f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%graph_notebook_config\n", + "{\n", + " \"host\": ,\n", + " \"neptune_service\": \"neptune-db\",\n", + " \"port\": 8182,\n", + " \"ssl\": true,\n", + " \"proxy_port\": 443,\n", + " \"proxy_host\": \"catalogue-graph.wellcomecollection.org\",\n", + " \"auth_mode\": \"IAM\",\n", + " \"aws_region\": \"eu-west-1\",\n", + " \"load_from_s3_arn\": \"\"\n", + "} " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0b6ae09", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%status" + ] + }, + { + "cell_type": "markdown", + "id": "5a008888-9f98-41c0-9185-858c80e2464a", + "metadata": {}, + "source": [ + "## Sample openCypher queries" + ] + }, + { + "cell_type": "markdown", + "id": "363bcdc3-062d-4ee8-be27-893c72ef701c", + "metadata": {}, + "source": [ + "Count the number of all `SourceConcept` nodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "009b8a46-4fd0-4bfd-9eb3-77dac689bc2d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (c:SourceConcept)\n", + "RETURN count(c)" + ] + }, + { + "cell_type": "markdown", + "id": "47f1610a-2219-4554-9c25-11f725a53dec", + "metadata": {}, + "source": [ + "Count the number of `SourceConcept` nodes grouped by their source (LCSH, MeSH, Wikidata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a2aa8da-ba69-4af7-bd42-60f8624b61fd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (c:SourceConcept)\n", + "RETURN count(c), c.source" + ] + }, + { + "cell_type": "markdown", + "id": "925cc5aa-31ac-45f5-bfcd-c43a9e2f0c52", + "metadata": {}, + "source": [ + "We can do the same for `SourceLocation` and `SourceName` nodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed03d76b-a7e2-4272-9eee-10e439869eb4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (l:SourceLocation)\n", + "RETURN count(l), l.source" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1159cda1-ec08-4d7c-a76a-5ef49db3453d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (n:SourceName)\n", + "RETURN count(n), n.source" + ] + }, + { + "cell_type": "markdown", + "id": "da0a41e3-ba77-44e3-91a0-9dabf0ec8a6f", + "metadata": {}, + "source": [ + "Using openCypher queries, we can easily traverse the edges in the graph. For example, we can use this query to look up the labels of `SourceConcept` parents:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74bdf7dd-be15-4f5f-a267-3c5321a808a8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (c:SourceConcept)-[:HAS_PARENT]->(p)\n", + "WHERE c.source='nlm-mesh'\n", + "RETURN c.label, p.label\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "id": "d9014952-3930-4d69-88d2-2d1b56f42d78", + "metadata": {}, + "source": [ + "We can also traverse multiple edges using the `*` operator. For example, the query below retrieves grandparent labels of `SourceConcept` nodes (i.e. `2` levels of `HAS_PARENT` edges)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "745fab86-375b-4462-8f1d-80b3380ee830", + "metadata": {}, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (c:SourceConcept)-[:HAS_PARENT*2]->(p)\n", + "WHERE c.source='nlm-mesh'\n", + "RETURN c.label, p.label\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "id": "84c96a06-2b1c-4741-9a2c-2fdde7a15a48", + "metadata": {}, + "source": [ + "We can count the number of links between sources via `SAME_AS` edges. This reveals a high level of Wikidata coverage for both LoC and MeSH `SourceConcepts`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5164734a-4673-47fb-8d4a-1eaa52f423b2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (sc1:SourceConcept)-[:SAME_AS]->(sc2:SourceConcept)\n", + "RETURN count(sc1), sc1.source" + ] + }, + { + "cell_type": "markdown", + "id": "bc5fa969-19af-46ad-80f1-fe1dd6bd1216", + "metadata": {}, + "source": [ + "It is also possible to view an interactive visualisation of query results when returning everyting (`*`), which can be accessed via the `Graph` tab. This can be customised with visualization hints using `-d`, `-de`, `-l` and `-g` after the `%%oc` magic command. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bd63ec2-3942-438f-a91e-2f1aab581013", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 20\n", + "MATCH(c:SourceConcept)-[r:NARROWER_THAN*]->(p)\n", + "WHERE c.id = 'sh00002633'\n", + "RETURN *\n", + "LIMIT 20" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c878c57-7571-49a9-909b-4ca50fbce349", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 20\n", + "MATCH (m:SourceConcept)-[p:HAS_PARENT*]->(c:SourceConcept)\n", + "WHERE m.id = 'D012499'\n", + "RETURN *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80b5ac50-2337-4615-bd58-f42fddb616ea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 20\n", + "MATCH (m:SourceConcept)<-[p:HAS_PARENT*]-(c:SourceConcept)\n", + "WHERE m.id = 'D012499'\n", + "RETURN *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b3b2582-06f0-4989-93ee-7987f1b19047", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 20 -g source\n", + "MATCH (sc1:SourceConcept)-[r:SAME_AS*]->(sc2:SourceConcept)\n", + "WHERE sc1.id = 'D012499'\n", + "RETURN *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20a14a71-4870-4397-97d9-8e6efad98d71", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 25 -g source\n", + "MATCH (sc1:SourceConcept)-[r:RELATED_TO*..2]->(sc2:SourceConcept)\n", + "WHERE sc1.id = 'sh85117296'\n", + "RETURN *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3266937f-4ca6-4d46-a9b7-3d0496a78b8d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 25 -g source\n", + "MATCH (sc1:SourceConcept)-[r:NARROWER_THAN*..2]->(sc2:SourceConcept)\n", + "WHERE sc1.id = 'sh85117296'\n", + "RETURN *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8eab1235-c42d-44f0-bebf-376add4637b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 25 -g source\n", + "MATCH (sc1:SourceConcept)<-[r:NARROWER_THAN*..2]-(sc2:SourceConcept)\n", + "WHERE sc1.id = 'sh85117296'\n", + "RETURN *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8a58fe4-556d-4b8e-bb4b-da61bd5f531b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 20 -g source\n", + "MATCH (sc1:SourceConcept)-[r:SAME_AS]->(sc2:SourceConcept)-[p:HAS_PARENT]->(sc3:SourceConcept)\n", + "RETURN *\n", + "LIMIT 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "807f813b-6469-4ad8-85df-ed09abed2f0a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 20\n", + "MATCH (sn1:SourceName)-[r:SAME_AS]->(sn2:SourceName)\n", + "WHERE sn1.id='n84804337'\n", + "RETURN *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "196c43b7-a9e1-441b-bbde-dc284ea069bc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%oc -d label -l 20\n", + "MATCH (sn1:SourceName)-[r:SAME_AS]->(sn2:SourceName)\n", + "WHERE sn1.id='Q542019'\n", + "RETURN *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab3103fd-9b7f-4fb6-80a6-efbbe56693f1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "catgraph-exploration", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 3b4de72241ad1f2c6fd818a550c98cdc9614fd60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 22 Jan 2025 17:33:47 +0000 Subject: [PATCH 124/310] Fix coordinates bug --- src/sources/wikidata/sparql_client.py | 1 + src/sources/wikidata/sparql_query_builder.py | 34 +++++++++++++------- src/transformers/wikidata/raw_concept.py | 5 ++- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index ffbde5c1ce..5ebc2b059c 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -55,6 +55,7 @@ def run_query(self, query: str) -> list[dict]: with self.too_many_requests_lock: self.too_many_requests = True + print("Too many SPARQL requests. Sleeping...") retry_after = int(r.headers["Retry-After"]) time.sleep(max(60, retry_after)) diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 1e0535173e..238a158be2 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -21,7 +21,18 @@ def _get_formatted_fields(node_type: NodeType) -> str: elif node_type == "locations": fields += ["?coordinates"] - return " ".join(fields) + # The Wikidata id (stored under '?item') is the only field in the 'GROUP BY' clause. SPARQL requires that all + # other fields are wrapped in an aggregate function. We use the 'SAMPLE' function, which chooses a value at + # random where multiple values are available for some field. (Currently only the '?coordinates' field sometimes + # stores multiple values, and we don't mind only extracting one of those values.) + fields_with_aggregation = [] + for field in fields: + if field == "?item": + fields_with_aggregation.append(field) + else: + fields_with_aggregation.append(f"(SAMPLE({field}) as {field})") + + return " ".join(fields_with_aggregation) @staticmethod def _get_formatted_field_mappings(node_type: NodeType) -> str: @@ -38,16 +49,7 @@ def _get_formatted_field_mappings(node_type: NodeType) -> str: "OPTIONAL { ?item wdt:P19 ?placeOfBirth. }", ] elif node_type == "locations": - definitions += [ - """ - { - SELECT ?item (SAMPLE(?coordinates) AS ?coordinates) { - ?item p:P625/ps:P625 ?coordinates. - } - GROUP BY ?item - } - """ - ] + definitions += ["OPTIONAL { ?item p:P625/ps:P625 ?coordinates. }"] return "\n".join(definitions) @@ -82,10 +84,18 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: query = f""" SELECT DISTINCT {cls._get_formatted_fields(node_type)} WHERE {{ - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} VALUES ?item {{ {ids_clause} }} + {cls._get_formatted_field_mappings(node_type)} + + SERVICE wikibase:label {{ + bd:serviceParam wikibase:language "en". + ?item rdfs:label ?itemLabel. + ?item schema:description ?itemDescription. + ?item skos:altLabel ?itemAltLabel. + }} }} + GROUP BY ?item """ return query diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 7402aa9690..4e57e7982f 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -62,7 +62,10 @@ def _get_coordinates(self) -> dict[str, float | None]: """Extracts coordinates from a raw string in the format `Point( )` (e.g. `Point(9.83 53.54)`)""" # Some items do not return valid coordinates (e.g. Q17064702, whose coordinates just say 'unknown value' on the # Wikidata website). When this happens, the 'type' of the 'coordinates' property always appears to be 'uri'. - if self.raw_concept["coordinates"]["type"] == "uri": + if ( + "coordinates" not in self.raw_concept + or self.raw_concept["coordinates"]["type"] == "uri" + ): return {"longitude": None, "latitude": None} raw_coordinates = self._extract_field_value("coordinates") From 1e2b896ef9386604336bcd1ea10b3f0468737531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 23 Jan 2025 12:24:35 +0000 Subject: [PATCH 125/310] Fix invalid LoC names bug --- .../wikidata/linked_ontology_id_type_checker.py | 14 ++++---------- src/sources/wikidata/linked_ontology_source.py | 1 + 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 8cce4e134a..4aab9bc3f5 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -51,14 +51,8 @@ def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: return ids def id_included_in_selected_type(self, linked_id: str) -> bool: - """Return `True` if a given linked ontology id is classified under the selected node type (concepts, - locations, or names).""" - - # To check whether a Library of Congress id is classified under 'names', we could examine all the 'names' ids, - # but the corresponding file is large and it would take too long. Instead, it's better to check that the - # LoC id starts with an 'n' and that it is not classified under 'locations'. - if self.linked_ontology == "loc" and self.node_type == "names": - location_ids = self._get_linked_ontology_ids("locations") - return linked_id not in location_ids and linked_id[0] == "n" - + """ + Return `True` if a given linked ontology id is classified under the selected node type (concepts, + locations, or names). + """ return linked_id in self._get_linked_ontology_ids(self.node_type) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 62b306c326..e5f7f2ca2c 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -106,6 +106,7 @@ def _stream_raw_edges(self) -> Generator[dict]: # Only yield the mapping if the linked id corresponds to the selected `node_type`, as determined by the # linked ontology. For example, if we want to stream Wikidata 'names' edges, but we classify the referenced # LoC id is a 'locations' id, we skip it. + # This also removes mappings which include invalid LoC ids (of which there are several thousand). if self.id_type_checker.id_included_in_selected_type(mapping["linked_id"]): yield mapping From 949c9883cf0d73ad22fb15fbd9580140fed2b8c1 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 23 Jan 2025 13:45:16 +0000 Subject: [PATCH 126/310] Adds CODEOWNERS --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) create mode 100644 CODEOWNERS diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000000..524c6ae0e9 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1 @@ +* @wellcomecollection/digital-platform From 1b9f228ef5f6c0ec2038b7c61bd82dd757e29113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 23 Jan 2025 13:48:45 +0000 Subject: [PATCH 127/310] Retry failed Wikidata requests --- src/sources/wikidata/sparql_client.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 5ebc2b059c..52af21cdb3 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,5 +1,6 @@ import threading import time +import backoff import requests @@ -9,6 +10,11 @@ MAX_PARALLEL_SPARQL_QUERIES = 4 +def on_request_backoff(backoff_details: dict) -> None: + exception_name = type(backoff_details["exception"]).__name__ + print(f"SPARQL request failed due to '{exception_name}'. Retrying...") + + class WikidataSparqlClient: """ A client class for querying Wikidata via SPARQL queries. Automatically throttles requests (in a thread-safe way) @@ -31,6 +37,13 @@ def _get_user_agent_header() -> str: "digital@wellcomecollection.org) wellcome-collection-catalogue-graph/0.1" ) + @backoff.on_exception( + backoff.constant, + Exception, + max_tries=3, + interval=10, + on_backoff=on_request_backoff, + ) def run_query(self, query: str) -> list[dict]: """Runs a query against Wikidata's SPARQL endpoint and returns the results as a list""" From d0093c1515a1d42fed395f3dac3dd18c60550cba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 23 Jan 2025 13:48:45 +0000 Subject: [PATCH 128/310] Retry failed Wikidata requests --- src/sources/wikidata/sparql_client.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 5ebc2b059c..1e3cee8f4a 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,5 +1,7 @@ import threading import time +import backoff +import typing import requests @@ -9,6 +11,11 @@ MAX_PARALLEL_SPARQL_QUERIES = 4 +def on_request_backoff(backoff_details: typing.Any) -> None: + exception_name = type(backoff_details["exception"]).__name__ + print(f"SPARQL request failed due to '{exception_name}'. Retrying...") + + class WikidataSparqlClient: """ A client class for querying Wikidata via SPARQL queries. Automatically throttles requests (in a thread-safe way) @@ -31,6 +38,13 @@ def _get_user_agent_header() -> str: "digital@wellcomecollection.org) wellcome-collection-catalogue-graph/0.1" ) + @backoff.on_exception( + backoff.constant, + Exception, + max_tries=3, + interval=10, + on_backoff=on_request_backoff, + ) def run_query(self, query: str) -> list[dict]: """Runs a query against Wikidata's SPARQL endpoint and returns the results as a list""" From 6492f5b9272bea7d6f47748e213154c7ad656f0e Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Thu, 23 Jan 2025 14:06:00 +0000 Subject: [PATCH 129/310] Apply auto-formatting rules --- src/sources/wikidata/sparql_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 1e3cee8f4a..975a6b5d66 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,8 +1,8 @@ import threading import time -import backoff import typing +import backoff import requests # Wikidata limits the number of parallel queries from a single IP address to 5. From 838c7b4cb36e2e943b0783299f7bcb87ea6cfed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 23 Jan 2025 14:19:50 +0000 Subject: [PATCH 130/310] Increase Neptune client resilience --- src/clients/base_neptune_client.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 8839653bdb..745e1c2bee 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -6,6 +6,12 @@ import requests from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest +import typing + + +def on_request_backoff(backoff_details: typing.Any) -> None: + exception_name = type(backoff_details["exception"]).__name__ + print(f"Neptune request failed due to '{exception_name}'. Retrying...") class BaseNeptuneClient: @@ -23,6 +29,13 @@ def __init__(self, neptune_endpoint: str) -> None: def _get_client_url(self) -> str: raise NotImplementedError() + @backoff.on_exception( + backoff.constant, + Exception, + max_tries=3, + interval=10, + on_backoff=on_request_backoff, + ) def _make_request( self, method: str, relative_url: str, payload: dict | None = None ) -> dict: @@ -50,7 +63,6 @@ def _make_request( response: dict = raw_response.json() return response - @backoff.on_exception(backoff.constant, Exception, max_tries=5, interval=1) def run_open_cypher_query(self, query: str) -> dict: """Runs an openCypher query against the Neptune cluster. Automatically retries up to 5 times to mitigate transient errors.""" From 462c42a908b81bdb18abe951ec91fe562b00a0e6 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Thu, 23 Jan 2025 14:21:06 +0000 Subject: [PATCH 131/310] Apply auto-formatting rules --- src/clients/base_neptune_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 745e1c2bee..0cf2ec006f 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -1,12 +1,12 @@ import datetime import json +import typing import backoff import boto3 import requests from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest -import typing def on_request_backoff(backoff_details: typing.Any) -> None: From dc1f543ca1481a1802d7da32f11d733e02901d1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 23 Jan 2025 17:07:26 +0000 Subject: [PATCH 132/310] Build fix --- terraform/lambda_bulk_load_poller.tf | 6 +++--- terraform/lambda_bulk_loader.tf | 4 ++-- terraform/lambda_extractor.tf | 4 ++-- terraform/lambda_indexer.tf | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/terraform/lambda_bulk_load_poller.tf b/terraform/lambda_bulk_load_poller.tf index 57690876c3..9c1839343d 100644 --- a/terraform/lambda_bulk_load_poller.tf +++ b/terraform/lambda_bulk_load_poller.tf @@ -5,12 +5,12 @@ module "bulk_load_poller_lambda" { description = "Polls the status of a Neptune bulk load job." runtime = "python3.13" - filename = "../build.zip" - source_code_hash = filesha256("../build.zip") + filename = "../target/build.zip" + source_code_hash = filesha256("../target/build.zip") handler = "bulk_load_poller.lambda_handler" memory_size = 128 - timeout = 30 // 30 seconds + timeout = 120 // 120 seconds vpc_config = { subnet_ids = local.private_subnets diff --git a/terraform/lambda_bulk_loader.tf b/terraform/lambda_bulk_loader.tf index 59a92cd281..4801d75adf 100644 --- a/terraform/lambda_bulk_loader.tf +++ b/terraform/lambda_bulk_loader.tf @@ -5,8 +5,8 @@ module "bulk_loader_lambda" { description = "Bulk loads entities from an S3 bucket into the Neptune database." runtime = "python3.13" - filename = "../build.zip" - source_code_hash = filesha256("../build.zip") + filename = "../target/build.zip" + source_code_hash = filesha256("../target/build.zip") handler = "bulk_loader.lambda_handler" memory_size = 128 diff --git a/terraform/lambda_extractor.tf b/terraform/lambda_extractor.tf index 69a818bc57..d4dd5829aa 100644 --- a/terraform/lambda_extractor.tf +++ b/terraform/lambda_extractor.tf @@ -5,8 +5,8 @@ module "extractor_lambda" { description = "Extracts source concepts and turns them into Cypher queries." runtime = "python3.13" - filename = "../build.zip" - source_code_hash = filesha256("../build.zip") + filename = "../target/build.zip" + source_code_hash = filesha256("../target/build.zip") handler = "extractor.lambda_handler" diff --git a/terraform/lambda_indexer.tf b/terraform/lambda_indexer.tf index 0beb6e48a0..53c37af1a5 100644 --- a/terraform/lambda_indexer.tf +++ b/terraform/lambda_indexer.tf @@ -5,8 +5,8 @@ module "indexer_lambda" { description = "Indexes nodes and edges into the Neptune catalogue graph cluster." runtime = "python3.13" - filename = "../build.zip" - source_code_hash = filesha256("../build.zip") + filename = "../target/build.zip" + source_code_hash = filesha256("../target/build.zip") handler = "indexer.lambda_handler" memory_size = 128 From 39604d88fd7849857b21868136e80c3af1df51fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 23 Jan 2025 17:25:26 +0000 Subject: [PATCH 133/310] Add Wikidata parent concepts (draft) --- .../wikidata/linked_ontology_source.py | 60 +++++++++++++------ src/sources/wikidata/sparql_client.py | 3 +- src/sources/wikidata/sparql_query_builder.py | 42 ++++++++++--- 3 files changed, 77 insertions(+), 28 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index e5f7f2ca2c..c52cac25d8 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -3,6 +3,7 @@ from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel +from typing import Callable from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient @@ -13,20 +14,20 @@ WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" -def extract_wikidata_id(item: dict) -> str: +def extract_wikidata_id(item: dict, key: str = "item") -> str: """ Accepts a raw `item` dictionary returned by the Wikidata SPARQL endpoint and returns the Wikidata id of the item. """ - assert isinstance(item["item"]["value"], str) - assert item["item"]["type"] == "uri" - return item["item"]["value"].removeprefix(WIKIDATA_ID_PREFIX) + assert isinstance(item[key]["value"], str) + assert item[key]["type"] == "uri" + return item[key]["value"].removeprefix(WIKIDATA_ID_PREFIX) class WikidataLinkedOntologySource(BaseSource): """ A source for streaming selected Wikidata nodes/edges. There are _many_ Wikidata items, so we cannot store all of - them in the graph. Instead, we only include items which reference an id from a selected linked ontology, - (LoC or MeSH), as defined by the `linked_ontology` parameter. + them in the graph. Instead, we only include items which reference an id from a selected linked ontology + (LoC or MeSH) and their parents. Wikidata puts strict limits on the resources which can be consumed by a single query, and queries which include filters or do other expensive processing often time out or return a stack overflow error. This means we need @@ -66,10 +67,30 @@ def _get_linked_id_mappings(self, wikidata_ids: list[str]) -> list[dict]: ) return self.client.run_query(query) - def _get_linked_items(self, wikidata_ids: list[str]) -> list: + def _get_wikidata_items(self, wikidata_ids: list[str]) -> list: query = SparqlQueryBuilder.get_items_query(wikidata_ids, self.node_type) return self.client.run_query(query) + def _get_parent_id_mappings(self, child_wikidata_ids: list[str]) -> list[dict]: + query = SparqlQueryBuilder.get_filtered_parents_query( + child_wikidata_ids, self.linked_ontology + ) + return self.client.run_query(query) + + @staticmethod + def _parallelise_requests( + items: Generator, run_sparql_query: Callable[[list], list] + ) -> Generator: + """Accept an `items` generator and a `run_sparql_query` method. Split `items` chunks and apply + `run_sparql_query` to each chunk. Return a single generator of results.""" + for raw_response_item in process_stream_in_parallel( + items, + run_sparql_query, + SPARQL_ITEMS_CHUNK_SIZE, + MAX_PARALLEL_SPARQL_QUERIES, + ): + yield raw_response_item + def _stream_wikidata_ids(self) -> Generator[str]: """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" seen = set() @@ -93,11 +114,8 @@ def _stream_raw_edges(self) -> Generator[dict]: all_ids = self._get_all_ids() # Parallelise the second query to retrieve the mappings faster. - for raw_mapping in process_stream_in_parallel( - all_ids, - self._get_linked_id_mappings, - SPARQL_ITEMS_CHUNK_SIZE, - MAX_PARALLEL_SPARQL_QUERIES, + for raw_mapping in self._parallelise_requests( + all_ids, self._get_linked_id_mappings ): linked_id = raw_mapping["linkedId"]["value"] wikidata_id = extract_wikidata_id(raw_mapping) @@ -110,6 +128,16 @@ def _stream_raw_edges(self) -> Generator[dict]: if self.id_type_checker.id_included_in_selected_type(mapping["linked_id"]): yield mapping + for raw_mapping in self._parallelise_requests( + all_ids, self._get_parent_id_mappings + ): + mapping = { + "child_id": extract_wikidata_id(raw_mapping, "child"), + "parent_id": extract_wikidata_id(raw_mapping), + } + + yield mapping + def _stream_raw_nodes(self) -> Generator[dict]: """ Extract nodes via the following steps: @@ -118,13 +146,7 @@ def _stream_raw_nodes(self) -> Generator[dict]: Wikidata fields required to create a node. """ all_ids = self._stream_wikidata_ids() - - yield from process_stream_in_parallel( - all_ids, - self._get_linked_items, - SPARQL_ITEMS_CHUNK_SIZE, - MAX_PARALLEL_SPARQL_QUERIES, - ) + yield from self._parallelise_requests(all_ids, self._get_wikidata_items) def stream_raw(self) -> Generator[dict]: if self.entity_type == "nodes": diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 52af21cdb3..b454a384a6 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,6 +1,7 @@ import threading import time import backoff +from backoff._typing import Details import requests @@ -10,7 +11,7 @@ MAX_PARALLEL_SPARQL_QUERIES = 4 -def on_request_backoff(backoff_details: dict) -> None: +def on_request_backoff(backoff_details: Details) -> None: exception_name = type(backoff_details["exception"]).__name__ print(f"SPARQL request failed due to '{exception_name}'. Retrying...") diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 238a158be2..ef6fde379c 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -34,6 +34,15 @@ def _get_formatted_fields(node_type: NodeType) -> str: return " ".join(fields_with_aggregation) + @staticmethod + def _get_linked_ontology_filter(linked_ontology: OntologyType): + if linked_ontology == "loc": + return "?item p:P244/ps:P244 ?linkedId." + elif linked_ontology == "mesh": + return "?item p:P486/ps:P486 ?linkedId." + + raise ValueError(f"Invalid linked ontology type: {linked_ontology}") + @staticmethod def _get_formatted_field_mappings(node_type: NodeType) -> str: """ @@ -108,20 +117,37 @@ def get_linked_ids_query( Given a list of Wikidata `item_ids`, return a query to retrieve all linked ontology ids referenced by each item in the list. """ - if linked_ontology == "loc": - field_filter = "?item p:P244/ps:P244 ?linkedId." - elif linked_ontology == "mesh": - field_filter = "?item p:P486/ps:P486 ?linkedId." - else: - raise ValueError(f"Invalid linked ontology type: {linked_ontology}") - ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) query = f""" SELECT DISTINCT ?item ?linkedId WHERE {{ VALUES ?item {{ {ids_clause} }} - {field_filter} + {cls._get_linked_ontology_filter(linked_ontology)} + }} + """ + + return query + + @classmethod + def get_filtered_parents_query( + cls, item_ids: list[str], linked_ontology: OntologyType + ) -> str: + """ + Given a list of Wikidata `item_ids`, return a query to retrieve all parents of each item in the list, + filtering out items which reference ids from `linked_ontology`. + Parents are determined based on the 'subclass of' (P279) and the 'instance of' (P31) fields. + """ + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + + # We need to filter out items which link to the selected ontology as those items are added + # to the graph separately. + query = f""" + SELECT DISTINCT ?child ?item + WHERE {{ + VALUES ?child {{ {ids_clause} }} + ?child wdt:P31 ?item. + FILTER NOT EXISTS {{ {cls._get_linked_ontology_filter(linked_ontology)} }} }} """ From 3afeeafb01b2b3f4cfe5dc68964160c8d5af685c Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 23 Jan 2025 20:44:29 +0000 Subject: [PATCH 134/310] Add source for catalogue concepts --- src/sources/catalogue/__init__.py | 0 src/sources/catalogue/concepts_source.py | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 src/sources/catalogue/__init__.py create mode 100644 src/sources/catalogue/concepts_source.py diff --git a/src/sources/catalogue/__init__.py b/src/sources/catalogue/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/sources/catalogue/concepts_source.py b/src/sources/catalogue/concepts_source.py new file mode 100644 index 0000000000..50f78e1c88 --- /dev/null +++ b/src/sources/catalogue/concepts_source.py @@ -0,0 +1,23 @@ +import gzip +import json +from collections.abc import Generator + +import requests + +from sources.base_source import BaseSource + +CONCEPT_KEYS = ["subjects", "genres", "contributors"] + +class CatalogueConceptSource(BaseSource): + def __init__(self, url: str): + self.url = url + + def stream_raw(self) -> Generator[dict]: + response = requests.get(self.url, stream=True) + + with gzip.GzipFile(fileobj=response.raw) as file: + for line_bytes in file: + work = json.loads(line_bytes.decode("utf8")) + for conecpt_key in CONCEPT_KEYS: + for raw_concept in work.get(conecpt_key, []): + yield raw_concept From beca6e49acf484545c819aeeb65e6f8b8dacf21e Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 23 Jan 2025 13:22:08 +0000 Subject: [PATCH 135/310] add top level test for indexer --- pytest.ini | 2 + src/clients/lambda_neptune_client.py | 6 +- src/conftest.py | 10 +++ src/test_indexer.py | 33 ++++++++++ src/test_mocks.py | 99 ++++++++++++++++++++++++++++ src/utils/aws.py | 13 ++-- 6 files changed, 153 insertions(+), 10 deletions(-) create mode 100644 pytest.ini create mode 100644 src/conftest.py create mode 100644 src/test_indexer.py create mode 100644 src/test_mocks.py diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000..c1fa878547 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = -p no:warnings \ No newline at end of file diff --git a/src/clients/lambda_neptune_client.py b/src/clients/lambda_neptune_client.py index 028985c93d..544d49809b 100644 --- a/src/clients/lambda_neptune_client.py +++ b/src/clients/lambda_neptune_client.py @@ -9,11 +9,7 @@ class LambdaNeptuneClient(BaseNeptuneClient): def __init__(self, neptune_endpoint: str): super().__init__(neptune_endpoint) - self.session = boto3.Session( - aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), - aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), - aws_session_token=os.getenv("AWS_SESSION_TOKEN"), - ) + self.session = boto3.Session() def _get_client_url(self) -> str: return f"https://{self.neptune_endpoint}:8182" diff --git a/src/conftest.py b/src/conftest.py new file mode 100644 index 0000000000..d2c4ad0562 --- /dev/null +++ b/src/conftest.py @@ -0,0 +1,10 @@ +import pytest + +from test_mocks import MockBoto3Session, MockRequest + + +@pytest.fixture(autouse=True) +def test(monkeypatch): + # Replaces boto3 and Elasticsearch with fake clients + monkeypatch.setattr("boto3.Session", MockBoto3Session) + monkeypatch.setattr("requests.request", MockRequest.request) diff --git a/src/test_indexer.py b/src/test_indexer.py new file mode 100644 index 0000000000..8b03ec4016 --- /dev/null +++ b/src/test_indexer.py @@ -0,0 +1,33 @@ +import json + +from indexer import lambda_handler +from test_mocks import MockRequest + + +def test_lambda_handler(): + MockRequest.responses = [ + { + "method": "POST", + "url": "https://test-host.com:8182/openCypher", + "status_code": 200, + "json_data": {"results": {"foo": "bar"}}, + } + ] + + event = {"Records": [{"body": json.dumps({"Message": "SOME_QUERY"})}]} + + lambda_handler(event, None) + + assert len(MockRequest.calls) == 1 + request = MockRequest.calls[0] + + # Check we are sending a POST request to the correct endpoint + assert request["method"] == "POST" + assert request["url"] == "https://test-host.com:8182/openCypher" + + # Check we are sending the correct data + assert request["data"] == json.dumps({"query": "SOME_QUERY"}) + + # Check we are building a SigV4Auth header + assert "Authorization" in request["headers"] + assert "AWS4-HMAC-SHA256" in request["headers"]["Authorization"] diff --git a/src/test_mocks.py b/src/test_mocks.py new file mode 100644 index 0000000000..b4310834a3 --- /dev/null +++ b/src/test_mocks.py @@ -0,0 +1,99 @@ +from botocore.credentials import Credentials + +from utils.aws import INSTANCE_ENDPOINT_SECRET_NAME, LOAD_BALANCER_SECRET_NAME + +MOCK_API_KEY = "TEST_SECRET_API_KEY_123" +MOCK_INSTANCE_ENDPOINT = "test-host.com" +MOCK_CREDENTIALS = Credentials( + access_key="test_access_key", + secret_key="test", + token="test_token", +) + + +class MockSecretsManagerClient: + def get_secret_value(self, SecretId: str): + if SecretId == LOAD_BALANCER_SECRET_NAME: + secret_value = MOCK_API_KEY + elif SecretId == INSTANCE_ENDPOINT_SECRET_NAME: + secret_value = MOCK_INSTANCE_ENDPOINT + else: + raise KeyError("Secret value does not exist.") + + return {"SecretString": secret_value} + + +class MockBoto3Session: + def __init__(self): + self.clients = { + "secretsmanager": MockSecretsManagerClient(), + } + + def client(self, client_name: str): + if client_name not in self.clients: + raise KeyError("There is no mock client for the specified client_name.") + + return self.clients[client_name] + + def get_credentials(self): + return MOCK_CREDENTIALS + + +class MockResponse: + def __init__(self, json_data, status_code): + self.json_data = json_data + self.status_code = status_code + + def json(self): + return self.json_data + + +class MockRequest: + responses = {} + calls = [] + + @staticmethod + def clear_mock_responses(): + MockRequest.responses = {} + + @staticmethod + def clear_mock_calls(): + MockRequest.calls = [] + + @staticmethod + def reset_mocks(): + MockRequest.clear_mock_responses() + MockRequest.clear_mock_calls() + + @staticmethod + def mock_response(method, url, status_code, json_data): + MockRequest.responses.append( + { + "method": method, + "url": url, + "status_code": status_code, + "json_data": json_data, + } + ) + + @staticmethod + def mock_responses(method, url, responses): + MockRequest.clear_mock_responses() + for response in responses: + MockRequest.mock_response( + method, url, response["status_code"], response["json_data"] + ) + + @staticmethod + def request(method, url, **kwargs): + data = kwargs.get("data") + headers = kwargs.get("headers") + + MockRequest.calls.append( + {"method": method, "url": url, "data": data, "headers": headers} + ) + for response in MockRequest.responses: + if response["method"] == method and response["url"] == url: + return MockResponse(response["json_data"], response["status_code"]) + + raise Exception(f"Unexpected request: {method} {url}") diff --git a/src/utils/aws.py b/src/utils/aws.py index cb40ac178e..f29a867c4a 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -6,10 +6,13 @@ from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient +LOAD_BALANCER_SECRET_NAME = "NeptuneTest/LoadBalancerUrl" +INSTANCE_ENDPOINT_SECRET_NAME = "NeptuneTest/InstanceEndpoint" + def get_secret(secret_name: str) -> str: """Returns an AWS Secrets Manager secret string associated with a given secret name.""" - secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-1") + secrets_manager_client = boto3.Session().client("secretsmanager") response = secrets_manager_client.get_secret_value(SecretId=secret_name) secret: str = response["SecretString"] @@ -31,7 +34,7 @@ def publish_batch_to_sns(topic_arn: str, messages: list[str]) -> None: } ) - boto3.client("sns").publish_batch( + boto3.Session().client("sns").publish_batch( TopicArn=topic_arn, PublishBatchRequestEntries=request_entries, ) @@ -44,8 +47,8 @@ def get_neptune_client(is_local: bool) -> BaseNeptuneClient: """ if is_local: return LocalNeptuneClient( - get_secret("NeptuneTest/LoadBalancerUrl"), - get_secret("NeptuneTest/InstanceEndpoint"), + get_secret(LOAD_BALANCER_SECRET_NAME), + get_secret(INSTANCE_ENDPOINT_SECRET_NAME), ) else: - return LambdaNeptuneClient(get_secret("NeptuneTest/InstanceEndpoint")) + return LambdaNeptuneClient(get_secret(INSTANCE_ENDPOINT_SECRET_NAME)) From 06feeddb9b3972a0c3791e3d136e0364eae9aac2 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 23 Jan 2025 13:42:58 +0000 Subject: [PATCH 136/310] type hints --- src/conftest.py | 3 ++- src/test_indexer.py | 2 +- src/test_mocks.py | 40 +++++++++++++++++++++------------------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/conftest.py b/src/conftest.py index d2c4ad0562..7292ac18f2 100644 --- a/src/conftest.py +++ b/src/conftest.py @@ -1,10 +1,11 @@ import pytest +from _pytest.monkeypatch import MonkeyPatch from test_mocks import MockBoto3Session, MockRequest @pytest.fixture(autouse=True) -def test(monkeypatch): +def test(monkeypatch: MonkeyPatch) -> None: # Replaces boto3 and Elasticsearch with fake clients monkeypatch.setattr("boto3.Session", MockBoto3Session) monkeypatch.setattr("requests.request", MockRequest.request) diff --git a/src/test_indexer.py b/src/test_indexer.py index 8b03ec4016..17598068b7 100644 --- a/src/test_indexer.py +++ b/src/test_indexer.py @@ -4,7 +4,7 @@ from test_mocks import MockRequest -def test_lambda_handler(): +def test_lambda_handler() -> None: MockRequest.responses = [ { "method": "POST", diff --git a/src/test_mocks.py b/src/test_mocks.py index b4310834a3..eeb658a5c1 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -11,8 +11,13 @@ ) -class MockSecretsManagerClient: - def get_secret_value(self, SecretId: str): +class MockAwsService: + def __init__(self) -> None: + return None + + +class MockSecretsManagerClient(MockAwsService): + def get_secret_value(self, SecretId: str) -> dict: if SecretId == LOAD_BALANCER_SECRET_NAME: secret_value = MOCK_API_KEY elif SecretId == INSTANCE_ENDPOINT_SECRET_NAME: @@ -24,49 +29,49 @@ def get_secret_value(self, SecretId: str): class MockBoto3Session: - def __init__(self): + def __init__(self) -> None: self.clients = { "secretsmanager": MockSecretsManagerClient(), } - def client(self, client_name: str): + def client(self, client_name: str) -> MockAwsService: if client_name not in self.clients: raise KeyError("There is no mock client for the specified client_name.") return self.clients[client_name] - def get_credentials(self): + def get_credentials(self) -> Credentials: return MOCK_CREDENTIALS class MockResponse: - def __init__(self, json_data, status_code): + def __init__(self, json_data: dict, status_code: int) -> None: self.json_data = json_data self.status_code = status_code - def json(self): + def json(self) -> dict: return self.json_data class MockRequest: - responses = {} - calls = [] + responses: list[dict] = [] + calls: list[dict] = [] @staticmethod - def clear_mock_responses(): - MockRequest.responses = {} + def clear_mock_responses() -> None: + MockRequest.responses = [] @staticmethod - def clear_mock_calls(): + def clear_mock_calls() -> None: MockRequest.calls = [] @staticmethod - def reset_mocks(): + def reset_mocks() -> None: MockRequest.clear_mock_responses() MockRequest.clear_mock_calls() @staticmethod - def mock_response(method, url, status_code, json_data): + def mock_response(method: str, url: str, status_code: int, json_data: dict) -> None: MockRequest.responses.append( { "method": method, @@ -77,7 +82,7 @@ def mock_response(method, url, status_code, json_data): ) @staticmethod - def mock_responses(method, url, responses): + def mock_responses(method: str, url: str, responses: list[dict]) -> None: MockRequest.clear_mock_responses() for response in responses: MockRequest.mock_response( @@ -85,10 +90,7 @@ def mock_responses(method, url, responses): ) @staticmethod - def request(method, url, **kwargs): - data = kwargs.get("data") - headers = kwargs.get("headers") - + def request(method: str, url: str, data: dict, headers: dict) -> MockResponse: MockRequest.calls.append( {"method": method, "url": url, "data": data, "headers": headers} ) From b9473dc998234c0aba7c7b656cafe10cc86dbace Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 24 Jan 2025 10:33:10 +0000 Subject: [PATCH 137/310] Refactor catalogue concepts source --- src/sources/catalogue/concepts_source.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/sources/catalogue/concepts_source.py b/src/sources/catalogue/concepts_source.py index 50f78e1c88..28900be3ac 100644 --- a/src/sources/catalogue/concepts_source.py +++ b/src/sources/catalogue/concepts_source.py @@ -1,10 +1,7 @@ -import gzip -import json from collections.abc import Generator -import requests - from sources.base_source import BaseSource +from sources.gzip_source import GZipSource CONCEPT_KEYS = ["subjects", "genres", "contributors"] @@ -13,11 +10,9 @@ def __init__(self, url: str): self.url = url def stream_raw(self) -> Generator[dict]: - response = requests.get(self.url, stream=True) - - with gzip.GzipFile(fileobj=response.raw) as file: - for line_bytes in file: - work = json.loads(line_bytes.decode("utf8")) - for conecpt_key in CONCEPT_KEYS: - for raw_concept in work.get(conecpt_key, []): - yield raw_concept + """Streams raw concept nodes from a work's subjects, genres, and contributors.""" + catalogue_source = GZipSource(self.url) + for work in catalogue_source.stream_raw(): + for conecpt_key in CONCEPT_KEYS: + for raw_concept in work.get(conecpt_key, []): + yield raw_concept From 3632cd7fb038ef79eb0ff6f91b0e0548c472e545 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 24 Jan 2025 10:34:16 +0000 Subject: [PATCH 138/310] Consistent source naming --- src/sources/catalogue/concepts_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sources/catalogue/concepts_source.py b/src/sources/catalogue/concepts_source.py index 28900be3ac..ebc985e30d 100644 --- a/src/sources/catalogue/concepts_source.py +++ b/src/sources/catalogue/concepts_source.py @@ -5,7 +5,7 @@ CONCEPT_KEYS = ["subjects", "genres", "contributors"] -class CatalogueConceptSource(BaseSource): +class CatalogueConceptsSource(BaseSource): def __init__(self, url: str): self.url = url From 6e713b123338c0c2d6de250fa50fa4444d23f83a Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 24 Jan 2025 11:34:37 +0000 Subject: [PATCH 139/310] Add raw catalogue concept --- src/transformers/catalogue/raw_concept.py | 56 +++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 src/transformers/catalogue/raw_concept.py diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py new file mode 100644 index 0000000000..3b9bda100e --- /dev/null +++ b/src/transformers/catalogue/raw_concept.py @@ -0,0 +1,56 @@ +from typing import Literal, get_args + +ConceptType = Literal["Concept", + "Person", + "Organisation", + "Meeting", + "Period", + "Subject", + "Place", + "Agent", + "Genre"] + + +class RawCatalogueConcept: + def __init__(self, raw_concept: dict): + self.raw_concept = self._extract_concept_node(raw_concept) + + @staticmethod + def _extract_concept_node(raw_concept): + # There should be either one concept inside a list, or one agent + if len(raw_concept.get("concepts", [])) > 0: + return raw_concept["concepts"][0] + else: + return raw_concept.get("agent") + + @property + def is_concept(self) -> bool: + if isinstance(self.raw_concept.get("type"), str): + if self.raw_concept["type"] in get_args(ConceptType) and self.raw_concept.get("identifiers"): + return True + return False + + @property + def wellcome_id(self) -> str: + return self.raw_concept["id"] + + @property + def label(self) -> str: + return self.raw_concept["label"] + + @property + def type(self) -> ConceptType: + return self.raw_concept["type"] + + def _get_identifier(self) -> dict: + identifier = self.raw_concept.get("identifiers", []) + # There should be exactly one source identifier for each concept + assert(len(identifier) == 1) + + return identifier[0] + + @property + def source(self) -> str: + + identifier = self._get_identifier() + return identifier["identifierType"]["id"] From af33d39021446d3996bffbda92743c25f23f0d08 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 24 Jan 2025 16:09:05 +0000 Subject: [PATCH 140/310] Add literal types --- src/models/graph_node.py | 13 +++--- src/transformers/catalogue/raw_concept.py | 51 +++++++++++++---------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/src/models/graph_node.py b/src/models/graph_node.py index d1d7241a45..b1209a34cc 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -34,10 +34,7 @@ class SourceName(SourceConcept): date_of_death: Optional[datetime.date] = None place_of_birth: Optional[str] = None - -# The `id` field stores a canonical Wellcome identifier -class Concept(BaseNode): - type: Literal[ +ConceptType = Literal[ "Person", "Concept", "Organisation", @@ -47,6 +44,12 @@ class Concept(BaseNode): "Genre", "Period", ] - source: Literal[ + +ConceptSource = Literal[ "label-derived", "nlm-mesh", "lc-subjects", "lc-names", "viaf", "fihrist" ] + +# The `id` field stores a canonical Wellcome identifier +class Concept(BaseNode): + type: ConceptType + source: ConceptSource \ No newline at end of file diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 3b9bda100e..e006c20a58 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -1,14 +1,6 @@ -from typing import Literal, get_args +from typing import cast, get_args -ConceptType = Literal["Concept", - "Person", - "Organisation", - "Meeting", - "Period", - "Subject", - "Place", - "Agent", - "Genre"] +from models.graph_node import ConceptType, ConceptSource class RawCatalogueConcept: @@ -16,12 +8,14 @@ def __init__(self, raw_concept: dict): self.raw_concept = self._extract_concept_node(raw_concept) @staticmethod - def _extract_concept_node(raw_concept): + def _extract_concept_node(raw_concept: dict) -> dict: # There should be either one concept inside a list, or one agent if len(raw_concept.get("concepts", [])) > 0: - return raw_concept["concepts"][0] + raw_concept_node = raw_concept["concepts"][0] else: - return raw_concept.get("agent") + raw_concept_node = raw_concept.get("agent") + assert isinstance(raw_concept_node, dict) + return raw_concept_node @property def is_concept(self) -> bool: @@ -32,25 +26,38 @@ def is_concept(self) -> bool: @property def wellcome_id(self) -> str: - return self.raw_concept["id"] + wellcome_id = self.raw_concept.get("id") + assert isinstance(wellcome_id, str) + return wellcome_id @property def label(self) -> str: - return self.raw_concept["label"] + label = self.raw_concept.get("label") + assert isinstance(label, str) + return label @property def type(self) -> ConceptType: - return self.raw_concept["type"] + concept_type = self.raw_concept.get("type") + if concept_type in get_args(ConceptType): + return cast(ConceptType, concept_type) + raise ValueError("Concept type not recognised.") def _get_identifier(self) -> dict: - identifier = self.raw_concept.get("identifiers", []) + raw_identifier = self.raw_concept.get("identifiers", []) # There should be exactly one source identifier for each concept - assert(len(identifier) == 1) - - return identifier[0] + assert len(raw_identifier) == 1 + identifier = raw_identifier[0] + + assert isinstance(identifier, dict) + return identifier @property - def source(self) -> str: + def source(self) -> ConceptSource: identifier = self._get_identifier() - return identifier["identifierType"]["id"] + + source = identifier["identifierType"]["id"] + if source in get_args(ConceptSource): + return cast(ConceptSource, source) + raise ValueError("Concept source not recognised.") From ac76c2551faaf6431ead1b3d0cf7396e3cf16d0a Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 24 Jan 2025 16:23:31 +0000 Subject: [PATCH 141/310] Add docstrings --- src/models/graph_node.py | 2 ++ src/transformers/catalogue/raw_concept.py | 21 ++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/models/graph_node.py b/src/models/graph_node.py index b1209a34cc..04de7b7743 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -34,6 +34,8 @@ class SourceName(SourceConcept): date_of_death: Optional[datetime.date] = None place_of_birth: Optional[str] = None + +# Catalogue concepts have a specific type and source ConceptType = Literal[ "Person", "Concept", diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index e006c20a58..96158217f2 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -9,41 +9,58 @@ def __init__(self, raw_concept: dict): @staticmethod def _extract_concept_node(raw_concept: dict) -> dict: - # There should be either one concept inside a list, or one agent + """ + Extracts raw concepts data from one of two formats: + Either as a dicitonary inside a list under "concepts", or as a dictionary under "agent". + """ if len(raw_concept.get("concepts", [])) > 0: raw_concept_node = raw_concept["concepts"][0] else: raw_concept_node = raw_concept.get("agent") + assert isinstance(raw_concept_node, dict) return raw_concept_node @property def is_concept(self) -> bool: + """ + Determines whether a given block of JSON represents a Concept as returned from the Catalogue API. + A Concept is a block of JSON with a type property and a list of identifiers. + """ if isinstance(self.raw_concept.get("type"), str): if self.raw_concept["type"] in get_args(ConceptType) and self.raw_concept.get("identifiers"): return True + return False @property def wellcome_id(self) -> str: + """Returns the canonical Wellcome identifier.""" wellcome_id = self.raw_concept.get("id") + assert isinstance(wellcome_id, str) return wellcome_id @property def label(self) -> str: + """Returns the concept label.""" label = self.raw_concept.get("label") + assert isinstance(label, str) return label @property def type(self) -> ConceptType: + """Returns the concept type (one of "Person", "Concept", "Genre", etc.).""" concept_type = self.raw_concept.get("type") + if concept_type in get_args(ConceptType): return cast(ConceptType, concept_type) + raise ValueError("Concept type not recognised.") def _get_identifier(self) -> dict: + """Returns metadata about the source identifier.""" raw_identifier = self.raw_concept.get("identifiers", []) # There should be exactly one source identifier for each concept assert len(raw_identifier) == 1 @@ -54,10 +71,12 @@ def _get_identifier(self) -> dict: @property def source(self) -> ConceptSource: + """Returns the concept source (one of "lc-names", "label-derived", etc.).""" identifier = self._get_identifier() source = identifier["identifierType"]["id"] if source in get_args(ConceptSource): return cast(ConceptSource, source) + raise ValueError("Concept source not recognised.") From 79dfbde4402f8cbd4074e26bbab0ffd4d617691a Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 24 Jan 2025 16:24:23 +0000 Subject: [PATCH 142/310] Add catalogue concepts transformer --- src/transformers/catalogue/__init__.py | 0 .../catalogue/concepts_transformer.py | 30 +++++++++++++++++++ src/transformers/create_transformer.py | 7 ++++- 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 src/transformers/catalogue/__init__.py create mode 100644 src/transformers/catalogue/concepts_transformer.py diff --git a/src/transformers/catalogue/__init__.py b/src/transformers/catalogue/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py new file mode 100644 index 0000000000..daa133d12f --- /dev/null +++ b/src/transformers/catalogue/concepts_transformer.py @@ -0,0 +1,30 @@ +from collections.abc import Generator + +from models.graph_edge import BaseEdge +from models.graph_node import Concept +from sources.catalogue.concepts_source import CatalogueConceptsSource +from transformers.base_transformer import BaseTransformer + +from .raw_concept import RawCatalogueConcept + + +class CatalogueConceptsTransformer(BaseTransformer): + def __init__(self, url: str): + self.source = CatalogueConceptsSource(url) + + def transform_node(self, raw_node: dict) -> Concept | None: + raw_concept = RawCatalogueConcept(raw_node) + + if not raw_concept.is_concept: + return None + + return Concept( + id=raw_concept.wellcome_id, + label=raw_concept.label, + source=raw_concept.source, + type=raw_concept.type, + ) + + def extract_edges(self, raw_node: RawCatalogueConcept) -> Generator[BaseEdge]: + # TODO: Extract `HAS_SOURCE_CONCEPT` edges + yield from () diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index d7d41ef697..e5fa3f4784 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -6,6 +6,7 @@ from .loc.names_transformer import LibraryOfCongressNamesTransformer from .mesh.concepts_transformer import MeSHConceptsTransformer from .mesh.locations_transformer import MeSHLocationsTransformer +from .catalogue.concepts_transformer import CatalogueConceptsTransformer LOC_SUBJECT_HEADINGS_URL = ( "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" @@ -14,8 +15,10 @@ MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" +CATALOGUE_SNAPSHOT_URL = "https://data.wellcomecollection.org/catalogue/v2/works.json.gz" + TransformerType = Literal[ - "loc_concepts", "loc_names", "loc_locations", "mesh_concepts", "mesh_locations" + "loc_concepts", "loc_names", "loc_locations", "mesh_concepts", "mesh_locations", "catalogue_concepts" ] @@ -32,5 +35,7 @@ def create_transformer(transformer_type: TransformerType) -> BaseTransformer: return MeSHConceptsTransformer(MESH_URL) if transformer_type == "mesh_locations": return MeSHLocationsTransformer(MESH_URL) + if transformer_type == "catalogue_concepts": + return CatalogueConceptsTransformer(CATALOGUE_SNAPSHOT_URL) raise ValueError(f"Unknown transformer type: {transformer_type}") From 6f64e2cf2901e80f1ead9120d232c427d692a139 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Fri, 24 Jan 2025 16:32:41 +0000 Subject: [PATCH 143/310] Add catalogue concepts to state machine --- terraform/variables.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/terraform/variables.tf b/terraform/variables.tf index c881c092d2..c945f53bd1 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -93,5 +93,10 @@ variable "state_machine_inputs" { "transformer_type" : "wikidata_linked_mesh_locations", "entity_type" : "edges" }, + { + "label" : "Catalogue Concept Nodes", + "transformer_type" : "catalogue_concepts", + "entity_type" : "nodes" + }, ] } From d3f942396bccf22bc0c20f2c90ee860a43839419 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Fri, 24 Jan 2025 16:34:22 +0000 Subject: [PATCH 144/310] Apply auto-formatting rules --- src/models/graph_node.py | 25 +++++++++++----------- src/sources/catalogue/concepts_source.py | 1 + src/transformers/catalogue/raw_concept.py | 26 ++++++++++++----------- src/transformers/create_transformer.py | 8 ++++--- 4 files changed, 33 insertions(+), 27 deletions(-) diff --git a/src/models/graph_node.py b/src/models/graph_node.py index 14b85104f0..a58cfb5fe0 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -40,21 +40,22 @@ class SourceName(SourceConcept): # Catalogue concepts have a specific type and source ConceptType = Literal[ - "Person", - "Concept", - "Organisation", - "Place", - "Agent", - "Meeting", - "Genre", - "Period", - ] + "Person", + "Concept", + "Organisation", + "Place", + "Agent", + "Meeting", + "Genre", + "Period", +] ConceptSource = Literal[ - "label-derived", "nlm-mesh", "lc-subjects", "lc-names", "viaf", "fihrist" - ] + "label-derived", "nlm-mesh", "lc-subjects", "lc-names", "viaf", "fihrist" +] + # The `id` field stores a canonical Wellcome identifier class Concept(BaseNode): type: ConceptType - source: ConceptSource \ No newline at end of file + source: ConceptSource diff --git a/src/sources/catalogue/concepts_source.py b/src/sources/catalogue/concepts_source.py index ebc985e30d..538df806eb 100644 --- a/src/sources/catalogue/concepts_source.py +++ b/src/sources/catalogue/concepts_source.py @@ -5,6 +5,7 @@ CONCEPT_KEYS = ["subjects", "genres", "contributors"] + class CatalogueConceptsSource(BaseSource): def __init__(self, url: str): self.url = url diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 96158217f2..7ba3d909e0 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -1,6 +1,6 @@ from typing import cast, get_args -from models.graph_node import ConceptType, ConceptSource +from models.graph_node import ConceptSource, ConceptType class RawCatalogueConcept: @@ -24,13 +24,15 @@ def _extract_concept_node(raw_concept: dict) -> dict: @property def is_concept(self) -> bool: """ - Determines whether a given block of JSON represents a Concept as returned from the Catalogue API. + Determines whether a given block of JSON represents a Concept as returned from the Catalogue API. A Concept is a block of JSON with a type property and a list of identifiers. """ if isinstance(self.raw_concept.get("type"), str): - if self.raw_concept["type"] in get_args(ConceptType) and self.raw_concept.get("identifiers"): + if self.raw_concept["type"] in get_args( + ConceptType + ) and self.raw_concept.get("identifiers"): return True - + return False @property @@ -40,7 +42,7 @@ def wellcome_id(self) -> str: assert isinstance(wellcome_id, str) return wellcome_id - + @property def label(self) -> str: """Returns the concept label.""" @@ -48,33 +50,33 @@ def label(self) -> str: assert isinstance(label, str) return label - + @property def type(self) -> ConceptType: """Returns the concept type (one of "Person", "Concept", "Genre", etc.).""" concept_type = self.raw_concept.get("type") - + if concept_type in get_args(ConceptType): return cast(ConceptType, concept_type) - + raise ValueError("Concept type not recognised.") - + def _get_identifier(self) -> dict: """Returns metadata about the source identifier.""" raw_identifier = self.raw_concept.get("identifiers", []) # There should be exactly one source identifier for each concept assert len(raw_identifier) == 1 identifier = raw_identifier[0] - + assert isinstance(identifier, dict) return identifier - + @property def source(self) -> ConceptSource: """Returns the concept source (one of "lc-names", "label-derived", etc.).""" identifier = self._get_identifier() - + source = identifier["identifierType"]["id"] if source in get_args(ConceptSource): return cast(ConceptSource, source) diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index a02ab984c0..7d1c7607e7 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -1,12 +1,12 @@ from typing import Literal from .base_transformer import BaseTransformer, EntityType +from .catalogue.concepts_transformer import CatalogueConceptsTransformer from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer from .loc.locations_transformer import LibraryOfCongressLocationsTransformer from .loc.names_transformer import LibraryOfCongressNamesTransformer from .mesh.concepts_transformer import MeSHConceptsTransformer from .mesh.locations_transformer import MeSHLocationsTransformer -from .catalogue.concepts_transformer import CatalogueConceptsTransformer from .wikidata.concepts_transformer import WikidataConceptsTransformer from .wikidata.locations_transformer import WikidataLocationsTransformer from .wikidata.names_transformer import WikidataNamesTransformer @@ -17,7 +17,9 @@ LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" -CATALOGUE_SNAPSHOT_URL = "https://data.wellcomecollection.org/catalogue/v2/works.json.gz" +CATALOGUE_SNAPSHOT_URL = ( + "https://data.wellcomecollection.org/catalogue/v2/works.json.gz" +) TransformerType = Literal[ "loc_concepts", @@ -30,7 +32,7 @@ "wikidata_linked_loc_names", "wikidata_linked_mesh_concepts", "wikidata_linked_mesh_locations", - "catalogue_concepts" + "catalogue_concepts", ] From cd5b8061206a0c40d58798c26e008ae648ea6186 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 27 Jan 2025 13:57:48 +0000 Subject: [PATCH 145/310] scope ignoring warnings to DeprecationWarning only --- pytest.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index c1fa878547..4d6e9787d6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -addopts = -p no:warnings \ No newline at end of file +filterwarnings = ignore::DeprecationWarning From 7aca4e8117a03c0b16e085f7c50b69051e8a2ef0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 27 Jan 2025 12:25:48 +0000 Subject: [PATCH 146/310] Add parent Wikidata concepts and corresponding edges --- .../linked_ontology_id_type_checker.py | 10 ++-- .../wikidata/linked_ontology_source.py | 59 +++++++++++++------ src/sources/wikidata/sparql_query_builder.py | 25 +++++--- .../wikidata/concepts_transformer.py | 30 ++++++---- src/utils/streaming.py | 7 ++- 5 files changed, 88 insertions(+), 43 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 4aab9bc3f5..eae63ea7ce 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -32,7 +32,11 @@ def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" - print(f"Retrieving {linked_nodes_file_name} from S3.") + print( + f"Retrieving ids of type '{node_type}' from ontology '{self.linked_ontology}' from S3.", + end=" ", + flush=True, + ) ids = set() transport_params = {"client": boto3.client("s3")} @@ -44,9 +48,7 @@ def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: continue ids.add(line.split(",")[0]) - print( - f"Retrieved {len(ids)} ids of type '{node_type}' from ontology '{self.linked_ontology}'." - ) + print(f"({len(ids)} ids retrieved.)") return ids diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index c52cac25d8..a44f966fdb 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -1,4 +1,4 @@ -from collections.abc import Generator +from collections.abc import Generator, Iterable from sources.base_source import BaseSource from transformers.base_transformer import EntityType @@ -20,7 +20,7 @@ def extract_wikidata_id(item: dict, key: str = "item") -> str: """ assert isinstance(item[key]["value"], str) assert item[key]["type"] == "uri" - return item[key]["value"].removeprefix(WIKIDATA_ID_PREFIX) + return str(item[key]["value"].removeprefix(WIKIDATA_ID_PREFIX)) class WikidataLinkedOntologySource(BaseSource): @@ -48,9 +48,13 @@ def __init__( self.entity_type = entity_type self.id_type_checker = LinkedOntologyIdTypeChecker(node_type, linked_ontology) - def _get_all_ids(self) -> Generator[str]: + def _get_all_ids(self) -> list[str]: """Return all Wikidata ids corresponding to Wikidata items referencing the selected linked ontology.""" - print(f"Retrieving Wikidata ids linked to {self.linked_ontology} items.") + print( + f"Retrieving Wikidata ids linked to {self.linked_ontology} items.", + end=" ", + flush=True, + ) ids_query = SparqlQueryBuilder.get_all_ids_query(self.linked_ontology) id_items = self.client.run_query(ids_query) @@ -58,8 +62,8 @@ def _get_all_ids(self) -> Generator[str]: # but that would make the query significantly slower. It's faster to deduplicate here.) all_ids = set(extract_wikidata_id(item) for item in id_items) - print(f"Retrieved a total of {len(all_ids)} Wikidata ids.") - yield from all_ids + print(f"({len(all_ids)} ids retrieved.)") + return list(all_ids) def _get_linked_id_mappings(self, wikidata_ids: list[str]) -> list[dict]: query = SparqlQueryBuilder.get_linked_ids_query( @@ -72,14 +76,21 @@ def _get_wikidata_items(self, wikidata_ids: list[str]) -> list: return self.client.run_query(query) def _get_parent_id_mappings(self, child_wikidata_ids: list[str]) -> list[dict]: - query = SparqlQueryBuilder.get_filtered_parents_query( - child_wikidata_ids, self.linked_ontology + subclass_of_query = SparqlQueryBuilder.get_parents_query( + child_wikidata_ids, "subclass_of" ) - return self.client.run_query(query) + subclass_of_results = self.client.run_query(subclass_of_query) + + instance_of_query = SparqlQueryBuilder.get_parents_query( + child_wikidata_ids, "instance_of" + ) + instance_of_results = self.client.run_query(instance_of_query) + + return subclass_of_results + instance_of_results @staticmethod def _parallelise_requests( - items: Generator, run_sparql_query: Callable[[list], list] + items: Iterable, run_sparql_query: Callable[[list], list] ) -> Generator: """Accept an `items` generator and a `run_sparql_query` method. Split `items` chunks and apply `run_sparql_query` to each chunk. Return a single generator of results.""" @@ -95,7 +106,14 @@ def _stream_wikidata_ids(self) -> Generator[str]: """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" seen = set() for item in self._stream_raw_edges(): - wikidata_id: str = item["wikidata_id"] + wikidata_id: str + if item["type"] == "SAME_AS": + wikidata_id = item["wikidata_id"] + elif item["type"] == "HAS_PARENT": + wikidata_id = item["parent_id"] + else: + raise ValueError(f"Unknown raw edge type {item['type']}.") + if wikidata_id not in seen: seen.add(wikidata_id) yield wikidata_id @@ -111,15 +129,19 @@ def _stream_raw_edges(self) -> Generator[dict]: 3. Filter the returned id pairs to only include Wikidata ids corresponding to the selected node type (i.e. concepts, locations, or names). """ - all_ids = self._get_all_ids() + all_linked_ids = self._get_all_ids() - # Parallelise the second query to retrieve the mappings faster. + print("Streaming linked Wikidata ids...") for raw_mapping in self._parallelise_requests( - all_ids, self._get_linked_id_mappings + all_linked_ids, self._get_linked_id_mappings ): linked_id = raw_mapping["linkedId"]["value"] wikidata_id = extract_wikidata_id(raw_mapping) - mapping = {"wikidata_id": wikidata_id, "linked_id": linked_id} + mapping = { + "wikidata_id": wikidata_id, + "linked_id": linked_id, + "type": "SAME_AS", + } # Only yield the mapping if the linked id corresponds to the selected `node_type`, as determined by the # linked ontology. For example, if we want to stream Wikidata 'names' edges, but we classify the referenced @@ -128,12 +150,15 @@ def _stream_raw_edges(self) -> Generator[dict]: if self.id_type_checker.id_included_in_selected_type(mapping["linked_id"]): yield mapping + print("Streaming parent Wikidata ids...") for raw_mapping in self._parallelise_requests( - all_ids, self._get_parent_id_mappings + all_linked_ids, self._get_parent_id_mappings ): + parent_id = extract_wikidata_id(raw_mapping) mapping = { "child_id": extract_wikidata_id(raw_mapping, "child"), - "parent_id": extract_wikidata_id(raw_mapping), + "parent_id": parent_id, + "type": "HAS_PARENT", } yield mapping diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index ef6fde379c..7b8e01c573 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -35,7 +35,7 @@ def _get_formatted_fields(node_type: NodeType) -> str: return " ".join(fields_with_aggregation) @staticmethod - def _get_linked_ontology_filter(linked_ontology: OntologyType): + def _get_linked_ontology_filter(linked_ontology: OntologyType) -> str: if linked_ontology == "loc": return "?item p:P244/ps:P244 ?linkedId." elif linked_ontology == "mesh": @@ -130,24 +130,33 @@ def get_linked_ids_query( return query @classmethod - def get_filtered_parents_query( - cls, item_ids: list[str], linked_ontology: OntologyType + def get_parents_query( + cls, + item_ids: list[str], + relationship_type: Literal["instance_of", "subclass_of"], ) -> str: """ - Given a list of Wikidata `item_ids`, return a query to retrieve all parents of each item in the list, - filtering out items which reference ids from `linked_ontology`. - Parents are determined based on the 'subclass of' (P279) and the 'instance of' (P31) fields. + Given a list of Wikidata `item_ids`, return a query to retrieve all parents of each item in the list. + Parents are determined based on the 'subclass of' (P279) or the 'instance of' (P31) fields. """ ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + if relationship_type == "instance_of": + relationship = "?child wdt:P31 ?item." + elif relationship_type == "subclass_of": + relationship = "?child wdt:P279 ?item." + else: + raise ValueError( + f"Unknown parent/child relationship type: {relationship_type}" + ) + # We need to filter out items which link to the selected ontology as those items are added # to the graph separately. query = f""" SELECT DISTINCT ?child ?item WHERE {{ VALUES ?child {{ {ids_clause} }} - ?child wdt:P31 ?item. - FILTER NOT EXISTS {{ {cls._get_linked_ontology_filter(linked_ontology)} }} + {relationship} }} """ diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index 8a83b6bcf2..1f7a4360d2 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,6 +1,6 @@ from collections.abc import Generator -from models.graph_edge import SourceConceptSameAs +from models.graph_edge import SourceConceptSameAs, SourceConceptHasParent from models.graph_node import SourceConcept from sources.wikidata.linked_ontology_source import ( OntologyType, @@ -28,13 +28,21 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: description=raw_concept.description, ) - def extract_edges(self, raw_edge: dict) -> Generator[SourceConceptSameAs]: - linked_id, wikidata_id = raw_edge["linked_id"], raw_edge["wikidata_id"] - edge_attributes = {"source": "wikidata"} - - yield SourceConceptSameAs( - from_id=linked_id, to_id=wikidata_id, attributes=edge_attributes - ) - yield SourceConceptSameAs( - from_id=wikidata_id, to_id=linked_id, attributes=edge_attributes - ) + def extract_edges( + self, raw_edge: dict + ) -> Generator[SourceConceptSameAs | SourceConceptHasParent]: + if raw_edge["type"] == "SAME_AS": + linked_id, wikidata_id = raw_edge["linked_id"], raw_edge["wikidata_id"] + edge_attributes = {"source": "wikidata"} + yield SourceConceptSameAs( + from_id=linked_id, to_id=wikidata_id, attributes=edge_attributes + ) + yield SourceConceptSameAs( + from_id=wikidata_id, to_id=linked_id, attributes=edge_attributes + ) + elif raw_edge["type"] == "HAS_PARENT": + yield SourceConceptHasParent( + from_id=raw_edge["child_id"], to_id=raw_edge["parent_id"] + ) + else: + raise ValueError(f"Unknown edge type f{raw_edge['type']}") diff --git a/src/utils/streaming.py b/src/utils/streaming.py index a7505e9237..444bca9a56 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -1,5 +1,5 @@ import concurrent.futures -from collections.abc import Generator, Iterator +from collections.abc import Generator, Iterable from itertools import islice from typing import Any, Callable, TypeVar @@ -7,13 +7,14 @@ S = TypeVar("S") -def generator_to_chunks(items: Iterator[Any], chunk_size: int) -> Generator[list]: +def generator_to_chunks(items: Iterable[Any], chunk_size: int) -> Generator[list]: """ Split items in a generator into chunks of size `chunk_size` and return another generator yielding the chunks one by one. """ while True: chunk = list(islice(items, chunk_size)) + print(len(chunk)) if chunk: yield chunk else: @@ -21,7 +22,7 @@ def generator_to_chunks(items: Iterator[Any], chunk_size: int) -> Generator[list def process_stream_in_parallel( - stream: Iterator[T], + stream: Iterable[T], process: Callable[[list[T]], list[S]], chunk_size: int, thread_count: int, From 0c2b4ec59f00dde1ba9cd6643e9787f94f838227 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Mon, 27 Jan 2025 14:09:14 +0000 Subject: [PATCH 147/310] Apply auto-formatting rules --- src/sources/wikidata/linked_ontology_source.py | 2 +- src/transformers/wikidata/concepts_transformer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index a44f966fdb..00b82735ef 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -1,9 +1,9 @@ from collections.abc import Generator, Iterable +from typing import Callable from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel -from typing import Callable from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index 1f7a4360d2..4704f6ff06 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,6 +1,6 @@ from collections.abc import Generator -from models.graph_edge import SourceConceptSameAs, SourceConceptHasParent +from models.graph_edge import SourceConceptHasParent, SourceConceptSameAs from models.graph_node import SourceConcept from sources.wikidata.linked_ontology_source import ( OntologyType, From 3860570224d32e66cd90a456b0807ab014401075 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 23 Jan 2025 14:03:24 +0000 Subject: [PATCH 148/310] update setup, add git-hook for pre-commit --- scripts/pre-push | 7 ++++ scripts/setup.sh | 92 ++++++++++++++++++++++++++++++------------------ 2 files changed, 65 insertions(+), 34 deletions(-) create mode 100755 scripts/pre-push diff --git a/scripts/pre-push b/scripts/pre-push new file mode 100755 index 0000000000..d0cfe68869 --- /dev/null +++ b/scripts/pre-push @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset + +./scripts/autoformat.sh +./scripts/typecheck.sh \ No newline at end of file diff --git a/scripts/setup.sh b/scripts/setup.sh index 6157a56b38..5d72449751 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -6,40 +6,64 @@ set -o nounset DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ROOT+="$(dirname "$DIR")" -echo "Setting up the project ..." - -# check if pyenv is installed, otherwise exit and echo install from message -if ! command -v pyenv &> /dev/null; then - echo "pyenv is not installed. Please install it using \"brew install pyenv\"" - exit 1 -fi - -# get the version specified using pyenv local -PY_VERSION=$(pyenv local) -# check if the version is installed, otherwise install it -if ! pyenv versions | grep -q "$PY_VERSION"; then - echo "Installing Python version $PY_VERSION ..." - pyenv install "$PY_VERSION" -fi - -# specify the python version to use for pyenv -export PYENV_VERSION="$PY_VERSION" - -# check if venv exists otherwise create it -if [ ! -d "$ROOT/venv" ]; then - echo "Creating virtual environment ..." - python -m venv "$ROOT/venv" -fi - -# activate the virtual environment -source "$ROOT/venv/bin/activate" +setup_git_hook() { + echo "Setting up pre-push hook ..." + cp "${ROOT}/scripts/pre-push" "${ROOT}/.git/hooks/pre-push" + chmod +x "${ROOT}/.git/hooks/pre-push" +} + +check_install_pyenv() { + echo "Checking if pyenv is installed ..." + # check if pyenv is installed, otherwise exit and echo install from message + if ! command -v pyenv &> /dev/null; then + echo "pyenv is not installed. Please install it using \"brew install pyenv\"" + exit 1 + fi +} + +install_python() { + echo "Installing Python ..." + # get the version specified using pyenv local + PY_VERSION=$(pyenv local) + # check if the version is installed, otherwise install it + if ! pyenv versions | grep -q "$PY_VERSION"; then + echo "Installing Python version $PY_VERSION ..." + pyenv install "$PY_VERSION" + else + echo "Python version $PY_VERSION is already installed." + fi + + echo $PY_VERSION +} + +setup_venv() { + echo "Setting up virtual environment ..." + PY_VERSION=$(pyenv local) + # check if venv exists otherwise create it + if [ ! -d "$ROOT/venv" ]; then + echo "Creating virtual environment ..." + python -m venv "$ROOT/venv" + fi +} -# install the requirements -pip install -r "$ROOT/src/requirements.txt" -pip install -r "$ROOT/src/dev_requirements.txt" +install_python_deps() { + echo "Installing Python dependencies ..." + + # install the requirements + pip install -r "$ROOT/src/requirements.txt" + pip install -r "$ROOT/src/dev_requirements.txt" + + # install pip-tools + pip install pip-tools +} + +setup_git_hook +check_install_pyenv + +export PYENV_VERSION=$(install_python) +source "$ROOT/venv/bin/activate" -# install pip-tools -pip install pip-tools +install_python_deps -# echo the setup is complete, and to run "source ./venv/bin/activate" -echo "Setup complete, please run \"source ./venv/bin/activate\" to activate the virtual environment." +echo "Setup complete :)" +echo "Run \"source ./venv/bin/activate\" to activate the virtual environment." From 96938b1e43567408503b81855c0229f5ea0e216b Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 23 Jan 2025 14:04:20 +0000 Subject: [PATCH 149/310] test hook --- src/bulk_loader.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/bulk_loader.py b/src/bulk_loader.py index e032fea535..61336ada08 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -1,14 +1,10 @@ import argparse import os import typing - from transformers.base_transformer import EntityType from transformers.create_transformer import TransformerType from utils.aws import get_neptune_client - S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] - - def handler( transformer_type: TransformerType, entity_type: EntityType, is_local: bool = False ) -> dict[str, str]: From 9ba721504e9491b5ef8f6e79168f880d8ec457de Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 23 Jan 2025 14:14:34 +0000 Subject: [PATCH 150/310] implement check to ensure failure if not formatted --- .github/workflows/autoformat-typecheck.yml | 2 +- scripts/autoformat.sh | 13 +++++++++++-- scripts/pre-push | 2 +- scripts/typecheck.sh | 1 + src/bulk_loader.py | 4 ++++ 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/workflows/autoformat-typecheck.yml b/.github/workflows/autoformat-typecheck.yml index f297804002..2e5e738923 100644 --- a/.github/workflows/autoformat-typecheck.yml +++ b/.github/workflows/autoformat-typecheck.yml @@ -23,7 +23,7 @@ jobs: - name: Run typecheck run: | - ./scripts/typecheck.sh + ./scripts/typecheck.sh --check autoformat: runs-on: ubuntu-latest diff --git a/scripts/autoformat.sh b/scripts/autoformat.sh index bfaf2b4886..b022ef4032 100755 --- a/scripts/autoformat.sh +++ b/scripts/autoformat.sh @@ -9,5 +9,14 @@ ROOT+="$(dirname "$DIR")" # change working directory to the root of the project cd "$ROOT" -black src/ -isort --profile=black src/ \ No newline at end of file +CHECK=${1:-} + +if [ "$CHECK" == "--check" ]; then + echo "Checking code formatting (run ./scripts/autoformat.sh to fix any issues!)..." + black --check src/ + isort --profile=black --check src/ +else + echo "Formatting code ..." + black src/ + isort --profile=black src/ +fi diff --git a/scripts/pre-push b/scripts/pre-push index d0cfe68869..e806c87bdc 100755 --- a/scripts/pre-push +++ b/scripts/pre-push @@ -3,5 +3,5 @@ set -o errexit set -o nounset -./scripts/autoformat.sh +./scripts/autoformat.sh --check ./scripts/typecheck.sh \ No newline at end of file diff --git a/scripts/typecheck.sh b/scripts/typecheck.sh index ae9dddca7a..92f27f6f2b 100755 --- a/scripts/typecheck.sh +++ b/scripts/typecheck.sh @@ -9,4 +9,5 @@ ROOT+="$(dirname "$DIR")" # change working directory to the root of the project cd "$ROOT" +echo "Type checking code ..." mypy --config-file src/mypy.ini src/ \ No newline at end of file diff --git a/src/bulk_loader.py b/src/bulk_loader.py index 61336ada08..e032fea535 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -1,10 +1,14 @@ import argparse import os import typing + from transformers.base_transformer import EntityType from transformers.create_transformer import TransformerType from utils.aws import get_neptune_client + S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] + + def handler( transformer_type: TransformerType, entity_type: EntityType, is_local: bool = False ) -> dict[str, str]: From ce9a37f541def9cc7767ad4560cf03fa70d20bca Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 27 Jan 2025 14:11:12 +0000 Subject: [PATCH 151/310] Move --check param to autoformat where it is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Štěpán Brychta --- .github/workflows/autoformat-typecheck.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/autoformat-typecheck.yml b/.github/workflows/autoformat-typecheck.yml index 2e5e738923..d35166e7c8 100644 --- a/.github/workflows/autoformat-typecheck.yml +++ b/.github/workflows/autoformat-typecheck.yml @@ -23,7 +23,7 @@ jobs: - name: Run typecheck run: | - ./scripts/typecheck.sh --check + ./scripts/typecheck.sh autoformat: runs-on: ubuntu-latest @@ -41,7 +41,7 @@ jobs: - name: Run autoformat run: | - ./scripts/autoformat.sh + ./scripts/autoformat.sh --check - name: Check for formatting changes id: check_formatting_changes From e62b3f10743be36af3c83dd441e1cd32b09b987b Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 27 Jan 2025 15:08:17 +0000 Subject: [PATCH 152/310] Remove extra check in CI, we want to commit changes. Co-Authored-By: agnesgaroux <135110571+agnesgaroux@users.noreply.github.com> --- .github/workflows/autoformat-typecheck.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/autoformat-typecheck.yml b/.github/workflows/autoformat-typecheck.yml index d35166e7c8..f297804002 100644 --- a/.github/workflows/autoformat-typecheck.yml +++ b/.github/workflows/autoformat-typecheck.yml @@ -41,7 +41,7 @@ jobs: - name: Run autoformat run: | - ./scripts/autoformat.sh --check + ./scripts/autoformat.sh - name: Check for formatting changes id: check_formatting_changes From 152eb84a86ef370ca9c3a800e429f4cb3ff3db13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 27 Jan 2025 14:40:20 +0000 Subject: [PATCH 153/310] Iterator bug fix --- src/sources/wikidata/linked_ontology_source.py | 8 ++++---- src/utils/streaming.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 00b82735ef..866920d329 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -1,4 +1,4 @@ -from collections.abc import Generator, Iterable +from collections.abc import Generator, Iterator from typing import Callable from sources.base_source import BaseSource @@ -90,7 +90,7 @@ def _get_parent_id_mappings(self, child_wikidata_ids: list[str]) -> list[dict]: @staticmethod def _parallelise_requests( - items: Iterable, run_sparql_query: Callable[[list], list] + items: Iterator, run_sparql_query: Callable[[list], list] ) -> Generator: """Accept an `items` generator and a `run_sparql_query` method. Split `items` chunks and apply `run_sparql_query` to each chunk. Return a single generator of results.""" @@ -133,7 +133,7 @@ def _stream_raw_edges(self) -> Generator[dict]: print("Streaming linked Wikidata ids...") for raw_mapping in self._parallelise_requests( - all_linked_ids, self._get_linked_id_mappings + iter(all_linked_ids), self._get_linked_id_mappings ): linked_id = raw_mapping["linkedId"]["value"] wikidata_id = extract_wikidata_id(raw_mapping) @@ -152,7 +152,7 @@ def _stream_raw_edges(self) -> Generator[dict]: print("Streaming parent Wikidata ids...") for raw_mapping in self._parallelise_requests( - all_linked_ids, self._get_parent_id_mappings + iter(all_linked_ids), self._get_parent_id_mappings ): parent_id = extract_wikidata_id(raw_mapping) mapping = { diff --git a/src/utils/streaming.py b/src/utils/streaming.py index da1d1e141b..a7505e9237 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -1,5 +1,5 @@ import concurrent.futures -from collections.abc import Generator, Iterable +from collections.abc import Generator, Iterator from itertools import islice from typing import Any, Callable, TypeVar @@ -7,7 +7,7 @@ S = TypeVar("S") -def generator_to_chunks(items: Iterable[Any], chunk_size: int) -> Generator[list]: +def generator_to_chunks(items: Iterator[Any], chunk_size: int) -> Generator[list]: """ Split items in a generator into chunks of size `chunk_size` and return another generator yielding the chunks one by one. @@ -21,7 +21,7 @@ def generator_to_chunks(items: Iterable[Any], chunk_size: int) -> Generator[list def process_stream_in_parallel( - stream: Iterable[T], + stream: Iterator[T], process: Callable[[list[T]], list[S]], chunk_size: int, thread_count: int, From 6e1b08e6a25d5dbeb7bbbcbe87903cbaef53050e Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Mon, 27 Jan 2025 15:13:08 +0000 Subject: [PATCH 154/310] version lambdas and used $LATEST in state machines --- terraform/lambda_bulk_load_poller.tf | 1 + terraform/lambda_bulk_loader.tf | 1 + terraform/lambda_extractor.tf | 3 ++- terraform/lambda_indexer.tf | 1 + terraform/state_machine_bulk_loader.tf | 8 ++++++-- terraform/state_machine_extractors.tf | 5 ++++- 6 files changed, 15 insertions(+), 4 deletions(-) diff --git a/terraform/lambda_bulk_load_poller.tf b/terraform/lambda_bulk_load_poller.tf index 57690876c3..e1159f621b 100644 --- a/terraform/lambda_bulk_load_poller.tf +++ b/terraform/lambda_bulk_load_poller.tf @@ -4,6 +4,7 @@ module "bulk_load_poller_lambda" { name = "catalogue-graph-bulk-load-poller" description = "Polls the status of a Neptune bulk load job." runtime = "python3.13" + publish = true filename = "../build.zip" source_code_hash = filesha256("../build.zip") diff --git a/terraform/lambda_bulk_loader.tf b/terraform/lambda_bulk_loader.tf index 59a92cd281..49ec4d76bc 100644 --- a/terraform/lambda_bulk_loader.tf +++ b/terraform/lambda_bulk_loader.tf @@ -4,6 +4,7 @@ module "bulk_loader_lambda" { name = "catalogue-graph-bulk-loader" description = "Bulk loads entities from an S3 bucket into the Neptune database." runtime = "python3.13" + publish = true filename = "../build.zip" source_code_hash = filesha256("../build.zip") diff --git a/terraform/lambda_extractor.tf b/terraform/lambda_extractor.tf index 69a818bc57..170d1484a7 100644 --- a/terraform/lambda_extractor.tf +++ b/terraform/lambda_extractor.tf @@ -4,7 +4,8 @@ module "extractor_lambda" { name = "catalogue-graph-extractor" description = "Extracts source concepts and turns them into Cypher queries." runtime = "python3.13" - + publish = true + filename = "../build.zip" source_code_hash = filesha256("../build.zip") diff --git a/terraform/lambda_indexer.tf b/terraform/lambda_indexer.tf index 0beb6e48a0..9089dfd5c7 100644 --- a/terraform/lambda_indexer.tf +++ b/terraform/lambda_indexer.tf @@ -4,6 +4,7 @@ module "indexer_lambda" { name = "catalogue-graph-indexer" description = "Indexes nodes and edges into the Neptune catalogue graph cluster." runtime = "python3.13" + publish = true filename = "../build.zip" source_code_hash = filesha256("../build.zip") diff --git a/terraform/state_machine_bulk_loader.tf b/terraform/state_machine_bulk_loader.tf index f560c460ce..51742f9121 100644 --- a/terraform/state_machine_bulk_loader.tf +++ b/terraform/state_machine_bulk_loader.tf @@ -1,3 +1,7 @@ +locals { + bulk_loader_lambda = "${module.bulk_loader_lambda.lambda.arn}:${module.bulk_loader_lambda.lambda.version}" + bulk_load_poller_lambda = "${module.bulk_load_poller_lambda.lambda.arn}:${module.bulk_load_poller_lambda.lambda.version}" +} resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { name = "catalogue-graph-bulk-loader" role_arn = aws_iam_role.state_machine_execution_role.arn @@ -12,7 +16,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { "Resource" : "arn:aws:states:::lambda:invoke", "Output" : "{% $states.result.Payload %}", "Arguments" : { - "FunctionName" : module.bulk_loader_lambda.lambda.arn, + "FunctionName" : local.bulk_loader_lambda, "Payload" : "{% $states.input %}" }, "Next" : "Wait 30 seconds" @@ -27,7 +31,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { "Resource" : "arn:aws:states:::lambda:invoke", "Output" : "{% $states.result.Payload %}", "Arguments" : { - "FunctionName" : module.bulk_load_poller_lambda.lambda.arn, + "FunctionName" : local.bulk_load_poller_lambda, "Payload" : "{% $states.input %}" }, "Next" : "Load complete?" diff --git a/terraform/state_machine_extractors.tf b/terraform/state_machine_extractors.tf index e8aee4c50e..725749d90a 100644 --- a/terraform/state_machine_extractors.tf +++ b/terraform/state_machine_extractors.tf @@ -1,3 +1,6 @@ +locals { + extractor_lambda = "${module.extractor_lambda.lambda.arn}:${module.extractor_lambda.lambda.version}" +} resource "aws_sfn_state_machine" "catalogue_graph_extractors" { name = "catalogue-graph-extractors" role_arn = aws_iam_role.state_machine_execution_role.arn @@ -14,7 +17,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_extractors" { States = { "Extract ${task_input.label}" = { Type = "Task" - Resource = module.extractor_lambda.lambda.arn + Resource = local.extractor_lambda Parameters = { "transformer_type" = task_input.transformer_type, "entity_type" = task_input.entity_type, From 9fc7c7bcfa687e8c9fe234133fef8c5c58d17f99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 27 Jan 2025 15:23:00 +0000 Subject: [PATCH 155/310] Handle invalid Wikidata ids --- .../wikidata/linked_ontology_source.py | 52 ++++++++++++------- src/transformers/wikidata/raw_concept.py | 4 +- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 866920d329..ea6b1cce82 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -14,13 +14,23 @@ WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" -def extract_wikidata_id(item: dict, key: str = "item") -> str: +def extract_wikidata_id(item: dict, key: str = "item") -> str | None: """ Accepts a raw `item` dictionary returned by the Wikidata SPARQL endpoint and returns the Wikidata id of the item. + Returns `None` if the stored id is not valid. """ - assert isinstance(item[key]["value"], str) + wikidata_id = item[key]["value"] + assert isinstance(wikidata_id, str) assert item[key]["type"] == "uri" - return str(item[key]["value"].removeprefix(WIKIDATA_ID_PREFIX)) + + if wikidata_id.startswith(WIKIDATA_ID_PREFIX): + return wikidata_id.removeprefix(WIKIDATA_ID_PREFIX) + + # Very rarely, Wikidata returns an invalid ID in the format + # http://www.wikidata.org/.well-known/genid/. + # Log when this happens and return 'None'. + print(f"Encountered an invalid Wikidata id: {wikidata_id}") + return None class WikidataLinkedOntologySource(BaseSource): @@ -61,9 +71,10 @@ def _get_all_ids(self) -> list[str]: # Deduplicate. (We could deduplicate as part of the SPARQL query via the 'DISTINCT' keyword, # but that would make the query significantly slower. It's faster to deduplicate here.) all_ids = set(extract_wikidata_id(item) for item in id_items) + all_valid_ids = [i for i in all_ids if i is not None] - print(f"({len(all_ids)} ids retrieved.)") - return list(all_ids) + print(f"({len(all_valid_ids)} ids retrieved.)") + return list(all_valid_ids) def _get_linked_id_mappings(self, wikidata_ids: list[str]) -> list[dict]: query = SparqlQueryBuilder.get_linked_ids_query( @@ -130,6 +141,7 @@ def _stream_raw_edges(self) -> Generator[dict]: (i.e. concepts, locations, or names). """ all_linked_ids = self._get_all_ids() + selected_type_ids = set() print("Streaming linked Wikidata ids...") for raw_mapping in self._parallelise_requests( @@ -137,31 +149,33 @@ def _stream_raw_edges(self) -> Generator[dict]: ): linked_id = raw_mapping["linkedId"]["value"] wikidata_id = extract_wikidata_id(raw_mapping) - mapping = { - "wikidata_id": wikidata_id, - "linked_id": linked_id, - "type": "SAME_AS", - } # Only yield the mapping if the linked id corresponds to the selected `node_type`, as determined by the # linked ontology. For example, if we want to stream Wikidata 'names' edges, but we classify the referenced # LoC id is a 'locations' id, we skip it. # This also removes mappings which include invalid LoC ids (of which there are several thousand). - if self.id_type_checker.id_included_in_selected_type(mapping["linked_id"]): - yield mapping + if self.id_type_checker.id_included_in_selected_type(linked_id): + selected_type_ids.add(wikidata_id) + yield { + "wikidata_id": wikidata_id, + "linked_id": linked_id, + "type": "SAME_AS", + } print("Streaming parent Wikidata ids...") for raw_mapping in self._parallelise_requests( iter(all_linked_ids), self._get_parent_id_mappings ): parent_id = extract_wikidata_id(raw_mapping) - mapping = { - "child_id": extract_wikidata_id(raw_mapping, "child"), - "parent_id": parent_id, - "type": "HAS_PARENT", - } - - yield mapping + child_id = extract_wikidata_id(raw_mapping, "child") + + if parent_id is not None and child_id is not None: + if child_id in selected_type_ids: + yield { + "child_id": child_id, + "parent_id": parent_id, + "type": "HAS_PARENT", + } def _stream_raw_nodes(self) -> Generator[dict]: """ diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 4e57e7982f..6621a1996b 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -28,7 +28,9 @@ def _extract_english_field_value(self, field_name: str) -> str: @property def source_id(self) -> str: - return extract_wikidata_id(self.raw_concept) + wikidata_id = extract_wikidata_id(self.raw_concept) + assert wikidata_id is not None + return wikidata_id @property def label(self) -> str: From c8f870d10111aff3d66917ffa40d09a9f0df7568 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 28 Jan 2025 13:06:00 +0000 Subject: [PATCH 156/310] Extract backoff parameters into constants --- src/clients/base_neptune_client.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 7c57efd9c6..058e7da2a6 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -8,6 +8,9 @@ from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest +NEPTUNE_BACKOFF_DEFAULT_RETRIES = 3 +NEPTUNE_BACKOFF_DEFAULT_INTERVAL = 10 + def on_request_backoff(backoff_details: typing.Any) -> None: exception_name = type(backoff_details["exception"]).__name__ @@ -32,8 +35,8 @@ def _get_client_url(self) -> str: @backoff.on_exception( backoff.constant, Exception, - max_tries=3, - interval=10, + max_tries=NEPTUNE_BACKOFF_DEFAULT_RETRIES, + interval=NEPTUNE_BACKOFF_DEFAULT_INTERVAL, on_backoff=on_request_backoff, ) def _make_request( From b943d44a15c9bb47137621895934162cca0bbb39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 28 Jan 2025 13:22:39 +0000 Subject: [PATCH 157/310] Add comments --- src/sources/wikidata/linked_ontology_source.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index ea6b1cce82..5c5a8f0176 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -87,11 +87,18 @@ def _get_wikidata_items(self, wikidata_ids: list[str]) -> list: return self.client.run_query(query) def _get_parent_id_mappings(self, child_wikidata_ids: list[str]) -> list[dict]: + """ + Given a list of child wikidata ids, checks for all parents of each item in the list and returns a list + of mappings between child and parent ids. + """ + + # Get all parent ids referenced via the Wikidata 'subclass of' field subclass_of_query = SparqlQueryBuilder.get_parents_query( child_wikidata_ids, "subclass_of" ) subclass_of_results = self.client.run_query(subclass_of_query) + # Get all parent ids referenced via the Wikidata 'instance of' field instance_of_query = SparqlQueryBuilder.get_parents_query( child_wikidata_ids, "instance_of" ) From ba14682214f4964978f29d04644e8e82e805869c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 28 Jan 2025 15:14:33 +0000 Subject: [PATCH 158/310] Wikidata parent items bug fix --- .../linked_ontology_id_type_checker.py | 10 ++ .../wikidata/linked_ontology_source.py | 115 +++++++++++------- 2 files changed, 81 insertions(+), 44 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index eae63ea7ce..6a0c7fb379 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -58,3 +58,13 @@ def id_included_in_selected_type(self, linked_id: str) -> bool: locations, or names). """ return linked_id in self._get_linked_ontology_ids(self.node_type) + + def id_is_valid(self, linked_id: str) -> bool: + is_valid = False + is_valid |= linked_id in self._get_linked_ontology_ids("concepts") + is_valid |= linked_id in self._get_linked_ontology_ids("locations") + + if self.linked_ontology == "loc": + is_valid |= linked_id in self._get_linked_ontology_ids("names") + + return is_valid diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 5c5a8f0176..9a60a9d943 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -4,6 +4,7 @@ from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel +from functools import lru_cache from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient @@ -58,8 +59,12 @@ def __init__( self.entity_type = entity_type self.id_type_checker = LinkedOntologyIdTypeChecker(node_type, linked_ontology) + @lru_cache def _get_all_ids(self) -> list[str]: - """Return all Wikidata ids corresponding to Wikidata items referencing the selected linked ontology.""" + """ + Return all Wikidata ids corresponding to Wikidata items referencing the selected linked ontology. + All ids are returned, no matter whether we categorise them as concepts, names, or locations. + """ print( f"Retrieving Wikidata ids linked to {self.linked_ontology} items.", end=" ", @@ -91,7 +96,6 @@ def _get_parent_id_mappings(self, child_wikidata_ids: list[str]) -> list[dict]: Given a list of child wikidata ids, checks for all parents of each item in the list and returns a list of mappings between child and parent ids. """ - # Get all parent ids referenced via the Wikidata 'subclass of' field subclass_of_query = SparqlQueryBuilder.get_parents_query( child_wikidata_ids, "subclass_of" @@ -120,56 +124,58 @@ def _parallelise_requests( ): yield raw_response_item - def _stream_wikidata_ids(self) -> Generator[str]: - """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" + def _stream_filtered_wikidata_ids(self) -> Generator[str]: + """Streams all wikidata ids to be processed as nodes given the selected `node_type`.""" seen = set() - for item in self._stream_raw_edges(): - wikidata_id: str - if item["type"] == "SAME_AS": - wikidata_id = item["wikidata_id"] - elif item["type"] == "HAS_PARENT": - wikidata_id = item["parent_id"] - else: - raise ValueError(f"Unknown raw edge type {item['type']}.") - if wikidata_id not in seen: + # Stream all SAME_AS edges and extract Wikidata ids from them, making sure to deduplicate + # (a given Wikidata id can appear in more than one edge). + for item in self._stream_all_same_as_edges(): + wikidata_id = item["wikidata_id"] + linked_id = item["linked_id"] + if self.id_type_checker.id_is_valid(linked_id) and wikidata_id not in seen: seen.add(wikidata_id) - yield wikidata_id - def _stream_raw_edges(self) -> Generator[dict]: + if self.id_type_checker.id_included_in_selected_type(linked_id): + yield wikidata_id + + # Stream HAS_PARENT edges and extract Wikidata ids of all parents (children are streamed above). Filter out + # all parent ids which reference a linked ontology ids. All remaining ids belong to items which do not + # reference a MeSH/LoC id. We categorise all of them as _concepts_, no matter whether the children are + # categorised as concepts, names, or locations. + if self.node_type == "concepts": + for item in self._stream_all_has_parent_edges(): + wikidata_id = item["parent_id"] + if wikidata_id not in seen: + seen.add(wikidata_id) + yield wikidata_id + + def _stream_all_same_as_edges(self) -> Generator[dict]: """ - Extract edges via the following steps: + Stream raw 'SAME_AS' edges, mapping Wikidata ids to ids from the selected linked ontology. + + Edges are extracted via the following steps: 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the linked ontology. 2. Split the returned ids into chunks. For each chunk, run a second SPARQL query to retrieve a mapping between Wikidata ids and ids from the linked ontology. (It is possible to modify the query in step 1 to return all the mappings at once, but this makes the query unreliable - sometimes it times out or returns invalid JSON. Getting the mappings in chunks is much slower, but it works every time.) - 3. Filter the returned id pairs to only include Wikidata ids corresponding to the selected node type - (i.e. concepts, locations, or names). """ all_linked_ids = self._get_all_ids() - selected_type_ids = set() - - print("Streaming linked Wikidata ids...") for raw_mapping in self._parallelise_requests( iter(all_linked_ids), self._get_linked_id_mappings ): - linked_id = raw_mapping["linkedId"]["value"] - wikidata_id = extract_wikidata_id(raw_mapping) - - # Only yield the mapping if the linked id corresponds to the selected `node_type`, as determined by the - # linked ontology. For example, if we want to stream Wikidata 'names' edges, but we classify the referenced - # LoC id is a 'locations' id, we skip it. - # This also removes mappings which include invalid LoC ids (of which there are several thousand). - if self.id_type_checker.id_included_in_selected_type(linked_id): - selected_type_ids.add(wikidata_id) - yield { - "wikidata_id": wikidata_id, - "linked_id": linked_id, - "type": "SAME_AS", - } + yield { + "wikidata_id": extract_wikidata_id(raw_mapping), + "linked_id": raw_mapping["linkedId"]["value"], + "type": "SAME_AS", + } - print("Streaming parent Wikidata ids...") + def _stream_all_has_parent_edges(self) -> Generator[dict]: + """ + Stream raw 'HAS_PARENT' Wikidata edges, mapping child items to parent items. + """ + all_linked_ids = self._get_all_ids() for raw_mapping in self._parallelise_requests( iter(all_linked_ids), self._get_parent_id_mappings ): @@ -177,21 +183,42 @@ def _stream_raw_edges(self) -> Generator[dict]: child_id = extract_wikidata_id(raw_mapping, "child") if parent_id is not None and child_id is not None: - if child_id in selected_type_ids: - yield { - "child_id": child_id, - "parent_id": parent_id, - "type": "HAS_PARENT", - } + yield { + "child_id": child_id, + "parent_id": parent_id, + "type": "HAS_PARENT", + } + + def _stream_raw_edges(self) -> Generator[dict]: + """ + Stream SAME_AS edges followed by HAS_PARENT edges for the selected `linked_ontology` and `node_type`. + """ + print("Streaming SAME_AS edges...") + streamed_wikidata_ids = set() + for edge in self._stream_all_same_as_edges(): + # Filter for mappings which are part of the selected `node_type`, as determined by the linked ontology. + # For example, if we are streaming Wikidata 'names' edges linked to LoC ids but the LoC id linked to some + # Wikidata id is classified as a 'location', we skip it. This filtering process also removes mappings which + # include invalid LoC ids (of which there are several thousand). + if self.id_type_checker.id_included_in_selected_type(edge["linked_id"]): + streamed_wikidata_ids.add(edge["wikidata_id"]) + yield edge + + print("Streaming HAS_PARENT edges...") + for edge in self._stream_all_has_parent_edges(): + # Only include an edge if its `child_id` was already streamed in a SAME_AS edge, indicating that + # the child item belongs under the selected `node_type`. + if edge["child_id"] in streamed_wikidata_ids: + yield edge def _stream_raw_nodes(self) -> Generator[dict]: """ Extract nodes via the following steps: - 1. Stream edges via the `_stream_raw_edges` method and extract Wikidata ids from the streamed edges. + 1. Stream raw edges and extract Wikidata ids from them. 2. Split the extracted ids into chunks. For each chunk, run a SPARQL query to retrieve all the corresponding Wikidata fields required to create a node. """ - all_ids = self._stream_wikidata_ids() + all_ids = self._stream_filtered_wikidata_ids() yield from self._parallelise_requests(all_ids, self._get_wikidata_items) def stream_raw(self) -> Generator[dict]: From 0ca1959d89022f31a9fbabd94a042be3efec02bc Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Tue, 28 Jan 2025 16:49:27 +0000 Subject: [PATCH 159/310] Apply auto-formatting rules --- src/sources/wikidata/linked_ontology_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 9a60a9d943..caa3f51a05 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -1,10 +1,10 @@ from collections.abc import Generator, Iterator +from functools import lru_cache from typing import Callable from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel -from functools import lru_cache from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient From 35e289a0075173f755f10e9247a902425142843b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 28 Jan 2025 15:14:33 +0000 Subject: [PATCH 160/310] Wikidata parent items bug fix --- .../linked_ontology_id_type_checker.py | 11 ++ .../wikidata/linked_ontology_source.py | 115 +++++++++++------- 2 files changed, 82 insertions(+), 44 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index eae63ea7ce..3da0099cca 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -58,3 +58,14 @@ def id_included_in_selected_type(self, linked_id: str) -> bool: locations, or names). """ return linked_id in self._get_linked_ontology_ids(self.node_type) + + def id_is_valid(self, linked_id: str) -> bool: + """Returns 'True' if the given id from the selected linked ontology is valid.""" + is_valid = False + is_valid |= linked_id in self._get_linked_ontology_ids("concepts") + is_valid |= linked_id in self._get_linked_ontology_ids("locations") + + if self.linked_ontology == "loc": + is_valid |= linked_id in self._get_linked_ontology_ids("names") + + return is_valid diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 5c5a8f0176..9a60a9d943 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -4,6 +4,7 @@ from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel +from functools import lru_cache from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient @@ -58,8 +59,12 @@ def __init__( self.entity_type = entity_type self.id_type_checker = LinkedOntologyIdTypeChecker(node_type, linked_ontology) + @lru_cache def _get_all_ids(self) -> list[str]: - """Return all Wikidata ids corresponding to Wikidata items referencing the selected linked ontology.""" + """ + Return all Wikidata ids corresponding to Wikidata items referencing the selected linked ontology. + All ids are returned, no matter whether we categorise them as concepts, names, or locations. + """ print( f"Retrieving Wikidata ids linked to {self.linked_ontology} items.", end=" ", @@ -91,7 +96,6 @@ def _get_parent_id_mappings(self, child_wikidata_ids: list[str]) -> list[dict]: Given a list of child wikidata ids, checks for all parents of each item in the list and returns a list of mappings between child and parent ids. """ - # Get all parent ids referenced via the Wikidata 'subclass of' field subclass_of_query = SparqlQueryBuilder.get_parents_query( child_wikidata_ids, "subclass_of" @@ -120,56 +124,58 @@ def _parallelise_requests( ): yield raw_response_item - def _stream_wikidata_ids(self) -> Generator[str]: - """Streams filtered edges using the `_stream_raw_edges` method and extracts Wikidata ids from them.""" + def _stream_filtered_wikidata_ids(self) -> Generator[str]: + """Streams all wikidata ids to be processed as nodes given the selected `node_type`.""" seen = set() - for item in self._stream_raw_edges(): - wikidata_id: str - if item["type"] == "SAME_AS": - wikidata_id = item["wikidata_id"] - elif item["type"] == "HAS_PARENT": - wikidata_id = item["parent_id"] - else: - raise ValueError(f"Unknown raw edge type {item['type']}.") - if wikidata_id not in seen: + # Stream all SAME_AS edges and extract Wikidata ids from them, making sure to deduplicate + # (a given Wikidata id can appear in more than one edge). + for item in self._stream_all_same_as_edges(): + wikidata_id = item["wikidata_id"] + linked_id = item["linked_id"] + if self.id_type_checker.id_is_valid(linked_id) and wikidata_id not in seen: seen.add(wikidata_id) - yield wikidata_id - def _stream_raw_edges(self) -> Generator[dict]: + if self.id_type_checker.id_included_in_selected_type(linked_id): + yield wikidata_id + + # Stream HAS_PARENT edges and extract Wikidata ids of all parents (children are streamed above). Filter out + # all parent ids which reference a linked ontology ids. All remaining ids belong to items which do not + # reference a MeSH/LoC id. We categorise all of them as _concepts_, no matter whether the children are + # categorised as concepts, names, or locations. + if self.node_type == "concepts": + for item in self._stream_all_has_parent_edges(): + wikidata_id = item["parent_id"] + if wikidata_id not in seen: + seen.add(wikidata_id) + yield wikidata_id + + def _stream_all_same_as_edges(self) -> Generator[dict]: """ - Extract edges via the following steps: + Stream raw 'SAME_AS' edges, mapping Wikidata ids to ids from the selected linked ontology. + + Edges are extracted via the following steps: 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the linked ontology. 2. Split the returned ids into chunks. For each chunk, run a second SPARQL query to retrieve a mapping between Wikidata ids and ids from the linked ontology. (It is possible to modify the query in step 1 to return all the mappings at once, but this makes the query unreliable - sometimes it times out or returns invalid JSON. Getting the mappings in chunks is much slower, but it works every time.) - 3. Filter the returned id pairs to only include Wikidata ids corresponding to the selected node type - (i.e. concepts, locations, or names). """ all_linked_ids = self._get_all_ids() - selected_type_ids = set() - - print("Streaming linked Wikidata ids...") for raw_mapping in self._parallelise_requests( iter(all_linked_ids), self._get_linked_id_mappings ): - linked_id = raw_mapping["linkedId"]["value"] - wikidata_id = extract_wikidata_id(raw_mapping) - - # Only yield the mapping if the linked id corresponds to the selected `node_type`, as determined by the - # linked ontology. For example, if we want to stream Wikidata 'names' edges, but we classify the referenced - # LoC id is a 'locations' id, we skip it. - # This also removes mappings which include invalid LoC ids (of which there are several thousand). - if self.id_type_checker.id_included_in_selected_type(linked_id): - selected_type_ids.add(wikidata_id) - yield { - "wikidata_id": wikidata_id, - "linked_id": linked_id, - "type": "SAME_AS", - } + yield { + "wikidata_id": extract_wikidata_id(raw_mapping), + "linked_id": raw_mapping["linkedId"]["value"], + "type": "SAME_AS", + } - print("Streaming parent Wikidata ids...") + def _stream_all_has_parent_edges(self) -> Generator[dict]: + """ + Stream raw 'HAS_PARENT' Wikidata edges, mapping child items to parent items. + """ + all_linked_ids = self._get_all_ids() for raw_mapping in self._parallelise_requests( iter(all_linked_ids), self._get_parent_id_mappings ): @@ -177,21 +183,42 @@ def _stream_raw_edges(self) -> Generator[dict]: child_id = extract_wikidata_id(raw_mapping, "child") if parent_id is not None and child_id is not None: - if child_id in selected_type_ids: - yield { - "child_id": child_id, - "parent_id": parent_id, - "type": "HAS_PARENT", - } + yield { + "child_id": child_id, + "parent_id": parent_id, + "type": "HAS_PARENT", + } + + def _stream_raw_edges(self) -> Generator[dict]: + """ + Stream SAME_AS edges followed by HAS_PARENT edges for the selected `linked_ontology` and `node_type`. + """ + print("Streaming SAME_AS edges...") + streamed_wikidata_ids = set() + for edge in self._stream_all_same_as_edges(): + # Filter for mappings which are part of the selected `node_type`, as determined by the linked ontology. + # For example, if we are streaming Wikidata 'names' edges linked to LoC ids but the LoC id linked to some + # Wikidata id is classified as a 'location', we skip it. This filtering process also removes mappings which + # include invalid LoC ids (of which there are several thousand). + if self.id_type_checker.id_included_in_selected_type(edge["linked_id"]): + streamed_wikidata_ids.add(edge["wikidata_id"]) + yield edge + + print("Streaming HAS_PARENT edges...") + for edge in self._stream_all_has_parent_edges(): + # Only include an edge if its `child_id` was already streamed in a SAME_AS edge, indicating that + # the child item belongs under the selected `node_type`. + if edge["child_id"] in streamed_wikidata_ids: + yield edge def _stream_raw_nodes(self) -> Generator[dict]: """ Extract nodes via the following steps: - 1. Stream edges via the `_stream_raw_edges` method and extract Wikidata ids from the streamed edges. + 1. Stream raw edges and extract Wikidata ids from them. 2. Split the extracted ids into chunks. For each chunk, run a SPARQL query to retrieve all the corresponding Wikidata fields required to create a node. """ - all_ids = self._stream_wikidata_ids() + all_ids = self._stream_filtered_wikidata_ids() yield from self._parallelise_requests(all_ids, self._get_wikidata_items) def stream_raw(self) -> Generator[dict]: From 5c04ce3a30665e7a83bb99d1ce256731148302c1 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Tue, 28 Jan 2025 17:01:11 +0000 Subject: [PATCH 161/310] Apply auto-formatting rules --- src/sources/wikidata/linked_ontology_source.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index da314d17a1..caa3f51a05 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -5,7 +5,6 @@ from sources.base_source import BaseSource from transformers.base_transformer import EntityType from utils.streaming import process_stream_in_parallel -from functools import lru_cache from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient From bccf2433d7879b1b9a2e7ca789dc7f1770d18bbd Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Fri, 24 Jan 2025 10:58:58 +0000 Subject: [PATCH 162/310] add test for extractor --- src/bulk_loader.py | 3 +- src/config.py | 10 + src/conftest.py | 20 + src/extractor.py | 6 +- src/fixtures/mesh_example.xml | 406 ++++++++++++++++++ .../linked_ontology_id_type_checker.py | 4 +- .../wikidata/linked_ontology_source.py | 1 - src/test_extractor.py | 29 ++ src/test_indexer.py | 18 +- src/test_mocks.py | 43 +- src/transformers/create_transformer.py | 8 +- 11 files changed, 517 insertions(+), 31 deletions(-) create mode 100644 src/config.py create mode 100644 src/fixtures/mesh_example.xml create mode 100644 src/test_extractor.py diff --git a/src/bulk_loader.py b/src/bulk_loader.py index e032fea535..856f54b809 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -2,12 +2,11 @@ import os import typing +from config import S3_BULK_LOAD_BUCKET_NAME from transformers.base_transformer import EntityType from transformers.create_transformer import TransformerType from utils.aws import get_neptune_client -S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] - def handler( transformer_type: TransformerType, entity_type: EntityType, is_local: bool = False diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000000..4761017706 --- /dev/null +++ b/src/config.py @@ -0,0 +1,10 @@ +import os + +S3_BULK_LOAD_BUCKET_NAME = os.environ.get("S3_BULK_LOAD_BUCKET_NAME") +GRAPH_QUERIES_SNS_TOPIC_ARN = os.environ.get("GRAPH_QUERIES_SNS_TOPIC_ARN") + +LOC_SUBJECT_HEADINGS_URL = ( + "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" +) +LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" +MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" diff --git a/src/conftest.py b/src/conftest.py index 7292ac18f2..7e3fa3e2d8 100644 --- a/src/conftest.py +++ b/src/conftest.py @@ -1,3 +1,5 @@ +import os + import pytest from _pytest.monkeypatch import MonkeyPatch @@ -9,3 +11,21 @@ def test(monkeypatch: MonkeyPatch) -> None: # Replaces boto3 and Elasticsearch with fake clients monkeypatch.setattr("boto3.Session", MockBoto3Session) monkeypatch.setattr("requests.request", MockRequest.request) + monkeypatch.setattr("requests.get", MockRequest.get) + + monkeypatch.setattr("config.S3_BULK_LOAD_BUCKET_NAME", "bulk_load_test_bucket") + monkeypatch.setattr( + "config.GRAPH_QUERIES_SNS_TOPIC_ARN", + "arn:aws:sns:us-east-1:123456789012:graph_queries_test_topic", + ) + + +@pytest.fixture(autouse=True) +def run_around_tests(): + MockRequest.reset_mocks() + yield + + +def load_fixture(file_name: str) -> bytes: + with open(f"{os.path.dirname(__file__)}/fixtures/{file_name}", "rb") as f: + return f.read() diff --git a/src/extractor.py b/src/extractor.py index 5963de4b5a..d8d6105775 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -1,14 +1,12 @@ import argparse -import os import typing +from config import GRAPH_QUERIES_SNS_TOPIC_ARN, S3_BULK_LOAD_BUCKET_NAME from transformers.base_transformer import BaseTransformer, EntityType, StreamDestination from transformers.create_transformer import TransformerType, create_transformer from utils.aws import get_neptune_client CHUNK_SIZE = 256 -S3_BULK_LOAD_BUCKET_NAME = os.environ.get("S3_BULK_LOAD_BUCKET_NAME") -GRAPH_QUERIES_SNS_TOPIC_ARN = os.environ.get("GRAPH_QUERIES_SNS_TOPIC_ARN") def handler( @@ -25,6 +23,8 @@ def handler( transformer: BaseTransformer = create_transformer(transformer_type, entity_type) + print(f"Transformer: {transformer}") + if stream_destination == "graph": neptune_client = get_neptune_client(is_local) transformer.stream_to_graph( diff --git a/src/fixtures/mesh_example.xml b/src/fixtures/mesh_example.xml new file mode 100644 index 0000000000..0332e6d689 --- /dev/null +++ b/src/fixtures/mesh_example.xml @@ -0,0 +1,406 @@ + + + + + D000001 + + Calcimycin + + + 1974 + 11 + 19 + + + 2023 + 02 + 26 + + + 1984 + 01 + 01 + + + + + Q000008 + + administration & dosage + + + AD + + + + Q000009 + + adverse effects + + + AE + + + + Q000031 + + analogs & derivatives + + + AA + + + + Q000032 + + analysis + + + AN + + + + Q000037 + + antagonists & inhibitors + + + AI + + + + Q000096 + + biosynthesis + + + BI + + + + Q000097 + + blood + + + BL + + + + Q000134 + + cerebrospinal fluid + + + CF + + + + Q000138 + + chemical synthesis + + + CS + + + + Q000145 + + classification + + + CL + + + + Q000191 + + economics + + + EC + + + + Q000266 + + history + + + HI + + + + Q000276 + + immunology + + + IM + + + + Q000302 + + isolation & purification + + + IP + + + + Q000378 + + metabolism + + + ME + + + + Q000493 + + pharmacokinetics + + + PK + + + + Q000494 + + pharmacology + + + PD + + + + Q000506 + + poisoning + + + PO + + + + Q000528 + + radiation effects + + + RE + + + + Q000592 + + standards + + + ST + + + + Q000600 + + supply & distribution + + + SD + + + + Q000627 + + therapeutic use + + + TU + + + + Q000633 + + toxicity + + + TO + + + + Q000652 + + urine + + + UR + + + + Q000737 + + chemistry + + + CH + + + + Q000819 + + agonists + + + AG + + + 91(75); was A 23187 1975-90 (see under ANTIBIOTICS 1975-83) + + use CALCIMYCIN to search A 23187 1975-90 + + 91; was A 23187 1975-90 (see under ANTIBIOTICS 1975-83) + + + Antibiotics (1973-1974) + Carboxylic Acids (1973-1974) + + + + + D000900 + + Anti-Bacterial Agents + + + + + + D061207 + + Calcium Ionophores + + + + + + D02.355.291.933.125 + D02.540.576.625.125 + D03.633.100.221.173 + D04.345.241.654.125 + D04.345.674.625.125 + + + + M0000001 + + Calcimycin + + + 37H9VM9WZL + + An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems. + + + 52665-69-7 (Calcimycin) + + + + M0000001 + M0353609 + + + + + T000002 + Calcimycin + + 1999 + 01 + 01 + + + FDA SRS (2014) + NLM (1975) + + + + T001124965 + 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))- + + 2022 + 09 + 22 + + + NLM (2024) + + + + + + M0353609 + + A-23187 + + + 0 + + + + M0000001 + M0353609 + + + + + T000001 + A-23187 + + 1990 + 03 + 08 + + + NLM (1991) + + + + T000001 + A 23187 + + + T000003 + Antibiotic A23187 + + 1990 + 03 + 08 + + + NLM (1991) + + + + T000003 + A23187, Antibiotic + + + T000004 + A23187 + + 1974 + 11 + 11 + + + UNK (19XX) + + + + + + + \ No newline at end of file diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 4aab9bc3f5..42847700ad 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -4,9 +4,9 @@ import boto3 import smart_open -from .sparql_query_builder import NodeType, OntologyType +from config import S3_BULK_LOAD_BUCKET_NAME -S3_BULK_LOAD_BUCKET_NAME = os.environ["S3_BULK_LOAD_BUCKET_NAME"] +from .sparql_query_builder import NodeType, OntologyType class LinkedOntologyIdTypeChecker: diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index e5f7f2ca2c..06d779a5a0 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -9,7 +9,6 @@ from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder SPARQL_ITEMS_CHUNK_SIZE = 400 - WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" diff --git a/src/test_extractor.py b/src/test_extractor.py new file mode 100644 index 0000000000..b660e7cc5a --- /dev/null +++ b/src/test_extractor.py @@ -0,0 +1,29 @@ +import os + +from config import MESH_URL +from conftest import load_fixture +from extractor import lambda_handler +from test_mocks import MockRequest + + +def test_lambda_handler() -> None: + content = load_fixture("mesh_example.xml") + + MockRequest.mock_responses( + [{"method": "GET", "url": MESH_URL, "status_code": 200, "content": content}] + ) + + event = { + "transformer_type": "mesh_concepts", + "entity_type": "nodes", + "stream_destination": "void", + } + + lambda_handler(event, None) + + assert len(MockRequest.calls) == 1 + request = MockRequest.calls[0] + + # Check we are sending a GET request to the correct endpoint + assert request["method"] == "GET" + assert request["url"] == MESH_URL diff --git a/src/test_indexer.py b/src/test_indexer.py index 17598068b7..3d26231775 100644 --- a/src/test_indexer.py +++ b/src/test_indexer.py @@ -5,14 +5,16 @@ def test_lambda_handler() -> None: - MockRequest.responses = [ - { - "method": "POST", - "url": "https://test-host.com:8182/openCypher", - "status_code": 200, - "json_data": {"results": {"foo": "bar"}}, - } - ] + MockRequest.mock_responses( + [ + { + "method": "POST", + "url": "https://test-host.com:8182/openCypher", + "status_code": 200, + "json_data": {"results": {"foo": "bar"}}, + } + ] + ) event = {"Records": [{"body": json.dumps({"Message": "SOME_QUERY"})}]} diff --git a/src/test_mocks.py b/src/test_mocks.py index eeb658a5c1..937cd21f9b 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -45,13 +45,19 @@ def get_credentials(self) -> Credentials: class MockResponse: - def __init__(self, json_data: dict, status_code: int) -> None: + def __init__( + self, status_code: int, json_data: dict = None, content: bytes = None + ) -> None: self.json_data = json_data self.status_code = status_code + self.content = content - def json(self) -> dict: + def json(self) -> dict | None: return self.json_data + def content(self) -> bytes | None: + return self.content + class MockRequest: responses: list[dict] = [] @@ -71,22 +77,30 @@ def reset_mocks() -> None: MockRequest.clear_mock_calls() @staticmethod - def mock_response(method: str, url: str, status_code: int, json_data: dict) -> None: + def mock_response( + method: str, + url: str, + status_code: int, + json_data: dict = None, + content: bytes = None, + ) -> None: MockRequest.responses.append( { "method": method, "url": url, - "status_code": status_code, - "json_data": json_data, + "response": MockResponse(status_code, json_data, content), } ) @staticmethod - def mock_responses(method: str, url: str, responses: list[dict]) -> None: - MockRequest.clear_mock_responses() + def mock_responses(responses: list[dict]) -> None: for response in responses: MockRequest.mock_response( - method, url, response["status_code"], response["json_data"] + response["method"], + response["url"], + response["status_code"], + response.get("json_data"), + response.get("content"), ) @staticmethod @@ -96,6 +110,17 @@ def request(method: str, url: str, data: dict, headers: dict) -> MockResponse: ) for response in MockRequest.responses: if response["method"] == method and response["url"] == url: - return MockResponse(response["json_data"], response["status_code"]) + return response["response"] raise Exception(f"Unexpected request: {method} {url}") + + @staticmethod + def get(url: str, data: dict = {}, headers: dict = {}) -> MockResponse: + MockRequest.calls.append( + {"method": "GET", "url": url, "data": data, "headers": headers} + ) + for response in MockRequest.responses: + if response["method"] == "GET" and response["url"] == url: + return response["response"] + + raise Exception(f"Unexpected request: GET {url}") diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index 9fc3315d2c..e04478940b 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -1,5 +1,7 @@ from typing import Literal +from config import LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL, MESH_URL + from .base_transformer import BaseTransformer, EntityType from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer from .loc.locations_transformer import LibraryOfCongressLocationsTransformer @@ -10,12 +12,6 @@ from .wikidata.locations_transformer import WikidataLocationsTransformer from .wikidata.names_transformer import WikidataNamesTransformer -LOC_SUBJECT_HEADINGS_URL = ( - "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" -) -LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" -MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" - TransformerType = Literal[ "loc_concepts", "loc_names", From 3b02d81a74e43043164e4c9a73f538cb9f3c752e Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Fri, 24 Jan 2025 14:58:05 +0000 Subject: [PATCH 163/310] testing for all of mesh_concepts extractor space --- src/conftest.py | 18 +- src/extractor.py | 21 +- src/fixtures/mesh_example.xml | 1846 +++++++++++++++++++++++++++ src/sources/mesh/concepts_source.py | 1 + src/test_extractor.py | 132 +- src/test_indexer.py | 1 + src/test_mocks.py | 87 +- 7 files changed, 2059 insertions(+), 47 deletions(-) diff --git a/src/conftest.py b/src/conftest.py index 7e3fa3e2d8..96c15aeb1f 100644 --- a/src/conftest.py +++ b/src/conftest.py @@ -1,17 +1,18 @@ -import os +from typing import Any, Generator import pytest from _pytest.monkeypatch import MonkeyPatch -from test_mocks import MockBoto3Session, MockRequest +from test_mocks import MockBoto3Session, MockRequest, MockSmartOpen, MockSNSClient @pytest.fixture(autouse=True) -def test(monkeypatch: MonkeyPatch) -> None: +def test(monkeypatch: MonkeyPatch) -> Generator[Any, Any, Any]: # Replaces boto3 and Elasticsearch with fake clients monkeypatch.setattr("boto3.Session", MockBoto3Session) monkeypatch.setattr("requests.request", MockRequest.request) monkeypatch.setattr("requests.get", MockRequest.get) + monkeypatch.setattr("smart_open.open", MockSmartOpen.open) monkeypatch.setattr("config.S3_BULK_LOAD_BUCKET_NAME", "bulk_load_test_bucket") monkeypatch.setattr( @@ -19,13 +20,8 @@ def test(monkeypatch: MonkeyPatch) -> None: "arn:aws:sns:us-east-1:123456789012:graph_queries_test_topic", ) - -@pytest.fixture(autouse=True) -def run_around_tests(): MockRequest.reset_mocks() + MockSmartOpen.reset_mocks() + MockSNSClient.reset_mocks() yield - - -def load_fixture(file_name: str) -> bytes: - with open(f"{os.path.dirname(__file__)}/fixtures/{file_name}", "rb") as f: - return f.read() + # Run any cleanup code here diff --git a/src/extractor.py b/src/extractor.py index d8d6105775..de5c2e7e52 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -1,7 +1,7 @@ import argparse import typing -from config import GRAPH_QUERIES_SNS_TOPIC_ARN, S3_BULK_LOAD_BUCKET_NAME +import config from transformers.base_transformer import BaseTransformer, EntityType, StreamDestination from transformers.create_transformer import TransformerType, create_transformer from utils.aws import get_neptune_client @@ -9,6 +9,13 @@ CHUNK_SIZE = 256 +class LambdaEvent(typing.TypedDict): + transformer_type: TransformerType + entity_type: EntityType + stream_destination: StreamDestination + sample_size: int | None + + def handler( stream_destination: StreamDestination, transformer_type: TransformerType, @@ -23,8 +30,6 @@ def handler( transformer: BaseTransformer = create_transformer(transformer_type, entity_type) - print(f"Transformer: {transformer}") - if stream_destination == "graph": neptune_client = get_neptune_client(is_local) transformer.stream_to_graph( @@ -32,19 +37,19 @@ def handler( ) elif stream_destination == "s3": assert ( - S3_BULK_LOAD_BUCKET_NAME is not None + config.S3_BULK_LOAD_BUCKET_NAME is not None ), "To stream to S3, the S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." file_name = f"{transformer_type}__{entity_type}.csv" - s3_uri = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{file_name}" + s3_uri = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{file_name}" transformer.stream_to_s3(s3_uri, entity_type, CHUNK_SIZE, sample_size) elif stream_destination == "sns": assert ( - GRAPH_QUERIES_SNS_TOPIC_ARN is not None + config.GRAPH_QUERIES_SNS_TOPIC_ARN is not None ), "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." transformer.stream_to_sns( - GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size + config.GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size ) elif stream_destination == "void": transformer.stream_to_nowhere(entity_type, CHUNK_SIZE, sample_size) @@ -52,7 +57,7 @@ def handler( raise ValueError("Unsupported stream destination.") -def lambda_handler(event: dict, context: typing.Any) -> None: +def lambda_handler(event: LambdaEvent, context: typing.Any) -> None: stream_destination = event["stream_destination"] transformer_type = event["transformer_type"] entity_type = event["entity_type"] diff --git a/src/fixtures/mesh_example.xml b/src/fixtures/mesh_example.xml index 0332e6d689..5d714d9b83 100644 --- a/src/fixtures/mesh_example.xml +++ b/src/fixtures/mesh_example.xml @@ -1,6 +1,1852 @@ + + D009930 + + Organic Chemicals + + + 1999 + 01 + 01 + + + 2020 + 05 + 27 + + + 1998 + 01 + 01 + + + + + Q000008 + + administration & dosage + + + AD + + + + Q000009 + + adverse effects + + + AE + + + + Q000032 + + analysis + + + AN + + + + Q000037 + + antagonists & inhibitors + + + AI + + + + Q000097 + + blood + + + BL + + + + Q000134 + + cerebrospinal fluid + + + CF + + + + Q000138 + + chemical synthesis + + + CS + + + + Q000145 + + classification + + + CL + + + + Q000191 + + economics + + + EC + + + + Q000266 + + history + + + HI + + + + Q000276 + + immunology + + + IM + + + + Q000302 + + isolation & purification + + + IP + + + + Q000378 + + metabolism + + + ME + + + + Q000493 + + pharmacokinetics + + + PK + + + + Q000494 + + pharmacology + + + PD + + + + Q000506 + + poisoning + + + PO + + + + Q000528 + + radiation effects + + + RE + + + + Q000592 + + standards + + + ST + + + + Q000600 + + supply & distribution + + + SD + + + + Q000627 + + therapeutic use + + + TU + + + + Q000633 + + toxicity + + + TO + + + + Q000652 + + urine + + + UR + + + + Q000737 + + chemistry + + + CH + + + + Q000819 + + agonists + + + AG + + + general or unspecified; prefer specific groups or specific chemicals + + 98; use explode 1967-97 + + 98 + + + D02 + + + + M0015401 + + Organic Chemicals + + + 0 + + A broad class of substances containing carbon and its derivatives. Many of these chemicals will frequently contain hydrogen with or without oxygen, nitrogen, sulfur, phosphorus, and other elements. They exist in either carbon chain or carbon ring form. + + + + T029381 + Organic Chemicals + + 1999 + 01 + 01 + + + NLM (1966) + + + + T029381 + Chemicals, Organic + + + T000998325 + Organic Chemical + + 2019 + 11 + 20 + + + NLM (2021) + + + + T000998325 + Chemical, Organic + + + + + + + D004987 + + Ethers + + + 1999 + 01 + 01 + + + 2015 + 11 + 23 + + + 1966 + 01 + 01 + + + + + Q000008 + + administration & dosage + + + AD + + + + Q000009 + + adverse effects + + + AE + + + + Q000032 + + analysis + + + AN + + + + Q000037 + + antagonists & inhibitors + + + AI + + + + Q000097 + + blood + + + BL + + + + Q000134 + + cerebrospinal fluid + + + CF + + + + Q000138 + + chemical synthesis + + + CS + + + + Q000145 + + classification + + + CL + + + + Q000191 + + economics + + + EC + + + + Q000266 + + history + + + HI + + + + Q000276 + + immunology + + + IM + + + + Q000302 + + isolation & purification + + + IP + + + + Q000378 + + metabolism + + + ME + + + + Q000493 + + pharmacokinetics + + + PK + + + + Q000494 + + pharmacology + + + PD + + + + Q000506 + + poisoning + + + PO + + + + Q000528 + + radiation effects + + + RE + + + + Q000592 + + standards + + + ST + + + + Q000600 + + supply & distribution + + + SD + + + + Q000627 + + therapeutic use + + + TU + + + + Q000633 + + toxicity + + + TO + + + + Q000652 + + urine + + + UR + + + + Q000737 + + chemistry + + + CH + + + + Q000819 + + agonists + + + AG + + + do not confuse with ETHER + + ETHOGLUCID was see under ALKYLATING AGENTS 1967-74 + + + D02.355 + + + + M0007828 + + Ethers + + + 0 + + Organic compounds having two alkyl or aryl groups bonded to an oxygen atom, as in the formula R1–O–R2. + + + + T015221 + Ethers + + 1999 + 01 + 01 + + + NLM (1966) + + + + + + + + D004988 + + Ethers, Cyclic + + + 1999 + 01 + 01 + + + 2020 + 05 + 27 + + + 1966 + 01 + 01 + + + + + Q000008 + + administration & dosage + + + AD + + + + Q000009 + + adverse effects + + + AE + + + + Q000032 + + analysis + + + AN + + + + Q000037 + + antagonists & inhibitors + + + AI + + + + Q000097 + + blood + + + BL + + + + Q000134 + + cerebrospinal fluid + + + CF + + + + Q000138 + + chemical synthesis + + + CS + + + + Q000145 + + classification + + + CL + + + + Q000191 + + economics + + + EC + + + + Q000266 + + history + + + HI + + + + Q000276 + + immunology + + + IM + + + + Q000302 + + isolation & purification + + + IP + + + + Q000378 + + metabolism + + + ME + + + + Q000493 + + pharmacokinetics + + + PK + + + + Q000494 + + pharmacology + + + PD + + + + Q000506 + + poisoning + + + PO + + + + Q000528 + + radiation effects + + + RE + + + + Q000592 + + standards + + + ST + + + + Q000600 + + supply & distribution + + + SD + + + + Q000627 + + therapeutic use + + + TU + + + + Q000633 + + toxicity + + + TO + + + + Q000652 + + urine + + + UR + + + + Q000737 + + chemistry + + + CH + + + + Q000819 + + agonists + + + AG + + + + D02.355.291 + D04.345.241 + + + + M0007829 + + Ethers, Cyclic + + + 0 + + Compounds of the general formula R-O-R arranged in a ring or crown formation. + + + + T015222 + Ethers, Cyclic + + 1999 + 01 + 01 + + + NLM (1966) + + + + T015222 + Cyclic Ethers + + + T000998163 + Cyclic Ether + + 2019 + 11 + 20 + + + NLM (2021) + + + + T000998163 + Ether, Cyclic + + + + + + + D000095662 + + Polyether Polyketides + + + 2023 + 07 + 26 + + + 2023 + 02 + 26 + + + 2024 + 01 + 01 + + + + + Q000008 + + administration & dosage + + + AD + + + + Q000009 + + adverse effects + + + AE + + + + Q000031 + + analogs & derivatives + + + AA + + + + Q000032 + + analysis + + + AN + + + + Q000037 + + antagonists & inhibitors + + + AI + + + + Q000097 + + blood + + + BL + + + + Q000134 + + cerebrospinal fluid + + + CF + + + + Q000138 + + chemical synthesis + + + CS + + + + Q000145 + + classification + + + CL + + + + Q000191 + + economics + + + EC + + + + Q000266 + + history + + + HI + + + + Q000276 + + immunology + + + IM + + + + Q000302 + + isolation & purification + + + IP + + + + Q000378 + + metabolism + + + ME + + + + Q000493 + + pharmacokinetics + + + PK + + + + Q000494 + + pharmacology + + + PD + + + + Q000506 + + poisoning + + + PO + + + + Q000528 + + radiation effects + + + RE + + + + Q000592 + + standards + + + ST + + + + Q000600 + + supply & distribution + + + SD + + + + Q000627 + + therapeutic use + + + TU + + + + Q000633 + + toxicity + + + TO + + + + Q000652 + + urine + + + UR + + + + Q000737 + + chemistry + + + CH + + + + Q000819 + + agonists + + + AG + + + 2024 + + 2024 + + + D02.355.291.933 + D02.540.576.625 + D04.345.241.654 + D04.345.674.625 + + + + M000758735 + + Polyether Polyketides + + + 0 + + Polycyclic polyketides with distinctive ladder shaped multiple polyether motifs. In STREPTOMYCES polycyclic polyketide backbones are assembled by type I modular polyketide synthases which then are modified by epoxidase and epoxide hydrolase to achieve their stereochemistry. Polyether ionophore antibiotics coordinate alkali metal cations with their multiple oxygen atoms which disturbs sodium/potassium concentration gradient. They are widely used to treat and control COCCIDIOSIS. Structurally related MARINE LADDER POLYETHERS are neurotoxins. + + + + M000758735 + M000758734 + + + M000758735 + M000758795 + + + + + T001124911 + Polyether Polyketides + + 2022 + 09 + 21 + + + NLM (2024) + + + + + + M000758734 + + Polyether Antibiotics + + + 0 + + + + M000758735 + M000758734 + + + + + T001124909 + Polyether Antibiotics + + 2022 + 09 + 21 + + + NLM (2024) + + + + T001124907 + Polyether Ionophore Antibiotics + + 2022 + 09 + 21 + + + NLM (2024) + + + + T001124908 + Polyether Ionophores + + 2022 + 09 + 21 + + + NLM (2024) + + + + + + M000758795 + + Marine Ladder Polyethers + + + 0 + + + + M000758735 + M000758795 + + + + + T001125008 + Marine Ladder Polyethers + + 2022 + 09 + 23 + + + NLM (2024) + + + + + + + + D000095702 + + Polyether Toxins + + + 2023 + 07 + 26 + + + 2023 + 06 + 07 + + + 2024 + 01 + 01 + + + + + Q000008 + + administration & dosage + + + AD + + + + Q000009 + + adverse effects + + + AE + + + + Q000031 + + analogs & derivatives + + + AA + + + + Q000032 + + analysis + + + AN + + + + Q000037 + + antagonists & inhibitors + + + AI + + + + Q000096 + + biosynthesis + + + BI + + + + Q000097 + + blood + + + BL + + + + Q000134 + + cerebrospinal fluid + + + CF + + + + Q000138 + + chemical synthesis + + + CS + + + + Q000145 + + classification + + + CL + + + + Q000191 + + economics + + + EC + + + + Q000266 + + history + + + HI + + + + Q000276 + + immunology + + + IM + + + + Q000302 + + isolation & purification + + + IP + + + + Q000378 + + metabolism + + + ME + + + + Q000493 + + pharmacokinetics + + + PK + + + + Q000494 + + pharmacology + + + PD + + + + Q000506 + + poisoning + + + PO + + + + Q000528 + + radiation effects + + + RE + + + + Q000592 + + standards + + + ST + + + + Q000600 + + supply & distribution + + + SD + + + + Q000627 + + therapeutic use + + + TU + + + + Q000633 + + toxicity + + + TO + + + + Q000652 + + urine + + + UR + + + + Q000737 + + chemistry + + + CH + + + + Q000819 + + agonists + + + AG + + + 2024 + + 2024 + + + Marine Toxins (1992-2023) + + + D02.355.291.933.875 + D02.540.576.625.937 + D04.345.241.654.937 + D04.345.674.625.937 + D23.946.580.800 + + + + M000758736 + + Polyether Toxins + + + 0 + + Polycyclic polyether polyketides most often produced by marine organisms, e.g., DINOFLAGELLATES. Many polyether marine toxins are ionophores and neurotoxins and responsible for seafood poisoning. + + + + T001124915 + Polyether Toxins + + 2022 + 09 + 21 + + + NLM (2024) + + + + T001124914 + Marine Polyether Toxins + + 2022 + 09 + 21 + + + NLM (2024) + + + + + + + + D000095722 + + Amphidinols + + + 2023 + 07 + 26 + + + 2023 + 04 + 14 + + + 2024 + 01 + 01 + + + + + Q000008 + + administration & dosage + + + AD + + + + Q000009 + + adverse effects + + + AE + + + + Q000031 + + analogs & derivatives + + + AA + + + + Q000032 + + analysis + + + AN + + + + Q000037 + + antagonists & inhibitors + + + AI + + + + Q000097 + + blood + + + BL + + + + Q000134 + + cerebrospinal fluid + + + CF + + + + Q000138 + + chemical synthesis + + + CS + + + + Q000145 + + classification + + + CL + + + + Q000191 + + economics + + + EC + + + + Q000266 + + history + + + HI + + + + Q000276 + + immunology + + + IM + + + + Q000302 + + isolation & purification + + + IP + + + + Q000378 + + metabolism + + + ME + + + + Q000493 + + pharmacokinetics + + + PK + + + + Q000494 + + pharmacology + + + PD + + + + Q000506 + + poisoning + + + PO + + + + Q000528 + + radiation effects + + + RE + + + + Q000592 + + standards + + + ST + + + + Q000600 + + supply & distribution + + + SD + + + + Q000627 + + therapeutic use + + + TU + + + + Q000633 + + toxicity + + + TO + + + + Q000652 + + urine + + + UR + + + + Q000737 + + chemistry + + + CH + + + + Q000819 + + agonists + + + AG + + + 2024 + + 2024 + + + Macrolides (1991-2023) + + + D02.355.291.933.063 + D02.540.576.625.063 + D04.345.241.654.063 + D04.345.674.625.063 + D23.946.580.080 + + + + M000758793 + + Amphidinols + + + 0 + + Long linear polyketides with polyol groups and polyolefins and core motifs of two tetrahydropyran rings produced by marine dinoflagellates such as AMPHIDINIUM. Many amphidinols are hemolytic toxins with antifungal and other cytotoxic activities. + + + + T001125006 + Amphidinols + + 2022 + 09 + 23 + + + NLM (2024) + + + + T001135668 + Amphidinol + + 2023 + 03 + 30 + + + NLM (2024) + + + + + + D000001 diff --git a/src/sources/mesh/concepts_source.py b/src/sources/mesh/concepts_source.py index f8a17ac454..0a20d891dd 100644 --- a/src/sources/mesh/concepts_source.py +++ b/src/sources/mesh/concepts_source.py @@ -22,6 +22,7 @@ def _get_mesh_data(url: str) -> ET.Element: via a tree number lookup. """ response = requests.get(url) + return ET.fromstring(response.content) def _treenum_lookup(self) -> dict[str, str]: diff --git a/src/test_extractor.py b/src/test_extractor.py index b660e7cc5a..201de4af34 100644 --- a/src/test_extractor.py +++ b/src/test_extractor.py @@ -1,29 +1,123 @@ -import os +from typing import Any, Generator + +import pytest +from typing_extensions import get_args from config import MESH_URL -from conftest import load_fixture -from extractor import lambda_handler -from test_mocks import MockRequest +from extractor import LambdaEvent, lambda_handler +from test_mocks import MockRequest, MockResponseInput +from test_utils import load_fixture +from transformers.base_transformer import EntityType, StreamDestination +from transformers.create_transformer import TransformerType + +def mock_requests_lookup_table( + entity_type: EntityType, + destination: StreamDestination, + transformer_type: TransformerType, +): + mock_mesh_retrieval = { + "method": "GET", + "url": MESH_URL, + "status_code": 200, + "content_bytes": load_fixture("mesh_example.xml"), + "json_data": None, + } + + mock_graph_post = { + "method": "POST", + "url": "https://test-host.com:8182/openCypher", + "status_code": 200, + "content_bytes": None, + "json_data": {"results": {}}, + } -def test_lambda_handler() -> None: - content = load_fixture("mesh_example.xml") + if transformer_type == "mesh_concepts": + if destination == "s3" or destination == "void" or destination == "sns": + return [mock_mesh_retrieval] + elif destination == "graph": + return [mock_mesh_retrieval, mock_graph_post] - MockRequest.mock_responses( - [{"method": "GET", "url": MESH_URL, "status_code": 200, "content": content}] + raise ValueError( + f"Unsupported entity_type: {entity_type}, destination: {destination}, transformer_type: {transformer_type}" ) - event = { - "transformer_type": "mesh_concepts", - "entity_type": "nodes", - "stream_destination": "void", - } - lambda_handler(event, None) +def build_test_matrix() -> Generator[tuple[LambdaEvent, list[MockResponseInput]], Any]: + transformer_types = get_args(TransformerType) + entity_types = get_args(EntityType) + stream_destinations = get_args(StreamDestination) + + transformer_types_to_test = [ + transformer_type + for transformer_type in transformer_types + if transformer_type in ["mesh_concepts"] + ] + + stream_destination_to_test = [ + stream_destination + for stream_destination in stream_destinations + if stream_destination in ["graph", "s3", "void", "sns"] + ] + + entity_type_to_test = [ + entity_type for entity_type in entity_types if entity_type in ["nodes", "edges"] + ] + + for transformer_type in transformer_types_to_test: + for entity_type in entity_type_to_test: + for stream_destination in stream_destination_to_test: + yield ( + { + "transformer_type": transformer_type, + "entity_type": entity_type, + "stream_destination": stream_destination, + "sample_size": 1, + }, + mock_requests_lookup_table( + entity_type, stream_destination, transformer_type + ), + ) + + +@pytest.mark.parametrize( + "lambda_event, mock_responses", + build_test_matrix(), +) +def test_lambda_handler( + lambda_event: LambdaEvent, + mock_responses: list[MockResponseInput], +) -> None: + + MockRequest.mock_responses(mock_responses) + lambda_handler(lambda_event, None) + + transformer_type = lambda_event["transformer_type"] + entity_type = lambda_event["entity_type"] + destination = lambda_event["stream_destination"] + + if transformer_type == "mesh_concepts": + if destination == "void" or destination == "s3" or destination == "sns": + assert ( + len(MockRequest.calls) == 1 + ), f"Expected 1 request, got {len(MockRequest.calls)}, with config {entity_type}, {destination}: {MockRequest.calls}" + request = MockRequest.calls[0] + + assert request["method"] == "GET" + assert request["url"] == MESH_URL + + elif destination == "graph": + assert len(MockRequest.calls) == 2 + mesh_request = MockRequest.calls[0] + + assert mesh_request["method"] == "GET" + assert mesh_request["url"] == MESH_URL - assert len(MockRequest.calls) == 1 - request = MockRequest.calls[0] + graph_request = MockRequest.calls[1] - # Check we are sending a GET request to the correct endpoint - assert request["method"] == "GET" - assert request["url"] == MESH_URL + assert graph_request["method"] == "POST" + assert graph_request["url"] == "https://test-host.com:8182/openCypher" + else: + raise ValueError( + f"Unsupported entity_type: {entity_type}, destination: {destination}, transformer_type: {transformer_type}" + ) diff --git a/src/test_indexer.py b/src/test_indexer.py index 3d26231775..8ed9ba7bc1 100644 --- a/src/test_indexer.py +++ b/src/test_indexer.py @@ -12,6 +12,7 @@ def test_lambda_handler() -> None: "url": "https://test-host.com:8182/openCypher", "status_code": 200, "json_data": {"results": {"foo": "bar"}}, + "content_bytes": None, } ] ) diff --git a/src/test_mocks.py b/src/test_mocks.py index 937cd21f9b..4364decbf2 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -1,5 +1,10 @@ +import io +import tempfile +from typing import Any, TypedDict + from botocore.credentials import Credentials +from test_utils import load_fixture from utils.aws import INSTANCE_ENDPOINT_SECRET_NAME, LOAD_BALANCER_SECRET_NAME MOCK_API_KEY = "TEST_SECRET_API_KEY_123" @@ -11,6 +16,29 @@ ) +class MockSmartOpen: + file_lookup: dict = {} + + @staticmethod + def reset_mocks() -> None: + MockSmartOpen.file_lookup = {} + + @staticmethod + def get_mock_file(uri: str) -> io.StringIO: + return MockSmartOpen.file_lookup[uri] + + @staticmethod + def open(uri, mode, **kwargs: Any) -> Any: + # Create an in-memory text stream + mock_file = io.StringIO() + + # Save the file object in the file lookup + MockSmartOpen.file_lookup[uri] = mock_file + + # create temp file and open it with given mode + return mock_file + + class MockAwsService: def __init__(self) -> None: return None @@ -28,10 +56,36 @@ def get_secret_value(self, SecretId: str) -> dict: return {"SecretString": secret_value} +class MockS3Client(MockAwsService): + def __init__(self) -> None: + return + + +class MockSNSClient(MockAwsService): + publish_batch_request_entries: list[dict] = [] + + @staticmethod + def reset_mocks() -> None: + MockSNSClient.publish_batch_request_entries = [] + + def __init__(self) -> None: + return + + def publish_batch(self, TopicArn: str, PublishBatchRequestEntries: list): + MockSNSClient.publish_batch_request_entries.append( + { + "TopicArn": TopicArn, + "PublishBatchRequestEntries": PublishBatchRequestEntries, + } + ) + + class MockBoto3Session: def __init__(self) -> None: self.clients = { "secretsmanager": MockSecretsManagerClient(), + "s3": MockS3Client(), + "sns": MockSNSClient(), } def client(self, client_name: str) -> MockAwsService: @@ -46,7 +100,10 @@ def get_credentials(self) -> Credentials: class MockResponse: def __init__( - self, status_code: int, json_data: dict = None, content: bytes = None + self, + status_code: int, + json_data: dict | None = None, + content: bytes | None = None, ) -> None: self.json_data = json_data self.status_code = status_code @@ -55,12 +112,23 @@ def __init__( def json(self) -> dict | None: return self.json_data - def content(self) -> bytes | None: - return self.content + +class MockRequestExpectation(TypedDict): + method: str + url: str + response: MockResponse + + +class MockResponseInput(TypedDict): + method: str + url: str + status_code: int + content_bytes: bytes | None + json_data: dict | None class MockRequest: - responses: list[dict] = [] + responses: list[MockRequestExpectation] = [] calls: list[dict] = [] @staticmethod @@ -69,6 +137,7 @@ def clear_mock_responses() -> None: @staticmethod def clear_mock_calls() -> None: + print("Clearing mock calls") MockRequest.calls = [] @staticmethod @@ -81,26 +150,26 @@ def mock_response( method: str, url: str, status_code: int, - json_data: dict = None, - content: bytes = None, + json_data: dict | None = None, + content_bytes: bytes | None = None, ) -> None: MockRequest.responses.append( { "method": method, "url": url, - "response": MockResponse(status_code, json_data, content), + "response": MockResponse(status_code, json_data, content_bytes), } ) @staticmethod - def mock_responses(responses: list[dict]) -> None: + def mock_responses(responses: list[MockResponseInput]) -> None: for response in responses: MockRequest.mock_response( response["method"], response["url"], response["status_code"], response.get("json_data"), - response.get("content"), + response.get("content_bytes"), ) @staticmethod From 3dbdb53fbbe65a114c98b9c9498cf7560508eedd Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Fri, 24 Jan 2025 16:12:38 +0000 Subject: [PATCH 164/310] loc subject concepts working --- src/fixtures/loc_subjects_example.jsonld | 1 + src/test_extractor.py | 50 ++++++++++++++++++------ src/test_mocks.py | 22 ++++++++--- src/test_utils.py | 6 +++ 4 files changed, 63 insertions(+), 16 deletions(-) create mode 100644 src/fixtures/loc_subjects_example.jsonld create mode 100644 src/test_utils.py diff --git a/src/fixtures/loc_subjects_example.jsonld b/src/fixtures/loc_subjects_example.jsonld new file mode 100644 index 0000000000..4629d0309e --- /dev/null +++ b/src/fixtures/loc_subjects_example.jsonld @@ -0,0 +1 @@ +{ "@context": "http://v3/authorities/subjects/context.json", "@graph": [ { "@id": "http://id.loc.gov/authorities/subjects/sh2006006405", "@type": "skos:Concept", "skos:prefLabel": { "@language": "en", "@value": "Object-oriented programming languages" } }, { "@id": "http://id.loc.gov/authorities/subjects/sh2006007256", "@type": "skos:Concept", "skos:prefLabel": { "@language": "en", "@value": "Scripting languages (Computer science)" } }, { "@id": "http://id.loc.gov/authorities/subjects/sh00000011", "@type": "skos:Concept", "skos:broader": [ { "@id": "http://id.loc.gov/authorities/subjects/sh2006006405" }, { "@id": "http://id.loc.gov/authorities/subjects/sh2006007256" }, { "@id": "http://id.loc.gov/authorities/subjects/sh2007005223" } ], "skos:changeNote": [ { "@id": "_:n6bdc44981812457dbabe82d08ead45aab1" }, { "@id": "_:n6bdc44981812457dbabe82d08ead45aab2" } ], "skos:inScheme": { "@id": "http://id.loc.gov/authorities/subjects" }, "skos:prefLabel": { "@language": "en", "@value": "ActionScript (Computer program language)" } }, { "@id": "_:n6bdc44981812457dbabe82d08ead45aab1", "@type": "cs:ChangeSet", "cs:changeReason": "new", "cs:createdDate": { "@type": "xsd:dateTime", "@value": "2000-09-27T00:00:00" }, "cs:creatorName": { "@id": "http://id.loc.gov/vocabulary/organizations/dlc" }, "cs:subjectOfChange": { "@id": "http://id.loc.gov/authorities/subjects/sh00000011" } }, { "@id": "_:n6bdc44981812457dbabe82d08ead45aab2", "@type": "cs:ChangeSet", "cs:changeReason": "revised", "cs:createdDate": { "@type": "xsd:dateTime", "@value": "2007-10-12T07:53:10" }, "cs:creatorName": { "@id": "http://id.loc.gov/vocabulary/organizations/abau" }, "cs:subjectOfChange": { "@id": "http://id.loc.gov/authorities/subjects/sh00000011" } }, { "@id": "http://id.loc.gov/authorities/subjects/sh2007005223", "@type": "skos:Concept", "skos:prefLabel": { "@language": "en", "@value": "Domain-specific programming languages" } } ], "@id": "/authorities/subjects/sh00000011" } \ No newline at end of file diff --git a/src/test_extractor.py b/src/test_extractor.py index 201de4af34..f5cfaf49ef 100644 --- a/src/test_extractor.py +++ b/src/test_extractor.py @@ -3,7 +3,7 @@ import pytest from typing_extensions import get_args -from config import MESH_URL +from config import LOC_SUBJECT_HEADINGS_URL, MESH_URL from extractor import LambdaEvent, lambda_handler from test_mocks import MockRequest, MockResponseInput from test_utils import load_fixture @@ -12,10 +12,9 @@ def mock_requests_lookup_table( - entity_type: EntityType, destination: StreamDestination, transformer_type: TransformerType, -): +) -> Any: mock_mesh_retrieval = { "method": "GET", "url": MESH_URL, @@ -24,6 +23,14 @@ def mock_requests_lookup_table( "json_data": None, } + mock_loc_subjects_retrieval = { + "method": "GET", + "url": LOC_SUBJECT_HEADINGS_URL, + "status_code": 200, + "content_bytes": load_fixture("loc_subjects_example.jsonld"), + "json_data": None, + } + mock_graph_post = { "method": "POST", "url": "https://test-host.com:8182/openCypher", @@ -37,9 +44,14 @@ def mock_requests_lookup_table( return [mock_mesh_retrieval] elif destination == "graph": return [mock_mesh_retrieval, mock_graph_post] + elif transformer_type == "loc_concepts": + if destination == "s3" or destination == "void" or destination == "sns": + return [mock_loc_subjects_retrieval] + elif destination == "graph": + return [mock_loc_subjects_retrieval, mock_graph_post] raise ValueError( - f"Unsupported entity_type: {entity_type}, destination: {destination}, transformer_type: {transformer_type}" + f"Unsupported destination: {destination}, transformer_type: {transformer_type}" ) @@ -51,7 +63,7 @@ def build_test_matrix() -> Generator[tuple[LambdaEvent, list[MockResponseInput]] transformer_types_to_test = [ transformer_type for transformer_type in transformer_types - if transformer_type in ["mesh_concepts"] + if transformer_type in ["mesh_concepts", "loc_concepts"] ] stream_destination_to_test = [ @@ -74,9 +86,7 @@ def build_test_matrix() -> Generator[tuple[LambdaEvent, list[MockResponseInput]] "stream_destination": stream_destination, "sample_size": 1, }, - mock_requests_lookup_table( - entity_type, stream_destination, transformer_type - ), + mock_requests_lookup_table(stream_destination, transformer_type), ) @@ -98,9 +108,7 @@ def test_lambda_handler( if transformer_type == "mesh_concepts": if destination == "void" or destination == "s3" or destination == "sns": - assert ( - len(MockRequest.calls) == 1 - ), f"Expected 1 request, got {len(MockRequest.calls)}, with config {entity_type}, {destination}: {MockRequest.calls}" + assert len(MockRequest.calls) == 1 request = MockRequest.calls[0] assert request["method"] == "GET" @@ -117,6 +125,26 @@ def test_lambda_handler( assert graph_request["method"] == "POST" assert graph_request["url"] == "https://test-host.com:8182/openCypher" + elif transformer_type == "loc_concepts": + if destination == "void" or destination == "s3" or destination == "sns": + assert len(MockRequest.calls) == 1 + request = MockRequest.calls[0] + + assert request["method"] == "GET" + assert request["url"] == LOC_SUBJECT_HEADINGS_URL + + elif destination == "graph": + assert len(MockRequest.calls) == 2 + loc_request = MockRequest.calls[0] + + assert loc_request["method"] == "GET" + assert loc_request["url"] == LOC_SUBJECT_HEADINGS_URL + + graph_request = MockRequest.calls[1] + + assert graph_request["method"] == "POST" + assert graph_request["url"] == "https://test-host.com:8182/openCypher" + else: raise ValueError( f"Unsupported entity_type: {entity_type}, destination: {destination}, transformer_type: {transformer_type}" diff --git a/src/test_mocks.py b/src/test_mocks.py index 4364decbf2..3623013d03 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -24,11 +24,11 @@ def reset_mocks() -> None: MockSmartOpen.file_lookup = {} @staticmethod - def get_mock_file(uri: str) -> io.StringIO: + def get_mock_file(uri: str) -> Any: return MockSmartOpen.file_lookup[uri] @staticmethod - def open(uri, mode, **kwargs: Any) -> Any: + def open(uri: str, mode: str, **kwargs: Any) -> Any: # Create an in-memory text stream mock_file = io.StringIO() @@ -71,7 +71,7 @@ def reset_mocks() -> None: def __init__(self) -> None: return - def publish_batch(self, TopicArn: str, PublishBatchRequestEntries: list): + def publish_batch(self, TopicArn: str, PublishBatchRequestEntries: list) -> Any: MockSNSClient.publish_batch_request_entries.append( { "TopicArn": TopicArn, @@ -98,6 +98,10 @@ def get_credentials(self) -> Credentials: return MOCK_CREDENTIALS +import gzip +from io import BufferedRandom + + class MockResponse: def __init__( self, @@ -108,6 +112,13 @@ def __init__( self.json_data = json_data self.status_code = status_code self.content = content + self.raw: Any = None + + # Assume raw content is gzipped + if content is not None: + self.raw = io.BytesIO(gzip.compress(content)) + else: + self.raw = None def json(self) -> dict | None: return self.json_data @@ -137,7 +148,6 @@ def clear_mock_responses() -> None: @staticmethod def clear_mock_calls() -> None: - print("Clearing mock calls") MockRequest.calls = [] @staticmethod @@ -184,7 +194,9 @@ def request(method: str, url: str, data: dict, headers: dict) -> MockResponse: raise Exception(f"Unexpected request: {method} {url}") @staticmethod - def get(url: str, data: dict = {}, headers: dict = {}) -> MockResponse: + def get( + url: str, stream: bool = False, data: dict = {}, headers: dict = {} + ) -> MockResponse: MockRequest.calls.append( {"method": "GET", "url": url, "data": data, "headers": headers} ) diff --git a/src/test_utils.py b/src/test_utils.py new file mode 100644 index 0000000000..7b34986fe9 --- /dev/null +++ b/src/test_utils.py @@ -0,0 +1,6 @@ +import os + + +def load_fixture(file_name: str) -> bytes: + with open(f"{os.path.dirname(__file__)}/fixtures/{file_name}", "rb") as f: + return f.read() From 4fd10e483f8c8aab7f8595128cbe0523dd0daca1 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 27 Jan 2025 13:36:37 +0000 Subject: [PATCH 165/310] cover all transformers --- src/config.py | 1 + src/fixtures/loc_names_example.jsonld | 2 + .../wikidata/linked_ontology_source.py | 7 +- src/sources/wikidata/sparql_client.py | 14 +- src/sources/wikidata/sparql_query_builder.py | 3 + src/test_extractor.py | 207 +++++++++--------- src/test_indexer.py | 1 + src/test_mocks.py | 25 ++- 8 files changed, 146 insertions(+), 114 deletions(-) create mode 100644 src/fixtures/loc_names_example.jsonld diff --git a/src/config.py b/src/config.py index 4761017706..486556df30 100644 --- a/src/config.py +++ b/src/config.py @@ -8,3 +8,4 @@ ) LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" +WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql" diff --git a/src/fixtures/loc_names_example.jsonld b/src/fixtures/loc_names_example.jsonld new file mode 100644 index 0000000000..82e8f0f913 --- /dev/null +++ b/src/fixtures/loc_names_example.jsonld @@ -0,0 +1,2 @@ +{"@context": "http://v3/authorities/names/context.json", "@graph": [{"@id": "http://id.loc.gov/authorities/names/n79091094", "@type": "skos:Concept", "skos:changeNote": [{"@id": "_:ne7d4c5c8b1dc488887eee394c527b118b1"}, {"@id": "_:ne7d4c5c8b1dc488887eee394c527b118b2"}], "skos:exactMatch": {"@id": "http://viaf.org/viaf/sourceID/LC%7Cn++79091094#skos:Concept"}, "skos:inScheme": {"@id": "http://id.loc.gov/authorities/names"}, "skos:prefLabel": "Luterman, David, 1934-"}, {"@id": "_:ne7d4c5c8b1dc488887eee394c527b118b1", "@type": "cs:ChangeSet", "cs:changeReason": "revised", "cs:createdDate": {"@type": "xsd:dateTime", "@value": "2024-12-06T06:45:44"}, "cs:creatorName": {"@id": "http://id.loc.gov/vocabulary/organizations/nbisu"}, "cs:subjectOfChange": {"@id": "http://id.loc.gov/authorities/names/n79091094"}}, {"@id": "_:ne7d4c5c8b1dc488887eee394c527b118b2", "@type": "cs:ChangeSet", "cs:changeReason": "new", "cs:createdDate": {"@type": "xsd:dateTime", "@value": "1979-08-17T00:00:00"}, "cs:creatorName": {"@id": "http://id.loc.gov/vocabulary/organizations/dlc"}, "cs:subjectOfChange": {"@id": "http://id.loc.gov/authorities/names/n79091094"}}], "@id": "/authorities/names/n79091094"} +{"@context": "http://v3/authorities/names/context.json", "@graph": [{"@id": "http://id.loc.gov/authorities/names/n79091691", "@type": "skos:Concept", "skos:altLabel": [{"@language": "zxx-Latn", "@value": "Budapesht (Hungary)"}, {"@language": "zxx-Latn", "@value": "Budimpe\u0161ta (Hungary)"}, {"@language": "zxx-Latn", "@value": "Voudapest\u0113 (Hungary)"}], "skos:changeNote": [{"@id": "_:nf4e9b2ae104643a786e2362755a602f9b1"}, {"@id": "_:nf4e9b2ae104643a786e2362755a602f9b2"}], "skos:exactMatch": [{"@id": "http://id.worldcat.org/fast/1206109"}, {"@id": "http://viaf.org/viaf/sourceID/LC%7Cn++79091691#skos:Concept"}, {"@id": "http://vocab.getty.edu/tgn/7006280"}], "skos:inScheme": {"@id": "http://id.loc.gov/authorities/names"}, "skos:notation": {"@type": "http://id.loc.gov/datatypes/codes/gac", "@value": "e-hu---"}, "skos:prefLabel": "Budapest (Hungary)", "skos:semanticRelation": [{"@id": "http://id.loc.gov/authorities/names/n79091692"}, {"@id": "http://id.loc.gov/authorities/names/n79091693"}, {"@id": "http://id.loc.gov/authorities/names/n79091694"}], "skosxl:altLabel": [{"@id": "_:nf4e9b2ae104643a786e2362755a602f9b3"}, {"@id": "_:nf4e9b2ae104643a786e2362755a602f9b4"}, {"@id": "_:nf4e9b2ae104643a786e2362755a602f9b5"}]}, {"@id": "_:nf4e9b2ae104643a786e2362755a602f9b1", "@type": "cs:ChangeSet", "cs:changeReason": "new", "cs:createdDate": {"@type": "xsd:dateTime", "@value": "1980-02-12T00:00:00"}, "cs:creatorName": {"@id": "http://id.loc.gov/vocabulary/organizations/dlc"}, "cs:subjectOfChange": {"@id": "http://id.loc.gov/authorities/names/n79091691"}}, {"@id": "_:nf4e9b2ae104643a786e2362755a602f9b2", "@type": "cs:ChangeSet", "cs:changeReason": "revised", "cs:createdDate": {"@type": "xsd:dateTime", "@value": "2024-12-06T06:45:44"}, "cs:creatorName": {"@id": "http://id.loc.gov/vocabulary/organizations/wau"}, "cs:subjectOfChange": {"@id": "http://id.loc.gov/authorities/names/n79091691"}}, {"@id": "_:nf4e9b2ae104643a786e2362755a602f9b3", "@type": "skosxl:Label", "skosxl:literalForm": {"@language": "zxx-Latn", "@value": "Budapesht (Hungary)"}}, {"@id": "_:nf4e9b2ae104643a786e2362755a602f9b4", "@type": "skosxl:Label", "skosxl:literalForm": {"@language": "zxx-Latn", "@value": "Budimpe\u0161ta (Hungary)"}}, {"@id": "_:nf4e9b2ae104643a786e2362755a602f9b5", "@type": "skosxl:Label", "skosxl:literalForm": {"@language": "zxx-Latn", "@value": "Voudapest\u0113 (Hungary)"}}, {"@id": "http://id.loc.gov/authorities/names/n79091692", "@type": "skos:Concept", "skos:prefLabel": "\u00d3buda (Hungary)"}, {"@id": "http://id.loc.gov/authorities/names/n79091694", "@type": "skos:Concept", "skos:prefLabel": "Pest (Hungary)"}, {"@id": "http://id.loc.gov/authorities/names/n79091693", "@type": "skos:Concept", "skos:prefLabel": "Buda (Hungary)"}], "@id": "/authorities/names/n79091691"} \ No newline at end of file diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 06d779a5a0..b0780e5942 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -5,10 +5,11 @@ from utils.streaming import process_stream_in_parallel from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker -from .sparql_client import MAX_PARALLEL_SPARQL_QUERIES, WikidataSparqlClient +from .sparql_client import SPARQL_MAX_PARALLEL_QUERIES, WikidataSparqlClient from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder SPARQL_ITEMS_CHUNK_SIZE = 400 + WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" @@ -96,7 +97,7 @@ def _stream_raw_edges(self) -> Generator[dict]: all_ids, self._get_linked_id_mappings, SPARQL_ITEMS_CHUNK_SIZE, - MAX_PARALLEL_SPARQL_QUERIES, + SPARQL_MAX_PARALLEL_QUERIES, ): linked_id = raw_mapping["linkedId"]["value"] wikidata_id = extract_wikidata_id(raw_mapping) @@ -122,7 +123,7 @@ def _stream_raw_nodes(self) -> Generator[dict]: all_ids, self._get_linked_items, SPARQL_ITEMS_CHUNK_SIZE, - MAX_PARALLEL_SPARQL_QUERIES, + SPARQL_MAX_PARALLEL_QUERIES, ) def stream_raw(self) -> Generator[dict]: diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 975a6b5d66..d4b9b14b22 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -5,10 +5,14 @@ import backoff import requests +from config import WIKIDATA_SPARQL_URL + # Wikidata limits the number of parallel queries from a single IP address to 5. # See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits # However, experimentally, running more than 4 queries in parallel consistently results in '429 Too Many Requests' errors. -MAX_PARALLEL_SPARQL_QUERIES = 4 +SPARQL_MAX_PARALLEL_QUERIES = 4 +SPARQL_BACKOFF_DEFAULT_RETRIES = 3 +SPARQL_BACKOFF_DEFAULT_INTERVAL = 10 def on_request_backoff(backoff_details: typing.Any) -> None: @@ -23,7 +27,7 @@ class WikidataSparqlClient: """ def __init__(self) -> None: - self.parallel_query_semaphore = threading.Semaphore(MAX_PARALLEL_SPARQL_QUERIES) + self.parallel_query_semaphore = threading.Semaphore(SPARQL_MAX_PARALLEL_QUERIES) self.too_many_requests = False self.too_many_requests_lock = threading.Lock() @@ -41,8 +45,8 @@ def _get_user_agent_header() -> str: @backoff.on_exception( backoff.constant, Exception, - max_tries=3, - interval=10, + max_tries=SPARQL_BACKOFF_DEFAULT_RETRIES, + interval=SPARQL_BACKOFF_DEFAULT_INTERVAL, on_backoff=on_request_backoff, ) def run_query(self, query: str) -> list[dict]: @@ -57,7 +61,7 @@ def run_query(self, query: str) -> list[dict]: # Use a semaphore to throttle the number of parallel requests with self.parallel_query_semaphore: r = requests.get( - "https://query.wikidata.org/sparql", + WIKIDATA_SPARQL_URL, params={"format": "json", "query": query}, headers={"User-Agent": self._get_user_agent_header()}, ) diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 238a158be2..f886472913 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -71,6 +71,9 @@ def get_all_ids_query(linked_ontology: OntologyType) -> str: }} """ + # Remove all line breaks and extra spaces + get_ids_query = " ".join(get_ids_query.split()) + return get_ids_query @classmethod diff --git a/src/test_extractor.py b/src/test_extractor.py index f5cfaf49ef..11d145cee8 100644 --- a/src/test_extractor.py +++ b/src/test_extractor.py @@ -3,82 +3,105 @@ import pytest from typing_extensions import get_args -from config import LOC_SUBJECT_HEADINGS_URL, MESH_URL +from config import ( + LOC_NAMES_URL, + LOC_SUBJECT_HEADINGS_URL, + MESH_URL, + WIKIDATA_SPARQL_URL, +) from extractor import LambdaEvent, lambda_handler -from test_mocks import MockRequest, MockResponseInput +from test_mocks import MOCK_INSTANCE_ENDPOINT, MockRequest, MockResponseInput from test_utils import load_fixture from transformers.base_transformer import EntityType, StreamDestination from transformers.create_transformer import TransformerType +transformer_types = get_args(TransformerType) +entity_types = get_args(EntityType) +stream_destinations = get_args(StreamDestination) + def mock_requests_lookup_table( destination: StreamDestination, transformer_type: TransformerType, ) -> Any: - mock_mesh_retrieval = { - "method": "GET", - "url": MESH_URL, - "status_code": 200, - "content_bytes": load_fixture("mesh_example.xml"), - "json_data": None, - } - mock_loc_subjects_retrieval = { - "method": "GET", - "url": LOC_SUBJECT_HEADINGS_URL, - "status_code": 200, - "content_bytes": load_fixture("loc_subjects_example.jsonld"), - "json_data": None, - } + mocked_responses: list[dict] = [] - mock_graph_post = { - "method": "POST", - "url": "https://test-host.com:8182/openCypher", - "status_code": 200, - "content_bytes": None, - "json_data": {"results": {}}, - } + if destination == "graph": + mocked_responses.append( + { + "method": "POST", + "url": f"https://{MOCK_INSTANCE_ENDPOINT}:8182/openCypher", + "content_bytes": None, + "json_data": {"results": {}}, + } + ) - if transformer_type == "mesh_concepts": - if destination == "s3" or destination == "void" or destination == "sns": - return [mock_mesh_retrieval] - elif destination == "graph": - return [mock_mesh_retrieval, mock_graph_post] - elif transformer_type == "loc_concepts": - if destination == "s3" or destination == "void" or destination == "sns": - return [mock_loc_subjects_retrieval] - elif destination == "graph": - return [mock_loc_subjects_retrieval, mock_graph_post] - - raise ValueError( - f"Unsupported destination: {destination}, transformer_type: {transformer_type}" - ) + if transformer_type in ["mesh_concepts", "mesh_locations"]: + mocked_responses.append( + { + "method": "GET", + "url": MESH_URL, + "content_bytes": load_fixture("mesh_example.xml"), + } + ) + elif transformer_type in ["loc_concepts", "loc_locations", "loc_names"]: + mocked_responses.append( + { + "method": "GET", + "url": LOC_SUBJECT_HEADINGS_URL, + "content_bytes": load_fixture("loc_subjects_example.jsonld"), + } + ) + mocked_responses.append( + { + "method": "GET", + "url": LOC_NAMES_URL, + "content_bytes": load_fixture("loc_names_example.jsonld"), + } + ) + elif transformer_type in [ + "wikidata_linked_loc_names", + "wikidata_linked_loc_concepts", + "wikidata_linked_loc_locations", + "wikidata_linked_loc_names", + ]: + mocked_responses.append( + { + "method": "GET", + "url": WIKIDATA_SPARQL_URL, + "params": { + "format": "json", + "query": "SELECT ?item WHERE { ?item wdt:P244 _:anyValueP244. }", + }, + "content_bytes": None, + "json_data": {"results": {"bindings": []}}, + } + ) + elif transformer_type in [ + "wikidata_linked_mesh_concepts", + "wikidata_linked_mesh_locations", + ]: + mocked_responses.append( + { + "method": "GET", + "url": WIKIDATA_SPARQL_URL, + "params": { + "format": "json", + "query": "SELECT ?item WHERE { ?item wdt:P486 _:anyValueP486. }", + }, + "content_bytes": None, + "json_data": {"results": {"bindings": []}}, + } + ) + + return mocked_responses def build_test_matrix() -> Generator[tuple[LambdaEvent, list[MockResponseInput]], Any]: - transformer_types = get_args(TransformerType) - entity_types = get_args(EntityType) - stream_destinations = get_args(StreamDestination) - - transformer_types_to_test = [ - transformer_type - for transformer_type in transformer_types - if transformer_type in ["mesh_concepts", "loc_concepts"] - ] - - stream_destination_to_test = [ - stream_destination - for stream_destination in stream_destinations - if stream_destination in ["graph", "s3", "void", "sns"] - ] - - entity_type_to_test = [ - entity_type for entity_type in entity_types if entity_type in ["nodes", "edges"] - ] - - for transformer_type in transformer_types_to_test: - for entity_type in entity_type_to_test: - for stream_destination in stream_destination_to_test: + for transformer_type in transformer_types: + for entity_type in entity_types: + for stream_destination in stream_destinations: yield ( { "transformer_type": transformer_type, @@ -106,46 +129,30 @@ def test_lambda_handler( entity_type = lambda_event["entity_type"] destination = lambda_event["stream_destination"] - if transformer_type == "mesh_concepts": - if destination == "void" or destination == "s3" or destination == "sns": - assert len(MockRequest.calls) == 1 - request = MockRequest.calls[0] - - assert request["method"] == "GET" - assert request["url"] == MESH_URL - - elif destination == "graph": - assert len(MockRequest.calls) == 2 - mesh_request = MockRequest.calls[0] - - assert mesh_request["method"] == "GET" - assert mesh_request["url"] == MESH_URL - - graph_request = MockRequest.calls[1] - - assert graph_request["method"] == "POST" - assert graph_request["url"] == "https://test-host.com:8182/openCypher" - elif transformer_type == "loc_concepts": - if destination == "void" or destination == "s3" or destination == "sns": - assert len(MockRequest.calls) == 1 - request = MockRequest.calls[0] - - assert request["method"] == "GET" - assert request["url"] == LOC_SUBJECT_HEADINGS_URL - - elif destination == "graph": - assert len(MockRequest.calls) == 2 - loc_request = MockRequest.calls[0] - - assert loc_request["method"] == "GET" - assert loc_request["url"] == LOC_SUBJECT_HEADINGS_URL + concept_retrieval_url_lookup = { + "mesh_concepts": [MESH_URL], + "mesh_locations": [MESH_URL], + "loc_concepts": [LOC_SUBJECT_HEADINGS_URL], + "loc_locations": [LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL], + "loc_names": [LOC_NAMES_URL], + "wikidata_linked_loc_names": [WIKIDATA_SPARQL_URL], + "wikidata_linked_loc_concepts": [WIKIDATA_SPARQL_URL], + "wikidata_linked_loc_locations": [WIKIDATA_SPARQL_URL], + "wikidata_linked_mesh_concepts": [WIKIDATA_SPARQL_URL], + "wikidata_linked_mesh_locations": [WIKIDATA_SPARQL_URL], + } - graph_request = MockRequest.calls[1] + assert transformer_type in transformer_types + assert destination in stream_destinations + assert entity_type in entity_types - assert graph_request["method"] == "POST" - assert graph_request["url"] == "https://test-host.com:8182/openCypher" + concept_retrieval_urls = concept_retrieval_url_lookup[transformer_type] + called_urls = [call["url"] for call in MockRequest.calls] - else: - raise ValueError( - f"Unsupported entity_type: {entity_type}, destination: {destination}, transformer_type: {transformer_type}" - ) + assert all( + concept_retrieval_url in called_urls + for concept_retrieval_url in concept_retrieval_urls + ), ( + f"Unexpected requests found for ({transformer_type}, {entity_type}, {destination}): " + + f"Expected concept retrieval URLs: {concept_retrieval_urls}, got: {called_urls}" + ) diff --git a/src/test_indexer.py b/src/test_indexer.py index 8ed9ba7bc1..830f0ced87 100644 --- a/src/test_indexer.py +++ b/src/test_indexer.py @@ -13,6 +13,7 @@ def test_lambda_handler() -> None: "status_code": 200, "json_data": {"results": {"foo": "bar"}}, "content_bytes": None, + "params": None, } ] ) diff --git a/src/test_mocks.py b/src/test_mocks.py index 3623013d03..cf418f2fc0 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -105,7 +105,7 @@ def get_credentials(self) -> Credentials: class MockResponse: def __init__( self, - status_code: int, + status_code: int | None = None, json_data: dict | None = None, content: bytes | None = None, ) -> None: @@ -127,6 +127,7 @@ def json(self) -> dict | None: class MockRequestExpectation(TypedDict): method: str url: str + params: dict | None response: MockResponse @@ -134,6 +135,7 @@ class MockResponseInput(TypedDict): method: str url: str status_code: int + params: dict | None content_bytes: bytes | None json_data: dict | None @@ -159,7 +161,8 @@ def reset_mocks() -> None: def mock_response( method: str, url: str, - status_code: int, + status_code: int | None = None, + params: dict | None = None, json_data: dict | None = None, content_bytes: bytes | None = None, ) -> None: @@ -167,6 +170,7 @@ def mock_response( { "method": method, "url": url, + "params": params, "response": MockResponse(status_code, json_data, content_bytes), } ) @@ -177,7 +181,8 @@ def mock_responses(responses: list[MockResponseInput]) -> None: MockRequest.mock_response( response["method"], response["url"], - response["status_code"], + response.get("status_code", 200), + response.get("params"), response.get("json_data"), response.get("content_bytes"), ) @@ -195,13 +200,21 @@ def request(method: str, url: str, data: dict, headers: dict) -> MockResponse: @staticmethod def get( - url: str, stream: bool = False, data: dict = {}, headers: dict = {} + url: str, + stream: bool = False, + data: dict = {}, + headers: dict = {}, + params: dict | None = None, ) -> MockResponse: MockRequest.calls.append( {"method": "GET", "url": url, "data": data, "headers": headers} ) for response in MockRequest.responses: - if response["method"] == "GET" and response["url"] == url: + if ( + response["method"] == "GET" + and response["url"] == url + and response["params"] == params + ): return response["response"] - raise Exception(f"Unexpected request: GET {url}") + raise Exception(f"Unexpected request: GET {url}, params: {params}") From 3f21ef16610e361cc5b9c3b05e62582b0b07de0f Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 11:31:13 +0000 Subject: [PATCH 166/310] Remove mutable default args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://docs.quantifiedcode.com/python-anti-patterns/correctness/mutable_default_value_as_argument.html Co-Authored-By: Štěpán Brychta --- src/test_mocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test_mocks.py b/src/test_mocks.py index cf418f2fc0..8e8b6f4c87 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -202,8 +202,8 @@ def request(method: str, url: str, data: dict, headers: dict) -> MockResponse: def get( url: str, stream: bool = False, - data: dict = {}, - headers: dict = {}, + data: dict | None = None, + headers: dict | None = None, params: dict | None = None, ) -> MockResponse: MockRequest.calls.append( From e01c9f1d0dd75254f33476b4141d24a3de774df0 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 11:45:33 +0000 Subject: [PATCH 167/310] Re-use request function from GET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Štěpán Brychta --- src/test_mocks.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/test_mocks.py b/src/test_mocks.py index 8e8b6f4c87..e32a42a99c 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -188,18 +188,8 @@ def mock_responses(responses: list[MockResponseInput]) -> None: ) @staticmethod - def request(method: str, url: str, data: dict, headers: dict) -> MockResponse: - MockRequest.calls.append( - {"method": method, "url": url, "data": data, "headers": headers} - ) - for response in MockRequest.responses: - if response["method"] == method and response["url"] == url: - return response["response"] - - raise Exception(f"Unexpected request: {method} {url}") - - @staticmethod - def get( + def request( + method: str, url: str, stream: bool = False, data: dict | None = None, @@ -207,14 +197,24 @@ def get( params: dict | None = None, ) -> MockResponse: MockRequest.calls.append( - {"method": "GET", "url": url, "data": data, "headers": headers} + {"method": method, "url": url, "data": data, "headers": headers} ) for response in MockRequest.responses: if ( - response["method"] == "GET" + response["method"] == method and response["url"] == url and response["params"] == params ): return response["response"] - raise Exception(f"Unexpected request: GET {url}, params: {params}") + raise Exception(f"Unexpected request: {method} {url}") + + @staticmethod + def get( + url: str, + stream: bool = False, + data: dict = {}, + headers: dict = {}, + params: dict | None = None, + ) -> MockResponse: + return MockRequest.request("GET", url, stream, data, headers, params) From 64de1dbe7353121c725c5a1ce7de4ae573b0244c Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 11:49:12 +0000 Subject: [PATCH 168/310] always use named params --- src/test_mocks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/test_mocks.py b/src/test_mocks.py index e32a42a99c..d5d7021fba 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -179,12 +179,12 @@ def mock_response( def mock_responses(responses: list[MockResponseInput]) -> None: for response in responses: MockRequest.mock_response( - response["method"], - response["url"], - response.get("status_code", 200), - response.get("params"), - response.get("json_data"), - response.get("content_bytes"), + method = response["method"], + url = response["url"], + status_code = response.get("status_code", 200), + params = response.get("params"), + json_data = response.get("json_data"), + content_bytes = response.get("content_bytes"), ) @staticmethod From b8147b1cc437daa587ba0bee1c9e260a11ba4d21 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 29 Jan 2025 11:52:05 +0000 Subject: [PATCH 169/310] Expand readme --- README.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e4af8792bd..af3fc4988b 100644 --- a/README.md +++ b/README.md @@ -129,8 +129,27 @@ result = neptune_client.run_open_cypher_query(query) print(result) ``` -Additionally, it is possible to connect to the cluster using [AWS graph notebook](https://github.com/aws/graph-notebook) -with the following configuration: +### AWS Graph Notebook + +Additionally, it is possible to connect to the cluster using [AWS graph notebook](https://github.com/aws/graph-notebook).The most straightforward option to do this locally is using [JupyterLab](https://jupyter.org/). To install `graph-notebook` and `jupyterlab` (note that this requires Python 3.9.x-3.10.14): + +``` +# install graph-notebook +pip install graph-notebook + +# install jupyterlab +pip install "jupyterlab>=3,<4" + +# aws graph-notebook comes with some example notebooks, this creates copies in the notebooks directory +python -m graph_notebook.notebooks.install --destination notebooks +``` + +Run the following command to open JupyterLab in your browser: + +`python -m graph_notebook.start_jupyterlab --jupyter-dir notebooks` + + +To connect to the catalogue graph, add the following configuration into your Jupyter notebook: ``` %%graph_notebook_config { @@ -152,3 +171,4 @@ Jupyter notebook: %env AWS_PROFILE=platform-developer ``` +You can find an [example notebook](notebooks/graph_exploration.ipynb) in the notebooks folder with openCypher queries to explore the catalogue graph. From 8f0ca08ad4dd31e3aa54128b6a055bdb72a3e37c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 29 Jan 2025 11:53:23 +0000 Subject: [PATCH 170/310] Add comment --- src/sources/wikidata/linked_ontology_source.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index caa3f51a05..9a3d19cf62 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -134,6 +134,8 @@ def _stream_filtered_wikidata_ids(self) -> Generator[str]: wikidata_id = item["wikidata_id"] linked_id = item["linked_id"] if self.id_type_checker.id_is_valid(linked_id) and wikidata_id not in seen: + # Add Wikidata id to `seen` no matter if it's part of the selected node type + # to make sure it is not processed again as a parent below. seen.add(wikidata_id) if self.id_type_checker.id_included_in_selected_type(linked_id): From 073a00631acccd119c017e1e9e7e745ae6079825 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 11:53:39 +0000 Subject: [PATCH 171/310] compact format all queries --- src/sources/wikidata/sparql_query_builder.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index f886472913..265e57e045 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -9,6 +9,13 @@ class SparqlQueryBuilder: Contains various methods for constructing reusable SPARQL queries to run against Wikidata's SPARQL endpoint. """ + @staticmethod + def _compact_format_query(query: str) -> str: + """ + Remove all line breaks and extra spaces from the query. + """ + return " ".join(query.split()) + @staticmethod def _get_formatted_fields(node_type: NodeType) -> str: """ @@ -71,10 +78,7 @@ def get_all_ids_query(linked_ontology: OntologyType) -> str: }} """ - # Remove all line breaks and extra spaces - get_ids_query = " ".join(get_ids_query.split()) - - return get_ids_query + return SparqlQueryBuilder._compact_format_query(get_ids_query) @classmethod def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: @@ -101,7 +105,7 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: GROUP BY ?item """ - return query + return SparqlQueryBuilder._compact_format_query(query) @classmethod def get_linked_ids_query( @@ -128,4 +132,4 @@ def get_linked_ids_query( }} """ - return query + return SparqlQueryBuilder._compact_format_query(query) From 9dc9fcd97faf3082227a52fb873b9fa2667a0510 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 11:54:48 +0000 Subject: [PATCH 172/310] Move imports to top MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Štěpán Brychta --- src/test_mocks.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/test_mocks.py b/src/test_mocks.py index d5d7021fba..fbc956ced6 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -1,5 +1,5 @@ +import gzip import io -import tempfile from typing import Any, TypedDict from botocore.credentials import Credentials @@ -98,10 +98,6 @@ def get_credentials(self) -> Credentials: return MOCK_CREDENTIALS -import gzip -from io import BufferedRandom - - class MockResponse: def __init__( self, From b7dd062e7b6a1c7fcf7380118eaec4b1c660ae30 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 11:55:04 +0000 Subject: [PATCH 173/310] autoformat --- src/test_mocks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/test_mocks.py b/src/test_mocks.py index fbc956ced6..8de7642de3 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -175,12 +175,12 @@ def mock_response( def mock_responses(responses: list[MockResponseInput]) -> None: for response in responses: MockRequest.mock_response( - method = response["method"], - url = response["url"], - status_code = response.get("status_code", 200), - params = response.get("params"), - json_data = response.get("json_data"), - content_bytes = response.get("content_bytes"), + method=response["method"], + url=response["url"], + status_code=response.get("status_code", 200), + params=response.get("params"), + json_data=response.get("json_data"), + content_bytes=response.get("content_bytes"), ) @staticmethod From 6bfa17354685c97b68ad378ac458b0954c503ca3 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 11:57:27 +0000 Subject: [PATCH 174/310] Make test names more meaningful Co-authored-by: Paul Butcher --- src/test_extractor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/test_extractor.py b/src/test_extractor.py index 11d145cee8..a9e72b4573 100644 --- a/src/test_extractor.py +++ b/src/test_extractor.py @@ -113,9 +113,16 @@ def build_test_matrix() -> Generator[tuple[LambdaEvent, list[MockResponseInput]] ) +def get_test_id(argvalue: Any) -> str: + if isinstance(argvalue, list): + return "" + return f"{argvalue['transformer_type']}-{argvalue['entity_type']}-{argvalue['stream_destination']}" + + @pytest.mark.parametrize( "lambda_event, mock_responses", build_test_matrix(), + ids=get_test_id, ) def test_lambda_handler( lambda_event: LambdaEvent, From 652cd4da1323b14f6e8e113807a57b048daa923a Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 12:28:06 +0000 Subject: [PATCH 175/310] unpack dict for params MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Štěpán Brychta --- src/test_mocks.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/test_mocks.py b/src/test_mocks.py index 8de7642de3..4c95cb652e 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -157,7 +157,7 @@ def reset_mocks() -> None: def mock_response( method: str, url: str, - status_code: int | None = None, + status_code: int = 200, params: dict | None = None, json_data: dict | None = None, content_bytes: bytes | None = None, @@ -174,14 +174,7 @@ def mock_response( @staticmethod def mock_responses(responses: list[MockResponseInput]) -> None: for response in responses: - MockRequest.mock_response( - method=response["method"], - url=response["url"], - status_code=response.get("status_code", 200), - params=response.get("params"), - json_data=response.get("json_data"), - content_bytes=response.get("content_bytes"), - ) + MockRequest.mock_response(**response) @staticmethod def request( From c67770cf446ae7e8c1722e3b745369cbb9db58c7 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 12:28:36 +0000 Subject: [PATCH 176/310] oopsie forget to remove mutable params MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Štěpán Brychta --- src/test_mocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test_mocks.py b/src/test_mocks.py index 4c95cb652e..f2bfc4e8a2 100644 --- a/src/test_mocks.py +++ b/src/test_mocks.py @@ -202,8 +202,8 @@ def request( def get( url: str, stream: bool = False, - data: dict = {}, - headers: dict = {}, + data: dict | None = None, + headers: dict | None = None, params: dict | None = None, ) -> MockResponse: return MockRequest.request("GET", url, stream, data, headers, params) From fc302d5986651d4d25fa7aca5889bcce4045d893 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 27 Jan 2025 15:01:03 +0000 Subject: [PATCH 177/310] fix formatting and typechecking --- src/mypy.ini => mypy.ini | 1 + pytest.ini | 1 + scripts/autoformat.sh | 4 ++-- scripts/typecheck.sh | 4 ++-- {src => tests}/conftest.py | 1 - {src => tests}/fixtures/loc_names_example.jsonld | 0 {src => tests}/fixtures/loc_subjects_example.jsonld | 0 {src => tests}/fixtures/mesh_example.xml | 0 {src => tests}/test_extractor.py | 4 ++-- {src => tests}/test_indexer.py | 3 ++- {src => tests}/test_mocks.py | 2 +- {src => tests}/test_utils.py | 0 {src/utils => tests}/test_xml.py | 0 13 files changed, 11 insertions(+), 9 deletions(-) rename src/mypy.ini => mypy.ini (95%) rename {src => tests}/conftest.py (99%) rename {src => tests}/fixtures/loc_names_example.jsonld (100%) rename {src => tests}/fixtures/loc_subjects_example.jsonld (100%) rename {src => tests}/fixtures/mesh_example.xml (100%) rename {src => tests}/test_extractor.py (100%) rename {src => tests}/test_indexer.py (99%) rename {src => tests}/test_mocks.py (100%) rename {src => tests}/test_utils.py (100%) rename {src/utils => tests}/test_xml.py (100%) diff --git a/src/mypy.ini b/mypy.ini similarity index 95% rename from src/mypy.ini rename to mypy.ini index 7bb317dbbb..d1a7822597 100644 --- a/src/mypy.ini +++ b/mypy.ini @@ -10,6 +10,7 @@ warn_no_return = True warn_return_any = True warn_unreachable = True warn_unused_configs = True +mypy_path = src [mypy-smart_open.*] ignore_missing_imports = True diff --git a/pytest.ini b/pytest.ini index 4d6e9787d6..81863f4d48 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,3 @@ [pytest] filterwarnings = ignore::DeprecationWarning +pythonpath = src \ No newline at end of file diff --git a/scripts/autoformat.sh b/scripts/autoformat.sh index b022ef4032..d4e8b0e564 100755 --- a/scripts/autoformat.sh +++ b/scripts/autoformat.sh @@ -13,8 +13,8 @@ CHECK=${1:-} if [ "$CHECK" == "--check" ]; then echo "Checking code formatting (run ./scripts/autoformat.sh to fix any issues!)..." - black --check src/ - isort --profile=black --check src/ + black --check src/ tests/ + isort --profile=black --check src/ tests/ else echo "Formatting code ..." black src/ diff --git a/scripts/typecheck.sh b/scripts/typecheck.sh index 92f27f6f2b..8087ce7500 100755 --- a/scripts/typecheck.sh +++ b/scripts/typecheck.sh @@ -9,5 +9,5 @@ ROOT+="$(dirname "$DIR")" # change working directory to the root of the project cd "$ROOT" -echo "Type checking code ..." -mypy --config-file src/mypy.ini src/ \ No newline at end of file +mypy --config-file ./mypy.ini src/ +mypy --config-file ./mypy.ini tests/ diff --git a/src/conftest.py b/tests/conftest.py similarity index 99% rename from src/conftest.py rename to tests/conftest.py index 96c15aeb1f..ec534c6a0f 100644 --- a/src/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,6 @@ import pytest from _pytest.monkeypatch import MonkeyPatch - from test_mocks import MockBoto3Session, MockRequest, MockSmartOpen, MockSNSClient diff --git a/src/fixtures/loc_names_example.jsonld b/tests/fixtures/loc_names_example.jsonld similarity index 100% rename from src/fixtures/loc_names_example.jsonld rename to tests/fixtures/loc_names_example.jsonld diff --git a/src/fixtures/loc_subjects_example.jsonld b/tests/fixtures/loc_subjects_example.jsonld similarity index 100% rename from src/fixtures/loc_subjects_example.jsonld rename to tests/fixtures/loc_subjects_example.jsonld diff --git a/src/fixtures/mesh_example.xml b/tests/fixtures/mesh_example.xml similarity index 100% rename from src/fixtures/mesh_example.xml rename to tests/fixtures/mesh_example.xml diff --git a/src/test_extractor.py b/tests/test_extractor.py similarity index 100% rename from src/test_extractor.py rename to tests/test_extractor.py index a9e72b4573..7e1dba6c8f 100644 --- a/src/test_extractor.py +++ b/tests/test_extractor.py @@ -1,6 +1,8 @@ from typing import Any, Generator import pytest +from test_mocks import MOCK_INSTANCE_ENDPOINT, MockRequest, MockResponseInput +from test_utils import load_fixture from typing_extensions import get_args from config import ( @@ -10,8 +12,6 @@ WIKIDATA_SPARQL_URL, ) from extractor import LambdaEvent, lambda_handler -from test_mocks import MOCK_INSTANCE_ENDPOINT, MockRequest, MockResponseInput -from test_utils import load_fixture from transformers.base_transformer import EntityType, StreamDestination from transformers.create_transformer import TransformerType diff --git a/src/test_indexer.py b/tests/test_indexer.py similarity index 99% rename from src/test_indexer.py rename to tests/test_indexer.py index 830f0ced87..8c26a129c6 100644 --- a/src/test_indexer.py +++ b/tests/test_indexer.py @@ -1,8 +1,9 @@ import json -from indexer import lambda_handler from test_mocks import MockRequest +from indexer import lambda_handler + def test_lambda_handler() -> None: MockRequest.mock_responses( diff --git a/src/test_mocks.py b/tests/test_mocks.py similarity index 100% rename from src/test_mocks.py rename to tests/test_mocks.py index f2bfc4e8a2..95064edaa1 100644 --- a/src/test_mocks.py +++ b/tests/test_mocks.py @@ -3,8 +3,8 @@ from typing import Any, TypedDict from botocore.credentials import Credentials - from test_utils import load_fixture + from utils.aws import INSTANCE_ENDPOINT_SECRET_NAME, LOAD_BALANCER_SECRET_NAME MOCK_API_KEY = "TEST_SECRET_API_KEY_123" diff --git a/src/test_utils.py b/tests/test_utils.py similarity index 100% rename from src/test_utils.py rename to tests/test_utils.py diff --git a/src/utils/test_xml.py b/tests/test_xml.py similarity index 100% rename from src/utils/test_xml.py rename to tests/test_xml.py From 24e40b2964b7b8566aad18ab678be95f69e16d68 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 27 Jan 2025 15:14:00 +0000 Subject: [PATCH 178/310] don't double specify src --- scripts/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/test.sh b/scripts/test.sh index 52bdff7cf8..f798153749 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -6,6 +6,6 @@ set -o nounset DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ROOT+="$(dirname "$DIR")" -pytest --cov src ./src \ +pytest --cov src \ --cov-report term \ --cov-report xml:coverage.xml \ No newline at end of file From 64ee8e55b9f06d576f69f8b3844d275f0e2f3422 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 27 Jan 2025 15:01:03 +0000 Subject: [PATCH 179/310] fix formatting and typechecking --- scripts/typecheck.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/typecheck.sh b/scripts/typecheck.sh index 8087ce7500..9157c7d7cc 100755 --- a/scripts/typecheck.sh +++ b/scripts/typecheck.sh @@ -10,4 +10,8 @@ ROOT+="$(dirname "$DIR")" cd "$ROOT" mypy --config-file ./mypy.ini src/ +<<<<<<< HEAD mypy --config-file ./mypy.ini tests/ +======= +mypy --config-file ./mypy.ini tests/ +>>>>>>> aa95658 (fix formatting and typechecking) From db1d04b1c4be17119e25cb53b1dd5f92532617a8 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Mon, 27 Jan 2025 16:11:19 +0000 Subject: [PATCH 180/310] Add simple mesh source and transformer tests --- scripts/typecheck.sh | 4 --- src/extractor.py | 3 +- src/transformers/base_transformer.py | 8 ++--- tests/test_mesh_concepts_source.py | 30 ++++++++++++++++++ tests/test_mesh_concepts_transformer.py | 41 +++++++++++++++++++++++++ 5 files changed, 77 insertions(+), 9 deletions(-) create mode 100644 tests/test_mesh_concepts_source.py create mode 100644 tests/test_mesh_concepts_transformer.py diff --git a/scripts/typecheck.sh b/scripts/typecheck.sh index 9157c7d7cc..8087ce7500 100755 --- a/scripts/typecheck.sh +++ b/scripts/typecheck.sh @@ -10,8 +10,4 @@ ROOT+="$(dirname "$DIR")" cd "$ROOT" mypy --config-file ./mypy.ini src/ -<<<<<<< HEAD mypy --config-file ./mypy.ini tests/ -======= -mypy --config-file ./mypy.ini tests/ ->>>>>>> aa95658 (fix formatting and typechecking) diff --git a/src/extractor.py b/src/extractor.py index de5c2e7e52..293044c920 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -52,7 +52,8 @@ def handler( config.GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size ) elif stream_destination == "void": - transformer.stream_to_nowhere(entity_type, CHUNK_SIZE, sample_size) + for _ in transformer.stream(entity_type, CHUNK_SIZE, sample_size): + pass else: raise ValueError("Unsupported stream destination.") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 3839e7ec38..8a40a5c719 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -201,14 +201,14 @@ def stream_to_sns( if len(queries) > 0: publish_batch_to_sns(topic_arn, queries) - def stream_to_nowhere( + def stream( self, entity_type: EntityType, query_chunk_size: int, sample_size: int | None = None, - ) -> None: + ) -> Generator[Any, Any, Any]: """ - Streams transformed entities (nodes or edges) into the void. Useful for development and testing purposes. + Streams transformed entities (nodes or edges) as a generator. Useful for development and testing purposes. """ for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): - pass + yield chunk diff --git a/tests/test_mesh_concepts_source.py b/tests/test_mesh_concepts_source.py new file mode 100644 index 0000000000..072f7c9ed3 --- /dev/null +++ b/tests/test_mesh_concepts_source.py @@ -0,0 +1,30 @@ +from test_mocks import MockRequest +from test_utils import load_fixture + +from sources.mesh.concepts_source import MeSHConceptsSource + + +def test_mesh_concepts_source() -> None: + test_url = "https://example.com" + MockRequest.mock_responses( + [ + { + "method": "GET", + "url": test_url, + "status_code": 200, + "json_data": None, + "content_bytes": load_fixture("mesh_example.xml"), + "params": None, + } + ] + ) + + mesh_concepts_source = MeSHConceptsSource(test_url) + stream_result = list(mesh_concepts_source.stream_raw()) + + # Do some simple checks on mesh source decoding based on known data + assert len(stream_result) == 7 + xml_elem, treenum_lookup = stream_result[0] + + assert xml_elem.tag == "DescriptorRecord" + assert treenum_lookup["D02"] == "D009930" diff --git a/tests/test_mesh_concepts_transformer.py b/tests/test_mesh_concepts_transformer.py new file mode 100644 index 0000000000..a81671260e --- /dev/null +++ b/tests/test_mesh_concepts_transformer.py @@ -0,0 +1,41 @@ +from test_mocks import MockRequest +from test_utils import load_fixture + +from transformers.mesh.concepts_transformer import MeSHConceptsTransformer + + +def test_mesh_concepts_transformer() -> None: + test_url = "https://example.com" + + MockRequest.mock_responses( + [ + { + "method": "GET", + "url": test_url, + "status_code": 200, + "json_data": None, + "content_bytes": load_fixture("mesh_example.xml"), + "params": None, + } + ] + ) + mesh_concepts_transformer = MeSHConceptsTransformer(test_url) + + # test transform_node + nodes = list( + mesh_concepts_transformer.stream(entity_type="nodes", query_chunk_size=1) + ) + assert len(list(nodes)) == 7 + assert nodes[0][0].id == "D009930" + assert nodes[0][0].label == "Organic Chemicals" + + stream = mesh_concepts_transformer.stream(entity_type="edges", query_chunk_size=1) + # get first element, trying to get all of them will get edges we don't have in the test data + first_chunk = stream.__next__() + first_element = first_chunk[0] + + assert first_element.from_type == "SourceConcept" + assert first_element.to_type == "SourceConcept" + assert first_element.from_id == "D004987" + assert first_element.to_id == "D009930" + assert first_element.relationship == "HAS_PARENT" From e4c9c5b5f150fb46683ad8ca4b79fe41dc24bb36 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Tue, 28 Jan 2025 15:55:29 +0000 Subject: [PATCH 181/310] containerise extractor --- Dockerfile | 18 ++++ docker-compose.yml | 11 +++ scripts/build.sh | 87 ++++++++++++++++--- src/extractor.py | 2 + src/indexer.py | 2 + terraform/ecr.tf | 23 +++++ terraform/extractor_ecs_task.tf | 29 +++++++ terraform/iam_state_machines.tf | 9 +- terraform/locals.tf | 6 ++ terraform/modules/ecs_task/output.tf | 19 ++++ terraform/modules/ecs_task/task_definition.tf | 37 ++++++++ terraform/modules/ecs_task/variables.tf | 24 +++++ 12 files changed, 254 insertions(+), 13 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yml mode change 100644 => 100755 src/extractor.py mode change 100644 => 100755 src/indexer.py create mode 100644 terraform/ecr.tf create mode 100644 terraform/extractor_ecs_task.tf create mode 100644 terraform/modules/ecs_task/output.tf create mode 100644 terraform/modules/ecs_task/task_definition.tf create mode 100644 terraform/modules/ecs_task/variables.tf diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..14bfa18992 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +ARG PYTHON_IMAGE_VERSION=latest +FROM python:${PYTHON_IMAGE_VERSION} AS base + +LABEL maintainer="Wellcome Collection " + +ADD .python-version /app/.python_version +ADD src /app/src +ADD scripts /app/scripts + +WORKDIR /app + +RUN scripts/ci-setup.sh + +FROM base AS extractor +ENTRYPOINT [ "/app/src/extractor.py" ] + +FROM base AS indexer +ENTRYPOINT [ "/app/src/indexer.py" ] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000000..ee0c63d639 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,11 @@ +services: + extractor: + image: ${REPOSITORY_PREFIX:-}catalogue_graph_extractor:${TAG:-dev} + platform: linux/amd64 + build: + target: extractor + context: . + args: + - PYTHON_IMAGE_VERSION=${PYTHON_IMAGE_VERSION:-latest} + dockerfile: Dockerfile + \ No newline at end of file diff --git a/scripts/build.sh b/scripts/build.sh index 74c02ae4e0..811be3e745 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -3,24 +3,89 @@ set -o errexit set -o nounset +# set ROOT to the root of the project DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ROOT+="$(dirname "$DIR")" +# get python version from .python-version +PY_VERSION=$(cat .python-version) + +# set default values +ECR_REGISTRY="760097843905.dkr.ecr.eu-west-1.amazonaws.com/uk.ac.wellcome" +S3_BUCKET="wellcomecollection-platform-infra" +S3_PREFIX="lambdas/catalogue_graph" + +ZIP_TARGET="${ROOT}/target/build.zip" +TAG_DEFAULT="dev" +PUSH=false + +# parse command line arguments +while [[ $# -gt 0 ]]; do + case "$1" in + -t|--tag) + TAG=${2:-$TAG_DEFAULT} + echo "Using tag: $TAG" + shift + ;; + -p|--push) + PUSH=true + echo "Will push build artifacts to AWS" + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac + # dont fail if at end of arguments + shift || true +done + # change working directory to the root of the project cd "$ROOT" -# Create a temporary directory to hold source code and packages -mkdir -p target/tmp +function build_zip() {( set -e + local ZIP_TARGET=$1 + mkdir -p target/tmp -# get python version from .python-version -PY_VERSION=$(cat .python-version) + cp -r src/* target/tmp + pip install \ + -r src/requirements.txt \ + --platform manylinux2014_x86_64 \ + --target target/tmp \ + --only-binary=:all: \ + --python-version $PY_VERSION + + pushd target/tmp + zip -r $ZIP_TARGET . + popd + + rm -rf target/tmp +)} + +function upload_zip {( set -e + local FILE=$1 + local TAG=${TAG:-$TAG_DEFAULT} + + DESTINATION="s3://$S3_BUCKET/$S3_PREFIX/lambda-$TAG.zip" + + aws s3 cp $FILE $DESTINATION +)} + +function docker_compose {( set -e + local CMD=$1 + local SERVICE_NAME=$2 -cp -r src/* target/tmp -pip install -r src/requirements.txt --platform manylinux2014_x86_64 --target target/tmp --only-binary=:all: --python-version $PY_VERSION + TAG=${TAG:-$TAG_DEFAULT} \ + REPOSITORY_PREFIX=${ECR_REGISTRY}/ \ + PYTHON_IMAGE_VERSION=${PY_VERSION}-slim \ + docker compose $CMD $SERVICE_NAME +)} -cd target/tmp -zip -r ../build.zip . -cd ../.. +build_zip "$ZIP_TARGET" +docker_compose "build" "extractor" -# Clean up the temporary build directory -rm -rf target/tmp +if [ "$PUSH" == true ]; then + upload_zip "$ZIP_TARGET" + docker_compose "push" "extractor" +fi diff --git a/src/extractor.py b/src/extractor.py old mode 100644 new mode 100755 index 293044c920..977c3b5636 --- a/src/extractor.py +++ b/src/extractor.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import typing diff --git a/src/indexer.py b/src/indexer.py old mode 100644 new mode 100755 index dad19ea5f9..ded35f12f1 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import json import typing diff --git a/terraform/ecr.tf b/terraform/ecr.tf new file mode 100644 index 0000000000..9cbe8e7660 --- /dev/null +++ b/terraform/ecr.tf @@ -0,0 +1,23 @@ +resource "aws_ecr_repository" "catalogue_graph_extractor" { + name = "uk.ac.wellcome/catalogue_graph_extractor" +} + +resource "aws_ecr_lifecycle_policy" "expire_old_images" { + repository = aws_ecr_repository.catalogue_graph_extractor.name + policy = jsonencode({ + rules = [ + { + rulePriority = 1 + description = "Only keep the last 25 images in a repo" + selection = { + tagStatus = "any" + countType = "imageCountMoreThan" + countNumber = 25 + } + action = { + type = "expire" + } + } + ] + }) +} \ No newline at end of file diff --git a/terraform/extractor_ecs_task.tf b/terraform/extractor_ecs_task.tf new file mode 100644 index 0000000000..adce9d2e3e --- /dev/null +++ b/terraform/extractor_ecs_task.tf @@ -0,0 +1,29 @@ +resource "aws_ecs_cluster" "cluster" { + name = local.namespace +} + +module "extractor_ecs_task" { + source = "./modules/ecs_task" + + task_name = "${local.namespace}_extractor" + + image = "${aws_ecr_repository.catalogue_graph_extractor.repository_url}:dev" + + environment = { + S3_BULK_LOAD_BUCKET_NAME = aws_s3_bucket.neptune_bulk_upload_bucket.bucket + GRAPH_QUERIES_SNS_TOPIC_ARN = module.catalogue_graph_queries_topic.arn + } + + cpu = 1024 + memory = 2048 +} + +resource "aws_iam_role_policy" "ecs_stream_to_sns_policy" { + role = module.extractor_ecs_task.task_role_name + policy = data.aws_iam_policy_document.stream_to_sns.json +} + +resource "aws_iam_role_policy" "ecs_stream_to_s3_policy" { + role = module.extractor_ecs_task.task_role_name + policy = data.aws_iam_policy_document.stream_to_s3.json +} diff --git a/terraform/iam_state_machines.tf b/terraform/iam_state_machines.tf index 74df7a822f..e19cc7feb8 100644 --- a/terraform/iam_state_machines.tf +++ b/terraform/iam_state_machines.tf @@ -32,8 +32,13 @@ resource "aws_iam_policy" "state_machine_policy" { }, { Effect = "Allow", - Action = ["states:StartExecution", "states:DescribeExecution", "states:StopExecution"], - Resource = "*" + Action = ["ecs:RunTask"], + Resource = ["${local.extractor_task_definition_arn_latest}:*"] + }, + { + Effect = "Allow", + Action = ["iam:PassRole"], + Resource = [module.extractor_ecs_task.task_role_arn] }, # These EventBridge permissions are needed to allow state machines to perform the "startExecution.sync:2" action # (i.e. trigger another state machine and wait for it to complete) diff --git a/terraform/locals.tf b/terraform/locals.tf index 6909632e7c..2a8d48a16b 100644 --- a/terraform/locals.tf +++ b/terraform/locals.tf @@ -1,4 +1,10 @@ locals { + namespace = "catalogue-graph" + + _extractor_task_definition_split = split(":", module.extractor_ecs_task.task_definition_arn) + extractor_task_definition_version = element(local._extractor_task_definition_split, length(local._extractor_task_definition_split) - 1) + extractor_task_definition_arn_latest = trimsuffix(module.extractor_ecs_task.task_definition_arn, ":${local.extractor_task_definition_version}") + vpc_id = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_id private_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_private_subnets public_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_public_subnets diff --git a/terraform/modules/ecs_task/output.tf b/terraform/modules/ecs_task/output.tf new file mode 100644 index 0000000000..2b47430f9d --- /dev/null +++ b/terraform/modules/ecs_task/output.tf @@ -0,0 +1,19 @@ +output "task_role_arn" { + value = module.task_definition.task_role_arn +} + +output "task_execution_role_arn" { + value = module.task_definition.task_execution_role_arn +} + +output "task_role_name" { + value = module.task_definition.task_role_name +} + +output "task_execution_role_name" { + value = module.task_definition.task_execution_role_name +} + +output "task_definition_arn" { + value = module.task_definition.arn +} diff --git a/terraform/modules/ecs_task/task_definition.tf b/terraform/modules/ecs_task/task_definition.tf new file mode 100644 index 0000000000..3a80b96acb --- /dev/null +++ b/terraform/modules/ecs_task/task_definition.tf @@ -0,0 +1,37 @@ +module "app_container_definition" { + source = "git::github.com/wellcomecollection/terraform-aws-ecs-service.git//modules/container_definition?ref=v3.13.1" + name = var.task_name + image = var.image + + environment = var.environment + + log_configuration = module.log_router_container.container_log_configuration +} + +module "log_router_container" { + source = "git::github.com/wellcomecollection/terraform-aws-ecs-service.git//modules/firelens?ref=v3.13.1" + namespace = var.task_name + + use_privatelink_endpoint = true +} + +module "log_router_container_secrets_permissions" { + source = "git::github.com/wellcomecollection/terraform-aws-ecs-service.git//modules/secrets?ref=v3.13.1" + secrets = module.log_router_container.shared_secrets_logging + role_name = module.task_definition.task_execution_role_name +} + +module "task_definition" { + source = "git::github.com/wellcomecollection/terraform-aws-ecs-service.git//modules/task_definition?ref=v3.13.1" + + cpu = var.cpu + memory = var.memory + + container_definitions = [ + module.log_router_container.container_definition, + module.app_container_definition.container_definition + ] + + launch_types = ["FARGATE"] + task_name = var.task_name +} diff --git a/terraform/modules/ecs_task/variables.tf b/terraform/modules/ecs_task/variables.tf new file mode 100644 index 0000000000..5ed77d7a39 --- /dev/null +++ b/terraform/modules/ecs_task/variables.tf @@ -0,0 +1,24 @@ +variable "environment" { + type = map(string) + description = "A map of environment variables to pass to the container" +} + +variable "image" { + type = string + description = "The container image to use for the container" +} + +variable "cpu" { + type = number + description = "The number of CPU units to reserve for the container" +} + +variable "memory" { + type = number + description = "The amount of memory to reserve for the container" +} + +variable "task_name" { + type = string + description = "The name of the task" +} From db27cc70e5bcf7d945f68450063d09226f0acd5d Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 16:26:29 +0000 Subject: [PATCH 182/310] remove indexer target --- Dockerfile | 6 +----- docker-compose.yml | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 14bfa18992..2102d8f080 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,4 @@ WORKDIR /app RUN scripts/ci-setup.sh -FROM base AS extractor -ENTRYPOINT [ "/app/src/extractor.py" ] - -FROM base AS indexer -ENTRYPOINT [ "/app/src/indexer.py" ] \ No newline at end of file +ENTRYPOINT [ "/app/src/extractor.py" ] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index ee0c63d639..9464d2e246 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,6 @@ services: image: ${REPOSITORY_PREFIX:-}catalogue_graph_extractor:${TAG:-dev} platform: linux/amd64 build: - target: extractor context: . args: - PYTHON_IMAGE_VERSION=${PYTHON_IMAGE_VERSION:-latest} From 2ef7b33dc4a895fefa37e049feba8965e1601694 Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Wed, 29 Jan 2025 16:27:19 +0000 Subject: [PATCH 183/310] create new state machine for ECS single task --- terraform/ecr.tf | 3 + terraform/extractor_ecs_task.tf | 4 +- terraform/iam_state_machines.tf | 5 +- terraform/locals.tf | 4 ++ terraform/state_machine_extractors.tf | 1 - .../state_machine_single_extractor_loader.tf | 59 +++++++++++++++++++ terraform/terraform.tf | 11 ++++ 7 files changed, 83 insertions(+), 4 deletions(-) diff --git a/terraform/ecr.tf b/terraform/ecr.tf index 9cbe8e7660..3b01accd87 100644 --- a/terraform/ecr.tf +++ b/terraform/ecr.tf @@ -2,6 +2,9 @@ resource "aws_ecr_repository" "catalogue_graph_extractor" { name = "uk.ac.wellcome/catalogue_graph_extractor" } +// This policy will expire old images in the repository, when we decide +// deployment strategy we can update this policy to match the desired tags in use +// and the number of images to keep. resource "aws_ecr_lifecycle_policy" "expire_old_images" { repository = aws_ecr_repository.catalogue_graph_extractor.name policy = jsonencode({ diff --git a/terraform/extractor_ecs_task.tf b/terraform/extractor_ecs_task.tf index adce9d2e3e..0de5947257 100644 --- a/terraform/extractor_ecs_task.tf +++ b/terraform/extractor_ecs_task.tf @@ -14,8 +14,8 @@ module "extractor_ecs_task" { GRAPH_QUERIES_SNS_TOPIC_ARN = module.catalogue_graph_queries_topic.arn } - cpu = 1024 - memory = 2048 + cpu = 2048 + memory = 4096 } resource "aws_iam_role_policy" "ecs_stream_to_sns_policy" { diff --git a/terraform/iam_state_machines.tf b/terraform/iam_state_machines.tf index e19cc7feb8..e8a1628cbb 100644 --- a/terraform/iam_state_machines.tf +++ b/terraform/iam_state_machines.tf @@ -38,7 +38,10 @@ resource "aws_iam_policy" "state_machine_policy" { { Effect = "Allow", Action = ["iam:PassRole"], - Resource = [module.extractor_ecs_task.task_role_arn] + Resource = [ + module.extractor_ecs_task.task_execution_role_arn, + module.extractor_ecs_task.task_role_arn + ] }, # These EventBridge permissions are needed to allow state machines to perform the "startExecution.sync:2" action # (i.e. trigger another state machine and wait for it to complete) diff --git a/terraform/locals.tf b/terraform/locals.tf index 2a8d48a16b..807dcf8f02 100644 --- a/terraform/locals.tf +++ b/terraform/locals.tf @@ -5,9 +5,13 @@ locals { extractor_task_definition_version = element(local._extractor_task_definition_split, length(local._extractor_task_definition_split) - 1) extractor_task_definition_arn_latest = trimsuffix(module.extractor_ecs_task.task_definition_arn, ":${local.extractor_task_definition_version}") + shared_infra = data.terraform_remote_state.shared_infra.outputs + vpc_id = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_id private_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_private_subnets public_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_public_subnets + + ec_privatelink_security_group_id = local.shared_infra["ec_developer_privatelink_sg_id"] } data "aws_vpc" "vpc" { diff --git a/terraform/state_machine_extractors.tf b/terraform/state_machine_extractors.tf index e86b989864..c5e668c3eb 100644 --- a/terraform/state_machine_extractors.tf +++ b/terraform/state_machine_extractors.tf @@ -33,4 +33,3 @@ resource "aws_sfn_state_machine" "catalogue_graph_extractors" { } }) } - diff --git a/terraform/state_machine_single_extractor_loader.tf b/terraform/state_machine_single_extractor_loader.tf index 7614d03db4..85c0708f4f 100644 --- a/terraform/state_machine_single_extractor_loader.tf +++ b/terraform/state_machine_single_extractor_loader.tf @@ -32,3 +32,62 @@ resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load" { }, }) } +resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load_ecs" { + name = "catalogue-graph-single-extract-load-ecs" + role_arn = aws_iam_role.state_machine_execution_role.arn + + definition = jsonencode({ + Comment = "Extract nodes/edges from a single source and load them into the catalogue graph using an ECS task." + StartAt = "Extract" + States = { + "Extract" = { + "QueryLanguage" = "JSONata" + Type = "Task" + Resource = "arn:aws:states:::ecs:runTask.sync" + Next = "Load" + "Arguments" : { + "Cluster" : aws_ecs_cluster.cluster.arn, + "TaskDefinition" : module.extractor_ecs_task.task_definition_arn, + "LaunchType" : "FARGATE", + "NetworkConfiguration" : { + "AwsvpcConfiguration" : { + "AssignPublicIp" : "DISABLED", + "Subnets" : local.private_subnets, + "SecurityGroups" : [ + local.ec_privatelink_security_group_id, + aws_security_group.egress.id + ] + } + }, + "Overrides": { + "ContainerOverrides": [ + { + "Name": "catalogue-graph_extractor", + "Command": [ + "--transformer-type", + "{% $states.input.transformer_type %}", + "--entity-type", + "{% $states.input.entity_type %}", + "--stream-destination", + "{% $states.input.stream_destination %}" + ] + } + ] + } + } + } + "Load" = { + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", + Parameters = { + StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loader.arn + "Input.$" : "$$.Execution.Input", + } + Next = "Success" + }, + Success = { + Type = "Succeed" + } + }, + }) +} diff --git a/terraform/terraform.tf b/terraform/terraform.tf index b9972b9a6d..b629b166fd 100644 --- a/terraform/terraform.tf +++ b/terraform/terraform.tf @@ -25,6 +25,17 @@ data "terraform_remote_state" "aws_account_infrastructure" { } } +data "terraform_remote_state" "shared_infra" { + backend = "s3" + + config = { + role_arn = "arn:aws:iam::760097843905:role/platform-read_only" + bucket = "wellcomecollection-platform-infra" + key = "terraform/platform-infrastructure/shared.tfstate" + region = "eu-west-1" + } +} + provider "aws" { region = "eu-west-1" alias = "dns" From f209765126b08a69f8ecbf1af89a51f35004df4a Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Wed, 29 Jan 2025 11:22:21 +0000 Subject: [PATCH 184/310] start moving things about for MADS --- src/transformers/loc/common.py | 13 ++ src/transformers/loc/mads/__init__.py | 0 .../loc/mads/examples/sh2010105253.json | 128 ++++++++++++++++++ src/transformers/loc/mads/raw_mads_concept.py | 0 src/transformers/loc/mads/test_mads.py | 0 src/transformers/loc/raw_concept.py | 20 +-- src/transformers/loc/test_common.py | 30 ++++ 7 files changed, 174 insertions(+), 17 deletions(-) create mode 100644 src/transformers/loc/common.py create mode 100644 src/transformers/loc/mads/__init__.py create mode 100644 src/transformers/loc/mads/examples/sh2010105253.json create mode 100644 src/transformers/loc/mads/raw_mads_concept.py create mode 100644 src/transformers/loc/mads/test_mads.py create mode 100644 src/transformers/loc/test_common.py diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py new file mode 100644 index 0000000000..2b2d1d0c34 --- /dev/null +++ b/src/transformers/loc/common.py @@ -0,0 +1,13 @@ + +ID_PREFIXES_TO_REMOVE = ( + "/authorities/subjects/", + "http://id.loc.gov/authorities/subjects/", + "/authorities/names/", + "http://id.loc.gov/authorities/names/", +) + +def remove_id_prefix(raw_id: str) -> str: + for prefix in ID_PREFIXES_TO_REMOVE: + raw_id = raw_id.removeprefix(prefix) + return raw_id + diff --git a/src/transformers/loc/mads/__init__.py b/src/transformers/loc/mads/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/loc/mads/examples/sh2010105253.json b/src/transformers/loc/mads/examples/sh2010105253.json new file mode 100644 index 0000000000..f94b926d09 --- /dev/null +++ b/src/transformers/loc/mads/examples/sh2010105253.json @@ -0,0 +1,128 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh99001366", + "@type": [ + "madsrdf:Authority", + "madsrdf:GenreForm" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Early works to 1800" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n0d2f33cb1b0145e4a8a506c9472fe881b7" + } + ] + } + }, + { + "@id": "_:n0d2f33cb1b0145e4a8a506c9472fe881b7", + "@type": "madsrdf:GenreFormElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Early works to 1800" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85098685", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Pathology" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n0d2f33cb1b0145e4a8a506c9472fe881b5" + } + ] + } + }, + { + "@id": "_:n0d2f33cb1b0145e4a8a506c9472fe881b5", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Pathology" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh2010105253", + "@type": [ + "madsrdf:Authority", + "madsrdf:ComplexSubject" + ], + "bflc:marcKey": "150 $aPathology$vEarly works to 1800", + "identifiers:lccn": "sh2010105253", + "madsrdf:adminMetadata": { + "@id": "_:n0d2f33cb1b0145e4a8a506c9472fe881b1" + }, + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Pathology--Early works to 1800" + }, + "madsrdf:componentList": { + "@list": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85098685" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh99001366" + } + ] + }, + "madsrdf:editorialNote": "[Record generated for validation purposes.]", + "madsrdf:hasSource": { + "@id": "_:n0d2f33cb1b0145e4a8a506c9472fe881b4" + }, + "madsrdf:isMemberOfMADSCollection": [ + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSHAuthorizedHeadings" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSH_General" + } + ], + "madsrdf:isMemberOfMADSScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "owl:sameAs": [ + { + "@id": "http://id.loc.gov/authorities/sh2010105253#concept" + }, + { + "@id": "info:lc/authorities/sh2010105253" + } + ] + }, + { + "@id": "_:n0d2f33cb1b0145e4a8a506c9472fe881b1", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2010-03-30T00:00:00" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "new" + }, + { + "@id": "_:n0d2f33cb1b0145e4a8a506c9472fe881b4", + "@type": "madsrdf:Source", + "madsrdf:citationSource": "Work cat.: Concept of \u0100ma in \u0100yurveda, 2005", + "madsrdf:citationStatus": "found" + } + ], + "@id": "/authorities/subjects/sh2010105253" +} diff --git a/src/transformers/loc/mads/raw_mads_concept.py b/src/transformers/loc/mads/raw_mads_concept.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/loc/mads/test_mads.py b/src/transformers/loc/mads/test_mads.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 4f2f26769d..ab710f7877 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -1,25 +1,11 @@ from typing import Literal - -ID_PREFIXES_TO_REMOVE = ( - "/authorities/subjects/", - "http://id.loc.gov/authorities/subjects/", - "/authorities/names/", - "http://id.loc.gov/authorities/names/", -) - +from common import remove_id_prefix class RawLibraryOfCongressConcept: def __init__(self, raw_concept: dict): self.raw_concept = raw_concept self._raw_concept_node = self._extract_concept_node() - @staticmethod - def _remove_id_prefix(raw_id: str) -> str: - for prefix in ID_PREFIXES_TO_REMOVE: - raw_id = raw_id.removeprefix(prefix) - - return raw_id - def _extract_concept_node(self) -> dict | None: graph: list[dict] = self.raw_concept["@graph"] @@ -66,7 +52,7 @@ def exclude(self) -> bool: @property def source_id(self) -> str: - return self._remove_id_prefix(self.raw_concept["@id"]) + return remove_id_prefix(self.raw_concept["@id"]) @property def label(self) -> str: @@ -106,7 +92,7 @@ def linked_concepts_ids(self, sko_link: str) -> list[str]: if concept["@id"].startswith("_:n"): continue - linked_ids.append(self._remove_id_prefix(concept["@id"])) + linked_ids.append(self.remove_id_prefix(concept["@id"])) return linked_ids diff --git a/src/transformers/loc/test_common.py b/src/transformers/loc/test_common.py new file mode 100644 index 0000000000..ec25790f9e --- /dev/null +++ b/src/transformers/loc/test_common.py @@ -0,0 +1,30 @@ +from .common import remove_id_prefix + +def test_remove_prefix_noop(): + """ + If there is no prefix to remove, remove_id_prefix will do nothing + """ + assert remove_id_prefix("sh1234567890") == "sh1234567890" + + +def test_remove_prefix_fully_qualified(): + """ + remove_id_prefix removes fully-qualified URL-style prefixes + """ + assert remove_id_prefix("http://id.loc.gov/authorities/subjects/sh1234567890") == "sh1234567890" + assert remove_id_prefix("http://id.loc.gov/authorities/names/sh0987654321") == "sh0987654321" + +def test_remove_prefix_relative(): + """ + remove_id_prefix removes relative/local prefixes + """ + assert remove_id_prefix("/authorities/subjects/sh1234567890") == "sh1234567890" + assert remove_id_prefix("/authorities/names/sh0987654321") == "sh0987654321" + +def test_remove_prefix_lookalikes(): + """ + remove_id_prefix only removes specific known prefixes, + not just things that look a bit like them + """ + assert remove_id_prefix("/authorities/banana/sh1234567890") == "/authorities/banana/sh1234567890" + assert remove_id_prefix("https://id.loc.gov.uk/authorities/subjects/sh1234567890") == "https://id.loc.gov.uk/authorities/subjects/sh1234567890" From a7da96502eeeaae7123115db578fba041120d54a Mon Sep 17 00:00:00 2001 From: Robert Kenny Date: Thu, 30 Jan 2025 09:58:02 +0000 Subject: [PATCH 185/310] format --- terraform/ecr.tf | 2 +- terraform/extractor_ecs_task.tf | 4 +- terraform/iam_state_machines.tf | 19 +++-- terraform/lambda_bulk_load_poller.tf | 2 +- terraform/lambda_bulk_loader.tf | 2 +- terraform/lambda_extractor.tf | 6 +- terraform/lambda_indexer.tf | 4 +- terraform/load_balancer.tf | 2 +- terraform/locals.tf | 4 +- terraform/neptune.tf | 6 +- terraform/state_machine_bulk_loader.tf | 2 +- terraform/state_machine_bulk_loaders.tf | 10 +-- terraform/state_machine_extractors.tf | 10 +-- terraform/state_machine_pipeline.tf | 10 +-- .../state_machine_single_extractor_loader.tf | 70 +++++++++++-------- terraform/terraform.tf | 6 +- terraform/variables.tf | 2 +- 17 files changed, 90 insertions(+), 71 deletions(-) diff --git a/terraform/ecr.tf b/terraform/ecr.tf index 3b01accd87..054e46a866 100644 --- a/terraform/ecr.tf +++ b/terraform/ecr.tf @@ -7,7 +7,7 @@ resource "aws_ecr_repository" "catalogue_graph_extractor" { // and the number of images to keep. resource "aws_ecr_lifecycle_policy" "expire_old_images" { repository = aws_ecr_repository.catalogue_graph_extractor.name - policy = jsonencode({ + policy = jsonencode({ rules = [ { rulePriority = 1 diff --git a/terraform/extractor_ecs_task.tf b/terraform/extractor_ecs_task.tf index 0de5947257..b89a3fe94e 100644 --- a/terraform/extractor_ecs_task.tf +++ b/terraform/extractor_ecs_task.tf @@ -10,8 +10,8 @@ module "extractor_ecs_task" { image = "${aws_ecr_repository.catalogue_graph_extractor.repository_url}:dev" environment = { - S3_BULK_LOAD_BUCKET_NAME = aws_s3_bucket.neptune_bulk_upload_bucket.bucket - GRAPH_QUERIES_SNS_TOPIC_ARN = module.catalogue_graph_queries_topic.arn + S3_BULK_LOAD_BUCKET_NAME = aws_s3_bucket.neptune_bulk_upload_bucket.bucket + GRAPH_QUERIES_SNS_TOPIC_ARN = module.catalogue_graph_queries_topic.arn } cpu = 2048 diff --git a/terraform/iam_state_machines.tf b/terraform/iam_state_machines.tf index e8a1628cbb..6afd3f4e06 100644 --- a/terraform/iam_state_machines.tf +++ b/terraform/iam_state_machines.tf @@ -1,12 +1,12 @@ data "aws_caller_identity" "current" {} resource "aws_iam_role" "state_machine_execution_role" { - name = "catalogue-graph-state-machine-execution-role" + name = "catalogue-graph-state-machine-execution-role" assume_role_policy = jsonencode({ - Version = "2012-10-17", + Version = "2012-10-17", Statement = [ { - Effect = "Allow", + Effect = "Allow", Principal = { Service = "states.amazonaws.com" }, @@ -18,13 +18,18 @@ resource "aws_iam_role" "state_machine_execution_role" { resource "aws_iam_policy" "state_machine_policy" { policy = jsonencode({ - Version = "2012-10-17", + Version = "2012-10-17", Statement = [ { Effect = "Allow", Action = ["logs:CreateLogStream", "logs:PutLogEvents"], Resource = "*" }, + { + Effect = "Allow", + Action = ["states:StartExecution"], + Resource = [aws_sfn_state_machine.catalogue_graph_bulk_loader.arn] + }, { Effect = "Allow", Action = ["lambda:InvokeFunction"], @@ -36,12 +41,12 @@ resource "aws_iam_policy" "state_machine_policy" { Resource = ["${local.extractor_task_definition_arn_latest}:*"] }, { - Effect = "Allow", - Action = ["iam:PassRole"], + Effect = "Allow", + Action = ["iam:PassRole"], Resource = [ module.extractor_ecs_task.task_execution_role_arn, module.extractor_ecs_task.task_role_arn - ] + ] }, # These EventBridge permissions are needed to allow state machines to perform the "startExecution.sync:2" action # (i.e. trigger another state machine and wait for it to complete) diff --git a/terraform/lambda_bulk_load_poller.tf b/terraform/lambda_bulk_load_poller.tf index 9c1839343d..5fe47a218e 100644 --- a/terraform/lambda_bulk_load_poller.tf +++ b/terraform/lambda_bulk_load_poller.tf @@ -5,7 +5,7 @@ module "bulk_load_poller_lambda" { description = "Polls the status of a Neptune bulk load job." runtime = "python3.13" - filename = "../target/build.zip" + filename = "../target/build.zip" source_code_hash = filesha256("../target/build.zip") handler = "bulk_load_poller.lambda_handler" diff --git a/terraform/lambda_bulk_loader.tf b/terraform/lambda_bulk_loader.tf index 4801d75adf..3642ef9ac9 100644 --- a/terraform/lambda_bulk_loader.tf +++ b/terraform/lambda_bulk_loader.tf @@ -5,7 +5,7 @@ module "bulk_loader_lambda" { description = "Bulk loads entities from an S3 bucket into the Neptune database." runtime = "python3.13" - filename = "../target/build.zip" + filename = "../target/build.zip" source_code_hash = filesha256("../target/build.zip") handler = "bulk_loader.lambda_handler" diff --git a/terraform/lambda_extractor.tf b/terraform/lambda_extractor.tf index d4dd5829aa..a09f9c1366 100644 --- a/terraform/lambda_extractor.tf +++ b/terraform/lambda_extractor.tf @@ -5,7 +5,7 @@ module "extractor_lambda" { description = "Extracts source concepts and turns them into Cypher queries." runtime = "python3.13" - filename = "../target/build.zip" + filename = "../target/build.zip" source_code_hash = filesha256("../target/build.zip") handler = "extractor.lambda_handler" @@ -13,7 +13,7 @@ module "extractor_lambda" { // This Lambda does not need a lot of memory, but it downloads and processes large datasets (with up to 10 million // items) and therefore needs the additional compute and networking capacity which comes with increased memory. memory_size = 4096 - timeout = 15*60 // 15 minutes + timeout = 15 * 60 // 15 minutes vpc_config = { subnet_ids = local.private_subnets @@ -22,7 +22,7 @@ module "extractor_lambda" { environment = { variables = { - S3_BULK_LOAD_BUCKET_NAME = aws_s3_bucket.neptune_bulk_upload_bucket.bucket + S3_BULK_LOAD_BUCKET_NAME = aws_s3_bucket.neptune_bulk_upload_bucket.bucket GRAPH_QUERIES_SNS_TOPIC_ARN = module.catalogue_graph_queries_topic.arn } } diff --git a/terraform/lambda_indexer.tf b/terraform/lambda_indexer.tf index 53c37af1a5..c1a38f9def 100644 --- a/terraform/lambda_indexer.tf +++ b/terraform/lambda_indexer.tf @@ -5,7 +5,7 @@ module "indexer_lambda" { description = "Indexes nodes and edges into the Neptune catalogue graph cluster." runtime = "python3.13" - filename = "../target/build.zip" + filename = "../target/build.zip" source_code_hash = filesha256("../target/build.zip") handler = "indexer.lambda_handler" @@ -22,7 +22,7 @@ module "indexer_lambda" { data "aws_iam_policy_document" "allow_secret_read" { statement { - actions = ["secretsmanager:GetSecretValue"] + actions = ["secretsmanager:GetSecretValue"] resources = [ "arn:aws:secretsmanager:eu-west-1:760097843905:secret:NeptuneTest/*" ] diff --git a/terraform/load_balancer.tf b/terraform/load_balancer.tf index 7b08bd7f20..367c27484c 100644 --- a/terraform/load_balancer.tf +++ b/terraform/load_balancer.tf @@ -25,7 +25,7 @@ resource "aws_lb_target_group_attachment" "neptune_instance_attachment" { # this setup is still more convenient than only being able to connect from within the VPC. # If it starts bothering us, we can create a Lambda function for dynamically updating the target group IP, as outlined # here: https://aws-samples.github.io/aws-dbs-refarch-graph/src/connecting-using-a-load-balancer/ - target_id = "172.42.174.101" + target_id = "172.42.174.101" } locals { diff --git a/terraform/locals.tf b/terraform/locals.tf index 807dcf8f02..5d478cfc1f 100644 --- a/terraform/locals.tf +++ b/terraform/locals.tf @@ -1,11 +1,11 @@ locals { - namespace = "catalogue-graph" + namespace = "catalogue-graph" _extractor_task_definition_split = split(":", module.extractor_ecs_task.task_definition_arn) extractor_task_definition_version = element(local._extractor_task_definition_split, length(local._extractor_task_definition_split) - 1) extractor_task_definition_arn_latest = trimsuffix(module.extractor_ecs_task.task_definition_arn, ":${local.extractor_task_definition_version}") - shared_infra = data.terraform_remote_state.shared_infra.outputs + shared_infra = data.terraform_remote_state.shared_infra.outputs vpc_id = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_id private_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_private_subnets diff --git a/terraform/neptune.tf b/terraform/neptune.tf index 62a6749f9c..eebafe2c73 100644 --- a/terraform/neptune.tf +++ b/terraform/neptune.tf @@ -21,10 +21,10 @@ resource "aws_iam_role" "catalogue_graph_cluster" { name = "catalogue-graph-cluster" assume_role_policy = jsonencode({ - Version = "2012-10-17" + Version = "2012-10-17" Statement = [ { - Effect = "Allow" + Effect = "Allow" Principal = { Service = "rds.amazonaws.com" # Neptune uses RDS for some operations } @@ -37,7 +37,7 @@ resource "aws_iam_role" "catalogue_graph_cluster" { # Read-only access to the bulk load S3 bucket data "aws_iam_policy_document" "neptune_s3_read_only_policy" { statement { - effect = "Allow" + effect = "Allow" actions = [ "s3:GetObject", "s3:ListBucket" diff --git a/terraform/state_machine_bulk_loader.tf b/terraform/state_machine_bulk_loader.tf index f560c460ce..c9f62079af 100644 --- a/terraform/state_machine_bulk_loader.tf +++ b/terraform/state_machine_bulk_loader.tf @@ -6,7 +6,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { QueryLanguage = "JSONata" Comment = "Trigger a Neptune bulk load from a file stored in S3 and periodically check the status of the bulk load until complete." StartAt = "Trigger bulk load" - States = { + States = { "Trigger bulk load" : { "Type" : "Task", "Resource" : "arn:aws:states:::lambda:invoke", diff --git a/terraform/state_machine_bulk_loaders.tf b/terraform/state_machine_bulk_loaders.tf index 0d4d3e661f..f483b7cbd1 100644 --- a/terraform/state_machine_bulk_loaders.tf +++ b/terraform/state_machine_bulk_loaders.tf @@ -5,21 +5,21 @@ resource "aws_sfn_state_machine" "catalogue_graph_bulk_loaders" { definition = jsonencode({ Comment = "Trigger the catalogue-graph-bulk-loader state machine in sequence for each combination of inputs." StartAt = "Load ${var.state_machine_inputs[0].label}" - States = merge(tomap({ + States = merge(tomap({ for index, task_input in var.state_machine_inputs : "Load ${task_input.label}" => { - Type = "Task" - Resource = "arn:aws:states:::states:startExecution.sync:2", + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", Parameters = { StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loader.arn - Input = { + Input = { "transformer_type" = task_input.transformer_type, "entity_type" = task_input.entity_type } } Next = index == length(var.state_machine_inputs) - 1 ? "Success" : "Load ${var.state_machine_inputs[index + 1].label}" } - }), { + }), { Success = { Type = "Succeed" } diff --git a/terraform/state_machine_extractors.tf b/terraform/state_machine_extractors.tf index c5e668c3eb..178669eadd 100644 --- a/terraform/state_machine_extractors.tf +++ b/terraform/state_machine_extractors.tf @@ -5,16 +5,16 @@ resource "aws_sfn_state_machine" "catalogue_graph_extractors" { definition = jsonencode({ Comment = "Extract raw concepts from all sources, transform them into nodes and edges, and stream them into an S3 bucket." StartAt = "Trigger extractors" - States = { + States = { "Trigger extractors" = { - Type = "Parallel" + Type = "Parallel" Branches = flatten([ for index, task_input in var.state_machine_inputs : { StartAt = "Extract ${task_input.label}" - States = { + States = { "Extract ${task_input.label}" = { - Type = "Task" - Resource = module.extractor_lambda.lambda.arn + Type = "Task" + Resource = module.extractor_lambda.lambda.arn Parameters = { "transformer_type" = task_input.transformer_type, "entity_type" = task_input.entity_type, diff --git a/terraform/state_machine_pipeline.tf b/terraform/state_machine_pipeline.tf index e9513e2e99..bfc0e8a354 100644 --- a/terraform/state_machine_pipeline.tf +++ b/terraform/state_machine_pipeline.tf @@ -5,18 +5,18 @@ resource "aws_sfn_state_machine" "catalogue_graph_pipeline" { definition = jsonencode({ Comment = "Extract all concepts and load them into the catalogue graph." StartAt = "Extractors" - States = { + States = { "Extractors" = { - Type = "Task" - Resource = "arn:aws:states:::states:startExecution.sync:2", + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", Parameters = { StateMachineArn = aws_sfn_state_machine.catalogue_graph_extractors.arn } Next = "Bulk loaders" }, "Bulk loaders" = { - Type = "Task" - Resource = "arn:aws:states:::states:startExecution.sync:2", + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", Parameters = { StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loaders.arn } diff --git a/terraform/state_machine_single_extractor_loader.tf b/terraform/state_machine_single_extractor_loader.tf index 85c0708f4f..06ea9c8120 100644 --- a/terraform/state_machine_single_extractor_loader.tf +++ b/terraform/state_machine_single_extractor_loader.tf @@ -5,7 +5,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load" { definition = jsonencode({ Comment = "Extract nodes/edges from a single source and load them into the catalogue graph." StartAt = "Extract" - States = { + States = { "Extract" = { Type = "Task" Resource = module.extractor_lambda.lambda.arn @@ -18,8 +18,8 @@ resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load" { } } "Load" = { - Type = "Task" - Resource = "arn:aws:states:::states:startExecution.sync:2", + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", Parameters = { StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loader.arn "Input.$" : "$$.Execution.Input", @@ -37,33 +37,34 @@ resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load_ecs" { role_arn = aws_iam_role.state_machine_execution_role.arn definition = jsonencode({ - Comment = "Extract nodes/edges from a single source and load them into the catalogue graph using an ECS task." - StartAt = "Extract" - States = { - "Extract" = { - "QueryLanguage" = "JSONata" + Comment = "Extract nodes/edges from a single source and load them into the catalogue graph using an ECS task." + QueryLanguage = "JSONata" + StartAt = "Extract" + States = { + Extract = { Type = "Task" Resource = "arn:aws:states:::ecs:runTask.sync" - Next = "Load" - "Arguments" : { - "Cluster" : aws_ecs_cluster.cluster.arn, - "TaskDefinition" : module.extractor_ecs_task.task_definition_arn, - "LaunchType" : "FARGATE", - "NetworkConfiguration" : { - "AwsvpcConfiguration" : { - "AssignPublicIp" : "DISABLED", - "Subnets" : local.private_subnets, - "SecurityGroups" : [ + Output = "{% $states.input %}" + Next = "ShouldRunLoad" + Arguments = { + Cluster = aws_ecs_cluster.cluster.arn + TaskDefinition = module.extractor_ecs_task.task_definition_arn + LaunchType = "FARGATE" + NetworkConfiguration = { + AwsvpcConfiguration = { + AssignPublicIp = "DISABLED" + Subnets = local.private_subnets + SecurityGroups = [ local.ec_privatelink_security_group_id, aws_security_group.egress.id ] } }, - "Overrides": { - "ContainerOverrides": [ + Overrides = { + ContainerOverrides = [ { - "Name": "catalogue-graph_extractor", - "Command": [ + Name = "catalogue-graph_extractor" + Command = [ "--transformer-type", "{% $states.input.transformer_type %}", "--entity-type", @@ -75,13 +76,26 @@ resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load_ecs" { ] } } - } - "Load" = { - Type = "Task" - Resource = "arn:aws:states:::states:startExecution.sync:2", - Parameters = { + }, + ShouldRunLoad = { + Type = "Choice" + Output = "{% $states.input %}" + Choices = [ + { + # This is how you do null coalescing in JSONata + # https://github.com/jsonata-js/jsonata/issues/370#issuecomment-556995173 + Condition = "{% [$states.input.run_bulk_load, false][0] %}", + Next = "Load" + } + ] + Default = "Success" + }, + Load = { + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2" + Arguments = { StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loader.arn - "Input.$" : "$$.Execution.Input", + Input = "{% $states.input %}" } Next = "Success" }, diff --git a/terraform/terraform.tf b/terraform/terraform.tf index b629b166fd..0f83de8b8a 100644 --- a/terraform/terraform.tf +++ b/terraform/terraform.tf @@ -19,9 +19,9 @@ data "terraform_remote_state" "aws_account_infrastructure" { assume_role = { role_arn = "arn:aws:iam::760097843905:role/platform-read_only" } - bucket = "wellcomecollection-platform-infra" - key = "terraform/aws-account-infrastructure/platform.tfstate" - region = "eu-west-1" + bucket = "wellcomecollection-platform-infra" + key = "terraform/aws-account-infrastructure/platform.tfstate" + region = "eu-west-1" } } diff --git a/terraform/variables.tf b/terraform/variables.tf index c881c092d2..318e984d07 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -1,7 +1,7 @@ # Each entry corresponds to a single execution of the `extractor` and `bulk_loader` Lambda functions. The `extractor` # Lambda function will output a single S3 file, which will be loaded into the database via the `bulk_loader` Lambda function. variable "state_machine_inputs" { - type = list(object({ label : string, transformer_type : string, entity_type : string })) + type = list(object({ label : string, transformer_type : string, entity_type : string })) default = [ { "label" : "LoC Concept Nodes", From 646e6168bd886e1b71682c39e9d4c406f6902509 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 30 Jan 2025 09:59:11 +0000 Subject: [PATCH 186/310] Annotate cypher queries --- notebooks/graph_exploration.ipynb | 204 ++++++++++++++++++++---------- 1 file changed, 136 insertions(+), 68 deletions(-) diff --git a/notebooks/graph_exploration.ipynb b/notebooks/graph_exploration.ipynb index 743c6778f6..eb2ce5c808 100644 --- a/notebooks/graph_exploration.ipynb +++ b/notebooks/graph_exploration.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "04c35d87", + "metadata": {}, + "source": [ + "Set the profile to `platform-developer`" + ] + }, { "cell_type": "code", "execution_count": null, @@ -12,6 +20,14 @@ "%env AWS_PROFILE=platform-developer" ] }, + { + "cell_type": "markdown", + "id": "edde200f", + "metadata": {}, + "source": [ + "To connect to the catalogue graph, the config needs to be set using the `%%graph_notebook_config` magic command (replace the value for \"host\" with our Neptune endpoint)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -35,6 +51,14 @@ "} " ] }, + { + "cell_type": "markdown", + "id": "db21057c", + "metadata": {}, + "source": [ + "The `%status` command is useful to check whether the connection has been established successfully. In case of issues later on while querying the graph, it is worth checking the status again." + ] + }, { "cell_type": "code", "execution_count": null, @@ -60,7 +84,7 @@ "id": "363bcdc3-062d-4ee8-be27-893c72ef701c", "metadata": {}, "source": [ - "Count the number of all `SourceConcept` nodes" + "Count the number of all `SourceConcept` nodes:" ] }, { @@ -82,7 +106,7 @@ "id": "47f1610a-2219-4554-9c25-11f725a53dec", "metadata": {}, "source": [ - "Count the number of `SourceConcept` nodes grouped by their source (LCSH, MeSH, Wikidata)" + "Count the number of `SourceConcept` nodes grouped by their source (LCSH, MeSH, Wikidata):" ] }, { @@ -104,7 +128,7 @@ "id": "925cc5aa-31ac-45f5-bfcd-c43a9e2f0c52", "metadata": {}, "source": [ - "We can do the same for `SourceLocation` and `SourceName` nodes" + "We can do the same for `SourceLocation` and `SourceName` nodes:" ] }, { @@ -154,7 +178,6 @@ "source": [ "%%oc\n", "MATCH (c:SourceConcept)-[:HAS_PARENT]->(p)\n", - "WHERE c.source='nlm-mesh'\n", "RETURN c.label, p.label\n", "LIMIT 10" ] @@ -164,7 +187,7 @@ "id": "d9014952-3930-4d69-88d2-2d1b56f42d78", "metadata": {}, "source": [ - "We can also traverse multiple edges using the `*` operator. For example, the query below retrieves grandparent labels of `SourceConcept` nodes (i.e. `2` levels of `HAS_PARENT` edges)" + "We can also traverse multiple edges using the `*` operator. For example, the query below retrieves grandparent labels of `SourceConcept` nodes, i.e. `2` levels of `HAS_PARENT` edges away from the `SourceConcept` on the left side of the query (labelled c1):" ] }, { @@ -175,9 +198,51 @@ "outputs": [], "source": [ "%%oc\n", - "MATCH (c:SourceConcept)-[:HAS_PARENT*2]->(p)\n", - "WHERE c.source='nlm-mesh'\n", - "RETURN c.label, p.label\n", + "MATCH (c1:SourceConcept)-[:HAS_PARENT*2]->(c2)\n", + "RETURN c1.label, c2.label\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "id": "2f05316a", + "metadata": {}, + "source": [ + "Similar to SQL, we can filter results using a `WHERE` clause. For example, the query below retrieves grantparents of MeSH concepts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85244375", + "metadata": {}, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (c1:SourceConcept)-[:HAS_PARENT*2]->(c2)\n", + "WHERE c1.source = 'nlm-mesh'\n", + "RETURN c1.label, c2.label\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "id": "1720ac79", + "metadata": {}, + "source": [ + "Alternatively, nodes (or edges) can be filtered by passing the property value within `{}` inside the `MATCH` query. The following query returns the same results as above:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a43e8809", + "metadata": {}, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH (c1:SourceConcept {source: 'nlm-mesh'})-[:HAS_PARENT*2]->(c2)\n", + "RETURN c1.label, c2.label\n", "LIMIT 10" ] }, @@ -186,7 +251,7 @@ "id": "84c96a06-2b1c-4741-9a2c-2fdde7a15a48", "metadata": {}, "source": [ - "We can count the number of links between sources via `SAME_AS` edges. This reveals a high level of Wikidata coverage for both LoC and MeSH `SourceConcepts`" + "We can count the number of links between different sources via `SAME_AS` edges. This reveals a high level of Wikidata coverage for both LoC and MeSH `SourceConcepts`:" ] }, { @@ -208,7 +273,29 @@ "id": "bc5fa969-19af-46ad-80f1-fe1dd6bd1216", "metadata": {}, "source": [ - "It is also possible to view an interactive visualisation of query results when returning everyting (`*`), which can be accessed via the `Graph` tab. This can be customised with visualization hints using `-d`, `-de`, `-l` and `-g` after the `%%oc` magic command. " + "It is also possible to view an interactive visualisation of query results when returning everyting (`*`), which can be accessed via the `Graph` tab. By default, the node type (e.g. `SourceConcept`) is displayed on each node and used as a category to group nodes by colour. Additional metadata can be accessed in a table when clicking on a node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf54d00b", + "metadata": {}, + "outputs": [], + "source": [ + "%%oc\n", + "MATCH(c:SourceConcept)-[r:NARROWER_THAN*]->(p)\n", + "WHERE c.id = 'sh00002633'\n", + "RETURN *\n", + "LIMIT 20" + ] + }, + { + "cell_type": "markdown", + "id": "a9a83b6b", + "metadata": {}, + "source": [ + "The visualisation can be customised with visualization hints using `-d`, `-de`, `-l` and `-g` after the `%%oc` magic command. For example, passing a particular node property after `d` will use this to label the node in the graph display (with `-l` to specify the maximum text length). In our case, it can be more informative to display the `label` or `id` property." ] }, { @@ -227,6 +314,14 @@ "LIMIT 20" ] }, + { + "cell_type": "markdown", + "id": "30ae7125", + "metadata": {}, + "source": [ + "The query below can be used to visualise all parents of the MeSH concept `Sanitation`. Be careful when using a query to traverse all edges with the `*` operator as it can lead to very large query results." + ] + }, { "cell_type": "code", "execution_count": null, @@ -242,6 +337,14 @@ "RETURN *" ] }, + { + "cell_type": "markdown", + "id": "e1889297", + "metadata": {}, + "source": [ + "Simply switching the directionality of the edge in the previous query gives you all children of the MeSH concept `Sanitation`:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -257,6 +360,14 @@ "RETURN *" ] }, + { + "cell_type": "markdown", + "id": "e9847c54", + "metadata": {}, + "source": [ + "We have `SAME_AS` edges in the graph which link concepts from the three different data sources (LoC, Wikidata, and MeSH). For example, the MeSH concept `Sanitation` is connected to its equivalent in Wikidata, which in turn is linked to LoC. In this case, the visualisation is cusomised with the `-g` flag to specify that the source property should be used to colour the nodes." + ] + }, { "cell_type": "code", "execution_count": null, @@ -273,48 +384,34 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "20a14a71-4870-4397-97d9-8e6efad98d71", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "d1de0566", + "metadata": {}, "source": [ - "%%oc -d label -l 25 -g source\n", - "MATCH (sc1:SourceConcept)-[r:RELATED_TO*..2]->(sc2:SourceConcept)\n", - "WHERE sc1.id = 'sh85117296'\n", - "RETURN *" + "We also have `RELATED_TO` edges in the graph (these mainly come from LoC):" ] }, { "cell_type": "code", "execution_count": null, - "id": "3266937f-4ca6-4d46-a9b7-3d0496a78b8d", + "id": "20a14a71-4870-4397-97d9-8e6efad98d71", "metadata": { "tags": [] }, "outputs": [], "source": [ - "%%oc -d label -l 25 -g source\n", - "MATCH (sc1:SourceConcept)-[r:NARROWER_THAN*..2]->(sc2:SourceConcept)\n", + "%%oc -d label -l 25\n", + "MATCH (sc1:SourceConcept)-[r:RELATED_TO*..2]->(sc2:SourceConcept)\n", "WHERE sc1.id = 'sh85117296'\n", "RETURN *" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "8eab1235-c42d-44f0-bebf-376add4637b8", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "c517ea08", + "metadata": {}, "source": [ - "%%oc -d label -l 25 -g source\n", - "MATCH (sc1:SourceConcept)<-[r:NARROWER_THAN*..2]-(sc2:SourceConcept)\n", - "WHERE sc1.id = 'sh85117296'\n", - "RETURN *" + "Putting all of this together, we can retrieve all nodes linked via `SAME_AS` edges to a given concept ID, as well as any parent/broader/related concepts linked to any of these (if present). Note that an `OPTIONAL MATCH` clause is used below so we can retrieve all `SAME_AS` concepts regardless of whether they have any further connections or not." ] }, { @@ -326,39 +423,10 @@ }, "outputs": [], "source": [ - "%%oc -d label -l 20 -g source\n", - "MATCH (sc1:SourceConcept)-[r:SAME_AS]->(sc2:SourceConcept)-[p:HAS_PARENT]->(sc3:SourceConcept)\n", - "RETURN *\n", - "LIMIT 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "807f813b-6469-4ad8-85df-ed09abed2f0a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%oc -d label -l 20\n", - "MATCH (sn1:SourceName)-[r:SAME_AS]->(sn2:SourceName)\n", - "WHERE sn1.id='n84804337'\n", - "RETURN *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "196c43b7-a9e1-441b-bbde-dc284ea069bc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%oc -d label -l 20\n", - "MATCH (sn1:SourceName)-[r:SAME_AS]->(sn2:SourceName)\n", - "WHERE sn1.id='Q542019'\n", + "%%oc -d label -l 25 -g source\n", + "MATCH (sc1:SourceConcept {id: 'sh85117296'})-[r:SAME_AS*]->(sc2:SourceConcept)\n", + "OPTIONAL MATCH (sc2)-[p:HAS_PARENT|NARROWER_THAN|RELATED_TO]->(sc3)\n", + "OPTIONAL MATCH (sc3)-[s:SAME_AS]->(sc4:SourceConcept)\n", "RETURN *" ] }, From bfb2817a8b56ffdc5563b3e65c61836a2c2a716d Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Thu, 30 Jan 2025 10:11:07 +0000 Subject: [PATCH 187/310] start mads separation --- src/transformers/loc/concepts_transformer.py | 2 +- src/transformers/loc/locations_transformer.py | 2 +- .../loc/mads/{raw_mads_concept.py => raw_concept.py} | 0 src/transformers/loc/names_transformer.py | 2 +- src/transformers/loc/{mads/test_mads.py => skos/__init__.py} | 0 src/transformers/loc/{ => skos}/raw_concept.py | 4 ++-- tests/transformers/loc/mads/test_mads.py | 0 {src => tests}/transformers/loc/test_common.py | 5 ++++- 8 files changed, 9 insertions(+), 6 deletions(-) rename src/transformers/loc/mads/{raw_mads_concept.py => raw_concept.py} (100%) rename src/transformers/loc/{mads/test_mads.py => skos/__init__.py} (100%) rename src/transformers/loc/{ => skos}/raw_concept.py (97%) create mode 100644 tests/transformers/loc/mads/test_mads.py rename {src => tests}/transformers/loc/test_common.py (95%) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index c277e025fb..95d1fd9d00 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -5,7 +5,7 @@ from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer -from .raw_concept import RawLibraryOfCongressConcept +from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept class LibraryOfCongressConceptsTransformer(BaseTransformer): diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index f1053d242f..7a365f9158 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -5,7 +5,7 @@ from sources.gzip_source import MultiGZipSource from transformers.base_transformer import BaseTransformer -from .raw_concept import RawLibraryOfCongressConcept +from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept class LibraryOfCongressLocationsTransformer(BaseTransformer): diff --git a/src/transformers/loc/mads/raw_mads_concept.py b/src/transformers/loc/mads/raw_concept.py similarity index 100% rename from src/transformers/loc/mads/raw_mads_concept.py rename to src/transformers/loc/mads/raw_concept.py diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index c6f01bc5cf..8f0e9c4d27 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -5,7 +5,7 @@ from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer -from .raw_concept import RawLibraryOfCongressConcept +from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept class LibraryOfCongressNamesTransformer(BaseTransformer): diff --git a/src/transformers/loc/mads/test_mads.py b/src/transformers/loc/skos/__init__.py similarity index 100% rename from src/transformers/loc/mads/test_mads.py rename to src/transformers/loc/skos/__init__.py diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/skos/raw_concept.py similarity index 97% rename from src/transformers/loc/raw_concept.py rename to src/transformers/loc/skos/raw_concept.py index ab710f7877..e0cf4a2db0 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/skos/raw_concept.py @@ -1,5 +1,5 @@ from typing import Literal -from common import remove_id_prefix +from transformers.loc.common import remove_id_prefix class RawLibraryOfCongressConcept: def __init__(self, raw_concept: dict): @@ -92,7 +92,7 @@ def linked_concepts_ids(self, sko_link: str) -> list[str]: if concept["@id"].startswith("_:n"): continue - linked_ids.append(self.remove_id_prefix(concept["@id"])) + linked_ids.append(remove_id_prefix(concept["@id"])) return linked_ids diff --git a/tests/transformers/loc/mads/test_mads.py b/tests/transformers/loc/mads/test_mads.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py similarity index 95% rename from src/transformers/loc/test_common.py rename to tests/transformers/loc/test_common.py index ec25790f9e..02a5da8c5c 100644 --- a/src/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -1,4 +1,5 @@ -from .common import remove_id_prefix +from src.transformers.loc.common import remove_id_prefix + def test_remove_prefix_noop(): """ @@ -14,6 +15,7 @@ def test_remove_prefix_fully_qualified(): assert remove_id_prefix("http://id.loc.gov/authorities/subjects/sh1234567890") == "sh1234567890" assert remove_id_prefix("http://id.loc.gov/authorities/names/sh0987654321") == "sh0987654321" + def test_remove_prefix_relative(): """ remove_id_prefix removes relative/local prefixes @@ -21,6 +23,7 @@ def test_remove_prefix_relative(): assert remove_id_prefix("/authorities/subjects/sh1234567890") == "sh1234567890" assert remove_id_prefix("/authorities/names/sh0987654321") == "sh0987654321" + def test_remove_prefix_lookalikes(): """ remove_id_prefix only removes specific known prefixes, From 883154351f9f195cb84c1b09237bd9853ec0a650 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Thu, 30 Jan 2025 10:13:29 +0000 Subject: [PATCH 188/310] reformat --- src/transformers/loc/common.py | 3 +-- src/transformers/loc/skos/raw_concept.py | 1 + tests/transformers/loc/test_common.py | 20 ++++++++++++++++---- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index 2b2d1d0c34..6d91ab0fa2 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -1,4 +1,3 @@ - ID_PREFIXES_TO_REMOVE = ( "/authorities/subjects/", "http://id.loc.gov/authorities/subjects/", @@ -6,8 +5,8 @@ "http://id.loc.gov/authorities/names/", ) + def remove_id_prefix(raw_id: str) -> str: for prefix in ID_PREFIXES_TO_REMOVE: raw_id = raw_id.removeprefix(prefix) return raw_id - diff --git a/src/transformers/loc/skos/raw_concept.py b/src/transformers/loc/skos/raw_concept.py index e0cf4a2db0..e3d2eacd59 100644 --- a/src/transformers/loc/skos/raw_concept.py +++ b/src/transformers/loc/skos/raw_concept.py @@ -1,6 +1,7 @@ from typing import Literal from transformers.loc.common import remove_id_prefix + class RawLibraryOfCongressConcept: def __init__(self, raw_concept: dict): self.raw_concept = raw_concept diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py index 02a5da8c5c..402dfccd73 100644 --- a/tests/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -12,8 +12,14 @@ def test_remove_prefix_fully_qualified(): """ remove_id_prefix removes fully-qualified URL-style prefixes """ - assert remove_id_prefix("http://id.loc.gov/authorities/subjects/sh1234567890") == "sh1234567890" - assert remove_id_prefix("http://id.loc.gov/authorities/names/sh0987654321") == "sh0987654321" + assert ( + remove_id_prefix("http://id.loc.gov/authorities/subjects/sh1234567890") + == "sh1234567890" + ) + assert ( + remove_id_prefix("http://id.loc.gov/authorities/names/sh0987654321") + == "sh0987654321" + ) def test_remove_prefix_relative(): @@ -29,5 +35,11 @@ def test_remove_prefix_lookalikes(): remove_id_prefix only removes specific known prefixes, not just things that look a bit like them """ - assert remove_id_prefix("/authorities/banana/sh1234567890") == "/authorities/banana/sh1234567890" - assert remove_id_prefix("https://id.loc.gov.uk/authorities/subjects/sh1234567890") == "https://id.loc.gov.uk/authorities/subjects/sh1234567890" + assert ( + remove_id_prefix("/authorities/banana/sh1234567890") + == "/authorities/banana/sh1234567890" + ) + assert ( + remove_id_prefix("https://id.loc.gov.uk/authorities/subjects/sh1234567890") + == "https://id.loc.gov.uk/authorities/subjects/sh1234567890" + ) From 81bfe77c91ec1d67e4db529e2ecee1f22d7b7556 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Thu, 30 Jan 2025 10:16:26 +0000 Subject: [PATCH 189/310] reformat --- src/transformers/loc/concepts_transformer.py | 1 - src/transformers/loc/locations_transformer.py | 1 - src/transformers/loc/names_transformer.py | 1 - src/transformers/loc/skos/raw_concept.py | 1 + 4 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 95d1fd9d00..084c49eab8 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -4,7 +4,6 @@ from models.graph_node import SourceConcept from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer - from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 7a365f9158..6a74f2572e 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -4,7 +4,6 @@ from models.graph_node import SourceLocation from sources.gzip_source import MultiGZipSource from transformers.base_transformer import BaseTransformer - from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 8f0e9c4d27..6e9e466a6a 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -4,7 +4,6 @@ from models.graph_node import SourceName from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer - from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept diff --git a/src/transformers/loc/skos/raw_concept.py b/src/transformers/loc/skos/raw_concept.py index e3d2eacd59..a7e7d11ee8 100644 --- a/src/transformers/loc/skos/raw_concept.py +++ b/src/transformers/loc/skos/raw_concept.py @@ -1,4 +1,5 @@ from typing import Literal + from transformers.loc.common import remove_id_prefix From 0e3869bb585633cc7ba6d76e7eac1e9a4def099e Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Thu, 30 Jan 2025 10:23:39 +0000 Subject: [PATCH 190/310] reformat --- tests/transformers/loc/test_common.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py index 402dfccd73..7ab4f7dd71 100644 --- a/tests/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -1,14 +1,14 @@ -from src.transformers.loc.common import remove_id_prefix +from transformers.loc.common import remove_id_prefix -def test_remove_prefix_noop(): +def test_remove_prefix_noop() -> None: """ If there is no prefix to remove, remove_id_prefix will do nothing """ assert remove_id_prefix("sh1234567890") == "sh1234567890" -def test_remove_prefix_fully_qualified(): +def test_remove_prefix_fully_qualified() -> None: """ remove_id_prefix removes fully-qualified URL-style prefixes """ @@ -22,7 +22,7 @@ def test_remove_prefix_fully_qualified(): ) -def test_remove_prefix_relative(): +def test_remove_prefix_relative() -> None: """ remove_id_prefix removes relative/local prefixes """ @@ -30,7 +30,7 @@ def test_remove_prefix_relative(): assert remove_id_prefix("/authorities/names/sh0987654321") == "sh0987654321" -def test_remove_prefix_lookalikes(): +def test_remove_prefix_lookalikes() -> None: """ remove_id_prefix only removes specific known prefixes, not just things that look a bit like them From 7862fdb3f2f5cb0b2ce62ea77c29033231fe3844 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 30 Jan 2025 10:49:02 +0000 Subject: [PATCH 191/310] Add catalogue snapshot to config --- src/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/config.py b/src/config.py index 486556df30..ab680cca4f 100644 --- a/src/config.py +++ b/src/config.py @@ -9,3 +9,4 @@ LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql" +CATALOGUE_SNAPSHOT_URL = "https://data.wellcomecollection.org/catalogue/v2/works.json.gz" From 78fddd41f1f1873d26f830a9adfb72b50a8e46d1 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Thu, 30 Jan 2025 11:58:58 +0000 Subject: [PATCH 192/310] start refactoring skos/mads commonality --- src/transformers/loc/common.py | 9 +++++++++ src/transformers/loc/concepts_transformer.py | 6 +++--- src/transformers/loc/locations_transformer.py | 6 +++--- src/transformers/loc/mads/raw_concept.py | 9 +++++++++ src/transformers/loc/names_transformer.py | 6 +++--- src/transformers/loc/skos/raw_concept.py | 10 +++------- .../fixtures}/sh2010105253.json | 0 tests/transformers/loc/mads/test_mads.py | 0 .../loc/mads/test_raw_mads_concept.py | 16 ++++++++++++++++ 9 files changed, 46 insertions(+), 16 deletions(-) rename {src/transformers/loc/mads/examples => tests/fixtures}/sh2010105253.json (100%) delete mode 100644 tests/transformers/loc/mads/test_mads.py create mode 100644 tests/transformers/loc/mads/test_raw_mads_concept.py diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index 6d91ab0fa2..75351128ba 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -10,3 +10,12 @@ def remove_id_prefix(raw_id: str) -> str: for prefix in ID_PREFIXES_TO_REMOVE: raw_id = raw_id.removeprefix(prefix) return raw_id + + +class RawLibraryOfCongressConcept: + def __init__(self, raw_concept: dict): + self.raw_concept = raw_concept + + @property + def source_id(self) -> str: + return remove_id_prefix(self.raw_concept["@id"]) diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index 084c49eab8..a647e342b0 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -4,7 +4,7 @@ from models.graph_node import SourceConcept from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer -from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept +from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept class LibraryOfCongressConceptsTransformer(BaseTransformer): @@ -12,7 +12,7 @@ def __init__(self, url: str): self.source = GZipSource(url) def transform_node(self, raw_node: dict) -> SourceConcept | None: - raw_concept = RawLibraryOfCongressConcept(raw_node) + raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: return None @@ -27,7 +27,7 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: def extract_edges( self, raw_node: dict ) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: - raw_concept = RawLibraryOfCongressConcept(raw_node) + raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: return diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index 6a74f2572e..afbc2c0a95 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -4,7 +4,7 @@ from models.graph_node import SourceLocation from sources.gzip_source import MultiGZipSource from transformers.base_transformer import BaseTransformer -from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept +from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept class LibraryOfCongressLocationsTransformer(BaseTransformer): @@ -12,7 +12,7 @@ def __init__(self, subject_headings_url: str, names_url: str): self.source = MultiGZipSource([subject_headings_url, names_url]) def transform_node(self, raw_node: dict) -> SourceLocation | None: - raw_concept = RawLibraryOfCongressConcept(raw_node) + raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: return None @@ -27,7 +27,7 @@ def transform_node(self, raw_node: dict) -> SourceLocation | None: def extract_edges( self, raw_node: dict ) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: - raw_concept = RawLibraryOfCongressConcept(raw_node) + raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: return diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index e69de29bb2..c7ad658729 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -0,0 +1,9 @@ +from typing import Literal + +from transformers.loc.common import RawLibraryOfCongressConcept + + +class RawLibraryOfCongressMADSConcept(RawLibraryOfCongressConcept): + def __init__(self, raw_concept: dict): + super().__init__(raw_concept) + diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 6e9e466a6a..903a69f419 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -4,7 +4,7 @@ from models.graph_node import SourceName from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer -from transformers.loc.skos.raw_concept import RawLibraryOfCongressConcept +from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept class LibraryOfCongressNamesTransformer(BaseTransformer): @@ -12,7 +12,7 @@ def __init__(self, url: str): self.source = GZipSource(url) def transform_node(self, raw_node: dict) -> SourceName | None: - raw_concept = RawLibraryOfCongressConcept(raw_node) + raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: return None @@ -25,7 +25,7 @@ def transform_node(self, raw_node: dict) -> SourceName | None: ) def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: - raw_concept = RawLibraryOfCongressConcept(raw_node) + raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: return diff --git a/src/transformers/loc/skos/raw_concept.py b/src/transformers/loc/skos/raw_concept.py index a7e7d11ee8..49fb56e160 100644 --- a/src/transformers/loc/skos/raw_concept.py +++ b/src/transformers/loc/skos/raw_concept.py @@ -1,11 +1,11 @@ from typing import Literal -from transformers.loc.common import remove_id_prefix +from transformers.loc.common import remove_id_prefix, RawLibraryOfCongressConcept -class RawLibraryOfCongressConcept: +class RawLibraryOfCongressSKOSConcept(RawLibraryOfCongressConcept): def __init__(self, raw_concept: dict): - self.raw_concept = raw_concept + super().__init__(raw_concept) self._raw_concept_node = self._extract_concept_node() def _extract_concept_node(self) -> dict | None: @@ -52,10 +52,6 @@ def exclude(self) -> bool: return False - @property - def source_id(self) -> str: - return remove_id_prefix(self.raw_concept["@id"]) - @property def label(self) -> str: assert self._raw_concept_node is not None diff --git a/src/transformers/loc/mads/examples/sh2010105253.json b/tests/fixtures/sh2010105253.json similarity index 100% rename from src/transformers/loc/mads/examples/sh2010105253.json rename to tests/fixtures/sh2010105253.json diff --git a/tests/transformers/loc/mads/test_mads.py b/tests/transformers/loc/mads/test_mads.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py new file mode 100644 index 0000000000..d1a71a525e --- /dev/null +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -0,0 +1,16 @@ +import json + +from test_utils import load_fixture +from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept + +sh2010105253 = json.loads(load_fixture("sh2010105253.json")) + +def test_source_id() -> None: + """ + source_id is derived from the @id property in the source data. + It is the unqualified version of the full id + """ + concept = RawLibraryOfCongressMADSConcept({ + "@id": "/authorities/subjects/sh2010105253" + }) + assert concept.source_id == "sh2010105253" \ No newline at end of file From ca362d2e7e5dae9ea6c3a3ea9088ec218f4dbce2 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 30 Jan 2025 12:08:22 +0000 Subject: [PATCH 193/310] Small adjustments to final query and comment --- notebooks/graph_exploration.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/graph_exploration.ipynb b/notebooks/graph_exploration.ipynb index eb2ce5c808..edbdbcfc59 100644 --- a/notebooks/graph_exploration.ipynb +++ b/notebooks/graph_exploration.ipynb @@ -411,7 +411,7 @@ "id": "c517ea08", "metadata": {}, "source": [ - "Putting all of this together, we can retrieve all nodes linked via `SAME_AS` edges to a given concept ID, as well as any parent/broader/related concepts linked to any of these (if present). Note that an `OPTIONAL MATCH` clause is used below so we can retrieve all `SAME_AS` concepts regardless of whether they have any further connections or not." + "Putting all of this together, we can retrieve all nodes linked via `SAME_AS` edges to a given concept ID, as well as any parent/broader/related concepts linked to any of these (if present). Note that we can use multiple `MATCH` or `OPTIONAL MATCH` statements and choose more than one type of edge with the `|` operator." ] }, { @@ -425,8 +425,8 @@ "source": [ "%%oc -d label -l 25 -g source\n", "MATCH (sc1:SourceConcept {id: 'sh85117296'})-[r:SAME_AS*]->(sc2:SourceConcept)\n", - "OPTIONAL MATCH (sc2)-[p:HAS_PARENT|NARROWER_THAN|RELATED_TO]->(sc3)\n", - "OPTIONAL MATCH (sc3)-[s:SAME_AS]->(sc4:SourceConcept)\n", + "MATCH (sc2)-[p:HAS_PARENT|NARROWER_THAN|RELATED_TO]->(sc3:SourceConcept)\n", + "MATCH (sc3)-[s:SAME_AS]->(sc4:SourceConcept)\n", "RETURN *" ] }, @@ -441,7 +441,7 @@ ], "metadata": { "kernelspec": { - "display_name": "catgraph-exploration", + "display_name": "Python 3", "language": "python", "name": "python3" }, From 087ca7ab30f9bd211744c7b505450482b6dfe29f Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Thu, 30 Jan 2025 12:21:09 +0000 Subject: [PATCH 194/310] move source property to common\ --- src/transformers/loc/common.py | 12 +++++++ src/transformers/loc/mads/raw_concept.py | 3 -- src/transformers/loc/skos/raw_concept.py | 14 +-------- .../loc/mads/test_raw_mads_concept.py | 19 +++++++++--- tests/transformers/loc/test_common.py | 31 ++++++++++++++++++- 5 files changed, 58 insertions(+), 21 deletions(-) diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index 75351128ba..da10772bcd 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -1,3 +1,5 @@ +from typing import Literal + ID_PREFIXES_TO_REMOVE = ( "/authorities/subjects/", "http://id.loc.gov/authorities/subjects/", @@ -19,3 +21,13 @@ def __init__(self, raw_concept: dict): @property def source_id(self) -> str: return remove_id_prefix(self.raw_concept["@id"]) + + @property + def source(self) -> Literal["lc-subjects", "lc-names"]: + if "subjects" in self.raw_concept["@id"]: + return "lc-subjects" + + if "names" in self.raw_concept["@id"]: + return "lc-names" + + raise ValueError("Unknown concept type.") diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index c7ad658729..ba85778d91 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -1,9 +1,6 @@ -from typing import Literal - from transformers.loc.common import RawLibraryOfCongressConcept class RawLibraryOfCongressMADSConcept(RawLibraryOfCongressConcept): def __init__(self, raw_concept: dict): super().__init__(raw_concept) - diff --git a/src/transformers/loc/skos/raw_concept.py b/src/transformers/loc/skos/raw_concept.py index 49fb56e160..69aa02e6cb 100644 --- a/src/transformers/loc/skos/raw_concept.py +++ b/src/transformers/loc/skos/raw_concept.py @@ -1,6 +1,4 @@ -from typing import Literal - -from transformers.loc.common import remove_id_prefix, RawLibraryOfCongressConcept +from transformers.loc.common import RawLibraryOfCongressConcept, remove_id_prefix class RawLibraryOfCongressSKOSConcept(RawLibraryOfCongressConcept): @@ -120,13 +118,3 @@ def is_geographic(self) -> bool: notation_types = {item.get("@type") for item in notation} return "http://id.loc.gov/datatypes/codes/gac" in notation_types - - @property - def source(self) -> Literal["lc-subjects", "lc-names"]: - if "subjects" in self.raw_concept["@id"]: - return "lc-subjects" - - if "names" in self.raw_concept["@id"]: - return "lc-names" - - raise ValueError("Unknown concept type.") diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index d1a71a525e..b89475a45d 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -5,12 +5,23 @@ sh2010105253 = json.loads(load_fixture("sh2010105253.json")) + def test_source_id() -> None: """ source_id is derived from the @id property in the source data. It is the unqualified version of the full id """ - concept = RawLibraryOfCongressMADSConcept({ - "@id": "/authorities/subjects/sh2010105253" - }) - assert concept.source_id == "sh2010105253" \ No newline at end of file + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253"} + ) + assert concept.source_id == "sh2010105253" + + +def test_source() -> None: + """ + source is discovered by examining the prefix to the id. + """ + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253"} + ) + assert concept.source == "lc-subjects" diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py index 7ab4f7dd71..be7aa87af4 100644 --- a/tests/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -1,5 +1,6 @@ -from transformers.loc.common import remove_id_prefix +import pytest +from transformers.loc.common import remove_id_prefix, RawLibraryOfCongressConcept def test_remove_prefix_noop() -> None: """ @@ -43,3 +44,31 @@ def test_remove_prefix_lookalikes() -> None: remove_id_prefix("https://id.loc.gov.uk/authorities/subjects/sh1234567890") == "https://id.loc.gov.uk/authorities/subjects/sh1234567890" ) + + +def test_source_subjects() -> None: + """ + Given an id with the prefix /authorities/subjects/, the source will be lc-subjects + """ + concept = RawLibraryOfCongressConcept( + {"@id": "/authorities/subjects/sh2010105253"} + ) + assert concept.source == "lc-subjects" + + +def test_source_names() -> None: + """ + Given an id with the prefix /authorities/subjects/, the source will be lc-subjects + """ + concept = RawLibraryOfCongressConcept( + {"@id": "/authorities/names/sh2010105253"} + ) + assert concept.source == "lc-names" + + +def test_source_invalid() -> None: + with (pytest.raises(ValueError)): + concept = RawLibraryOfCongressConcept( + {"@id": "authorities/childrensSubjects/sj2021051581"} + ) + concept.source From 419078c14a05be03e5dcfcd3e53ff32674536d57 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 30 Jan 2025 14:17:36 +0000 Subject: [PATCH 195/310] Add catalogue concepts transformer --- src/transformers/create_transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index f46e7ccc2c..2d79e4c0e3 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -1,6 +1,6 @@ from typing import Literal -from config import LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL, MESH_URL +from config import LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL, MESH_URL, CATALOGUE_SNAPSHOT_URL from .base_transformer import BaseTransformer, EntityType from .loc.concepts_transformer import LibraryOfCongressConceptsTransformer @@ -53,5 +53,7 @@ def create_transformer( return WikidataConceptsTransformer(entity_type, "mesh") if transformer_type == "wikidata_linked_mesh_locations": return WikidataLocationsTransformer(entity_type, "mesh") + if transformer_type == "catalogue_concepts": + return CatalogueConceptsTransformer(CATALOGUE_SNAPSHOT_URL) raise ValueError(f"Unknown transformer type: {transformer_type}") From f1787acc4ce3518746e982823a456ca67000a668 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 30 Jan 2025 15:59:27 +0000 Subject: [PATCH 196/310] Add tests --- tests/fixtures/catalogue_example.json | 3 ++ tests/test_catalogue_concepts_source.py | 27 ++++++++++++++++++ tests/test_catalogue_concepts_transformer.py | 30 ++++++++++++++++++++ tests/test_extractor.py | 10 +++++++ 4 files changed, 70 insertions(+) create mode 100644 tests/fixtures/catalogue_example.json create mode 100644 tests/test_catalogue_concepts_source.py create mode 100644 tests/test_catalogue_concepts_transformer.py diff --git a/tests/fixtures/catalogue_example.json b/tests/fixtures/catalogue_example.json new file mode 100644 index 0000000000..da0cd4b4d2 --- /dev/null +++ b/tests/fixtures/catalogue_example.json @@ -0,0 +1,3 @@ +{"succeededBy": [], "production": [{"label": "[Toulouse (pr\u00e8s La. Place Royale)] : [De l'imprimerie de J.F. Desclassan], [1779]", "agents": [{"label": "[De l'imprimerie de J.F. Desclassan]", "type": "Agent"}], "dates": [{"label": "[1779]", "type": "Period"}], "type": "ProductionEvent", "places": [{"label": "[Toulouse (pr\u00e8s La. Place Royale)]", "type": "Place"}]}], "physicalDescription": "1 print (album leaf 31) : etching ; platemark 34.7 x 24 cm", "subjects": [{"label": "Human anatomy", "concepts": [{"id": "s6s24vd7", "identifiers": [{"identifierType": {"id": "lc-subjects", "label": "Library of Congress Subject Headings (LCSH)", "type": "IdentifierType"}, "value": "sh85004839", "type": "Identifier"}], "label": "Human anatomy", "type": "Concept"}], "identifiers": [{"identifierType": {"id": "lc-subjects", "label": "Library of Congress Subject Headings (LCSH)", "type": "IdentifierType"}, "value": "sh85004839", "type": "Identifier"}], "id": "s6s24vd7", "type": "Subject"}, {"label": "Skull", "concepts": [{"id": "wpusacr9", "identifiers": [{"identifierType": {"id": "lc-subjects", "label": "Library of Congress Subject Headings (LCSH)", "type": "IdentifierType"}, "value": "sh85123237", "type": "Identifier"}], "label": "Skull", "type": "Concept"}], "identifiers": [{"identifierType": {"id": "lc-subjects", "label": "Library of Congress Subject Headings (LCSH)", "type": "IdentifierType"}, "value": "sh85123237", "type": "Identifier"}], "id": "wpusacr9", "type": "Subject"}], "items": [{"id": "p93jh8gy", "identifiers": [{"identifierType": {"id": "sierra-system-number", "label": "Sierra system number", "type": "IdentifierType"}, "value": "i11464057", "type": "Identifier"}, {"identifierType": {"id": "sierra-identifier", "label": "Sierra identifier", "type": "IdentifierType"}, "value": "1146405", "type": "Identifier"}], "locations": [{"locationType": {"id": "closed-stores", "label": "Closed stores", "type": "LocationType"}, "label": "Closed stores", "accessConditions": [{"method": {"id": "online-request", "label": "Online request", "type": "AccessMethod"}, "status": {"id": "open", "label": "Open", "type": "AccessStatus"}, "type": "AccessCondition"}], "type": "PhysicalLocation"}, {"url": "https://iiif.wellcomecollection.org/image/V0008815/info.json", "credit": "Wellcome Collection", "license": {"id": "pdm", "label": "Public Domain Mark", "url": "https://creativecommons.org/share-your-work/public-domain/pdm/", "type": "License"}, "accessConditions": [{"method": {"id": "view-online", "label": "View online", "type": "AccessMethod"}, "status": {"id": "open", "label": "Open", "type": "AccessStatus"}, "type": "AccessCondition"}], "locationType": {"id": "iiif-image", "label": "IIIF Image API", "type": "LocationType"}, "type": "DigitalLocation"}], "type": "Item"}], "designation": [], "workType": {"id": "k", "label": "Pictures", "type": "Format"}, "identifiers": [{"identifierType": {"id": "sierra-system-number", "label": "Sierra system number", "type": "IdentifierType"}, "value": "b15697551", "type": "Identifier"}, {"identifierType": {"id": "sierra-identifier", "label": "Sierra identifier", "type": "IdentifierType"}, "value": "1569755", "type": "Identifier"}, {"identifierType": {"id": "iconographic-number", "label": "Iconographic number", "type": "IdentifierType"}, "value": "569755i", "type": "Identifier"}, {"identifierType": {"id": "miro-image-number", "label": "Miro image number", "type": "IdentifierType"}, "value": "V0008815", "type": "Identifier"}], "thumbnail": {"url": "https://iiif.wellcomecollection.org/image/V0008815/full/300,/0/default.jpg", "license": {"id": "pdm", "label": "Public Domain Mark", "url": "https://creativecommons.org/share-your-work/public-domain/pdm/", "type": "License"}, "accessConditions": [], "locationType": {"id": "thumbnail-image", "label": "Thumbnail image", "type": "LocationType"}, "type": "DigitalLocation"}, "formerFrequency": [], "alternativeTitles": [], "id": "m4u8drnu", "languages": [{"id": "fre", "label": "French", "type": "Language"}], "precededBy": [], "partOf": [{"title": "Nouveau recueil d'ost\u00e9ologie et de myologie.", "totalParts": 0, "totalDescendentParts": 0, "type": "Series"}], "genres": [{"label": "Etchings", "concepts": [{"id": "yfqryj26", "identifiers": [{"identifierType": {"id": "label-derived", "label": "Identifier derived from the label of the referent", "type": "IdentifierType"}, "value": "etchings", "type": "Identifier"}], "label": "Etchings", "type": "Genre"}], "type": "Genre"}], "lettering": "Martin inc.\n\nPage bears number 31 inscribed in pencil, in top right-hand corner\n\nOutline diagram of lower jaw is lettered for a key", "notes": [{"contents": ["Wellcome Collection 569755i"], "noteType": {"id": "reference", "label": "Reference", "type": "NoteType"}, "type": "Note"}], "holdings": [], "title": "Human skull, seen from below, with details of the lower jaw bone. Etching by Martin after J. Gamelin, 1778/1779.", "type": "Work", "contributors": [{"agent": {"id": "uykuavkt", "identifiers": [{"identifierType": {"id": "label-derived", "label": "Identifier derived from the label of the referent", "type": "IdentifierType"}, "value": "gamelin, jacques, 1739-1803", "type": "Identifier"}], "label": "Gamelin, Jacques, 1739-1803.", "type": "Person"}, "roles": [], "primary": true, "type": "Contributor"}, {"agent": {"id": "h2peyydc", "identifiers": [{"identifierType": {"id": "lc-names", "label": "Library of Congress Name authority records", "type": "IdentifierType"}, "value": "nr90012131", "type": "Identifier"}], "label": "Mart\u00edn.", "type": "Person"}, "roles": [], "primary": false, "type": "Contributor"}], "images": [{"id": "e3xtc9ne", "type": "Image"}], "availabilities": [{"id": "closed-stores", "label": "Closed stores", "type": "Availability"}, {"id": "online", "label": "Online", "type": "Availability"}], "parts": [], "referenceNumber": "569755i"} +{"succeededBy": [], "production": [{"label": "[Toulouse (pr\u00e8s La. Place Royale)] : [De l'imprimerie de J.F. Desclassan], [1779]", "agents": [{"label": "[De l'imprimerie de J.F. Desclassan]", "type": "Agent"}], "dates": [{"label": "[1779]", "type": "Period"}], "type": "ProductionEvent", "places": [{"label": "[Toulouse (pr\u00e8s La. Place Royale)]", "type": "Place"}]}], "physicalDescription": "1 print (album leaf 4) : etching ; platemark 35.2 x 25.1 cm", "subjects": [{"label": "Human anatomy", "concepts": [{"id": "s6s24vd7", "identifiers": [{"identifierType": {"id": "lc-subjects", "label": "Library of Congress Subject Headings (LCSH)", "type": "IdentifierType"}, "value": "sh85004839", "type": "Identifier"}], "label": "Human anatomy", "type": "Concept"}], "identifiers": [{"identifierType": {"id": "lc-subjects", "label": "Library of Congress Subject Headings (LCSH)", "type": "IdentifierType"}, "value": "sh85004839", "type": "Identifier"}], "id": "s6s24vd7", "type": "Subject"}], "items": [{"id": "p93jh8gy", "identifiers": [{"identifierType": {"id": "sierra-system-number", "label": "Sierra system number", "type": "IdentifierType"}, "value": "i11464057", "type": "Identifier"}, {"identifierType": {"id": "sierra-identifier", "label": "Sierra identifier", "type": "IdentifierType"}, "value": "1146405", "type": "Identifier"}], "locations": [{"locationType": {"id": "closed-stores", "label": "Closed stores", "type": "LocationType"}, "label": "Closed stores", "accessConditions": [{"method": {"id": "online-request", "label": "Online request", "type": "AccessMethod"}, "status": {"id": "open", "label": "Open", "type": "AccessStatus"}, "type": "AccessCondition"}], "type": "PhysicalLocation"}, {"url": "https://iiif.wellcomecollection.org/image/V0008796/info.json", "credit": "Wellcome Collection", "license": {"id": "pdm", "label": "Public Domain Mark", "url": "https://creativecommons.org/share-your-work/public-domain/pdm/", "type": "License"}, "accessConditions": [{"method": {"id": "view-online", "label": "View online", "type": "AccessMethod"}, "status": {"id": "open", "label": "Open", "type": "AccessStatus"}, "type": "AccessCondition"}], "locationType": {"id": "iiif-image", "label": "IIIF Image API", "type": "LocationType"}, "type": "DigitalLocation"}], "type": "Item"}], "designation": [], "workType": {"id": "k", "label": "Pictures", "type": "Format"}, "identifiers": [{"identifierType": {"id": "sierra-system-number", "label": "Sierra system number", "type": "IdentifierType"}, "value": "b15697290", "type": "Identifier"}, {"identifierType": {"id": "sierra-identifier", "label": "Sierra identifier", "type": "IdentifierType"}, "value": "1569729", "type": "Identifier"}, {"identifierType": {"id": "iconographic-number", "label": "Iconographic number", "type": "IdentifierType"}, "value": "569729i", "type": "Identifier"}, {"identifierType": {"id": "miro-image-number", "label": "Miro image number", "type": "IdentifierType"}, "value": "V0008796", "type": "Identifier"}], "thumbnail": {"url": "https://iiif.wellcomecollection.org/image/V0008796/full/300,/0/default.jpg", "license": {"id": "pdm", "label": "Public Domain Mark", "url": "https://creativecommons.org/share-your-work/public-domain/pdm/", "type": "License"}, "accessConditions": [], "locationType": {"id": "thumbnail-image", "label": "Thumbnail image", "type": "LocationType"}, "type": "DigitalLocation"}, "formerFrequency": [], "alternativeTitles": [], "id": "ydz8wd5r", "languages": [{"id": "fre", "label": "French", "type": "Language"}], "precededBy": [], "partOf": [{"title": "Nouveau recueil d'ost\u00e9ologie et de myologie.", "totalParts": 0, "totalDescendentParts": 0, "type": "Series"}], "genres": [{"label": "Etchings", "concepts": [{"id": "yfqryj26", "identifiers": [{"identifierType": {"id": "label-derived", "label": "Identifier derived from the label of the referent", "type": "IdentifierType"}, "value": "etchings", "type": "Identifier"}], "label": "Etchings", "type": "Genre"}], "type": "Genre"}], "lettering": "Permis d'imprimer Lartigue juge-mage\n\nPage bears number 4 inscribed in pencil, in top right-hand corner. Etching includes imprimatur signed by Andr\u00e9 Lartigue (b. 1723), juge-mage of Toulouse", "notes": [{"contents": ["Wellcome Collection 569729i"], "noteType": {"id": "reference", "label": "Reference", "type": "NoteType"}, "type": "Note"}], "holdings": [], "title": "A winged skeleton holding a drawing of a seated model, with human bones and skulls on the table and floor beside him. Etching by or after J. Gamelin, 1778/9.", "type": "Work", "contributors": [{"agent": {"id": "uykuavkt", "identifiers": [{"identifierType": {"id": "label-derived", "label": "Identifier derived from the label of the referent", "type": "IdentifierType"}, "value": "gamelin, jacques, 1739-1803", "type": "Identifier"}], "label": "Gamelin, Jacques, 1739-1803.", "type": "Person"}, "roles": [], "primary": true, "type": "Contributor"}], "images": [{"id": "q3vw2bvq", "type": "Image"}], "availabilities": [{"id": "closed-stores", "label": "Closed stores", "type": "Availability"}, {"id": "online", "label": "Online", "type": "Availability"}], "parts": [], "referenceNumber": "569729i"} +{"succeededBy": [], "production": [{"label": "[Toulouse (pr\u00e8s La. Place Royale)] : [De l'imprimerie de J.F. Desclassan], [1779]", "agents": [{"label": "[De l'imprimerie de J.F. Desclassan]", "type": "Agent"}], "dates": [{"label": "[1779]", "type": "Period"}], "type": "ProductionEvent", "places": [{"label": "[Toulouse (pr\u00e8s La. Place Royale)]", "type": "Place"}]}], "physicalDescription": "1 print (album leaf 16) : etching ; platemark 39.2 x 28.3 cm", "subjects": [{"label": "Human anatomy", "concepts": [{"id": "s6s24vd7", "identifiers": [{"identifierType": {"id": "lc-subjects", "label": "Library of Congress Subject Headings (LCSH)", "type": "IdentifierType"}, "value": "sh85004839", "type": "Identifier"}], "label": "Human anatomy", "type": "Concept"}], "identifiers": [{"identifierType": {"id": "lc-subjects", "label": "Library of Congress Subject Headings (LCSH)", "type": "IdentifierType"}, "value": "sh85004839", "type": "Identifier"}], "id": "s6s24vd7", "type": "Subject"}], "items": [{"id": "p93jh8gy", "identifiers": [{"identifierType": {"id": "sierra-system-number", "label": "Sierra system number", "type": "IdentifierType"}, "value": "i11464057", "type": "Identifier"}, {"identifierType": {"id": "sierra-identifier", "label": "Sierra identifier", "type": "IdentifierType"}, "value": "1146405", "type": "Identifier"}], "locations": [{"locationType": {"id": "closed-stores", "label": "Closed stores", "type": "LocationType"}, "label": "Closed stores", "accessConditions": [{"method": {"id": "online-request", "label": "Online request", "type": "AccessMethod"}, "status": {"id": "open", "label": "Open", "type": "AccessStatus"}, "type": "AccessCondition"}], "type": "PhysicalLocation"}, {"url": "https://iiif.wellcomecollection.org/image/V0008806/info.json", "credit": "Wellcome Collection", "license": {"id": "pdm", "label": "Public Domain Mark", "url": "https://creativecommons.org/share-your-work/public-domain/pdm/", "type": "License"}, "accessConditions": [{"method": {"id": "view-online", "label": "View online", "type": "AccessMethod"}, "status": {"id": "open", "label": "Open", "type": "AccessStatus"}, "type": "AccessCondition"}], "locationType": {"id": "iiif-image", "label": "IIIF Image API", "type": "LocationType"}, "type": "DigitalLocation"}], "type": "Item"}], "designation": [], "workType": {"id": "k", "label": "Pictures", "type": "Format"}, "identifiers": [{"identifierType": {"id": "sierra-system-number", "label": "Sierra system number", "type": "IdentifierType"}, "value": "b15697423", "type": "Identifier"}, {"identifierType": {"id": "sierra-identifier", "label": "Sierra identifier", "type": "IdentifierType"}, "value": "1569742", "type": "Identifier"}, {"identifierType": {"id": "iconographic-number", "label": "Iconographic number", "type": "IdentifierType"}, "value": "569742i", "type": "Identifier"}, {"identifierType": {"id": "miro-image-number", "label": "Miro image number", "type": "IdentifierType"}, "value": "V0008806", "type": "Identifier"}], "thumbnail": {"url": "https://iiif.wellcomecollection.org/image/V0008806/full/300,/0/default.jpg", "license": {"id": "pdm", "label": "Public Domain Mark", "url": "https://creativecommons.org/share-your-work/public-domain/pdm/", "type": "License"}, "accessConditions": [], "locationType": {"id": "thumbnail-image", "label": "Thumbnail image", "type": "LocationType"}, "type": "DigitalLocation"}, "formerFrequency": [], "alternativeTitles": [], "id": "f33w7jru", "languages": [{"id": "fre", "label": "French", "type": "Language"}], "precededBy": [], "partOf": [{"title": "Nouveau recueil d'ost\u00e9ologie et de myologie.", "totalParts": 0, "totalDescendentParts": 0, "type": "Series"}], "genres": [{"label": "Etchings", "concepts": [{"id": "yfqryj26", "identifiers": [{"identifierType": {"id": "label-derived", "label": "Identifier derived from the label of the referent", "type": "IdentifierType"}, "value": "etchings", "type": "Identifier"}], "label": "Etchings", "type": "Genre"}], "type": "Genre"}], "lettering": "O quanto ci deve dare pensiere ; Laval\u00e9e inc. 1778 ; Gamelin fec.\n\nPage bears number 16 inscribed in pencil, in top right-hand corner", "notes": [{"contents": ["Wellcome Collection 569742i"], "noteType": {"id": "reference", "label": "Reference", "type": "NoteType"}, "type": "Note"}], "holdings": [], "title": "Four skeletons with musical instruments, seated in various positions: one is singing, another holds a flute, while two others are shown playing the violin and cello. Etching by Laval\u00e9e after J. Gamelin, 1778.", "type": "Work", "contributors": [{"agent": {"id": "uykuavkt", "identifiers": [{"identifierType": {"id": "label-derived", "label": "Identifier derived from the label of the referent", "type": "IdentifierType"}, "value": "gamelin, jacques, 1739-1803", "type": "Identifier"}], "label": "Gamelin, Jacques, 1739-1803.", "type": "Person"}, "roles": [], "primary": true, "type": "Contributor"}, {"agent": {"id": "kpeywdvq", "identifiers": [{"identifierType": {"id": "label-derived", "label": "Identifier derived from the label of the referent", "type": "IdentifierType"}, "value": "lavalee", "type": "Identifier"}], "label": "Laval\u00e9e.", "type": "Person"}, "roles": [], "primary": false, "type": "Contributor"}], "images": [{"id": "vyyvxncx", "type": "Image"}], "availabilities": [{"id": "closed-stores", "label": "Closed stores", "type": "Availability"}, {"id": "online", "label": "Online", "type": "Availability"}], "parts": [], "referenceNumber": "569742i"} diff --git a/tests/test_catalogue_concepts_source.py b/tests/test_catalogue_concepts_source.py new file mode 100644 index 0000000000..edb9703c2c --- /dev/null +++ b/tests/test_catalogue_concepts_source.py @@ -0,0 +1,27 @@ +from test_mocks import MockRequest +from test_utils import load_fixture + +from sources.catalogue.concepts_source import CatalogueConceptsSource + + +def test_catalogue_concepts_source() -> None: + test_url = "https://example.com" + MockRequest.mock_responses( + [ + { + "method": "GET", + "url": test_url, + "status_code": 200, + "json_data": None, + "content_bytes": load_fixture("catalogue_example.json"), + "params": None, + } + ] + ) + + catalogue_concepts_source = CatalogueConceptsSource(test_url) + stream_result = list(catalogue_concepts_source.stream_raw()) + + # Do some simple checks on mesh source decoding based on known data + assert len(stream_result) == 12 + assert stream_result[0]["label"] == "Human anatomy" diff --git a/tests/test_catalogue_concepts_transformer.py b/tests/test_catalogue_concepts_transformer.py new file mode 100644 index 0000000000..00b9145c92 --- /dev/null +++ b/tests/test_catalogue_concepts_transformer.py @@ -0,0 +1,30 @@ +from test_mocks import MockRequest +from test_utils import load_fixture + +from transformers.catalogue.concepts_transformer import CatalogueConceptsTransformer + + +def test_mesh_concepts_transformer() -> None: + test_url = "https://example.com" + + MockRequest.mock_responses( + [ + { + "method": "GET", + "url": test_url, + "status_code": 200, + "json_data": None, + "content_bytes": load_fixture("catalogue_example.json"), + "params": None, + } + ] + ) + catalogue_concepts_transformer = CatalogueConceptsTransformer(test_url) + + # test transform_node + nodes = list( + catalogue_concepts_transformer.stream(entity_type="nodes", query_chunk_size=1) + ) + assert len(list(nodes)) == 12 + assert nodes[0][0].id == "s6s24vd7" + assert nodes[0][0].label == "Human anatomy" diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 7e1dba6c8f..5117787863 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -10,6 +10,7 @@ LOC_SUBJECT_HEADINGS_URL, MESH_URL, WIKIDATA_SPARQL_URL, + CATALOGUE_SNAPSHOT_URL ) from extractor import LambdaEvent, lambda_handler from transformers.base_transformer import EntityType, StreamDestination @@ -94,6 +95,14 @@ def mock_requests_lookup_table( "json_data": {"results": {"bindings": []}}, } ) + elif transformer_type == "catalogue_concepts": + mocked_responses.append( + { + "method": "GET", + "url": CATALOGUE_SNAPSHOT_URL, + "content_bytes": load_fixture("catalogue_example.json"), + } + ) return mocked_responses @@ -147,6 +156,7 @@ def test_lambda_handler( "wikidata_linked_loc_locations": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_concepts": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_locations": [WIKIDATA_SPARQL_URL], + "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL] } assert transformer_type in transformer_types From 5a2f9094fe9f9930649bff614e659f535161816f Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Thu, 30 Jan 2025 16:15:16 +0000 Subject: [PATCH 197/310] Apply auto-formatting rules --- src/config.py | 4 +++- src/transformers/create_transformer.py | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/config.py b/src/config.py index ab680cca4f..f92363a32e 100644 --- a/src/config.py +++ b/src/config.py @@ -9,4 +9,6 @@ LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql" -CATALOGUE_SNAPSHOT_URL = "https://data.wellcomecollection.org/catalogue/v2/works.json.gz" +CATALOGUE_SNAPSHOT_URL = ( + "https://data.wellcomecollection.org/catalogue/v2/works.json.gz" +) diff --git a/src/transformers/create_transformer.py b/src/transformers/create_transformer.py index 8bab2b3aa0..141d86b31d 100644 --- a/src/transformers/create_transformer.py +++ b/src/transformers/create_transformer.py @@ -1,6 +1,11 @@ from typing import Literal -from config import LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL, MESH_URL, CATALOGUE_SNAPSHOT_URL +from config import ( + CATALOGUE_SNAPSHOT_URL, + LOC_NAMES_URL, + LOC_SUBJECT_HEADINGS_URL, + MESH_URL, +) from .base_transformer import BaseTransformer, EntityType from .catalogue.concepts_transformer import CatalogueConceptsTransformer From 4f2116217358b3aa67cfe3bb988445ab25862aff Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 30 Jan 2025 16:40:07 +0000 Subject: [PATCH 198/310] Add checks for other node properties --- tests/test_catalogue_concepts_transformer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_catalogue_concepts_transformer.py b/tests/test_catalogue_concepts_transformer.py index 00b9145c92..6999b2580e 100644 --- a/tests/test_catalogue_concepts_transformer.py +++ b/tests/test_catalogue_concepts_transformer.py @@ -28,3 +28,5 @@ def test_mesh_concepts_transformer() -> None: assert len(list(nodes)) == 12 assert nodes[0][0].id == "s6s24vd7" assert nodes[0][0].label == "Human anatomy" + assert nodes[0][0].type == "Concept" + assert nodes[0][0].source == "lc-subjects" From 676dbf992405318d192110764a0fb595233cfeb1 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Thu, 30 Jan 2025 16:48:31 +0000 Subject: [PATCH 199/310] Add installation comment --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 160b87ff69..25fbec9945 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ print(result) ### AWS Graph Notebook -Additionally, it is possible to connect to the cluster using [AWS graph notebook](https://github.com/aws/graph-notebook).The most straightforward option to do this locally is using [JupyterLab](https://jupyter.org/). To install `graph-notebook` and `jupyterlab` (note that this requires Python 3.9.x-3.10.14): +Additionally, it is possible to connect to the cluster using [AWS graph notebook](https://github.com/aws/graph-notebook).The most straightforward option to do this locally is using [JupyterLab](https://jupyter.org/). To make this work, you need to set this up in a different virtual environment from the one in this project (this is because `graph-notebook` currently requires Python 3.9.x-3.10.14). Once you have created a new environment with the correct Python version, install the following: ``` # install graph-notebook From 7c6df7ffbd1256dfa820921cedeaefce164bbcb6 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Thu, 30 Jan 2025 16:58:56 +0000 Subject: [PATCH 200/310] extract exclude to the common base class --- src/transformers/loc/common.py | 31 ++ src/transformers/loc/skos/raw_concept.py | 27 -- .../loc_subjects_diverse_example.jsonld | 3 + tests/fixtures/mads_781_pair.ndjson | 2 + ...05253.json => mads_composite_concept.json} | 0 tests/fixtures/mads_deprecated_concept.json | 83 ++++ tests/fixtures/mads_geographic_concept.json | 354 ++++++++++++++++++ tests/fixtures/mads_variant_concept.json | 287 ++++++++++++++ tests/fixtures/skos_deprecated_concept.json | 56 +++ .../loc/mads/test_raw_mads_concept.py | 2 +- 10 files changed, 817 insertions(+), 28 deletions(-) create mode 100644 tests/fixtures/loc_subjects_diverse_example.jsonld create mode 100644 tests/fixtures/mads_781_pair.ndjson rename tests/fixtures/{sh2010105253.json => mads_composite_concept.json} (100%) create mode 100644 tests/fixtures/mads_deprecated_concept.json create mode 100644 tests/fixtures/mads_geographic_concept.json create mode 100644 tests/fixtures/mads_variant_concept.json create mode 100644 tests/fixtures/skos_deprecated_concept.json diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index da10772bcd..86e1c34c40 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -17,6 +17,10 @@ def remove_id_prefix(raw_id: str) -> str: class RawLibraryOfCongressConcept: def __init__(self, raw_concept: dict): self.raw_concept = raw_concept + self._raw_concept_node = self._extract_concept_node() + + def _extract_concept_node(self): + pass @property def source_id(self) -> str: @@ -31,3 +35,30 @@ def source(self) -> Literal["lc-subjects", "lc-names"]: return "lc-names" raise ValueError("Unknown concept type.") + + @staticmethod + def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: + # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. + if isinstance(raw_label, str): + return raw_label + + # In cases where an LoC Name has multiple labels written using different writing systems, labels are returned + # as a list. When this happens, we extract the first item in the list, which always stores the Latin script + # version of the label as a string. + if isinstance(raw_label, list): + assert isinstance(raw_label[0], str) + return raw_label[0] + + return raw_label["@value"] + + def exclude(self) -> bool: + """Returns True if the concept should be excluded from the graph.""" + if self._raw_concept_node is None: + return True + + # Remove concepts whose IDs have the "-781" suffix. They are duplicates of concepts with non-suffixed IDs. + # The suffix represents the fact that the concept in question is part of the LCSH - Geographic collection. + if self.source_id.endswith("-781"): + return True + + return False diff --git a/src/transformers/loc/skos/raw_concept.py b/src/transformers/loc/skos/raw_concept.py index 69aa02e6cb..13ce5b38fe 100644 --- a/src/transformers/loc/skos/raw_concept.py +++ b/src/transformers/loc/skos/raw_concept.py @@ -23,33 +23,6 @@ def _extract_concept_node(self) -> dict | None: return concept_node - @staticmethod - def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: - # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. - if isinstance(raw_label, str): - return raw_label - - # In cases where an LoC Name has multiple labels written using different writing systems, labels are returned - # as a list. When this happens, we extract the first item in the list, which always stores the Latin script - # version of the label as a string. - if isinstance(raw_label, list): - assert isinstance(raw_label[0], str) - return raw_label[0] - - return raw_label["@value"] - - def exclude(self) -> bool: - """Returns True if the concept should be excluded from the graph.""" - if self._raw_concept_node is None: - return True - - # Remove concepts whose IDs have the "-781" suffix. They are duplicates of concepts with non-suffixed IDs. - # The suffix represents the fact that the concept in question is part of the LCSH - Geographic collection. - if self.source_id.endswith("-781"): - return True - - return False - @property def label(self) -> str: assert self._raw_concept_node is not None diff --git a/tests/fixtures/loc_subjects_diverse_example.jsonld b/tests/fixtures/loc_subjects_diverse_example.jsonld new file mode 100644 index 0000000000..bb4ac019d2 --- /dev/null +++ b/tests/fixtures/loc_subjects_diverse_example.jsonld @@ -0,0 +1,3 @@ +{"@context": "http://v3/authorities/subjects/context.json", "@graph": [{"@id": "http://id.loc.gov/authorities/subjects/sh85040229", "@type": "skos:Concept", "skos:prefLabel": {"@language": "en", "@value": "Dwellings--England"}}, {"@id": "http://id.loc.gov/authorities/subjects/sh2017003861", "@type": "skos:Concept", "skos:broader": [{"@id": "http://id.loc.gov/authorities/subjects/sh85040229"}, {"@id": "http://id.loc.gov/authorities/subjects/sh85053109"}, {"@id": "http://id.loc.gov/authorities/subjects/sh92006359"}], "skos:changeNote": [{"@id": "_:n2de234bdaf9048a8bb64367927c52189b1"}, {"@id": "_:n2de234bdaf9048a8bb64367927c52189b2"}], "skos:editorialNote": "[This heading is not valid for use as a geographic subdivision.]", "skos:inScheme": {"@id": "http://id.loc.gov/authorities/subjects"}, "skos:notation": {"@type": "http://id.loc.gov/datatypes/codes/gac", "@value": "e-uk-en"}, "skos:prefLabel": {"@language": "en", "@value": "Caversham Park (Reading, England)"}}, {"@id": "_:n2de234bdaf9048a8bb64367927c52189b1", "@type": "cs:ChangeSet", "cs:changeReason": "revised", "cs:createdDate": {"@type": "xsd:dateTime", "@value": "2017-10-13T11:41:51"}, "cs:creatorName": {"@id": "http://id.loc.gov/vocabulary/organizations/dlc"}, "cs:subjectOfChange": {"@id": "http://id.loc.gov/authorities/subjects/sh2017003861"}}, {"@id": "_:n2de234bdaf9048a8bb64367927c52189b2", "@type": "cs:ChangeSet", "cs:changeReason": "new", "cs:createdDate": {"@type": "xsd:dateTime", "@value": "2017-07-06T00:00:00"}, "cs:creatorName": {"@id": "http://id.loc.gov/vocabulary/organizations/ukcu"}, "cs:subjectOfChange": {"@id": "http://id.loc.gov/authorities/subjects/sh2017003861"}}, {"@id": "http://id.loc.gov/authorities/subjects/sh85053109", "@type": "skos:Concept", "skos:prefLabel": {"@language": "en", "@value": "Gardens--England"}}, {"@id": "http://id.loc.gov/authorities/subjects/sh92006359", "@type": "skos:Concept", "skos:prefLabel": {"@language": "en", "@value": "Office buildings--England"}}], "@id": "/authorities/subjects/sh2017003861"} +{"@context": "http://v3/authorities/subjects/context.json", "@graph": [{"@id": "http://id.loc.gov/authorities/subjects/sh2001009772", "@type": "skosxl:Label", "skos:changeNote": [{"@id": "_:ncbe2fbbca4484db9968d00b56091d840b1"}, {"@id": "_:ncbe2fbbca4484db9968d00b56091d840b2"}, {"@language": "en", "@value": "This authority record has been deleted because he subject heading is covered by the name heading {Gummidge, Worzel (Fictitious character)} (DLC)nb2020011266"}], "skosxl:literalForm": {"@language": "en", "@value": "Worzel Gummidge (Fictitious character)"}}, {"@id": "_:ncbe2fbbca4484db9968d00b56091d840b1", "@type": "cs:ChangeSet", "cs:changeReason": "new", "cs:createdDate": {"@type": "xsd:dateTime", "@value": "2001-12-06T00:00:00"}, "cs:creatorName": {"@id": "http://id.loc.gov/vocabulary/organizations/uk"}, "cs:subjectOfChange": {"@id": "http://id.loc.gov/authorities/subjects/sh2001009772"}}, {"@id": "_:ncbe2fbbca4484db9968d00b56091d840b2", "@type": "cs:ChangeSet", "cs:changeReason": "deprecated", "cs:createdDate": {"@type": "xsd:dateTime", "@value": "2021-04-15T12:19:28"}, "cs:creatorName": {"@id": "http://id.loc.gov/vocabulary/organizations/uk"}, "cs:subjectOfChange": {"@id": "http://id.loc.gov/authorities/subjects/sh2001009772"}}], "@id": "/authorities/subjects/sh2001009772"} +{ "@context": "http://v3/authorities/subjects/context.json", "@graph": [ { "@id": "http://id.loc.gov/authorities/subjects/sh2006006405", "@type": "skos:Concept", "skos:prefLabel": { "@language": "en", "@value": "Object-oriented programming languages" } }, { "@id": "http://id.loc.gov/authorities/subjects/sh2006007256", "@type": "skos:Concept", "skos:prefLabel": { "@language": "en", "@value": "Scripting languages (Computer science)" } }, { "@id": "http://id.loc.gov/authorities/subjects/sh00000011", "@type": "skos:Concept", "skos:broader": [ { "@id": "http://id.loc.gov/authorities/subjects/sh2006006405" }, { "@id": "http://id.loc.gov/authorities/subjects/sh2006007256" }, { "@id": "http://id.loc.gov/authorities/subjects/sh2007005223" } ], "skos:changeNote": [ { "@id": "_:n6bdc44981812457dbabe82d08ead45aab1" }, { "@id": "_:n6bdc44981812457dbabe82d08ead45aab2" } ], "skos:inScheme": { "@id": "http://id.loc.gov/authorities/subjects" }, "skos:prefLabel": { "@language": "en", "@value": "ActionScript (Computer program language)" } }, { "@id": "_:n6bdc44981812457dbabe82d08ead45aab1", "@type": "cs:ChangeSet", "cs:changeReason": "new", "cs:createdDate": { "@type": "xsd:dateTime", "@value": "2000-09-27T00:00:00" }, "cs:creatorName": { "@id": "http://id.loc.gov/vocabulary/organizations/dlc" }, "cs:subjectOfChange": { "@id": "http://id.loc.gov/authorities/subjects/sh00000011" } }, { "@id": "_:n6bdc44981812457dbabe82d08ead45aab2", "@type": "cs:ChangeSet", "cs:changeReason": "revised", "cs:createdDate": { "@type": "xsd:dateTime", "@value": "2007-10-12T07:53:10" }, "cs:creatorName": { "@id": "http://id.loc.gov/vocabulary/organizations/abau" }, "cs:subjectOfChange": { "@id": "http://id.loc.gov/authorities/subjects/sh00000011" } }, { "@id": "http://id.loc.gov/authorities/subjects/sh2007005223", "@type": "skos:Concept", "skos:prefLabel": { "@language": "en", "@value": "Domain-specific programming languages" } } ], "@id": "/authorities/subjects/sh00000011" } \ No newline at end of file diff --git a/tests/fixtures/mads_781_pair.ndjson b/tests/fixtures/mads_781_pair.ndjson new file mode 100644 index 0000000000..849624405c --- /dev/null +++ b/tests/fixtures/mads_781_pair.ndjson @@ -0,0 +1,2 @@ +{"@context": "http://v3/authorities/subjects/context.json", "@graph": [{"@id": "http://id.loc.gov/authorities/subjects/sh2007009966", "@type": ["madsrdf:Authority", "madsrdf:ComplexSubject"], "madsrdf:authoritativeLabel": {"@language": "en", "@value": "Dikes (Engineering)--Netherlands"}, "madsrdf:componentList": {"@list": [{"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b10"}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b13"}]}}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b10", "@type": ["madsrdf:Authority", "madsrdf:Topic"], "madsrdf:authoritativeLabel": {"@language": "en", "@value": "Dikes (Engineering)"}, "madsrdf:elementList": {"@list": [{"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b11"}]}}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b11", "@type": "madsrdf:TopicElement", "madsrdf:elementValue": {"@language": "en", "@value": "Dikes (Engineering)"}}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b13", "@type": ["madsrdf:Authority", "madsrdf:Geographic"], "madsrdf:authoritativeLabel": {"@language": "en", "@value": "Netherlands"}, "madsrdf:elementList": {"@list": [{"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b14"}]}}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b14", "@type": "madsrdf:GeographicElement", "madsrdf:elementValue": {"@language": "en", "@value": "Netherlands"}}, {"@id": "http://id.loc.gov/authorities/subjects/sh2007009587", "@type": ["madsrdf:Authority", "madsrdf:Geographic"], "bflc:marcKey": "151 $aAfsluitdijk (Netherlands)", "identifiers:lccn": "sh2007009587", "madsrdf:adminMetadata": [{"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b1"}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b2"}], "madsrdf:authoritativeLabel": {"@language": "en", "@value": "Afsluitdijk (Netherlands)"}, "madsrdf:code": {"@type": "http://id.loc.gov/datatypes/codes/gac", "@value": "e-ne---"}, "madsrdf:elementList": {"@list": [{"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b3"}]}, "madsrdf:hasBroaderAuthority": {"@id": "http://id.loc.gov/authorities/subjects/sh2007009966"}, "madsrdf:hasSource": [{"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b5"}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b6"}], "madsrdf:hasVariant": {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b7"}, "madsrdf:isMemberOfMADSCollection": [{"@id": "http://id.loc.gov/authorities/subjects/collection_LCSHAuthorizedHeadings"}, {"@id": "http://id.loc.gov/authorities/subjects/collection_LCSH_General"}], "madsrdf:isMemberOfMADSScheme": {"@id": "http://id.loc.gov/authorities/subjects"}, "owl:sameAs": [{"@id": "http://id.loc.gov/authorities/sh2007009587#concept"}, {"@id": "info:lc/authorities/sh2007009587"}]}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b1", "@type": "ri:RecordInfo", "ri:languageOfCataloging": {"@id": "http://id.loc.gov/vocabulary/iso639-2/eng"}, "ri:recordChangeDate": {"@type": "xsd:dateTime", "@value": "2007-11-30T07:54:25"}, "ri:recordContentSource": {"@id": "http://id.loc.gov/vocabulary/organizations/dlc"}, "ri:recordStatus": "revised"}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b2", "@type": "ri:RecordInfo", "ri:languageOfCataloging": {"@id": "http://id.loc.gov/vocabulary/iso639-2/eng"}, "ri:recordChangeDate": {"@type": "xsd:dateTime", "@value": "2007-11-29T00:00:00"}, "ri:recordContentSource": {"@id": "http://id.loc.gov/vocabulary/organizations/uk"}, "ri:recordStatus": "new"}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b3", "@type": "madsrdf:GeographicElement", "madsrdf:elementValue": {"@language": "en", "@value": "Afsluitdijk (Netherlands)"}}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b5", "@type": "madsrdf:Source", "madsrdf:citationNote": {"@language": "en", "@value": "(The Afsluitdijk (English: Closure Dike) is a major dike in the Netherlands, constructed between 1927 and 1933 and running from Den Oever on Wieringen in North Holland province, to the village of Zurich (mun. Wunseradiel) in Friesland province, over a length of 32 km (20 miles))"}, "madsrdf:citationSource": "Wikipedia, 1 Nov. 2007", "madsrdf:citationStatus": "found"}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b6", "@type": "madsrdf:Source", "madsrdf:citationNote": {"@language": "en", "@value": "p. 4 of cover (The Afsluitdijk is an exceptional feat of civil engineering ... the Waddenzee on one side and the IJsselmeer on the other ... a dike through the sea)"}, "madsrdf:citationSource": "Work cat.: Brugman, G. Recht door zee, 2007:", "madsrdf:citationStatus": "found"}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b7", "@type": ["madsrdf:Geographic", "madsrdf:Variant"], "madsrdf:elementList": {"@list": [{"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b8"}]}, "madsrdf:variantLabel": {"@language": "en", "@value": "Closure Dike (Netherlands)"}}, {"@id": "_:n3ff6e8335a9b4669b3616f19a829d540b8", "@type": "madsrdf:GeographicElement", "madsrdf:elementValue": {"@language": "en", "@value": "Closure Dike (Netherlands)"}}], "@id": "/authorities/subjects/sh2007009587"} +{"@context": "http://v3/authorities/subjects/context.json", "@graph": [{"@id": "http://id.loc.gov/authorities/subjects/sh2007009587-781", "@type": ["madsrdf:Authority", "madsrdf:HierarchicalGeographic"], "bflc:marcKey": "181 $zNetherlands$zAfsluitdijk", "madsrdf:adminMetadata": [{"@id": "_:n7a5ea952dd65477390c330b6650c8857b1"}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b2"}], "madsrdf:authoritativeLabel": {"@language": "en", "@value": "Netherlands--Afsluitdijk"}, "madsrdf:componentList": {"@list": [{"@id": "_:n7a5ea952dd65477390c330b6650c8857b3"}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b6"}]}, "madsrdf:editorialNote": "[Resource automatically generated from LCCN sh2007009587]", "madsrdf:hasSource": {"@id": "_:n7a5ea952dd65477390c330b6650c8857b11"}, "madsrdf:isMemberOfMADSCollection": [{"@id": "http://id.loc.gov/authorities/subjects/collection_GeographicSubdivisions"}, {"@id": "http://id.loc.gov/authorities/subjects/collection_LCSH_General"}, {"@id": "http://id.loc.gov/authorities/subjects/collection_Subdivisions"}], "madsrdf:isMemberOfMADSScheme": {"@id": "http://id.loc.gov/authorities/subjects"}, "owl:sameAs": [{"@id": "http://id.loc.gov/authorities/sh2007009587-781#concept"}, {"@id": "info:lc/authorities/sh2007009587-781"}]}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b1", "@type": "ri:RecordInfo", "ri:languageOfCataloging": {"@id": "http://id.loc.gov/vocabulary/iso639-2/eng"}, "ri:recordChangeDate": {"@type": "xsd:dateTime", "@value": "2007-11-29T00:00:00"}, "ri:recordContentSource": {"@id": "http://id.loc.gov/vocabulary/organizations/uk"}, "ri:recordStatus": "new"}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b2", "@type": "ri:RecordInfo", "ri:languageOfCataloging": {"@id": "http://id.loc.gov/vocabulary/iso639-2/eng"}, "ri:recordChangeDate": {"@type": "xsd:dateTime", "@value": "2022-05-06T17:36:07"}, "ri:recordContentSource": {"@id": "http://id.loc.gov/vocabulary/organizations/dlc"}, "ri:recordStatus": "revised"}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b3", "@type": ["madsrdf:Authority", "madsrdf:Geographic"], "madsrdf:authoritativeLabel": {"@language": "en", "@value": "Netherlands"}, "madsrdf:elementList": {"@list": [{"@id": "_:n7a5ea952dd65477390c330b6650c8857b4"}]}}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b4", "@type": "madsrdf:GeographicElement", "madsrdf:elementValue": {"@language": "en", "@value": "Netherlands"}}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b6", "@type": ["madsrdf:Authority", "madsrdf:Geographic"], "madsrdf:authoritativeLabel": {"@language": "en", "@value": "Afsluitdijk"}, "madsrdf:elementList": {"@list": [{"@id": "_:n7a5ea952dd65477390c330b6650c8857b7"}]}}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b7", "@type": "madsrdf:GeographicElement", "madsrdf:elementValue": {"@language": "en", "@value": "Afsluitdijk"}}, {"@id": "_:n7a5ea952dd65477390c330b6650c8857b11", "@type": "madsrdf:Source", "madsrdf:citationSource": [{"@id": "http://id.loc.gov/authorities/names/sh2007009587"}, "Source: "], "madsrdf:citationStatus": "found"}], "@id": "/authorities/subjects/sh2007009587-781"} diff --git a/tests/fixtures/sh2010105253.json b/tests/fixtures/mads_composite_concept.json similarity index 100% rename from tests/fixtures/sh2010105253.json rename to tests/fixtures/mads_composite_concept.json diff --git a/tests/fixtures/mads_deprecated_concept.json b/tests/fixtures/mads_deprecated_concept.json new file mode 100644 index 0000000000..00e090b75a --- /dev/null +++ b/tests/fixtures/mads_deprecated_concept.json @@ -0,0 +1,83 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh2001009772", + "@type": [ + "madsrdf:DeprecatedAuthority", + "madsrdf:Topic", + "madsrdf:Variant" + ], + "madsrdf:adminMetadata": [ + { + "@id": "_:nfcf2ac8a60404d37aaa15836972ce73fb1" + }, + { + "@id": "_:nfcf2ac8a60404d37aaa15836972ce73fb2" + } + ], + "madsrdf:deletionNote": { + "@language": "en", + "@value": "This authority record has been deleted because he subject heading is covered by the name heading {Gummidge, Worzel (Fictitious character)} (DLC)nb2020011266" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:nfcf2ac8a60404d37aaa15836972ce73fb3" + } + ] + }, + "madsrdf:variantLabel": { + "@language": "en", + "@value": "Worzel Gummidge (Fictitious character)" + }, + "owl:sameAs": [ + { + "@id": "http://id.loc.gov/authorities/sh2001009772#concept" + }, + { + "@id": "info:lc/authorities/sh2001009772" + } + ] + }, + { + "@id": "_:nfcf2ac8a60404d37aaa15836972ce73fb1", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2021-04-15T12:19:28" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/uk" + }, + "ri:recordStatus": "deprecated" + }, + { + "@id": "_:nfcf2ac8a60404d37aaa15836972ce73fb2", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2001-12-06T00:00:00" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/uk" + }, + "ri:recordStatus": "new" + }, + { + "@id": "_:nfcf2ac8a60404d37aaa15836972ce73fb3", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Worzel Gummidge (Fictitious character)" + } + } + ], + "@id": "/authorities/subjects/sh2001009772" +} \ No newline at end of file diff --git a/tests/fixtures/mads_geographic_concept.json b/tests/fixtures/mads_geographic_concept.json new file mode 100644 index 0000000000..e5fe5aa30e --- /dev/null +++ b/tests/fixtures/mads_geographic_concept.json @@ -0,0 +1,354 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh92006359", + "@type": [ + "madsrdf:Authority", + "madsrdf:ComplexSubject" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Office buildings--England" + }, + "madsrdf:componentList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b23" + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b26" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b23", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Office buildings" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b24" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b24", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Office buildings" + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b26", + "@type": [ + "madsrdf:Authority", + "madsrdf:Geographic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "England" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b27" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b27", + "@type": "madsrdf:GeographicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "England" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85053109", + "@type": [ + "madsrdf:Authority", + "madsrdf:ComplexSubject" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Gardens--England" + }, + "madsrdf:componentList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b15" + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b18" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b15", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Gardens" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b16" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b16", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Gardens" + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b18", + "@type": [ + "madsrdf:Authority", + "madsrdf:Geographic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "England" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b19" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b19", + "@type": "madsrdf:GeographicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "England" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85040229", + "@type": [ + "madsrdf:Authority", + "madsrdf:ComplexSubject" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Dwellings--England" + }, + "madsrdf:componentList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b7" + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b10" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b7", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Dwellings" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b8" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b8", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Dwellings" + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b10", + "@type": [ + "madsrdf:Authority", + "madsrdf:Geographic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "England" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b11" + } + ] + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b11", + "@type": "madsrdf:GeographicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "England" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh2017003861", + "@type": [ + "madsrdf:Authority", + "madsrdf:Geographic" + ], + "bflc:marcKey": "151 $aCaversham Park (Reading, England)", + "identifiers:lccn": "sh2017003861", + "madsrdf:adminMetadata": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b1" + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b2" + } + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Caversham Park (Reading, England)" + }, + "madsrdf:code": { + "@type": "http://id.loc.gov/datatypes/codes/gac", + "@value": "e-uk-en" + }, + "madsrdf:editorialNote": "[This heading is not valid for use as a geographic subdivision.]", + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b3" + } + ] + }, + "madsrdf:hasBroaderAuthority": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85040229" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85053109" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh92006359" + } + ], + "madsrdf:hasSource": [ + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b5" + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b6" + } + ], + "madsrdf:isMemberOfMADSCollection": [ + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSHAuthorizedHeadings" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSH_General" + } + ], + "madsrdf:isMemberOfMADSScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "owl:sameAs": [ + { + "@id": "http://id.loc.gov/authorities/sh2017003861#concept" + }, + { + "@id": "info:lc/authorities/sh2017003861" + } + ] + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b1", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2017-10-13T11:41:51" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "revised" + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b2", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2017-07-06T00:00:00" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/ukcu" + }, + "ri:recordStatus": "new" + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b3", + "@type": "madsrdf:GeographicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Caversham Park (Reading, England)" + } + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b5", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "title page (Caversham Park) page 3 (first mentioned in the Domesday Book and its ownership can be traced back to 1164, when it was held by the Earl of Pembroke) page 6 (garden is the work of Capability Brown) page 15 (the estate was sold and split up in 1921) page 16 (since 1943 Caversham Park has been the HQ of BBC Monitoring and home of Radio Berkshire)" + }, + "madsrdf:citationSource": "Work cat.: Viljoen, Ben. Lancelot Brown & the landscape of Caversham Park, 2016:", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:ne7ebddd144ae4c6d9942e79e04d16e67b6", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "(Victorian stately home with parkland in the suburb of Caversham, on the outskirts of Reading; goes back to Norman times, a park and fortified manor house or castle which no longer exists; the current building inspired by Italian baroque palaces, was erected after a fire in 1850 by architect Horace Jones and is now home to the BBC's monitoring service)" + }, + "madsrdf:citationSource": "Wikipedia (English), June 6, 2017", + "madsrdf:citationStatus": "found" + } + ], + "@id": "/authorities/subjects/sh2017003861" +} diff --git a/tests/fixtures/mads_variant_concept.json b/tests/fixtures/mads_variant_concept.json new file mode 100644 index 0000000000..dc61f6af0e --- /dev/null +++ b/tests/fixtures/mads_variant_concept.json @@ -0,0 +1,287 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85074044", + "@type": [ + "madsrdf:Authority", + "madsrdf:ComplexSubject" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Lakes--Argentina" + }, + "madsrdf:componentList": { + "@list": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b15" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b18" + } + ] + } + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b15", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Lakes" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b16" + } + ] + } + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b16", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Lakes" + } + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b18", + "@type": [ + "madsrdf:Authority", + "madsrdf:Geographic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Argentina" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b19" + } + ] + } + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b19", + "@type": "madsrdf:GeographicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Argentina" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh00000023", + "@type": [ + "madsrdf:Authority", + "madsrdf:Geographic" + ], + "bflc:marcKey": "151 $aChanchera Lake (Argentina)", + "identifiers:lccn": "sh 00000023", + "madsrdf:adminMetadata": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b1" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b2" + } + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Chanchera Lake (Argentina)" + }, + "madsrdf:code": { + "@type": "http://id.loc.gov/datatypes/codes/gac", + "@value": "s-ag---" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b3" + } + ] + }, + "madsrdf:hasBroaderAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh85074044" + }, + "madsrdf:hasSource": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b5" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b6" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b7" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b8" + } + ], + "madsrdf:hasVariant": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b9" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b12" + } + ], + "madsrdf:isMemberOfMADSCollection": [ + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSHAuthorizedHeadings" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSH_General" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_PatternHeadingH1145.5" + } + ], + "madsrdf:isMemberOfMADSScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "madsrdf:usePatternCollection": { + "@id": "http://id.loc.gov/authorities/subjects/collection_PatternHeadingH1145.5" + }, + "owl:sameAs": [ + { + "@id": "http://id.loc.gov/authorities/sh00000023#concept" + }, + { + "@id": "info:lc/authorities/sh00000023" + } + ] + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b1", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2000-03-22T00:00:00" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "new" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b2", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2024-09-25T14:01:14" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "revised" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b3", + "@type": "madsrdf:GeographicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Chanchera Lake (Argentina)" + } + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b5", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "p. 9, etc. (laguna La Chanchera; C\u00f3rdoba Province, Argentina)" + }, + "madsrdf:citationSource": "Work cat.: 99526422: La Chanchera, 1999:", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b6", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "lake; 33\u00b049\u02b960\u02baS 063\u00b025\u02b960\u02baW" + }, + "madsrdf:citationSource": "GeoNames [algorithmically matched]", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b7", + "@type": "madsrdf:Source", + "madsrdf:citationSource": "Lippincott;Web. geog.;Argentina gaz.;Nat. Geog. atlas;Times atlas;Rand McNally new internat. atlas", + "madsrdf:citationStatus": "notfound" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b8", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "(La Chanchera, Laguna)" + }, + "madsrdf:citationSource": "GEOnet, March 22, 2000", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b9", + "@type": [ + "madsrdf:Geographic", + "madsrdf:Variant" + ], + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b10" + } + ] + }, + "madsrdf:variantLabel": { + "@language": "en", + "@value": "La Chancera, Laguna (Argentina)" + } + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b10", + "@type": "madsrdf:GeographicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "La Chancera, Laguna (Argentina)" + } + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b12", + "@type": [ + "madsrdf:Geographic", + "madsrdf:Variant" + ], + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b13" + } + ] + }, + "madsrdf:variantLabel": { + "@language": "en", + "@value": "Laguna La Chanchera (Argentina)" + } + }, + { + "@id": "_:nf0e643dd3ad44ac48213787159c80818b13", + "@type": "madsrdf:GeographicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Laguna La Chanchera (Argentina)" + } + } + ], + "@id": "/authorities/subjects/sh00000023" +} diff --git a/tests/fixtures/skos_deprecated_concept.json b/tests/fixtures/skos_deprecated_concept.json new file mode 100644 index 0000000000..ab74a36a96 --- /dev/null +++ b/tests/fixtures/skos_deprecated_concept.json @@ -0,0 +1,56 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh2001009772", + "@type": "skosxl:Label", + "skos:changeNote": [ + { + "@id": "_:ncbe2fbbca4484db9968d00b56091d840b1" + }, + { + "@id": "_:ncbe2fbbca4484db9968d00b56091d840b2" + }, + { + "@language": "en", + "@value": "This authority record has been deleted because he subject heading is covered by the name heading {Gummidge, Worzel (Fictitious character)} (DLC)nb2020011266" + } + ], + "skosxl:literalForm": { + "@language": "en", + "@value": "Worzel Gummidge (Fictitious character)" + } + }, + { + "@id": "_:ncbe2fbbca4484db9968d00b56091d840b1", + "@type": "cs:ChangeSet", + "cs:changeReason": "new", + "cs:createdDate": { + "@type": "xsd:dateTime", + "@value": "2001-12-06T00:00:00" + }, + "cs:creatorName": { + "@id": "http://id.loc.gov/vocabulary/organizations/uk" + }, + "cs:subjectOfChange": { + "@id": "http://id.loc.gov/authorities/subjects/sh2001009772" + } + }, + { + "@id": "_:ncbe2fbbca4484db9968d00b56091d840b2", + "@type": "cs:ChangeSet", + "cs:changeReason": "deprecated", + "cs:createdDate": { + "@type": "xsd:dateTime", + "@value": "2021-04-15T12:19:28" + }, + "cs:creatorName": { + "@id": "http://id.loc.gov/vocabulary/organizations/uk" + }, + "cs:subjectOfChange": { + "@id": "http://id.loc.gov/authorities/subjects/sh2001009772" + } + } + ], + "@id": "/authorities/subjects/sh2001009772" +} \ No newline at end of file diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index b89475a45d..10f958466e 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -3,7 +3,7 @@ from test_utils import load_fixture from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept -sh2010105253 = json.loads(load_fixture("sh2010105253.json")) +sh2010105253 = json.loads(load_fixture("mads_composite_concept.json")) def test_source_id() -> None: From 1cb69e500d8cdc0ee76a899bb0a2c8ce830fdeff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 30 Jan 2025 16:37:37 +0000 Subject: [PATCH 201/310] Integrate ECS extractor into catalogue graph pipeline #5889 --- terraform/iam_state_machines.tf | 13 ++- terraform/security_groups.tf | 19 ++++ terraform/state_machine_extractor.tf | 51 +++++++++++ terraform/state_machine_extractors.tf | 46 +++++----- .../state_machine_single_extractor_loader.tf | 88 ++----------------- 5 files changed, 111 insertions(+), 106 deletions(-) create mode 100644 terraform/security_groups.tf create mode 100644 terraform/state_machine_extractor.tf diff --git a/terraform/iam_state_machines.tf b/terraform/iam_state_machines.tf index 6afd3f4e06..08152c10eb 100644 --- a/terraform/iam_state_machines.tf +++ b/terraform/iam_state_machines.tf @@ -28,12 +28,21 @@ resource "aws_iam_policy" "state_machine_policy" { { Effect = "Allow", Action = ["states:StartExecution"], - Resource = [aws_sfn_state_machine.catalogue_graph_bulk_loader.arn] + Resource = [ + aws_sfn_state_machine.catalogue_graph_extractor.arn, + aws_sfn_state_machine.catalogue_graph_extractors.arn, + aws_sfn_state_machine.catalogue_graph_bulk_loader.arn, + aws_sfn_state_machine.catalogue_graph_bulk_loaders.arn + ] }, { Effect = "Allow", Action = ["lambda:InvokeFunction"], - Resource = "*" + Resource = [ + module.extractor_lambda.lambda.arn, + module.bulk_loader_lambda.lambda.arn, + module.bulk_load_poller_lambda.lambda.arn + ] }, { Effect = "Allow", diff --git a/terraform/security_groups.tf b/terraform/security_groups.tf new file mode 100644 index 0000000000..7a9ff68b39 --- /dev/null +++ b/terraform/security_groups.tf @@ -0,0 +1,19 @@ +resource "aws_security_group" "egress" { + name = "${local.namespace}_egress" + description = "Allow egress traffic from the services" + vpc_id = data.aws_vpc.vpc.id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + + cidr_blocks = [ + "0.0.0.0/0", + ] + } + + tags = { + Name = "${local.namespace}_egress" + } +} diff --git a/terraform/state_machine_extractor.tf b/terraform/state_machine_extractor.tf new file mode 100644 index 0000000000..1ac98b1370 --- /dev/null +++ b/terraform/state_machine_extractor.tf @@ -0,0 +1,51 @@ +resource "aws_sfn_state_machine" "catalogue_graph_extractor" { + name = "catalogue-graph-extractor" + role_arn = aws_iam_role.state_machine_execution_role.arn + + definition = jsonencode({ + Comment = "Run a single catalogue graph pipeline extractor task." + QueryLanguage = "JSONata" + StartAt = "Extract" + States = { + Extract = { + Type = "Task" + Resource = "arn:aws:states:::ecs:runTask.sync" + Output = "{% $states.input %}" + Next = "Success" + Arguments = { + Cluster = aws_ecs_cluster.cluster.arn + TaskDefinition = module.extractor_ecs_task.task_definition_arn + LaunchType = "FARGATE" + NetworkConfiguration = { + AwsvpcConfiguration = { + AssignPublicIp = "DISABLED" + Subnets = local.private_subnets + SecurityGroups = [ + local.ec_privatelink_security_group_id, + aws_security_group.egress.id + ] + } + }, + Overrides = { + ContainerOverrides = [ + { + Name = "catalogue-graph_extractor" + Command = [ + "--transformer-type", + "{% $states.input.transformer_type %}", + "--entity-type", + "{% $states.input.entity_type %}", + "--stream-destination", + "{% $states.input.stream_destination %}" + ] + } + ] + } + } + }, + Success = { + Type = "Succeed" + } + }, + }) +} diff --git a/terraform/state_machine_extractors.tf b/terraform/state_machine_extractors.tf index 178669eadd..c035bf6f49 100644 --- a/terraform/state_machine_extractors.tf +++ b/terraform/state_machine_extractors.tf @@ -4,32 +4,28 @@ resource "aws_sfn_state_machine" "catalogue_graph_extractors" { definition = jsonencode({ Comment = "Extract raw concepts from all sources, transform them into nodes and edges, and stream them into an S3 bucket." - StartAt = "Trigger extractors" - States = { - "Trigger extractors" = { - Type = "Parallel" - Branches = flatten([ - for index, task_input in var.state_machine_inputs : { - StartAt = "Extract ${task_input.label}" - States = { - "Extract ${task_input.label}" = { - Type = "Task" - Resource = module.extractor_lambda.lambda.arn - Parameters = { - "transformer_type" = task_input.transformer_type, - "entity_type" = task_input.entity_type, - "stream_destination" = "s3" - } - End = true - } - } + StartAt = "Extract ${var.state_machine_inputs[0].label}" + + States = merge(tomap({ + for index, task_input in var.state_machine_inputs : + "Extract ${task_input.label}" => { + Type = "Task" + Resource = "arn:aws:states:::states:startExecution.sync:2", + Parameters = { + StateMachineArn = aws_sfn_state_machine.catalogue_graph_extractor.arn + Input = { + "stream_destination": "s3", + "transformer_type.$": "$$.Execution.Input.transformer_type", + "entity_type.$": "$$.Execution.Input.entity_type", + "sample_size.$": "$$.Execution.Input.sample_size" } - ]) - Next = "Success" - }, - "Success" : { - "Type" : "Succeed" + } + Next = index == length(var.state_machine_inputs) - 1 ? "Success" : "Extract ${var.state_machine_inputs[index + 1].label}" + } + }), { + Success = { + Type = "Succeed" } - } + }) }) } diff --git a/terraform/state_machine_single_extractor_loader.tf b/terraform/state_machine_single_extractor_loader.tf index 06ea9c8120..26d6757e54 100644 --- a/terraform/state_machine_single_extractor_loader.tf +++ b/terraform/state_machine_single_extractor_loader.tf @@ -8,13 +8,16 @@ resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load" { States = { "Extract" = { Type = "Task" - Resource = module.extractor_lambda.lambda.arn + Resource = "arn:aws:states:::states:startExecution.sync:2", Next = "Load" - "Parameters" : { - "stream_destination" : "s3", - "transformer_type.$" : "$$.Execution.Input.transformer_type", - "entity_type.$" : "$$.Execution.Input.entity_type", - "sample_size.$" : "$$.Execution.Input.sample_size" + Parameters = { + StateMachineArn = aws_sfn_state_machine.catalogue_graph_extractor.arn + Input = { + "stream_destination": "s3", + "transformer_type.$": "$$.Execution.Input.transformer_type", + "entity_type.$": "$$.Execution.Input.entity_type", + "sample_size.$": "$$.Execution.Input.sample_size" + } } } "Load" = { @@ -32,76 +35,3 @@ resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load" { }, }) } -resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load_ecs" { - name = "catalogue-graph-single-extract-load-ecs" - role_arn = aws_iam_role.state_machine_execution_role.arn - - definition = jsonencode({ - Comment = "Extract nodes/edges from a single source and load them into the catalogue graph using an ECS task." - QueryLanguage = "JSONata" - StartAt = "Extract" - States = { - Extract = { - Type = "Task" - Resource = "arn:aws:states:::ecs:runTask.sync" - Output = "{% $states.input %}" - Next = "ShouldRunLoad" - Arguments = { - Cluster = aws_ecs_cluster.cluster.arn - TaskDefinition = module.extractor_ecs_task.task_definition_arn - LaunchType = "FARGATE" - NetworkConfiguration = { - AwsvpcConfiguration = { - AssignPublicIp = "DISABLED" - Subnets = local.private_subnets - SecurityGroups = [ - local.ec_privatelink_security_group_id, - aws_security_group.egress.id - ] - } - }, - Overrides = { - ContainerOverrides = [ - { - Name = "catalogue-graph_extractor" - Command = [ - "--transformer-type", - "{% $states.input.transformer_type %}", - "--entity-type", - "{% $states.input.entity_type %}", - "--stream-destination", - "{% $states.input.stream_destination %}" - ] - } - ] - } - } - }, - ShouldRunLoad = { - Type = "Choice" - Output = "{% $states.input %}" - Choices = [ - { - # This is how you do null coalescing in JSONata - # https://github.com/jsonata-js/jsonata/issues/370#issuecomment-556995173 - Condition = "{% [$states.input.run_bulk_load, false][0] %}", - Next = "Load" - } - ] - Default = "Success" - }, - Load = { - Type = "Task" - Resource = "arn:aws:states:::states:startExecution.sync:2" - Arguments = { - StateMachineArn = aws_sfn_state_machine.catalogue_graph_bulk_loader.arn - Input = "{% $states.input %}" - } - Next = "Success" - }, - Success = { - Type = "Succeed" - } - }, - }) -} From 7c57ebfa8cad7a20054fe59046f6707b876b9654 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 09:33:11 +0000 Subject: [PATCH 202/310] add exclusion tests --- tests/transformers/loc/test_common.py | 171 ++++++++++++++++---------- 1 file changed, 103 insertions(+), 68 deletions(-) diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py index be7aa87af4..25498c8035 100644 --- a/tests/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -1,74 +1,109 @@ import pytest from transformers.loc.common import remove_id_prefix, RawLibraryOfCongressConcept +from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept +from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept -def test_remove_prefix_noop() -> None: - """ - If there is no prefix to remove, remove_id_prefix will do nothing - """ - assert remove_id_prefix("sh1234567890") == "sh1234567890" - - -def test_remove_prefix_fully_qualified() -> None: - """ - remove_id_prefix removes fully-qualified URL-style prefixes - """ - assert ( - remove_id_prefix("http://id.loc.gov/authorities/subjects/sh1234567890") - == "sh1234567890" - ) - assert ( - remove_id_prefix("http://id.loc.gov/authorities/names/sh0987654321") - == "sh0987654321" - ) - - -def test_remove_prefix_relative() -> None: - """ - remove_id_prefix removes relative/local prefixes - """ - assert remove_id_prefix("/authorities/subjects/sh1234567890") == "sh1234567890" - assert remove_id_prefix("/authorities/names/sh0987654321") == "sh0987654321" - - -def test_remove_prefix_lookalikes() -> None: - """ - remove_id_prefix only removes specific known prefixes, - not just things that look a bit like them - """ - assert ( - remove_id_prefix("/authorities/banana/sh1234567890") - == "/authorities/banana/sh1234567890" - ) - assert ( - remove_id_prefix("https://id.loc.gov.uk/authorities/subjects/sh1234567890") - == "https://id.loc.gov.uk/authorities/subjects/sh1234567890" - ) - - -def test_source_subjects() -> None: - """ - Given an id with the prefix /authorities/subjects/, the source will be lc-subjects - """ - concept = RawLibraryOfCongressConcept( - {"@id": "/authorities/subjects/sh2010105253"} - ) - assert concept.source == "lc-subjects" - - -def test_source_names() -> None: - """ - Given an id with the prefix /authorities/subjects/, the source will be lc-subjects - """ - concept = RawLibraryOfCongressConcept( - {"@id": "/authorities/names/sh2010105253"} - ) - assert concept.source == "lc-names" - - -def test_source_invalid() -> None: - with (pytest.raises(ValueError)): + +class TestRemovePrefix: + def test_remove_prefix_noop(self) -> None: + """ + If there is no prefix to remove, remove_id_prefix will do nothing + """ + assert remove_id_prefix("sh1234567890") == "sh1234567890" + + + def test_remove_prefix_fully_qualified(self) -> None: + """ + remove_id_prefix removes fully-qualified URL-style prefixes + """ + assert ( + remove_id_prefix("http://id.loc.gov/authorities/subjects/sh1234567890") + == "sh1234567890" + ) + assert ( + remove_id_prefix("http://id.loc.gov/authorities/names/sh0987654321") + == "sh0987654321" + ) + + + def test_remove_prefix_relative(self) -> None: + """ + remove_id_prefix removes relative/local prefixes + """ + assert remove_id_prefix("/authorities/subjects/sh1234567890") == "sh1234567890" + assert remove_id_prefix("/authorities/names/sh0987654321") == "sh0987654321" + + + def test_remove_prefix_lookalikes(self) -> None: + """ + remove_id_prefix only removes specific known prefixes, + not just things that look a bit like them + """ + assert ( + remove_id_prefix("/authorities/banana/sh1234567890") + == "/authorities/banana/sh1234567890" + ) + assert ( + remove_id_prefix("https://id.loc.gov.uk/authorities/subjects/sh1234567890") + == "https://id.loc.gov.uk/authorities/subjects/sh1234567890" + ) + +class TestSource: + def test_source_subjects(self) -> None: + """ + Given an id with the prefix /authorities/subjects/, the source will be lc-subjects + """ concept = RawLibraryOfCongressConcept( - {"@id": "authorities/childrensSubjects/sj2021051581"} + {"@id": "/authorities/subjects/sh2010105253"} ) - concept.source + assert concept.source == "lc-subjects" + + + def test_source_names(self) -> None: + """ + Given an id with the prefix /authorities/subjects/, the source will be lc-subjects + """ + concept = RawLibraryOfCongressConcept( + {"@id": "/authorities/names/sh2010105253"} + ) + assert concept.source == "lc-names" + + + def test_source_invalid(self) -> None: + with (pytest.raises(ValueError)): + concept = RawLibraryOfCongressConcept( + {"@id": "authorities/childrensSubjects/sj2021051581"} + ) + concept.source + + +@pytest.mark.parametrize("sut_class", [RawLibraryOfCongressSKOSConcept, RawLibraryOfCongressMADSConcept]) +class TestExclusion: + def test_do_not_exclude(self, sut_class): + """ + A record with a corresponding node in its internal graph, and which is not a duplicate, + should be included in the output + """ + concept = sut_class( {"@id": "authorities/names/sh2010105253", "@graph":[]}) + # The SUT at this point doesn't actually care what the node is, just that it exists + concept._raw_concept_node = "Anything" + assert concept.exclude() == False + + def test_exclude_no_node(self, sut_class): + """ + If a record does not contain a corresponding node in its internal graph + then it should be excluded + """ + concept = sut_class( {"@id": "authorities/names/sh2010105253", "@graph":[]}) + assert concept.exclude() == True + + def test_exclude_marked_duplicates(self, sut_class): + """ + If a record's identifier is suffixed with -781, this marks the entry as a duplicate + which is to be excluded + """ + concept = sut_class( {"@id": "authorities/names/sh2010105253-781", "@graph":[]}) + concept._raw_concept_node = "Anything" + assert concept.exclude() == True + From ad33843b4a5007557d029ef966b538da40b2a2b2 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 09:37:46 +0000 Subject: [PATCH 203/310] add exclusion tests --- src/transformers/loc/common.py | 2 +- tests/transformers/loc/test_common.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index 86e1c34c40..26f26a7bb0 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -19,7 +19,7 @@ def __init__(self, raw_concept: dict): self.raw_concept = raw_concept self._raw_concept_node = self._extract_concept_node() - def _extract_concept_node(self): + def _extract_concept_node(self) -> dict | None: pass @property diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py index 25498c8035..89c6812253 100644 --- a/tests/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -1,5 +1,5 @@ import pytest - +from typing import Type from transformers.loc.common import remove_id_prefix, RawLibraryOfCongressConcept from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept @@ -80,17 +80,17 @@ def test_source_invalid(self) -> None: @pytest.mark.parametrize("sut_class", [RawLibraryOfCongressSKOSConcept, RawLibraryOfCongressMADSConcept]) class TestExclusion: - def test_do_not_exclude(self, sut_class): + def test_do_not_exclude(self, sut_class:Type[RawLibraryOfCongressConcept])->None: """ A record with a corresponding node in its internal graph, and which is not a duplicate, should be included in the output """ concept = sut_class( {"@id": "authorities/names/sh2010105253", "@graph":[]}) # The SUT at this point doesn't actually care what the node is, just that it exists - concept._raw_concept_node = "Anything" + concept._raw_concept_node = {} assert concept.exclude() == False - def test_exclude_no_node(self, sut_class): + def test_exclude_no_node(self, sut_class:Type[RawLibraryOfCongressConcept])->None: """ If a record does not contain a corresponding node in its internal graph then it should be excluded @@ -98,12 +98,12 @@ def test_exclude_no_node(self, sut_class): concept = sut_class( {"@id": "authorities/names/sh2010105253", "@graph":[]}) assert concept.exclude() == True - def test_exclude_marked_duplicates(self, sut_class): + def test_exclude_marked_duplicates(self, sut_class:Type[RawLibraryOfCongressConcept]) -> None: """ If a record's identifier is suffixed with -781, this marks the entry as a duplicate which is to be excluded """ concept = sut_class( {"@id": "authorities/names/sh2010105253-781", "@graph":[]}) - concept._raw_concept_node = "Anything" + concept._raw_concept_node = {} assert concept.exclude() == True From 5ce58c0850dac902f2f9e4ede82e1a47c750f00c Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 09:39:02 +0000 Subject: [PATCH 204/310] add exclusion tests --- tests/transformers/loc/test_common.py | 29 ++++++++++++++------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py index 89c6812253..37eadcc737 100644 --- a/tests/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -12,7 +12,6 @@ def test_remove_prefix_noop(self) -> None: """ assert remove_id_prefix("sh1234567890") == "sh1234567890" - def test_remove_prefix_fully_qualified(self) -> None: """ remove_id_prefix removes fully-qualified URL-style prefixes @@ -26,7 +25,6 @@ def test_remove_prefix_fully_qualified(self) -> None: == "sh0987654321" ) - def test_remove_prefix_relative(self) -> None: """ remove_id_prefix removes relative/local prefixes @@ -34,7 +32,6 @@ def test_remove_prefix_relative(self) -> None: assert remove_id_prefix("/authorities/subjects/sh1234567890") == "sh1234567890" assert remove_id_prefix("/authorities/names/sh0987654321") == "sh0987654321" - def test_remove_prefix_lookalikes(self) -> None: """ remove_id_prefix only removes specific known prefixes, @@ -49,6 +46,7 @@ def test_remove_prefix_lookalikes(self) -> None: == "https://id.loc.gov.uk/authorities/subjects/sh1234567890" ) + class TestSource: def test_source_subjects(self) -> None: """ @@ -59,7 +57,6 @@ def test_source_subjects(self) -> None: ) assert concept.source == "lc-subjects" - def test_source_names(self) -> None: """ Given an id with the prefix /authorities/subjects/, the source will be lc-subjects @@ -69,41 +66,45 @@ def test_source_names(self) -> None: ) assert concept.source == "lc-names" - def test_source_invalid(self) -> None: - with (pytest.raises(ValueError)): + with pytest.raises(ValueError): concept = RawLibraryOfCongressConcept( {"@id": "authorities/childrensSubjects/sj2021051581"} ) concept.source -@pytest.mark.parametrize("sut_class", [RawLibraryOfCongressSKOSConcept, RawLibraryOfCongressMADSConcept]) +@pytest.mark.parametrize( + "sut_class", [RawLibraryOfCongressSKOSConcept, RawLibraryOfCongressMADSConcept] +) class TestExclusion: - def test_do_not_exclude(self, sut_class:Type[RawLibraryOfCongressConcept])->None: + def test_do_not_exclude(self, sut_class: Type[RawLibraryOfCongressConcept]) -> None: """ A record with a corresponding node in its internal graph, and which is not a duplicate, should be included in the output """ - concept = sut_class( {"@id": "authorities/names/sh2010105253", "@graph":[]}) + concept = sut_class({"@id": "authorities/names/sh2010105253", "@graph": []}) # The SUT at this point doesn't actually care what the node is, just that it exists concept._raw_concept_node = {} assert concept.exclude() == False - def test_exclude_no_node(self, sut_class:Type[RawLibraryOfCongressConcept])->None: + def test_exclude_no_node( + self, sut_class: Type[RawLibraryOfCongressConcept] + ) -> None: """ If a record does not contain a corresponding node in its internal graph then it should be excluded """ - concept = sut_class( {"@id": "authorities/names/sh2010105253", "@graph":[]}) + concept = sut_class({"@id": "authorities/names/sh2010105253", "@graph": []}) assert concept.exclude() == True - def test_exclude_marked_duplicates(self, sut_class:Type[RawLibraryOfCongressConcept]) -> None: + def test_exclude_marked_duplicates( + self, sut_class: Type[RawLibraryOfCongressConcept] + ) -> None: """ If a record's identifier is suffixed with -781, this marks the entry as a duplicate which is to be excluded """ - concept = sut_class( {"@id": "authorities/names/sh2010105253-781", "@graph":[]}) + concept = sut_class({"@id": "authorities/names/sh2010105253-781", "@graph": []}) concept._raw_concept_node = {} assert concept.exclude() == True - From 364d7e1c7a28aa489e16a5d6cde6aa2c059b33f4 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 09:40:32 +0000 Subject: [PATCH 205/310] apply autoformat script to tests --- scripts/autoformat.sh | 4 ++-- tests/transformers/loc/mads/test_raw_mads_concept.py | 1 + tests/transformers/loc/test_common.py | 6 ++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/autoformat.sh b/scripts/autoformat.sh index d4e8b0e564..c4ddb62a01 100755 --- a/scripts/autoformat.sh +++ b/scripts/autoformat.sh @@ -17,6 +17,6 @@ if [ "$CHECK" == "--check" ]; then isort --profile=black --check src/ tests/ else echo "Formatting code ..." - black src/ - isort --profile=black src/ + black src/ tests/ + isort --profile=black src/ tests/ fi diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index 10f958466e..5dd71c74c6 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -1,6 +1,7 @@ import json from test_utils import load_fixture + from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept sh2010105253 = json.loads(load_fixture("mads_composite_concept.json")) diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py index 37eadcc737..8c3545ed1f 100644 --- a/tests/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -1,6 +1,8 @@ -import pytest from typing import Type -from transformers.loc.common import remove_id_prefix, RawLibraryOfCongressConcept + +import pytest + +from transformers.loc.common import RawLibraryOfCongressConcept, remove_id_prefix from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept From 0c9484ae47cd746408338ebfe527eea7b84e6278 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 09:41:23 +0000 Subject: [PATCH 206/310] apply autoformat to tests --- scripts/autoformat.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/autoformat.sh b/scripts/autoformat.sh index d4e8b0e564..c4ddb62a01 100755 --- a/scripts/autoformat.sh +++ b/scripts/autoformat.sh @@ -17,6 +17,6 @@ if [ "$CHECK" == "--check" ]; then isort --profile=black --check src/ tests/ else echo "Formatting code ..." - black src/ - isort --profile=black src/ + black src/ tests/ + isort --profile=black src/ tests/ fi From 3517c917894f9f7ff1d87bb38fe0bc3614a8f242 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 10:02:38 +0000 Subject: [PATCH 207/310] harmonise common tests a bit better --- src/transformers/loc/skos/raw_concept.py | 2 +- .../loc/mads/test_raw_mads_concept.py | 21 ------ tests/transformers/loc/test_common.py | 70 ++++++++++++------- 3 files changed, 47 insertions(+), 46 deletions(-) diff --git a/src/transformers/loc/skos/raw_concept.py b/src/transformers/loc/skos/raw_concept.py index 13ce5b38fe..a56ee331dc 100644 --- a/src/transformers/loc/skos/raw_concept.py +++ b/src/transformers/loc/skos/raw_concept.py @@ -7,7 +7,7 @@ def __init__(self, raw_concept: dict): self._raw_concept_node = self._extract_concept_node() def _extract_concept_node(self) -> dict | None: - graph: list[dict] = self.raw_concept["@graph"] + graph: list[dict] = self.raw_concept.get("@graph", []) # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. # When this happens, return `None` because there is no concept for us to extract. diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index 5dd71c74c6..9af350f195 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -5,24 +5,3 @@ from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept sh2010105253 = json.loads(load_fixture("mads_composite_concept.json")) - - -def test_source_id() -> None: - """ - source_id is derived from the @id property in the source data. - It is the unqualified version of the full id - """ - concept = RawLibraryOfCongressMADSConcept( - {"@id": "/authorities/subjects/sh2010105253"} - ) - assert concept.source_id == "sh2010105253" - - -def test_source() -> None: - """ - source is discovered by examining the prefix to the id. - """ - concept = RawLibraryOfCongressMADSConcept( - {"@id": "/authorities/subjects/sh2010105253"} - ) - assert concept.source == "lc-subjects" diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_common.py index 8c3545ed1f..b317d5d624 100644 --- a/tests/transformers/loc/test_common.py +++ b/tests/transformers/loc/test_common.py @@ -7,72 +7,94 @@ from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept -class TestRemovePrefix: - def test_remove_prefix_noop(self) -> None: +@pytest.mark.parametrize( + "sut_class", [RawLibraryOfCongressSKOSConcept, RawLibraryOfCongressMADSConcept] +) +class TestSourceId: + def test_remove_prefix_noop( + self, sut_class: Type[RawLibraryOfCongressConcept] + ) -> None: """ If there is no prefix to remove, remove_id_prefix will do nothing """ - assert remove_id_prefix("sh1234567890") == "sh1234567890" + assert sut_class({"@id": "sh1234567890"}).source_id == "sh1234567890" - def test_remove_prefix_fully_qualified(self) -> None: + def test_remove_prefix_fully_qualified( + self, sut_class: Type[RawLibraryOfCongressConcept] + ) -> None: """ remove_id_prefix removes fully-qualified URL-style prefixes """ assert ( - remove_id_prefix("http://id.loc.gov/authorities/subjects/sh1234567890") + sut_class( + {"@id": "http://id.loc.gov/authorities/subjects/sh1234567890"} + ).source_id == "sh1234567890" ) assert ( - remove_id_prefix("http://id.loc.gov/authorities/names/sh0987654321") + sut_class( + {"@id": "http://id.loc.gov/authorities/names/sh0987654321"} + ).source_id == "sh0987654321" ) - def test_remove_prefix_relative(self) -> None: + def test_remove_prefix_relative( + self, sut_class: Type[RawLibraryOfCongressConcept] + ) -> None: """ remove_id_prefix removes relative/local prefixes """ - assert remove_id_prefix("/authorities/subjects/sh1234567890") == "sh1234567890" - assert remove_id_prefix("/authorities/names/sh0987654321") == "sh0987654321" + assert ( + sut_class({"@id": "/authorities/subjects/sh1234567890"}).source_id + == "sh1234567890" + ) + assert ( + sut_class({"@id": "/authorities/names/sh0987654321"}).source_id + == "sh0987654321" + ) - def test_remove_prefix_lookalikes(self) -> None: + def test_remove_prefix_lookalikes( + self, sut_class: Type[RawLibraryOfCongressConcept] + ) -> None: """ remove_id_prefix only removes specific known prefixes, not just things that look a bit like them """ assert ( - remove_id_prefix("/authorities/banana/sh1234567890") + sut_class({"@id": "/authorities/banana/sh1234567890"}).source_id == "/authorities/banana/sh1234567890" ) assert ( - remove_id_prefix("https://id.loc.gov.uk/authorities/subjects/sh1234567890") + sut_class( + {"@id": "https://id.loc.gov.uk/authorities/subjects/sh1234567890"} + ).source_id == "https://id.loc.gov.uk/authorities/subjects/sh1234567890" ) +@pytest.mark.parametrize( + "sut_class", [RawLibraryOfCongressSKOSConcept, RawLibraryOfCongressMADSConcept] +) class TestSource: - def test_source_subjects(self) -> None: + def test_source_subjects( + self, sut_class: Type[RawLibraryOfCongressConcept] + ) -> None: """ Given an id with the prefix /authorities/subjects/, the source will be lc-subjects """ - concept = RawLibraryOfCongressConcept( - {"@id": "/authorities/subjects/sh2010105253"} - ) + concept = sut_class({"@id": "/authorities/subjects/sh2010105253"}) assert concept.source == "lc-subjects" - def test_source_names(self) -> None: + def test_source_names(self, sut_class: Type[RawLibraryOfCongressConcept]) -> None: """ Given an id with the prefix /authorities/subjects/, the source will be lc-subjects """ - concept = RawLibraryOfCongressConcept( - {"@id": "/authorities/names/sh2010105253"} - ) + concept = sut_class({"@id": "/authorities/names/sh2010105253"}) assert concept.source == "lc-names" - def test_source_invalid(self) -> None: + def test_source_invalid(self, sut_class: Type[RawLibraryOfCongressConcept]) -> None: with pytest.raises(ValueError): - concept = RawLibraryOfCongressConcept( - {"@id": "authorities/childrensSubjects/sj2021051581"} - ) + concept = sut_class({"@id": "authorities/childrensSubjects/sj2021051581"}) concept.source From 5cf71d09dfd8955f65ef5c047591f7ee4fedc015 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 11:10:21 +0000 Subject: [PATCH 208/310] extract MADS node --- src/transformers/loc/common.py | 4 +- src/transformers/loc/mads/raw_concept.py | 14 ++ tests/fixtures/mads_concept.json | 188 ++++++++++++++++++ tests/fixtures/skos_concept.json | 56 ++++++ .../loc/mads/test_raw_mads_concept.py | 30 +++ .../loc/skos/test_raw_skos_concept.py | 35 ++++ .../{test_common.py => test_raw_concept.py} | 0 7 files changed, 326 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/mads_concept.json create mode 100644 tests/fixtures/skos_concept.json create mode 100644 tests/transformers/loc/skos/test_raw_skos_concept.py rename tests/transformers/loc/{test_common.py => test_raw_concept.py} (100%) diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index 26f26a7bb0..34eef93aa7 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -20,7 +20,9 @@ def __init__(self, raw_concept: dict): self._raw_concept_node = self._extract_concept_node() def _extract_concept_node(self) -> dict | None: - pass + raise NotImplementedError( + "Define a method to extract the corresponding node from the internal @graph in a LoC record" + ) @property def source_id(self) -> str: diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index ba85778d91..0a6b987d04 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -4,3 +4,17 @@ class RawLibraryOfCongressMADSConcept(RawLibraryOfCongressConcept): def __init__(self, raw_concept: dict): super().__init__(raw_concept) + + def _extract_concept_node(self) -> dict | None: + graph: list[dict] = self.raw_concept.get("@graph", []) + for node in graph: + # madsrdf:Authority coresponds to the "idea or notion" + # So the node we are after is the one whose id matches, and is an Authority + # Ignore DeprecatedAuthority in this context, as they are to be excluded. + # https://www.loc.gov/standards/mads/rdf/#t21 + if ( + self.source_id in node.get("@id", "") + and "madsrdf:Authority" in node["@type"] + ): + return node + return None diff --git a/tests/fixtures/mads_concept.json b/tests/fixtures/mads_concept.json new file mode 100644 index 0000000000..7b6130e37b --- /dev/null +++ b/tests/fixtures/mads_concept.json @@ -0,0 +1,188 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85042701", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Embroidery" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b9" + } + ] + } + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b9", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Embroidery" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85129340", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "bflc:marcKey": "150 0$aStump work", + "identifiers:lccn": "sh 85129340", + "madsrdf:adminMetadata": [ + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b1" + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b2" + } + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Stump work" + }, + "madsrdf:classification": { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b3" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b4" + } + ] + }, + "madsrdf:hasBroaderAuthority": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85042701" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85090601" + } + ], + "madsrdf:hasVariant": { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b6" + }, + "madsrdf:isMemberOfMADSCollection": [ + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSHAuthorizedHeadings" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSH_General" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_SubdivideGeographically" + } + ], + "madsrdf:isMemberOfMADSScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "owl:sameAs": [ + { + "@id": "http://id.loc.gov/authorities/sh85129340#concept" + }, + { + "@id": "info:lc/authorities/sh85129340" + } + ] + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b1", + "@type": "ri:RecordInfo", + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "1994-12-06T06:43:15" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "revised" + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b2", + "@type": "ri:RecordInfo", + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "1986-02-11T00:00:00" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "new" + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b3", + "@type": "lcc:ClassNumber", + "madsrdf:code": "TT778.S75", + "madsrdf:hasExactExternalAuthority": { + "@id": "http://id.loc.gov/authorities/classification/TT778.S75" + } + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b4", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Stump work" + } + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b6", + "@type": [ + "madsrdf:Topic", + "madsrdf:Variant" + ], + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b7" + } + ] + }, + "madsrdf:variantLabel": { + "@language": "en", + "@value": "Stumpwork" + } + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b7", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Stumpwork" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85090601", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Needlework" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b11" + } + ] + } + }, + { + "@id": "_:n6689c63343524db7abe8d4543ce4b897b11", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Needlework" + } + } + ], + "@id": "/authorities/subjects/sh85129340" +} \ No newline at end of file diff --git a/tests/fixtures/skos_concept.json b/tests/fixtures/skos_concept.json new file mode 100644 index 0000000000..00fd012361 --- /dev/null +++ b/tests/fixtures/skos_concept.json @@ -0,0 +1,56 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh99005104", + "@type": "skos:Concept", + "skos:changeNote": [ + { + "@id": "_:n28dfff48bd194803959bb4d18b09c690b1" + }, + { + "@id": "_:n28dfff48bd194803959bb4d18b09c690b2" + } + ], + "skos:inScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "skos:note": "Use as a topical subdivision under individual materials and types of materials.", + "skos:prefLabel": { + "@language": "en", + "@value": "Pickling" + } + }, + { + "@id": "_:n28dfff48bd194803959bb4d18b09c690b1", + "@type": "cs:ChangeSet", + "cs:changeReason": "revised", + "cs:createdDate": { + "@type": "xsd:dateTime", + "@value": "2007-11-06T07:52:55" + }, + "cs:creatorName": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "cs:subjectOfChange": { + "@id": "http://id.loc.gov/authorities/subjects/sh99005104" + } + }, + { + "@id": "_:n28dfff48bd194803959bb4d18b09c690b2", + "@type": "cs:ChangeSet", + "cs:changeReason": "new", + "cs:createdDate": { + "@type": "xsd:dateTime", + "@value": "1999-07-08T00:00:00" + }, + "cs:creatorName": { + "@id": "http://id.loc.gov/vocabulary/organizations/ien" + }, + "cs:subjectOfChange": { + "@id": "http://id.loc.gov/authorities/subjects/sh99005104" + } + } + ], + "@id": "/authorities/subjects/sh99005104" +} diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index 9af350f195..3f397642f4 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -5,3 +5,33 @@ from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept sh2010105253 = json.loads(load_fixture("mads_composite_concept.json")) + + +def test_exclude_no_graph() -> None: + """ + If there is no graph, then the concept is to be excluded + """ + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + assert concept.exclude() == True + + +def test_exclude_no_matching_concept_node() -> None: + """ + If the graph does not contain a node of type skos:Concept, it is to be excluded + """ + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_deprecated_concept.json")) + ) + assert concept.exclude() == True + + +def test_do_not_exclude() -> None: + """ + A complete, non-duplicate, non-deprecated record is to be included in the output + """ + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_concept.json")) + ) + assert concept.exclude() == False diff --git a/tests/transformers/loc/skos/test_raw_skos_concept.py b/tests/transformers/loc/skos/test_raw_skos_concept.py new file mode 100644 index 0000000000..f1c3694461 --- /dev/null +++ b/tests/transformers/loc/skos/test_raw_skos_concept.py @@ -0,0 +1,35 @@ +import json + +from test_utils import load_fixture + +from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept + + +def test_exclude_no_graph() -> None: + """ + If there is no graph, then the concept is to be excluded + """ + concept = RawLibraryOfCongressSKOSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + assert concept.exclude() == True + + +def test_exclude_no_matching_concept_node() -> None: + """ + If the graph does not contain a node of type skos:Concept, it is to be excluded + """ + concept = RawLibraryOfCongressSKOSConcept( + json.loads(load_fixture("skos_deprecated_concept.json")) + ) + assert concept.exclude() == True + + +def test_do_not_exclude() -> None: + """ + A complete, non-duplicate, non-deprecated record is to be included in the output + """ + concept = RawLibraryOfCongressSKOSConcept( + json.loads(load_fixture("skos_concept.json")) + ) + assert concept.exclude() == False diff --git a/tests/transformers/loc/test_common.py b/tests/transformers/loc/test_raw_concept.py similarity index 100% rename from tests/transformers/loc/test_common.py rename to tests/transformers/loc/test_raw_concept.py From f54e6a65c955375bc66abf9af8a83e4be5255209 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 11:16:37 +0000 Subject: [PATCH 209/310] typo --- src/transformers/loc/mads/raw_concept.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index 0a6b987d04..e9e510faba 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -8,7 +8,7 @@ def __init__(self, raw_concept: dict): def _extract_concept_node(self) -> dict | None: graph: list[dict] = self.raw_concept.get("@graph", []) for node in graph: - # madsrdf:Authority coresponds to the "idea or notion" + # madsrdf:Authority corresponds to the "idea or notion" # So the node we are after is the one whose id matches, and is an Authority # Ignore DeprecatedAuthority in this context, as they are to be excluded. # https://www.loc.gov/standards/mads/rdf/#t21 From 13433c14eecb5fc5133bb759042a76ef6388a225 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 11:23:55 +0000 Subject: [PATCH 210/310] add label to mads --- src/transformers/loc/mads/raw_concept.py | 5 +++++ tests/transformers/loc/mads/test_raw_mads_concept.py | 10 ++++++++++ tests/transformers/loc/skos/test_raw_skos_concept.py | 10 ++++++++++ 3 files changed, 25 insertions(+) diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index e9e510faba..6849634e34 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -18,3 +18,8 @@ def _extract_concept_node(self) -> dict | None: ): return node return None + + @property + def label(self) -> str: + raw_preferred_label = self._raw_concept_node["madsrdf:authoritativeLabel"] + return self._extract_label(raw_preferred_label) diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index 3f397642f4..4f017150ce 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -35,3 +35,13 @@ def test_do_not_exclude() -> None: json.loads(load_fixture("mads_concept.json")) ) assert concept.exclude() == False + + +def test_label() -> None: + """ + Label is extracted from madsrdf:authoritativeLabel + """ + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_concept.json")) + ) + assert concept.label == "Stump work" diff --git a/tests/transformers/loc/skos/test_raw_skos_concept.py b/tests/transformers/loc/skos/test_raw_skos_concept.py index f1c3694461..5e51f3d4d8 100644 --- a/tests/transformers/loc/skos/test_raw_skos_concept.py +++ b/tests/transformers/loc/skos/test_raw_skos_concept.py @@ -33,3 +33,13 @@ def test_do_not_exclude() -> None: json.loads(load_fixture("skos_concept.json")) ) assert concept.exclude() == False + + +def test_label() -> None: + """ + Label is extracted from madsrdf:authoritativeLabel + """ + concept = RawLibraryOfCongressSKOSConcept( + json.loads(load_fixture("skos_concept.json")) + ) + assert concept.label == "Pickling" From 6d058dfc8c700ae0175e8d1b9476dcd3af6f0bbe Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 12:01:02 +0000 Subject: [PATCH 211/310] add label to mads --- tests/fixtures/skos_geographic_concept.json | 95 +++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 tests/fixtures/skos_geographic_concept.json diff --git a/tests/fixtures/skos_geographic_concept.json b/tests/fixtures/skos_geographic_concept.json new file mode 100644 index 0000000000..d3178f6618 --- /dev/null +++ b/tests/fixtures/skos_geographic_concept.json @@ -0,0 +1,95 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85040229", + "@type": "skos:Concept", + "skos:prefLabel": { + "@language": "en", + "@value": "Dwellings--England" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh2017003861", + "@type": "skos:Concept", + "skos:broader": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85040229" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85053109" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh92006359" + } + ], + "skos:changeNote": [ + { + "@id": "_:n2de234bdaf9048a8bb64367927c52189b1" + }, + { + "@id": "_:n2de234bdaf9048a8bb64367927c52189b2" + } + ], + "skos:editorialNote": "[This heading is not valid for use as a geographic subdivision.]", + "skos:inScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "skos:notation": { + "@type": "http://id.loc.gov/datatypes/codes/gac", + "@value": "e-uk-en" + }, + "skos:prefLabel": { + "@language": "en", + "@value": "Caversham Park (Reading, England)" + } + }, + { + "@id": "_:n2de234bdaf9048a8bb64367927c52189b1", + "@type": "cs:ChangeSet", + "cs:changeReason": "revised", + "cs:createdDate": { + "@type": "xsd:dateTime", + "@value": "2017-10-13T11:41:51" + }, + "cs:creatorName": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "cs:subjectOfChange": { + "@id": "http://id.loc.gov/authorities/subjects/sh2017003861" + } + }, + { + "@id": "_:n2de234bdaf9048a8bb64367927c52189b2", + "@type": "cs:ChangeSet", + "cs:changeReason": "new", + "cs:createdDate": { + "@type": "xsd:dateTime", + "@value": "2017-07-06T00:00:00" + }, + "cs:creatorName": { + "@id": "http://id.loc.gov/vocabulary/organizations/ukcu" + }, + "cs:subjectOfChange": { + "@id": "http://id.loc.gov/authorities/subjects/sh2017003861" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh85053109", + "@type": "skos:Concept", + "skos:prefLabel": { + "@language": "en", + "@value": "Gardens--England" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh92006359", + "@type": "skos:Concept", + "skos:prefLabel": { + "@language": "en", + "@value": "Office buildings--England" + } + } + ], + "@id": "/authorities/subjects/sh2017003861" +} From 3b5fd5b0075fa42a892b36e2169cea49c03fdbcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 31 Jan 2025 14:24:37 +0000 Subject: [PATCH 212/310] Initial Wikidata source unit testing --- .../linked_ontology_id_type_checker.py | 7 +-- src/sources/wikidata/sparql_query_builder.py | 8 +-- .../loc_concepts_transformer_output.csv | 3 + tests/fixtures/wikidata/all_ids_query.json | 4 ++ tests/fixtures/wikidata/all_ids_response.json | 18 ++++++ tests/fixtures/wikidata/linked_ids_query.json | 4 ++ .../wikidata/linked_ids_response.json | 26 +++++++++ .../wikidata/parents_instance_of_query.json | 4 ++ .../parents_instance_of_response.json | 26 +++++++++ .../wikidata/parents_subclass_of_query.json | 4 ++ .../parents_subclass_of_response.json | 16 ++++++ .../test_mesh_concepts_source.py | 0 .../sources/test_wikidata_concepts_source.py | 57 +++++++++++++++++++ tests/test_mocks.py | 39 ++++++------- .../test_mesh_concepts_transformer.py | 0 15 files changed, 189 insertions(+), 27 deletions(-) create mode 100644 tests/fixtures/loc_concepts_transformer_output.csv create mode 100644 tests/fixtures/wikidata/all_ids_query.json create mode 100644 tests/fixtures/wikidata/all_ids_response.json create mode 100644 tests/fixtures/wikidata/linked_ids_query.json create mode 100644 tests/fixtures/wikidata/linked_ids_response.json create mode 100644 tests/fixtures/wikidata/parents_instance_of_query.json create mode 100644 tests/fixtures/wikidata/parents_instance_of_response.json create mode 100644 tests/fixtures/wikidata/parents_subclass_of_query.json create mode 100644 tests/fixtures/wikidata/parents_subclass_of_response.json rename tests/{ => sources}/test_mesh_concepts_source.py (100%) create mode 100644 tests/sources/test_wikidata_concepts_source.py rename tests/{ => transformers}/test_mesh_concepts_transformer.py (100%) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 90f52ad315..66116a1409 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -1,11 +1,9 @@ -import os from functools import lru_cache import boto3 import smart_open -from config import S3_BULK_LOAD_BUCKET_NAME - +import config from .sparql_query_builder import NodeType, OntologyType @@ -30,7 +28,7 @@ def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: """Return all ids classified under a given `node_type` for the selected ontology.""" # Retrieve the bulk load file outputted by the relevant transformer so that we can extract ids from it. linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" - s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + s3_url = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" print( f"Retrieving ids of type '{node_type}' from ontology '{self.linked_ontology}' from S3.", @@ -39,6 +37,7 @@ def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: ) ids = set() + transport_params = {"client": boto3.client("s3")} with smart_open.open(s3_url, "r", transport_params=transport_params) as f: # Loop through all items in the file and extract the id from each item diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index abc343fea9..6e9f377704 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -95,7 +95,7 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: Given a list of Wikidata `item_ids`, return a query to retrieve all required Wikidata fields for each id in the list. """ - ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in sorted(item_ids)]) query = f""" SELECT DISTINCT {cls._get_formatted_fields(node_type)} @@ -124,7 +124,7 @@ def get_linked_ids_query( Given a list of Wikidata `item_ids`, return a query to retrieve all linked ontology ids referenced by each item in the list. """ - ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in sorted(item_ids)]) query = f""" SELECT DISTINCT ?item ?linkedId @@ -134,7 +134,7 @@ def get_linked_ids_query( }} """ - return query + return SparqlQueryBuilder._compact_format_query(query) @classmethod def get_parents_query( @@ -146,7 +146,7 @@ def get_parents_query( Given a list of Wikidata `item_ids`, return a query to retrieve all parents of each item in the list. Parents are determined based on the 'subclass of' (P279) or the 'instance of' (P31) fields. """ - ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in sorted(item_ids)]) if relationship_type == "instance_of": relationship = "?child wdt:P31 ?item." diff --git a/tests/fixtures/loc_concepts_transformer_output.csv b/tests/fixtures/loc_concepts_transformer_output.csv new file mode 100644 index 0000000000..fedbad4077 --- /dev/null +++ b/tests/fixtures/loc_concepts_transformer_output.csv @@ -0,0 +1,3 @@ +:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description +sh00000001,SourceConcept,sh00000001,ActionScript (Computer program language),lc-subjects,,,null +sh00000002,SourceConcept,sh00000002,Tacos,lc-subjects,,,null diff --git a/tests/fixtures/wikidata/all_ids_query.json b/tests/fixtures/wikidata/all_ids_query.json new file mode 100644 index 0000000000..bf2726013c --- /dev/null +++ b/tests/fixtures/wikidata/all_ids_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT ?item WHERE { ?item wdt:P244 _:anyValueP244. }" +} diff --git a/tests/fixtures/wikidata/all_ids_response.json b/tests/fixtures/wikidata/all_ids_response.json new file mode 100644 index 0000000000..6cd31df097 --- /dev/null +++ b/tests/fixtures/wikidata/all_ids_response.json @@ -0,0 +1,18 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata/linked_ids_query.json b/tests/fixtures/wikidata/linked_ids_query.json new file mode 100644 index 0000000000..d3874e7c50 --- /dev/null +++ b/tests/fixtures/wikidata/linked_ids_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?item ?linkedId WHERE { VALUES ?item { wd:Q1 wd:Q2 } ?item p:P244/ps:P244 ?linkedId. }" +} diff --git a/tests/fixtures/wikidata/linked_ids_response.json b/tests/fixtures/wikidata/linked_ids_response.json new file mode 100644 index 0000000000..039a2b4327 --- /dev/null +++ b/tests/fixtures/wikidata/linked_ids_response.json @@ -0,0 +1,26 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + }, + "linkedId": { + "type": "uri", + "value": "sh00000001" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + }, + "linkedId": { + "type": "uri", + "value": "sh00000001" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata/parents_instance_of_query.json b/tests/fixtures/wikidata/parents_instance_of_query.json new file mode 100644 index 0000000000..8521f76e68 --- /dev/null +++ b/tests/fixtures/wikidata/parents_instance_of_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?child ?item WHERE { VALUES ?child { wd:Q1 wd:Q2 } ?child wdt:P31 ?item. }" +} diff --git a/tests/fixtures/wikidata/parents_instance_of_response.json b/tests/fixtures/wikidata/parents_instance_of_response.json new file mode 100644 index 0000000000..caeb9577f0 --- /dev/null +++ b/tests/fixtures/wikidata/parents_instance_of_response.json @@ -0,0 +1,26 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + }, + "child": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q3" + }, + "child": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata/parents_subclass_of_query.json b/tests/fixtures/wikidata/parents_subclass_of_query.json new file mode 100644 index 0000000000..318027fef7 --- /dev/null +++ b/tests/fixtures/wikidata/parents_subclass_of_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?child ?item WHERE { VALUES ?child { wd:Q1 wd:Q2 } ?child wdt:P279 ?item. }" +} diff --git a/tests/fixtures/wikidata/parents_subclass_of_response.json b/tests/fixtures/wikidata/parents_subclass_of_response.json new file mode 100644 index 0000000000..f524cc236d --- /dev/null +++ b/tests/fixtures/wikidata/parents_subclass_of_response.json @@ -0,0 +1,16 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q4" + }, + "child": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + } + } + ] + } +} diff --git a/tests/test_mesh_concepts_source.py b/tests/sources/test_mesh_concepts_source.py similarity index 100% rename from tests/test_mesh_concepts_source.py rename to tests/sources/test_mesh_concepts_source.py diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py new file mode 100644 index 0000000000..7e3949e769 --- /dev/null +++ b/tests/sources/test_wikidata_concepts_source.py @@ -0,0 +1,57 @@ +from test_mocks import MockRequest, MockSmartOpen +import json +from typing import Literal +from config import WIKIDATA_SPARQL_URL + +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from test_utils import load_fixture + + +def add_mock_wikidata_request( + query_type: Literal[ + "all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of" + ] +) -> None: + params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) + response = json.loads(load_fixture(f"wikidata/{query_type}_response.json")) + MockRequest.mock_response( + method="GET", url=WIKIDATA_SPARQL_URL, params=params, json_data=response + ) + + +def test_wikidata_concepts_source() -> None: + MockSmartOpen.mock_s3_file( + "s3://bulk_load_test_bucket/loc_concepts__nodes.csv", + load_fixture("loc_concepts_transformer_output.csv").decode(), + ) + add_mock_wikidata_request("all_ids") + add_mock_wikidata_request("linked_ids") + add_mock_wikidata_request("parents_instance_of") + add_mock_wikidata_request("parents_subclass_of") + + mesh_concepts_source = WikidataLinkedOntologySource( + node_type="concepts", linked_ontology="loc", entity_type="edges" + ) + + stream_result = list(mesh_concepts_source.stream_raw()) + + assert len(stream_result) == 5 + + same_as_edges = set() + has_parent_edges = set() + for edge in stream_result: + if edge["type"] == "SAME_AS": + same_as_edges.add((edge["wikidata_id"], edge["linked_id"])) + elif edge["type"] == "HAS_PARENT": + has_parent_edges.add((edge["child_id"], edge["parent_id"])) + else: + raise ValueError(f"Unknown edge type {edge['type']}") + + assert len(same_as_edges) == 2 + assert ("Q1", "sh00000001") in same_as_edges + assert ("Q2", "sh00000001") in same_as_edges + + assert len(has_parent_edges) == 3 + assert ("Q1", "Q4") in has_parent_edges + assert ("Q2", "Q1") in has_parent_edges + assert ("Q2", "Q3") in has_parent_edges diff --git a/tests/test_mocks.py b/tests/test_mocks.py index 95064edaa1..53d8032942 100644 --- a/tests/test_mocks.py +++ b/tests/test_mocks.py @@ -3,7 +3,6 @@ from typing import Any, TypedDict from botocore.credentials import Credentials -from test_utils import load_fixture from utils.aws import INSTANCE_ENDPOINT_SECRET_NAME, LOAD_BALANCER_SECRET_NAME @@ -19,24 +18,26 @@ class MockSmartOpen: file_lookup: dict = {} - @staticmethod - def reset_mocks() -> None: - MockSmartOpen.file_lookup = {} - - @staticmethod - def get_mock_file(uri: str) -> Any: - return MockSmartOpen.file_lookup[uri] - - @staticmethod - def open(uri: str, mode: str, **kwargs: Any) -> Any: - # Create an in-memory text stream - mock_file = io.StringIO() - - # Save the file object in the file lookup - MockSmartOpen.file_lookup[uri] = mock_file + @classmethod + def reset_mocks(cls) -> None: + cls.file_lookup = {} + + @classmethod + def mock_s3_file(cls, uri: str, content: str) -> None: + cls.file_lookup[uri] = io.StringIO(content) + + @classmethod + def open(cls, uri: str, mode: str, **kwargs: Any) -> Any: + if mode == "w": + # Create an in-memory text stream and save the file object in the file lookup + cls.file_lookup[uri] = io.StringIO() + elif mode == "r": + if uri not in cls.file_lookup: + raise KeyError(f"Mock S3 file {uri} does not exist.") + else: + raise ValueError(f"Unsupported file mode: {mode}") - # create temp file and open it with given mode - return mock_file + return cls.file_lookup[uri] class MockAwsService: @@ -196,7 +197,7 @@ def request( ): return response["response"] - raise Exception(f"Unexpected request: {method} {url}") + raise Exception(f"Unexpected request: {method} {url} {params}") @staticmethod def get( diff --git a/tests/test_mesh_concepts_transformer.py b/tests/transformers/test_mesh_concepts_transformer.py similarity index 100% rename from tests/test_mesh_concepts_transformer.py rename to tests/transformers/test_mesh_concepts_transformer.py From 6818f1e1c20ece7b45302df18bb6fbb1d649be6b Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Fri, 31 Jan 2025 14:25:35 +0000 Subject: [PATCH 213/310] Apply auto-formatting rules --- src/sources/wikidata/linked_ontology_id_type_checker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 66116a1409..425ced53d4 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -4,6 +4,7 @@ import smart_open import config + from .sparql_query_builder import NodeType, OntologyType From 16ad27454ee6844653a744de70d5833dc5dbc4e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 31 Jan 2025 14:24:37 +0000 Subject: [PATCH 214/310] Initial Wikidata source unit testing #5898 --- .../linked_ontology_id_type_checker.py | 7 +-- src/sources/wikidata/sparql_query_builder.py | 8 +-- .../loc_concepts_transformer_output.csv | 3 + tests/fixtures/wikidata/all_ids_query.json | 4 ++ tests/fixtures/wikidata/all_ids_response.json | 18 ++++++ tests/fixtures/wikidata/linked_ids_query.json | 4 ++ .../wikidata/linked_ids_response.json | 26 +++++++++ .../wikidata/parents_instance_of_query.json | 4 ++ .../parents_instance_of_response.json | 26 +++++++++ .../wikidata/parents_subclass_of_query.json | 4 ++ .../parents_subclass_of_response.json | 16 ++++++ .../test_mesh_concepts_source.py | 0 .../sources/test_wikidata_concepts_source.py | 57 +++++++++++++++++++ tests/test_mocks.py | 39 ++++++------- .../test_mesh_concepts_transformer.py | 0 15 files changed, 189 insertions(+), 27 deletions(-) create mode 100644 tests/fixtures/loc_concepts_transformer_output.csv create mode 100644 tests/fixtures/wikidata/all_ids_query.json create mode 100644 tests/fixtures/wikidata/all_ids_response.json create mode 100644 tests/fixtures/wikidata/linked_ids_query.json create mode 100644 tests/fixtures/wikidata/linked_ids_response.json create mode 100644 tests/fixtures/wikidata/parents_instance_of_query.json create mode 100644 tests/fixtures/wikidata/parents_instance_of_response.json create mode 100644 tests/fixtures/wikidata/parents_subclass_of_query.json create mode 100644 tests/fixtures/wikidata/parents_subclass_of_response.json rename tests/{ => sources}/test_mesh_concepts_source.py (100%) create mode 100644 tests/sources/test_wikidata_concepts_source.py rename tests/{ => transformers}/test_mesh_concepts_transformer.py (100%) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 90f52ad315..66116a1409 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -1,11 +1,9 @@ -import os from functools import lru_cache import boto3 import smart_open -from config import S3_BULK_LOAD_BUCKET_NAME - +import config from .sparql_query_builder import NodeType, OntologyType @@ -30,7 +28,7 @@ def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: """Return all ids classified under a given `node_type` for the selected ontology.""" # Retrieve the bulk load file outputted by the relevant transformer so that we can extract ids from it. linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" - s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + s3_url = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" print( f"Retrieving ids of type '{node_type}' from ontology '{self.linked_ontology}' from S3.", @@ -39,6 +37,7 @@ def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: ) ids = set() + transport_params = {"client": boto3.client("s3")} with smart_open.open(s3_url, "r", transport_params=transport_params) as f: # Loop through all items in the file and extract the id from each item diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index abc343fea9..6e9f377704 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -95,7 +95,7 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: Given a list of Wikidata `item_ids`, return a query to retrieve all required Wikidata fields for each id in the list. """ - ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in sorted(item_ids)]) query = f""" SELECT DISTINCT {cls._get_formatted_fields(node_type)} @@ -124,7 +124,7 @@ def get_linked_ids_query( Given a list of Wikidata `item_ids`, return a query to retrieve all linked ontology ids referenced by each item in the list. """ - ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in sorted(item_ids)]) query = f""" SELECT DISTINCT ?item ?linkedId @@ -134,7 +134,7 @@ def get_linked_ids_query( }} """ - return query + return SparqlQueryBuilder._compact_format_query(query) @classmethod def get_parents_query( @@ -146,7 +146,7 @@ def get_parents_query( Given a list of Wikidata `item_ids`, return a query to retrieve all parents of each item in the list. Parents are determined based on the 'subclass of' (P279) or the 'instance of' (P31) fields. """ - ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in item_ids]) + ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in sorted(item_ids)]) if relationship_type == "instance_of": relationship = "?child wdt:P31 ?item." diff --git a/tests/fixtures/loc_concepts_transformer_output.csv b/tests/fixtures/loc_concepts_transformer_output.csv new file mode 100644 index 0000000000..fedbad4077 --- /dev/null +++ b/tests/fixtures/loc_concepts_transformer_output.csv @@ -0,0 +1,3 @@ +:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description +sh00000001,SourceConcept,sh00000001,ActionScript (Computer program language),lc-subjects,,,null +sh00000002,SourceConcept,sh00000002,Tacos,lc-subjects,,,null diff --git a/tests/fixtures/wikidata/all_ids_query.json b/tests/fixtures/wikidata/all_ids_query.json new file mode 100644 index 0000000000..bf2726013c --- /dev/null +++ b/tests/fixtures/wikidata/all_ids_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT ?item WHERE { ?item wdt:P244 _:anyValueP244. }" +} diff --git a/tests/fixtures/wikidata/all_ids_response.json b/tests/fixtures/wikidata/all_ids_response.json new file mode 100644 index 0000000000..6cd31df097 --- /dev/null +++ b/tests/fixtures/wikidata/all_ids_response.json @@ -0,0 +1,18 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata/linked_ids_query.json b/tests/fixtures/wikidata/linked_ids_query.json new file mode 100644 index 0000000000..d3874e7c50 --- /dev/null +++ b/tests/fixtures/wikidata/linked_ids_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?item ?linkedId WHERE { VALUES ?item { wd:Q1 wd:Q2 } ?item p:P244/ps:P244 ?linkedId. }" +} diff --git a/tests/fixtures/wikidata/linked_ids_response.json b/tests/fixtures/wikidata/linked_ids_response.json new file mode 100644 index 0000000000..039a2b4327 --- /dev/null +++ b/tests/fixtures/wikidata/linked_ids_response.json @@ -0,0 +1,26 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + }, + "linkedId": { + "type": "uri", + "value": "sh00000001" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + }, + "linkedId": { + "type": "uri", + "value": "sh00000001" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata/parents_instance_of_query.json b/tests/fixtures/wikidata/parents_instance_of_query.json new file mode 100644 index 0000000000..8521f76e68 --- /dev/null +++ b/tests/fixtures/wikidata/parents_instance_of_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?child ?item WHERE { VALUES ?child { wd:Q1 wd:Q2 } ?child wdt:P31 ?item. }" +} diff --git a/tests/fixtures/wikidata/parents_instance_of_response.json b/tests/fixtures/wikidata/parents_instance_of_response.json new file mode 100644 index 0000000000..caeb9577f0 --- /dev/null +++ b/tests/fixtures/wikidata/parents_instance_of_response.json @@ -0,0 +1,26 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + }, + "child": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q3" + }, + "child": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata/parents_subclass_of_query.json b/tests/fixtures/wikidata/parents_subclass_of_query.json new file mode 100644 index 0000000000..318027fef7 --- /dev/null +++ b/tests/fixtures/wikidata/parents_subclass_of_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?child ?item WHERE { VALUES ?child { wd:Q1 wd:Q2 } ?child wdt:P279 ?item. }" +} diff --git a/tests/fixtures/wikidata/parents_subclass_of_response.json b/tests/fixtures/wikidata/parents_subclass_of_response.json new file mode 100644 index 0000000000..f524cc236d --- /dev/null +++ b/tests/fixtures/wikidata/parents_subclass_of_response.json @@ -0,0 +1,16 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q4" + }, + "child": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + } + } + ] + } +} diff --git a/tests/test_mesh_concepts_source.py b/tests/sources/test_mesh_concepts_source.py similarity index 100% rename from tests/test_mesh_concepts_source.py rename to tests/sources/test_mesh_concepts_source.py diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py new file mode 100644 index 0000000000..7e3949e769 --- /dev/null +++ b/tests/sources/test_wikidata_concepts_source.py @@ -0,0 +1,57 @@ +from test_mocks import MockRequest, MockSmartOpen +import json +from typing import Literal +from config import WIKIDATA_SPARQL_URL + +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from test_utils import load_fixture + + +def add_mock_wikidata_request( + query_type: Literal[ + "all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of" + ] +) -> None: + params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) + response = json.loads(load_fixture(f"wikidata/{query_type}_response.json")) + MockRequest.mock_response( + method="GET", url=WIKIDATA_SPARQL_URL, params=params, json_data=response + ) + + +def test_wikidata_concepts_source() -> None: + MockSmartOpen.mock_s3_file( + "s3://bulk_load_test_bucket/loc_concepts__nodes.csv", + load_fixture("loc_concepts_transformer_output.csv").decode(), + ) + add_mock_wikidata_request("all_ids") + add_mock_wikidata_request("linked_ids") + add_mock_wikidata_request("parents_instance_of") + add_mock_wikidata_request("parents_subclass_of") + + mesh_concepts_source = WikidataLinkedOntologySource( + node_type="concepts", linked_ontology="loc", entity_type="edges" + ) + + stream_result = list(mesh_concepts_source.stream_raw()) + + assert len(stream_result) == 5 + + same_as_edges = set() + has_parent_edges = set() + for edge in stream_result: + if edge["type"] == "SAME_AS": + same_as_edges.add((edge["wikidata_id"], edge["linked_id"])) + elif edge["type"] == "HAS_PARENT": + has_parent_edges.add((edge["child_id"], edge["parent_id"])) + else: + raise ValueError(f"Unknown edge type {edge['type']}") + + assert len(same_as_edges) == 2 + assert ("Q1", "sh00000001") in same_as_edges + assert ("Q2", "sh00000001") in same_as_edges + + assert len(has_parent_edges) == 3 + assert ("Q1", "Q4") in has_parent_edges + assert ("Q2", "Q1") in has_parent_edges + assert ("Q2", "Q3") in has_parent_edges diff --git a/tests/test_mocks.py b/tests/test_mocks.py index 95064edaa1..53d8032942 100644 --- a/tests/test_mocks.py +++ b/tests/test_mocks.py @@ -3,7 +3,6 @@ from typing import Any, TypedDict from botocore.credentials import Credentials -from test_utils import load_fixture from utils.aws import INSTANCE_ENDPOINT_SECRET_NAME, LOAD_BALANCER_SECRET_NAME @@ -19,24 +18,26 @@ class MockSmartOpen: file_lookup: dict = {} - @staticmethod - def reset_mocks() -> None: - MockSmartOpen.file_lookup = {} - - @staticmethod - def get_mock_file(uri: str) -> Any: - return MockSmartOpen.file_lookup[uri] - - @staticmethod - def open(uri: str, mode: str, **kwargs: Any) -> Any: - # Create an in-memory text stream - mock_file = io.StringIO() - - # Save the file object in the file lookup - MockSmartOpen.file_lookup[uri] = mock_file + @classmethod + def reset_mocks(cls) -> None: + cls.file_lookup = {} + + @classmethod + def mock_s3_file(cls, uri: str, content: str) -> None: + cls.file_lookup[uri] = io.StringIO(content) + + @classmethod + def open(cls, uri: str, mode: str, **kwargs: Any) -> Any: + if mode == "w": + # Create an in-memory text stream and save the file object in the file lookup + cls.file_lookup[uri] = io.StringIO() + elif mode == "r": + if uri not in cls.file_lookup: + raise KeyError(f"Mock S3 file {uri} does not exist.") + else: + raise ValueError(f"Unsupported file mode: {mode}") - # create temp file and open it with given mode - return mock_file + return cls.file_lookup[uri] class MockAwsService: @@ -196,7 +197,7 @@ def request( ): return response["response"] - raise Exception(f"Unexpected request: {method} {url}") + raise Exception(f"Unexpected request: {method} {url} {params}") @staticmethod def get( diff --git a/tests/test_mesh_concepts_transformer.py b/tests/transformers/test_mesh_concepts_transformer.py similarity index 100% rename from tests/test_mesh_concepts_transformer.py rename to tests/transformers/test_mesh_concepts_transformer.py From fcc96fe53b5fcd2c54b27c5e72d00e57ccccf7af Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 14:55:25 +0000 Subject: [PATCH 215/310] add broaders to mads --- src/transformers/loc/common.py | 23 +++- src/transformers/loc/mads/raw_concept.py | 32 ++++- .../loc/mads/test_raw_mads_concept.py | 115 +++++++++++++----- .../loc/skos/test_raw_skos_concept.py | 75 ++++++++---- 4 files changed, 185 insertions(+), 60 deletions(-) diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index 34eef93aa7..d1ffc36fd6 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -20,9 +20,7 @@ def __init__(self, raw_concept: dict): self._raw_concept_node = self._extract_concept_node() def _extract_concept_node(self) -> dict | None: - raise NotImplementedError( - "Define a method to extract the corresponding node from the internal @graph in a LoC record" - ) + raise NotImplementedError @property def source_id(self) -> str: @@ -38,6 +36,25 @@ def source(self) -> Literal["lc-subjects", "lc-names"]: raise ValueError("Unknown concept type.") + @property + def label(self) -> str: + raise NotImplementedError + + @property + def is_geographic(self) -> bool: + """Returns True if the node represents a geographic concept""" + raise NotImplementedError + + @property + def broader_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are broader than the current concept.""" + raise NotImplementedError + + @property + def related_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are related to the current concept.""" + raise NotImplementedError + @staticmethod def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index 6849634e34..19689819de 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -1,4 +1,4 @@ -from transformers.loc.common import RawLibraryOfCongressConcept +from transformers.loc.common import RawLibraryOfCongressConcept, remove_id_prefix class RawLibraryOfCongressMADSConcept(RawLibraryOfCongressConcept): @@ -23,3 +23,33 @@ def _extract_concept_node(self) -> dict | None: def label(self) -> str: raw_preferred_label = self._raw_concept_node["madsrdf:authoritativeLabel"] return self._extract_label(raw_preferred_label) + + @property + def is_geographic(self) -> bool: + """Returns True if the node represents a geographic concept, as determined by @type""" + return "madsrdf:Geographic" in self._raw_concept_node.get("@type", []) + + @property + def broader_concept_ids(self) -> list[str]: + return _filter_irrelevant_ids( + [ + remove_id_prefix(broader["@id"]) + for broader in _as_list( + self._raw_concept_node.get("madsrdf:hasBroaderAuthority", []) + ) + ] + ) + + +def _filter_irrelevant_ids(ids: list[str]) -> list[str]: + return [concept_id for concept_id in ids if not concept_id.startswith("_:n")] + + +def _as_list(dict_or_list: dict | list[dict]) -> list[dict]: + # Some fields in the source data may contain one or more values + # When it contains multiple values, it will be a list, + # but in the case where they contain just one value, it is not. + # Wrap bare single values in a list, for consistency of processing downstream + if isinstance(dict_or_list, dict): + return [dict_or_list] + return dict_or_list diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index 4f017150ce..86e7182a8f 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -4,44 +4,99 @@ from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept -sh2010105253 = json.loads(load_fixture("mads_composite_concept.json")) - -def test_exclude_no_graph() -> None: +def test_label() -> None: """ - If there is no graph, then the concept is to be excluded + Label is extracted from madsrdf:authoritativeLabel """ concept = RawLibraryOfCongressMADSConcept( - {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + json.loads(load_fixture("mads_concept.json")) ) - assert concept.exclude() == True + assert concept.label == "Stump work" -def test_exclude_no_matching_concept_node() -> None: - """ - If the graph does not contain a node of type skos:Concept, it is to be excluded - """ - concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_deprecated_concept.json")) - ) - assert concept.exclude() == True +class TestExclude: + def test_exclude_no_graph(self) -> None: + """ + If there is no graph, then the concept is to be excluded + """ + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + assert concept.exclude() == True + def test_exclude_no_matching_concept_node(self) -> None: + """ + If the graph does not contain a node of type skos:Concept, it is to be excluded + """ + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_deprecated_concept.json")) + ) + assert concept.exclude() == True -def test_do_not_exclude() -> None: - """ - A complete, non-duplicate, non-deprecated record is to be included in the output - """ - concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_concept.json")) - ) - assert concept.exclude() == False + def test_do_not_exclude(self) -> None: + """ + A complete, non-duplicate, non-deprecated record is to be included in the output + """ + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_concept.json")) + ) + assert concept.exclude() == False -def test_label() -> None: - """ - Label is extracted from madsrdf:authoritativeLabel - """ - concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_concept.json")) - ) - assert concept.label == "Stump work" +class TestGeographic: + def test_is_geographic(self) -> None: + """ + A concept is geographic if its @type list contains madsrdf:Geographic + """ + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_geographic_concept.json")) + ) + assert concept.is_geographic == True + + def test_is_not_geographic(self) -> None: + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_concept.json")) + ) + assert concept.is_geographic == False + + +class TestBroaderConcepts: + def test_real_example(self) -> None: + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_geographic_concept.json")) + ) + assert concept.broader_concept_ids == ["sh85040229", "sh85053109", "sh92006359"] + + def test_single_broader(self) -> None: + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + # exaample from sh00000014, Stuffed foods (Cooking) + concept._raw_concept_node = { + "madsrdf:hasBroaderAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh85129334" + } + } + assert concept.broader_concept_ids == ["sh85129334"] + + def test_no_broaders(self) -> None: + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = {} + assert concept.broader_concept_ids == [] + + def test_ignore_underscore_n(self) -> None: + # _:nbunchanumbers identifiers are to be ignored. + # example from /authorities/subjects/sh00008764, Bintan Island (Indonesia) + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = { + "madsrdf:hasBroaderAuthority": [ + {"@id": "_:n428e364baf3942ff9c026b0033bac3d0b5"}, + {"@id": "http://id.loc.gov/authorities/subjects/sh85068533"}, + ] + } + assert concept.broader_concept_ids == ["sh85068533"] diff --git a/tests/transformers/loc/skos/test_raw_skos_concept.py b/tests/transformers/loc/skos/test_raw_skos_concept.py index 5e51f3d4d8..565e5323bd 100644 --- a/tests/transformers/loc/skos/test_raw_skos_concept.py +++ b/tests/transformers/loc/skos/test_raw_skos_concept.py @@ -5,41 +5,64 @@ from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept -def test_exclude_no_graph() -> None: +def test_label() -> None: """ - If there is no graph, then the concept is to be excluded + Label is extracted from madsrdf:authoritativeLabel """ concept = RawLibraryOfCongressSKOSConcept( - {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + json.loads(load_fixture("skos_concept.json")) ) - assert concept.exclude() == True + assert concept.label == "Pickling" -def test_exclude_no_matching_concept_node() -> None: - """ - If the graph does not contain a node of type skos:Concept, it is to be excluded - """ - concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_deprecated_concept.json")) - ) - assert concept.exclude() == True +class TestExclude: + def test_exclude_no_graph(self) -> None: + """ + If there is no graph, then the concept is to be excluded + """ + concept = RawLibraryOfCongressSKOSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + assert concept.exclude() == True + def test_exclude_no_matching_concept_node(self) -> None: + """ + If the graph does not contain a node of type skos:Concept, it is to be excluded + """ + concept = RawLibraryOfCongressSKOSConcept( + json.loads(load_fixture("skos_deprecated_concept.json")) + ) + assert concept.exclude() == True -def test_do_not_exclude() -> None: - """ - A complete, non-duplicate, non-deprecated record is to be included in the output - """ - concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_concept.json")) - ) - assert concept.exclude() == False + def test_do_not_exclude(self) -> None: + """ + A complete, non-duplicate, non-deprecated record is to be included in the output + """ + concept = RawLibraryOfCongressSKOSConcept( + json.loads(load_fixture("skos_concept.json")) + ) + assert concept.exclude() == False -def test_label() -> None: - """ - Label is extracted from madsrdf:authoritativeLabel - """ +class TestGeographic: + def test_is_geographic(self) -> None: + """ + A concept is geographic if there exists skos:notation with a gac type + """ + concept = RawLibraryOfCongressSKOSConcept( + json.loads(load_fixture("skos_geographic_concept.json")) + ) + assert concept.is_geographic == True + + def test_is_not_geographic(self) -> None: + concept = RawLibraryOfCongressSKOSConcept( + json.loads(load_fixture("skos_concept.json")) + ) + assert concept.is_geographic == False + + +def test_broader_concepts() -> None: concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_concept.json")) + json.loads(load_fixture("skos_geographic_concept.json")) ) - assert concept.label == "Pickling" + assert concept.broader_concept_ids == ["sh85040229", "sh85053109", "sh92006359"] From aaf3450b093711a12d7a710d2508e6088f6ed249 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 15:05:54 +0000 Subject: [PATCH 216/310] add broaders to mads --- src/transformers/loc/mads/raw_concept.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index 19689819de..d78c542a33 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -21,16 +21,19 @@ def _extract_concept_node(self) -> dict | None: @property def label(self) -> str: + assert self._raw_concept_node is not None raw_preferred_label = self._raw_concept_node["madsrdf:authoritativeLabel"] return self._extract_label(raw_preferred_label) @property def is_geographic(self) -> bool: + assert self._raw_concept_node is not None """Returns True if the node represents a geographic concept, as determined by @type""" return "madsrdf:Geographic" in self._raw_concept_node.get("@type", []) @property def broader_concept_ids(self) -> list[str]: + assert self._raw_concept_node is not None return _filter_irrelevant_ids( [ remove_id_prefix(broader["@id"]) From 706bb5c866fcce78c6090da59d6680bdb4a43d4d Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 31 Jan 2025 15:58:00 +0000 Subject: [PATCH 217/310] add relateds to mads --- src/transformers/loc/mads/raw_concept.py | 12 + tests/fixtures/mads_related_concept.json | 270 ++++++++++++++++++ tests/fixtures/skos_related_concept.json | 126 ++++++++ .../loc/mads/test_raw_mads_concept.py | 74 ++++- 4 files changed, 478 insertions(+), 4 deletions(-) create mode 100644 tests/fixtures/mads_related_concept.json create mode 100644 tests/fixtures/skos_related_concept.json diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index d78c542a33..e7b913266f 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -43,6 +43,18 @@ def broader_concept_ids(self) -> list[str]: ] ) + @property + def related_concept_ids(self) -> list[str]: + assert self._raw_concept_node is not None + return _filter_irrelevant_ids( + [ + remove_id_prefix(broader["@id"]) + for broader in _as_list( + self._raw_concept_node.get("madsrdf:hasReciprocalAuthority", []) + ) + ] + ) + def _filter_irrelevant_ids(ids: list[str]) -> list[str]: return [concept_id for concept_id in ids if not concept_id.startswith("_:n")] diff --git a/tests/fixtures/mads_related_concept.json b/tests/fixtures/mads_related_concept.json new file mode 100644 index 0000000000..df14997ab1 --- /dev/null +++ b/tests/fixtures/mads_related_concept.json @@ -0,0 +1,270 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh90003066", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Memory management (Computer science)" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b18" + } + ] + } + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b18", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Memory management (Computer science)" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh88007957", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Cache memory" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b16" + } + ] + } + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b16", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Cache memory" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh00000122", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "bflc:marcKey": "150 $aLoop tiling (Computer science)", + "identifiers:lccn": "sh 00000122", + "madsrdf:adminMetadata": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b1" + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b2" + } + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Loop tiling (Computer science)" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b3" + } + ] + }, + "madsrdf:hasBroaderAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh88007957" + }, + "madsrdf:hasReciprocalAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh90003066" + }, + "madsrdf:hasSource": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b5" + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b6" + } + ], + "madsrdf:hasVariant": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b7" + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b10" + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b13" + } + ], + "madsrdf:isMemberOfMADSCollection": [ + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSHAuthorizedHeadings" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSH_General" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_SubdivideGeographically" + } + ], + "madsrdf:isMemberOfMADSScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "owl:sameAs": [ + { + "@id": "http://id.loc.gov/authorities/sh00000122#concept" + }, + { + "@id": "info:lc/authorities/sh00000122" + } + ] + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b1", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2000-06-27T00:00:00" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "new" + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b2", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2000-08-08T14:11:53" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "revised" + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b3", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Loop tiling (Computer science)" + } + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b5", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "CIP galley (Loop tiling, as one of the most important compiler optimizations, is beneficial for both parallel machines and uniprocessors with a memory hierarchy)" + }, + "madsrdf:citationSource": "Work cat.: 00057639: Xue, Jingling. Loop tiling for parallelism, c2000:", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b6", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "v. 48, no. 2, Feb., 1999 (Augmenting loop tiling with data alignment for improved cache performance. Loop blocking (tiling) is a well-known compiler optimization that helps improve cache performance by dividing the loop iteration space into smaller blocks (tiles))" + }, + "madsrdf:citationSource": "IEEE transactions on Computers:", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b7", + "@type": [ + "madsrdf:Topic", + "madsrdf:Variant" + ], + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b8" + } + ] + }, + "madsrdf:variantLabel": { + "@language": "en", + "@value": "Loop blocking (Computer science)" + } + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b8", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Loop blocking (Computer science)" + } + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b10", + "@type": [ + "madsrdf:Topic", + "madsrdf:Variant" + ], + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b11" + } + ] + }, + "madsrdf:variantLabel": { + "@language": "en", + "@value": "Blocking, Loop (Computer science)" + } + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b11", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Blocking, Loop (Computer science)" + } + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b13", + "@type": [ + "madsrdf:Topic", + "madsrdf:Variant" + ], + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b14" + } + ] + }, + "madsrdf:variantLabel": { + "@language": "en", + "@value": "Tiling, Loop (Computer science)" + } + }, + { + "@id": "_:n5c7514adfaab45d5bc99c13a2faadfc4b14", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Tiling, Loop (Computer science)" + } + } + ], + "@id": "/authorities/subjects/sh00000122" +} + diff --git a/tests/fixtures/skos_related_concept.json b/tests/fixtures/skos_related_concept.json new file mode 100644 index 0000000000..3584c01d00 --- /dev/null +++ b/tests/fixtures/skos_related_concept.json @@ -0,0 +1,126 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh90003066", + "@type": "skos:Concept", + "skos:prefLabel": { + "@language": "en", + "@value": "Memory management (Computer science)" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh00000122", + "@type": "skos:Concept", + "skos:altLabel": [ + { + "@language": "en", + "@value": "Blocking, Loop (Computer science)" + }, + { + "@language": "en", + "@value": "Loop blocking (Computer science)" + }, + { + "@language": "en", + "@value": "Tiling, Loop (Computer science)" + } + ], + "skos:broader": { + "@id": "http://id.loc.gov/authorities/subjects/sh88007957" + }, + "skos:changeNote": [ + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb1" + }, + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb2" + } + ], + "skos:inScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "skos:prefLabel": { + "@language": "en", + "@value": "Loop tiling (Computer science)" + }, + "skos:related": { + "@id": "http://id.loc.gov/authorities/subjects/sh90003066" + }, + "skosxl:altLabel": [ + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb3" + }, + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb4" + }, + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb5" + } + ] + }, + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb1", + "@type": "cs:ChangeSet", + "cs:changeReason": "new", + "cs:createdDate": { + "@type": "xsd:dateTime", + "@value": "2000-06-27T00:00:00" + }, + "cs:creatorName": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "cs:subjectOfChange": { + "@id": "http://id.loc.gov/authorities/subjects/sh00000122" + } + }, + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb2", + "@type": "cs:ChangeSet", + "cs:changeReason": "revised", + "cs:createdDate": { + "@type": "xsd:dateTime", + "@value": "2000-08-08T14:11:53" + }, + "cs:creatorName": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "cs:subjectOfChange": { + "@id": "http://id.loc.gov/authorities/subjects/sh00000122" + } + }, + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb3", + "@type": "skosxl:Label", + "skosxl:literalForm": { + "@language": "en", + "@value": "Loop blocking (Computer science)" + } + }, + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb4", + "@type": "skosxl:Label", + "skosxl:literalForm": { + "@language": "en", + "@value": "Tiling, Loop (Computer science)" + } + }, + { + "@id": "_:neb0df0b7a7b64ee2b5193fb6292983feb5", + "@type": "skosxl:Label", + "skosxl:literalForm": { + "@language": "en", + "@value": "Blocking, Loop (Computer science)" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh88007957", + "@type": "skos:Concept", + "skos:prefLabel": { + "@language": "en", + "@value": "Cache memory" + } + } + ], + "@id": "/authorities/subjects/sh00000122" +} \ No newline at end of file diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index 86e7182a8f..53ce8fb33e 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -68,7 +68,14 @@ def test_real_example(self) -> None: ) assert concept.broader_concept_ids == ["sh85040229", "sh85053109", "sh92006359"] - def test_single_broader(self) -> None: + def test_none(self) -> None: + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = {} + assert concept.broader_concept_ids == [] + + def test_single(self) -> None: concept = RawLibraryOfCongressMADSConcept( {"@id": "/authorities/subjects/sh2010105253", "@graph": []} ) @@ -80,12 +87,17 @@ def test_single_broader(self) -> None: } assert concept.broader_concept_ids == ["sh85129334"] - def test_no_broaders(self) -> None: + def test_multiple(self) -> None: concept = RawLibraryOfCongressMADSConcept( {"@id": "/authorities/subjects/sh2010105253", "@graph": []} ) - concept._raw_concept_node = {} - assert concept.broader_concept_ids == [] + concept._raw_concept_node = { + "madsrdf:hasBroaderAuthority": [ + {"@id": "http://id.loc.gov/authorities/subjects/sh85129334"}, + {"@id": "http://id.loc.gov/authorities/subjects/sh85068533"}, + ] + } + assert concept.broader_concept_ids == ["sh85129334", "sh85068533"] def test_ignore_underscore_n(self) -> None: # _:nbunchanumbers identifiers are to be ignored. @@ -100,3 +112,57 @@ def test_ignore_underscore_n(self) -> None: ] } assert concept.broader_concept_ids == ["sh85068533"] + + +class TestRelatedConcepts: + def test_real_example(self) -> None: + # A real-world example, taken directly from the export, + # This helps to give confidence that the whole test isn't just + # passing due to a bogus assumption when making artificial test data. + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_related_concept.json")) + ) + assert concept.related_concept_ids == ["sh90003066"] + + def test_none(self) -> None: + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = {} + assert concept.related_concept_ids == [] + + def test_single(self) -> None: + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = { + "madsrdf:hasReciprocalAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh90003066" + } + } + assert concept.related_concept_ids == ["sh90003066"] + + def test_multiple(self) -> None: + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = { + "madsrdf:hasReciprocalAuthority": [ + {"@id": "http://id.loc.gov/authorities/subjects/sh123456789"}, + {"@id": "http://id.loc.gov/authorities/subjects/sh987654321"}, + ] + } + assert concept.related_concept_ids == ["sh123456789", "sh987654321"] + + def test_ignore_underscore_n(self) -> None: + # _:nbunchanumbers identifiers are to be ignored. + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = { + "madsrdf:hasReciprocalAuthority": [ + {"@id": "_:n428e364baf3942ff9c026b0033bac3d0b5"}, + {"@id": "http://id.loc.gov/authorities/subjects/sh123456789"}, + ] + } + assert concept.related_concept_ids == ["sh123456789"] From 7346c8152e488e8f9b59d32fb615a4191c016315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 31 Jan 2025 17:27:41 +0000 Subject: [PATCH 218/310] Add Wikidata source testing for nodes #5898 --- README.md | 12 ++++ tests/fixtures/wikidata/items_query.json | 4 ++ tests/fixtures/wikidata/items_response.json | 70 +++++++++++++++++++ .../sources/test_wikidata_concepts_source.py | 37 +++++++++- 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 tests/fixtures/wikidata/items_query.json create mode 100644 tests/fixtures/wikidata/items_response.json diff --git a/README.md b/README.md index 25fbec9945..864df7b0d7 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,18 @@ command line. For example, to check the status of a bulk load job, run the follo AWS_PROFILE=platform-developer python3.13 bulk_load_poller.py --load-id= ``` +To run an extractor, use the following: + +```shell +S3_BULK_LOAD_BUCKET_NAME=wellcomecollection-neptune-graph-loader \ +AWS_PROFILE=platform-developer \ +python3.13 extractor.py \ + --transformer-type=wikidata_linked_loc_concepts \ + --entity-type=nodes \ + --stream-destination=void \ + --sample-size=10 +``` + ## Local Neptune experimentation To run experimental Neptune queries locally, create a new Python file in the `src` directory, create a local Neptune diff --git a/tests/fixtures/wikidata/items_query.json b/tests/fixtures/wikidata/items_query.json new file mode 100644 index 0000000000..0d32141a24 --- /dev/null +++ b/tests/fixtures/wikidata/items_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?item (SAMPLE(?itemLabel) as ?itemLabel) (SAMPLE(?itemDescription) as ?itemDescription) (SAMPLE(?itemAltLabel) as ?itemAltLabel) WHERE { VALUES ?item { wd:Q1 wd:Q2 wd:Q3 wd:Q4 } SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". ?item rdfs:label ?itemLabel. ?item schema:description ?itemDescription. ?item skos:altLabel ?itemAltLabel. } } GROUP BY ?item" +} diff --git a/tests/fixtures/wikidata/items_response.json b/tests/fixtures/wikidata/items_response.json new file mode 100644 index 0000000000..d8e9f0cedd --- /dev/null +++ b/tests/fixtures/wikidata/items_response.json @@ -0,0 +1,70 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + }, + "itemLabel": { + "xml:lang": "en", + "type": "literal", + "value": "Eustigmatophyceae" + }, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "class of algae" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + }, + "itemLabel": { + "xml:lang": "en", + "type": "literal", + "value": "American women in World War I" + }, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "American women's role in World War I and effect of World War I in American women's life" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q3" + }, + "itemLabel": { + "xml:lang": "en", + "type": "literal", + "value": "American women in World War I" + }, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "American women's role in World War I and effect of World War I in American women's life" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q4" + }, + "itemLabel": { + "xml:lang": "en", + "type": "literal", + "value": "American women in World War I" + }, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "American women's role in World War I and effect of World War I in American women's life" + } + } + ] + } +} diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 7e3949e769..d58161cb15 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -9,7 +9,7 @@ def add_mock_wikidata_request( query_type: Literal[ - "all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of" + "all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items" ] ) -> None: params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) @@ -19,7 +19,7 @@ def add_mock_wikidata_request( ) -def test_wikidata_concepts_source() -> None: +def test_wikidata_concepts_source_edges() -> None: MockSmartOpen.mock_s3_file( "s3://bulk_load_test_bucket/loc_concepts__nodes.csv", load_fixture("loc_concepts_transformer_output.csv").decode(), @@ -55,3 +55,36 @@ def test_wikidata_concepts_source() -> None: assert ("Q1", "Q4") in has_parent_edges assert ("Q2", "Q1") in has_parent_edges assert ("Q2", "Q3") in has_parent_edges + + +def test_wikidata_concepts_source_nodes() -> None: + MockSmartOpen.mock_s3_file( + "s3://bulk_load_test_bucket/loc_concepts__nodes.csv", + load_fixture("loc_concepts_transformer_output.csv").decode(), + ) + MockSmartOpen.mock_s3_file( + "s3://bulk_load_test_bucket/loc_locations__nodes.csv", + load_fixture("loc_concepts_transformer_output.csv").decode(), + ) + MockSmartOpen.mock_s3_file( + "s3://bulk_load_test_bucket/loc_names__nodes.csv", + load_fixture("loc_concepts_transformer_output.csv").decode(), + ) + add_mock_wikidata_request("all_ids") + add_mock_wikidata_request("linked_ids") + add_mock_wikidata_request("parents_instance_of") + add_mock_wikidata_request("parents_subclass_of") + add_mock_wikidata_request("items") + + mesh_concepts_source = WikidataLinkedOntologySource( + node_type="concepts", linked_ontology="loc", entity_type="nodes" + ) + + stream_result = list(mesh_concepts_source.stream_raw()) + + assert len(stream_result) == 4 + + for raw_node in stream_result: + assert "item" in raw_node + assert "itemLabel" in raw_node + assert "itemDescription" in raw_node From 76494f0ba55b425f3866d53b54b52468dec2e5f9 Mon Sep 17 00:00:00 2001 From: Antonia Date: Mon, 3 Feb 2025 09:38:16 +0000 Subject: [PATCH 219/310] Update src/sources/catalogue/concepts_source.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Štěpán Brychta --- src/sources/catalogue/concepts_source.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sources/catalogue/concepts_source.py b/src/sources/catalogue/concepts_source.py index 538df806eb..ca5c1e1d8f 100644 --- a/src/sources/catalogue/concepts_source.py +++ b/src/sources/catalogue/concepts_source.py @@ -14,6 +14,6 @@ def stream_raw(self) -> Generator[dict]: """Streams raw concept nodes from a work's subjects, genres, and contributors.""" catalogue_source = GZipSource(self.url) for work in catalogue_source.stream_raw(): - for conecpt_key in CONCEPT_KEYS: - for raw_concept in work.get(conecpt_key, []): + for concept_key in CONCEPT_KEYS: + for raw_concept in work.get(concept_key, []): yield raw_concept From 79e409062956afbe57b494e6522f260ce24ba5c8 Mon Sep 17 00:00:00 2001 From: Antonia Date: Mon, 3 Feb 2025 09:38:35 +0000 Subject: [PATCH 220/310] Update src/transformers/catalogue/raw_concept.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Štěpán Brychta --- src/transformers/catalogue/raw_concept.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 7ba3d909e0..cfef612986 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -11,7 +11,7 @@ def __init__(self, raw_concept: dict): def _extract_concept_node(raw_concept: dict) -> dict: """ Extracts raw concepts data from one of two formats: - Either as a dicitonary inside a list under "concepts", or as a dictionary under "agent". + Either as a dictionary inside a list under "concepts", or as a dictionary under "agent". """ if len(raw_concept.get("concepts", [])) > 0: raw_concept_node = raw_concept["concepts"][0] From 0dadc53be1625e4d9aedcf4e480076cc16107b00 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 3 Feb 2025 09:48:25 +0000 Subject: [PATCH 221/310] Combine concept checks --- src/transformers/catalogue/raw_concept.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index cfef612986..19bc48f025 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -27,11 +27,10 @@ def is_concept(self) -> bool: Determines whether a given block of JSON represents a Concept as returned from the Catalogue API. A Concept is a block of JSON with a type property and a list of identifiers. """ - if isinstance(self.raw_concept.get("type"), str): - if self.raw_concept["type"] in get_args( - ConceptType - ) and self.raw_concept.get("identifiers"): - return True + if self.raw_concept.get("type") in get_args( + ConceptType + ) and self.raw_concept.get("identifiers"): + return True return False From f7299acc2e5c6d2b7eafed46907411b6a9b19a91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 3 Feb 2025 09:48:57 +0000 Subject: [PATCH 222/310] Wikidata tests refactoring --- .../raw_names.jsonld} | 0 .../raw_subject_headings.jsonld} | 0 .../transformer_output_concepts_nodes.csv} | 0 .../transformer_output_locations_nodes.csv | 3 + .../loc/transformer_output_names_nodes.csv | 3 + .../raw_descriptors.xml} | 0 tests/sources/test_mesh_concepts_source.py | 2 +- .../sources/test_wikidata_concepts_source.py | 60 ++++++++----------- tests/test_extractor.py | 8 +-- .../test_mesh_concepts_transformer.py | 2 +- 10 files changed, 37 insertions(+), 41 deletions(-) rename tests/fixtures/{loc_names_example.jsonld => loc/raw_names.jsonld} (100%) rename tests/fixtures/{loc_subjects_example.jsonld => loc/raw_subject_headings.jsonld} (100%) rename tests/fixtures/{loc_concepts_transformer_output.csv => loc/transformer_output_concepts_nodes.csv} (100%) create mode 100644 tests/fixtures/loc/transformer_output_locations_nodes.csv create mode 100644 tests/fixtures/loc/transformer_output_names_nodes.csv rename tests/fixtures/{mesh_example.xml => mesh/raw_descriptors.xml} (100%) diff --git a/tests/fixtures/loc_names_example.jsonld b/tests/fixtures/loc/raw_names.jsonld similarity index 100% rename from tests/fixtures/loc_names_example.jsonld rename to tests/fixtures/loc/raw_names.jsonld diff --git a/tests/fixtures/loc_subjects_example.jsonld b/tests/fixtures/loc/raw_subject_headings.jsonld similarity index 100% rename from tests/fixtures/loc_subjects_example.jsonld rename to tests/fixtures/loc/raw_subject_headings.jsonld diff --git a/tests/fixtures/loc_concepts_transformer_output.csv b/tests/fixtures/loc/transformer_output_concepts_nodes.csv similarity index 100% rename from tests/fixtures/loc_concepts_transformer_output.csv rename to tests/fixtures/loc/transformer_output_concepts_nodes.csv diff --git a/tests/fixtures/loc/transformer_output_locations_nodes.csv b/tests/fixtures/loc/transformer_output_locations_nodes.csv new file mode 100644 index 0000000000..d259fa3a68 --- /dev/null +++ b/tests/fixtures/loc/transformer_output_locations_nodes.csv @@ -0,0 +1,3 @@ +:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description,latitude,longitude +sh00000015,SourceLocation,sh00000015,"Trail Creek Watershed (Jackson County, Or.)",lc-subjects,,,null,null,null +sh00000016,SourceLocation,sh00000016,"Trail Creek (Jackson County, Or.)",lc-subjects,,,null,null,null \ No newline at end of file diff --git a/tests/fixtures/loc/transformer_output_names_nodes.csv b/tests/fixtures/loc/transformer_output_names_nodes.csv new file mode 100644 index 0000000000..9d16eefeac --- /dev/null +++ b/tests/fixtures/loc/transformer_output_names_nodes.csv @@ -0,0 +1,3 @@ +:ID,:LABEL,id:String,label:String,source:String,alternative_ids:String,alternative_labels:String,description:String,date_of_birth:DateTime,date_of_death:DateTime,place_of_birth:String +n00000001,SourceName,n00000001,"McQuerry, Maureen, 1955-",lc-names,,"MacQuerry, Maureen, 1955-||Makkvyri, Morin, 1955-||McQuerry, Maureen Doyle, 1955-",,,, +n00000021,SourceName,n00000021,"Widmer, Thomas, 1962-",lc-names,,,,,, diff --git a/tests/fixtures/mesh_example.xml b/tests/fixtures/mesh/raw_descriptors.xml similarity index 100% rename from tests/fixtures/mesh_example.xml rename to tests/fixtures/mesh/raw_descriptors.xml diff --git a/tests/sources/test_mesh_concepts_source.py b/tests/sources/test_mesh_concepts_source.py index 072f7c9ed3..bdff2f225a 100644 --- a/tests/sources/test_mesh_concepts_source.py +++ b/tests/sources/test_mesh_concepts_source.py @@ -13,7 +13,7 @@ def test_mesh_concepts_source() -> None: "url": test_url, "status_code": 200, "json_data": None, - "content_bytes": load_fixture("mesh_example.xml"), + "content_bytes": load_fixture("mesh/raw_descriptors.xml"), "params": None, } ] diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index d58161cb15..372c8fe1c5 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -7,32 +7,37 @@ from test_utils import load_fixture -def add_mock_wikidata_request( - query_type: Literal[ - "all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items" - ] -) -> None: - params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) - response = json.loads(load_fixture(f"wikidata/{query_type}_response.json")) - MockRequest.mock_response( - method="GET", url=WIKIDATA_SPARQL_URL, params=params, json_data=response - ) +WikidataQueryType = Literal[ + "all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items" +] + + +def add_mock_wikidata_requests(query_types: list[WikidataQueryType]) -> None: + for query_type in query_types: + params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) + response = json.loads(load_fixture(f"wikidata/{query_type}_response.json")) + MockRequest.mock_response( + method="GET", url=WIKIDATA_SPARQL_URL, params=params, json_data=response + ) + + +def add_mock_loc_transformer_outputs() -> None: + for node_type in ["concepts", "locations", "names"]: + MockSmartOpen.mock_s3_file( + f"s3://bulk_load_test_bucket/loc_{node_type}__nodes.csv", + load_fixture(f"loc/transformer_output_{node_type}_nodes.csv").decode(), + ) def test_wikidata_concepts_source_edges() -> None: - MockSmartOpen.mock_s3_file( - "s3://bulk_load_test_bucket/loc_concepts__nodes.csv", - load_fixture("loc_concepts_transformer_output.csv").decode(), + add_mock_loc_transformer_outputs() + add_mock_wikidata_requests( + ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of"] ) - add_mock_wikidata_request("all_ids") - add_mock_wikidata_request("linked_ids") - add_mock_wikidata_request("parents_instance_of") - add_mock_wikidata_request("parents_subclass_of") mesh_concepts_source = WikidataLinkedOntologySource( node_type="concepts", linked_ontology="loc", entity_type="edges" ) - stream_result = list(mesh_concepts_source.stream_raw()) assert len(stream_result) == 5 @@ -58,23 +63,10 @@ def test_wikidata_concepts_source_edges() -> None: def test_wikidata_concepts_source_nodes() -> None: - MockSmartOpen.mock_s3_file( - "s3://bulk_load_test_bucket/loc_concepts__nodes.csv", - load_fixture("loc_concepts_transformer_output.csv").decode(), - ) - MockSmartOpen.mock_s3_file( - "s3://bulk_load_test_bucket/loc_locations__nodes.csv", - load_fixture("loc_concepts_transformer_output.csv").decode(), - ) - MockSmartOpen.mock_s3_file( - "s3://bulk_load_test_bucket/loc_names__nodes.csv", - load_fixture("loc_concepts_transformer_output.csv").decode(), + add_mock_loc_transformer_outputs() + add_mock_wikidata_requests( + ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items"] ) - add_mock_wikidata_request("all_ids") - add_mock_wikidata_request("linked_ids") - add_mock_wikidata_request("parents_instance_of") - add_mock_wikidata_request("parents_subclass_of") - add_mock_wikidata_request("items") mesh_concepts_source = WikidataLinkedOntologySource( node_type="concepts", linked_ontology="loc", entity_type="nodes" diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 7e1dba6c8f..879497e6c1 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -24,7 +24,6 @@ def mock_requests_lookup_table( destination: StreamDestination, transformer_type: TransformerType, ) -> Any: - mocked_responses: list[dict] = [] if destination == "graph": @@ -42,7 +41,7 @@ def mock_requests_lookup_table( { "method": "GET", "url": MESH_URL, - "content_bytes": load_fixture("mesh_example.xml"), + "content_bytes": load_fixture("mesh/raw_descriptors.xml"), } ) elif transformer_type in ["loc_concepts", "loc_locations", "loc_names"]: @@ -50,14 +49,14 @@ def mock_requests_lookup_table( { "method": "GET", "url": LOC_SUBJECT_HEADINGS_URL, - "content_bytes": load_fixture("loc_subjects_example.jsonld"), + "content_bytes": load_fixture("loc/raw_subject_headings.jsonld"), } ) mocked_responses.append( { "method": "GET", "url": LOC_NAMES_URL, - "content_bytes": load_fixture("loc_names_example.jsonld"), + "content_bytes": load_fixture("loc/raw_names.jsonld"), } ) elif transformer_type in [ @@ -128,7 +127,6 @@ def test_lambda_handler( lambda_event: LambdaEvent, mock_responses: list[MockResponseInput], ) -> None: - MockRequest.mock_responses(mock_responses) lambda_handler(lambda_event, None) diff --git a/tests/transformers/test_mesh_concepts_transformer.py b/tests/transformers/test_mesh_concepts_transformer.py index a81671260e..54ea6d5d90 100644 --- a/tests/transformers/test_mesh_concepts_transformer.py +++ b/tests/transformers/test_mesh_concepts_transformer.py @@ -14,7 +14,7 @@ def test_mesh_concepts_transformer() -> None: "url": test_url, "status_code": 200, "json_data": None, - "content_bytes": load_fixture("mesh_example.xml"), + "content_bytes": load_fixture("mesh/raw_descriptors.xml"), "params": None, } ] From 93a92289253b061ef27e7e893348959a2c0bd7ca Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Mon, 3 Feb 2025 10:27:49 +0000 Subject: [PATCH 223/310] add Narrowers --- src/transformers/loc/common.py | 5 + src/transformers/loc/mads/raw_concept.py | 27 +++ .../mads_narrower_authority_concept.json | 204 ++++++++++++++++++ .../loc/mads/test_raw_mads_concept.py | 55 +++++ 4 files changed, 291 insertions(+) create mode 100644 tests/fixtures/mads_narrower_authority_concept.json diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index d1ffc36fd6..98224c0a0d 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -50,6 +50,11 @@ def broader_concept_ids(self) -> list[str]: """Returns a list of IDs representing concepts which are broader than the current concept.""" raise NotImplementedError + @property + def narrower_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are narrower than the current concept.""" + raise NotImplementedError + @property def related_concept_ids(self) -> list[str]: """Returns a list of IDs representing concepts which are related to the current concept.""" diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index e7b913266f..dade2be12f 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -43,6 +43,33 @@ def broader_concept_ids(self) -> list[str]: ] ) + @property + def narrower_concept_ids(self) -> list[str]: + return self._narrowers_from_narrower_authority() + self._narrowers_from_component_list() + + + def _narrowers_from_component_list(self): + assert self._raw_concept_node is not None + return _filter_irrelevant_ids( + [ + remove_id_prefix(broader["@id"]) + for broader in _as_list( + self._raw_concept_node.get("madsrdf:componentList", {}).get("@list", []) + ) + ] + ) + + def _narrowers_from_narrower_authority(self): + assert self._raw_concept_node is not None + return _filter_irrelevant_ids( + [ + remove_id_prefix(broader["@id"]) + for broader in _as_list( + self._raw_concept_node.get("madsrdf:hasNarrowerAuthority", []) + ) + ] + ) + @property def related_concept_ids(self) -> list[str]: assert self._raw_concept_node is not None diff --git a/tests/fixtures/mads_narrower_authority_concept.json b/tests/fixtures/mads_narrower_authority_concept.json new file mode 100644 index 0000000000..c28edb8797 --- /dev/null +++ b/tests/fixtures/mads_narrower_authority_concept.json @@ -0,0 +1,204 @@ +{ + "@context": "http://v3/authorities/subjects/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh2006001563", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Pitheciidae" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b11" + } + ] + } + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b11", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Pitheciidae" + } + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh00000032", + "@type": [ + "madsrdf:Authority", + "madsrdf:Topic" + ], + "bflc:marcKey": "150 $aPithecia", + "identifiers:lccn": "sh 00000032", + "madsrdf:adminMetadata": [ + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b1" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b2" + } + ], + "madsrdf:authoritativeLabel": { + "@language": "en", + "@value": "Pithecia" + }, + "madsrdf:classification": { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b3" + }, + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b4" + } + ] + }, + "madsrdf:hasBroaderAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh2006001563" + }, + "madsrdf:hasNarrowerAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh00000029" + }, + "madsrdf:hasSource": [ + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b6" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b7" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b8" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b9" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b10" + } + ], + "madsrdf:isMemberOfMADSCollection": [ + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSHAuthorizedHeadings" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_LCSH_General" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/collection_SubdivideGeographically" + } + ], + "madsrdf:isMemberOfMADSScheme": { + "@id": "http://id.loc.gov/authorities/subjects" + }, + "owl:sameAs": [ + { + "@id": "http://id.loc.gov/authorities/sh00000032#concept" + }, + { + "@id": "info:lc/authorities/sh00000032" + } + ] + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b1", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2006-03-01T00:00:00" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "new" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b2", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "2006-04-25T13:58:06" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/dlc" + }, + "ri:recordStatus": "revised" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b3", + "@type": "lcc:ClassNumber", + "madsrdf:code": "QL737.P959", + "madsrdf:hasExactExternalAuthority": { + "@id": "http://id.loc.gov/authorities/classification/QL737.P959" + } + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b4", + "@type": "madsrdf:TopicElement", + "madsrdf:elementValue": { + "@language": "en", + "@value": "Pithecia" + } + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b6", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "p. 168 (Genus Pithecia Desmarest, 1804, in Family Pitheciidae, Saki Monkeys)" + }, + "madsrdf:citationSource": "Groves, C. Primate taxonomy, 2001:", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b7", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "p. 3 (Pithecia Desmarest 1804)" + }, + "madsrdf:citationSource": "Work cat.: 98210231: Homburg, I. \u00d6kologie und Sozialverhalten von Weissgesicht-Sakis, 1998:", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b8", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "p. 264 (Pithecia, Desmarest, 1804)" + }, + "madsrdf:citationSource": "Napier, J.R. A handbook of living primates, 1967:", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b9", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "v. 1, p. 452 (Primates; Cebidae; Genus Pithecia)" + }, + "madsrdf:citationSource": "Walker's mammals world, 1991:", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:n206a4e00dc5c4a9dbd1bf5bba8caab38b10", + "@type": "madsrdf:Source", + "madsrdf:citationNote": { + "@language": "en", + "@value": "p. 17 (Family Pitheciidae. Sakis (Chiropotes spp. and Pithecia spp.))" + }, + "madsrdf:citationSource": "Shumaker, R.W. Primates in question, 3002:", + "madsrdf:citationStatus": "found" + } + ], + "@id": "/authorities/subjects/sh00000032" +} diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index 53ce8fb33e..6c4c583ab0 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -166,3 +166,58 @@ def test_ignore_underscore_n(self) -> None: ] } assert concept.related_concept_ids == ["sh123456789"] + +class TestNarrower: + + def test_get_no_narrowers(self): + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = {} + assert concept.narrower_concept_ids == [] + + + def test_get_narrowers_from_components(self): + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = { "madsrdf:componentList": { + "@list": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85098685" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh99001366" + } + ] + },} + assert concept.narrower_concept_ids == ["sh85098685", "sh99001366"] + + + def test_get_narrowers_from_narrower_authority(self): + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = { "madsrdf:hasNarrowerAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh00000029" + }} + assert concept.narrower_concept_ids == ["sh00000029"] + + + def test_get_narrowers_from_both(self): + concept = RawLibraryOfCongressMADSConcept( + {"@id": "/authorities/subjects/sh2010105253", "@graph": []} + ) + concept._raw_concept_node = { "madsrdf:componentList": { + "@list": [ + { + "@id": "http://id.loc.gov/authorities/subjects/sh85098685" + }, + { + "@id": "http://id.loc.gov/authorities/subjects/sh99001366" + } + ] + },"madsrdf:hasNarrowerAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh00000029" + }} + assert set(concept.narrower_concept_ids) == {"sh00000029", "sh85098685", "sh99001366"} From 2384e3f3e4893647082c4cf15b6a2bfdbf29b2ac Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 3 Feb 2025 10:32:33 +0000 Subject: [PATCH 224/310] Refactor type and source --- src/transformers/catalogue/raw_concept.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 19bc48f025..97e87aa0b6 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -53,12 +53,8 @@ def label(self) -> str: @property def type(self) -> ConceptType: """Returns the concept type (one of "Person", "Concept", "Genre", etc.).""" - concept_type = self.raw_concept.get("type") - - if concept_type in get_args(ConceptType): - return cast(ConceptType, concept_type) - - raise ValueError("Concept type not recognised.") + concept_type: ConceptType = self.raw_concept["type"] + return concept_type def _get_identifier(self) -> dict: """Returns metadata about the source identifier.""" @@ -73,11 +69,7 @@ def _get_identifier(self) -> dict: @property def source(self) -> ConceptSource: """Returns the concept source (one of "lc-names", "label-derived", etc.).""" - identifier = self._get_identifier() - source = identifier["identifierType"]["id"] - if source in get_args(ConceptSource): - return cast(ConceptSource, source) - - raise ValueError("Concept source not recognised.") + source: ConceptSource = identifier["identifierType"]["id"] + return source From 78ed224b50fe302eda2fa70814818cf42b7cbf06 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Mon, 3 Feb 2025 10:36:55 +0000 Subject: [PATCH 225/310] add Narrowers --- src/transformers/loc/mads/raw_concept.py | 14 ++-- .../loc/mads/test_raw_mads_concept.py | 67 ++++++++++--------- 2 files changed, 43 insertions(+), 38 deletions(-) diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index dade2be12f..b2b2c6f597 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -45,21 +45,25 @@ def broader_concept_ids(self) -> list[str]: @property def narrower_concept_ids(self) -> list[str]: - return self._narrowers_from_narrower_authority() + self._narrowers_from_component_list() - + return ( + self._narrowers_from_narrower_authority() + + self._narrowers_from_component_list() + ) - def _narrowers_from_component_list(self): + def _narrowers_from_component_list(self) -> list[str]: assert self._raw_concept_node is not None return _filter_irrelevant_ids( [ remove_id_prefix(broader["@id"]) for broader in _as_list( - self._raw_concept_node.get("madsrdf:componentList", {}).get("@list", []) + self._raw_concept_node.get("madsrdf:componentList", {}).get( + "@list", [] + ) ) ] ) - def _narrowers_from_narrower_authority(self): + def _narrowers_from_narrower_authority(self) -> list[str]: assert self._raw_concept_node is not None return _filter_irrelevant_ids( [ diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index 6c4c583ab0..d9a12a8f2b 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -167,57 +167,58 @@ def test_ignore_underscore_n(self) -> None: } assert concept.related_concept_ids == ["sh123456789"] + class TestNarrower: - def test_get_no_narrowers(self): + def test_get_no_narrowers(self) -> None: concept = RawLibraryOfCongressMADSConcept( {"@id": "/authorities/subjects/sh2010105253", "@graph": []} ) concept._raw_concept_node = {} assert concept.narrower_concept_ids == [] - - def test_get_narrowers_from_components(self): + def test_get_narrowers_from_components(self) -> None: concept = RawLibraryOfCongressMADSConcept( {"@id": "/authorities/subjects/sh2010105253", "@graph": []} ) - concept._raw_concept_node = { "madsrdf:componentList": { - "@list": [ - { - "@id": "http://id.loc.gov/authorities/subjects/sh85098685" - }, - { - "@id": "http://id.loc.gov/authorities/subjects/sh99001366" - } - ] - },} + concept._raw_concept_node = { + "madsrdf:componentList": { + "@list": [ + {"@id": "http://id.loc.gov/authorities/subjects/sh85098685"}, + {"@id": "http://id.loc.gov/authorities/subjects/sh99001366"}, + ] + }, + } assert concept.narrower_concept_ids == ["sh85098685", "sh99001366"] - - def test_get_narrowers_from_narrower_authority(self): + def test_get_narrowers_from_narrower_authority(self) -> None: concept = RawLibraryOfCongressMADSConcept( {"@id": "/authorities/subjects/sh2010105253", "@graph": []} ) - concept._raw_concept_node = { "madsrdf:hasNarrowerAuthority": { - "@id": "http://id.loc.gov/authorities/subjects/sh00000029" - }} + concept._raw_concept_node = { + "madsrdf:hasNarrowerAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh00000029" + } + } assert concept.narrower_concept_ids == ["sh00000029"] - - def test_get_narrowers_from_both(self): + def test_get_narrowers_from_both(self) -> None: concept = RawLibraryOfCongressMADSConcept( {"@id": "/authorities/subjects/sh2010105253", "@graph": []} ) - concept._raw_concept_node = { "madsrdf:componentList": { - "@list": [ - { - "@id": "http://id.loc.gov/authorities/subjects/sh85098685" - }, - { - "@id": "http://id.loc.gov/authorities/subjects/sh99001366" - } - ] - },"madsrdf:hasNarrowerAuthority": { - "@id": "http://id.loc.gov/authorities/subjects/sh00000029" - }} - assert set(concept.narrower_concept_ids) == {"sh00000029", "sh85098685", "sh99001366"} + concept._raw_concept_node = { + "madsrdf:componentList": { + "@list": [ + {"@id": "http://id.loc.gov/authorities/subjects/sh85098685"}, + {"@id": "http://id.loc.gov/authorities/subjects/sh99001366"}, + ] + }, + "madsrdf:hasNarrowerAuthority": { + "@id": "http://id.loc.gov/authorities/subjects/sh00000029" + }, + } + assert set(concept.narrower_concept_ids) == { + "sh00000029", + "sh85098685", + "sh99001366", + } From 005048fbe88f88c2f6baeaa1dd8eb495e9457c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 3 Feb 2025 10:41:07 +0000 Subject: [PATCH 226/310] Add Wikidata transformer test --- .../sources/test_wikidata_concepts_source.py | 12 ++-- .../test_wikidata_concepts_transformer.py | 57 +++++++++++++++++++ 2 files changed, 63 insertions(+), 6 deletions(-) create mode 100644 tests/transformers/test_wikidata_concepts_transformer.py diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 372c8fe1c5..1dcea54d56 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -12,7 +12,7 @@ ] -def add_mock_wikidata_requests(query_types: list[WikidataQueryType]) -> None: +def _add_mock_wikidata_requests(query_types: list[WikidataQueryType]) -> None: for query_type in query_types: params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) response = json.loads(load_fixture(f"wikidata/{query_type}_response.json")) @@ -21,7 +21,7 @@ def add_mock_wikidata_requests(query_types: list[WikidataQueryType]) -> None: ) -def add_mock_loc_transformer_outputs() -> None: +def _add_mock_loc_transformer_outputs() -> None: for node_type in ["concepts", "locations", "names"]: MockSmartOpen.mock_s3_file( f"s3://bulk_load_test_bucket/loc_{node_type}__nodes.csv", @@ -30,8 +30,8 @@ def add_mock_loc_transformer_outputs() -> None: def test_wikidata_concepts_source_edges() -> None: - add_mock_loc_transformer_outputs() - add_mock_wikidata_requests( + _add_mock_loc_transformer_outputs() + _add_mock_wikidata_requests( ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of"] ) @@ -63,8 +63,8 @@ def test_wikidata_concepts_source_edges() -> None: def test_wikidata_concepts_source_nodes() -> None: - add_mock_loc_transformer_outputs() - add_mock_wikidata_requests( + _add_mock_loc_transformer_outputs() + _add_mock_wikidata_requests( ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items"] ) diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py new file mode 100644 index 0000000000..3ae87742c7 --- /dev/null +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -0,0 +1,57 @@ +from test_wikidata_concepts_source import ( + _add_mock_wikidata_requests, + _add_mock_loc_transformer_outputs, +) + +from models.graph_node import SourceConcept +from models.graph_edge import SourceConceptSameAs +from transformers.wikidata.concepts_transformer import WikidataConceptsTransformer + + +def test_wikidata_concepts_nodes_transformer() -> None: + _add_mock_loc_transformer_outputs() + _add_mock_wikidata_requests( + ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items"] + ) + + transformer = WikidataConceptsTransformer( + entity_type="nodes", linked_ontology="loc" + ) + + nodes = list(transformer.stream(entity_type="nodes", query_chunk_size=100))[0] + + assert len(list(nodes)) == 4 + + assert nodes[0] == SourceConcept( + id="Q1", + label="Eustigmatophyceae", + source="wikidata", + alternative_ids=[], + alternative_labels=[], + description="class of algae", + ) + + +def test_wikidata_concepts_edges_transformer() -> None: + _add_mock_loc_transformer_outputs() + _add_mock_wikidata_requests( + ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of"] + ) + + transformer = WikidataConceptsTransformer( + entity_type="edges", linked_ontology="loc" + ) + + edges = list(transformer.stream(entity_type="edges", query_chunk_size=100))[0] + print(edges) + assert len(list(edges)) == 7 + + assert edges[0] == SourceConceptSameAs( + from_type="SourceConcept", + to_type="SourceConcept", + from_id="sh00000001", + to_id="Q1", + relationship="SAME_AS", + directed=False, + attributes={"source": "wikidata"}, + ) From 190fcbf7b8ddf5a5310ca283d914199c4c436e9d Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Mon, 3 Feb 2025 10:50:36 +0000 Subject: [PATCH 227/310] Apply auto-formatting rules --- tests/sources/test_wikidata_concepts_source.py | 6 +++--- tests/test_extractor.py | 4 ++-- tests/transformers/test_wikidata_concepts_transformer.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 1dcea54d56..b6e88704f0 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -1,11 +1,11 @@ -from test_mocks import MockRequest, MockSmartOpen import json from typing import Literal -from config import WIKIDATA_SPARQL_URL -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from test_mocks import MockRequest, MockSmartOpen from test_utils import load_fixture +from config import WIKIDATA_SPARQL_URL +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource WikidataQueryType = Literal[ "all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items" diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 6b060ed41f..834b0f6f65 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -6,11 +6,11 @@ from typing_extensions import get_args from config import ( + CATALOGUE_SNAPSHOT_URL, LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL, MESH_URL, WIKIDATA_SPARQL_URL, - CATALOGUE_SNAPSHOT_URL ) from extractor import LambdaEvent, lambda_handler from transformers.base_transformer import EntityType, StreamDestination @@ -154,7 +154,7 @@ def test_lambda_handler( "wikidata_linked_loc_locations": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_concepts": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_locations": [WIKIDATA_SPARQL_URL], - "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL] + "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL], } assert transformer_type in transformer_types diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index 3ae87742c7..81cfc744d3 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -1,10 +1,10 @@ from test_wikidata_concepts_source import ( - _add_mock_wikidata_requests, _add_mock_loc_transformer_outputs, + _add_mock_wikidata_requests, ) -from models.graph_node import SourceConcept from models.graph_edge import SourceConceptSameAs +from models.graph_node import SourceConcept from transformers.wikidata.concepts_transformer import WikidataConceptsTransformer From 0a44fbcbfe2596356d42efd51356c6551b5f7faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 3 Feb 2025 11:40:13 +0000 Subject: [PATCH 228/310] Expand Wikidata tests --- tests/fixtures/wikidata/all_ids_response.json | 6 +++ tests/fixtures/wikidata/raw_location.json | 21 ++++++++++ tests/fixtures/wikidata/raw_name.json | 16 ++++++++ .../test_catalogue_concepts_source.py | 0 .../sources/test_wikidata_concepts_source.py | 40 ++++++++++++++----- .../test_catalogue_concepts_transformer.py | 0 .../test_wikidata_concepts_transformer.py | 24 ++++++++--- tests/{ => utils}/test_xml.py | 0 8 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 tests/fixtures/wikidata/raw_location.json create mode 100644 tests/fixtures/wikidata/raw_name.json rename tests/{ => sources}/test_catalogue_concepts_source.py (100%) rename tests/{ => transformers}/test_catalogue_concepts_transformer.py (100%) rename tests/{ => utils}/test_xml.py (100%) diff --git a/tests/fixtures/wikidata/all_ids_response.json b/tests/fixtures/wikidata/all_ids_response.json index 6cd31df097..dbe318698a 100644 --- a/tests/fixtures/wikidata/all_ids_response.json +++ b/tests/fixtures/wikidata/all_ids_response.json @@ -7,6 +7,12 @@ "value": "http://www.wikidata.org/entity/Q1" } }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q2" + } + }, { "item": { "type": "uri", diff --git a/tests/fixtures/wikidata/raw_location.json b/tests/fixtures/wikidata/raw_location.json new file mode 100644 index 0000000000..e76d7bee83 --- /dev/null +++ b/tests/fixtures/wikidata/raw_location.json @@ -0,0 +1,21 @@ +{ + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q33299614" + }, + "itemLabel": { + "xml:lang": "en", + "type": "literal", + "value": "Pleasant Lake" + }, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "lake in Noble County, Indiana, United States" + }, + "coordinates": { + "datatype": "http://www.opengis.net/ont/geosparql#wktLiteral", + "type": "literal", + "value": "Point(-85.469166666 41.346111111)" + } +} diff --git a/tests/fixtures/wikidata/raw_name.json b/tests/fixtures/wikidata/raw_name.json new file mode 100644 index 0000000000..38ad4eddc7 --- /dev/null +++ b/tests/fixtures/wikidata/raw_name.json @@ -0,0 +1,16 @@ +{ + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q12720393" + }, + "itemLabel": { + "xml:lang": "en", + "type": "literal", + "value": "Alexandru Surdu" + }, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "Romanian philosopher (1938-2020)" + } +} diff --git a/tests/test_catalogue_concepts_source.py b/tests/sources/test_catalogue_concepts_source.py similarity index 100% rename from tests/test_catalogue_concepts_source.py rename to tests/sources/test_catalogue_concepts_source.py diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index b6e88704f0..90360cdbce 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -2,17 +2,24 @@ from typing import Literal from test_mocks import MockRequest, MockSmartOpen +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from sources.wikidata.linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from test_utils import load_fixture from config import WIKIDATA_SPARQL_URL -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -WikidataQueryType = Literal[ - "all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items" -] +def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: + """Add all the required mock Wikidata requests/responses based on whether we are streaming nodes or edges""" + query_types = [ + "all_ids", + "linked_ids", + "parents_instance_of", + "parents_subclass_of", + ] + if node_type == "nodes": + query_types.append("items") -def _add_mock_wikidata_requests(query_types: list[WikidataQueryType]) -> None: for query_type in query_types: params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) response = json.loads(load_fixture(f"wikidata/{query_type}_response.json")) @@ -22,6 +29,9 @@ def _add_mock_wikidata_requests(query_types: list[WikidataQueryType]) -> None: def _add_mock_loc_transformer_outputs() -> None: + """ + Add mock LoC transformer output files to S3 so that the LinkedOntologyIdTypeChecker class can extract ids from them. + """ for node_type in ["concepts", "locations", "names"]: MockSmartOpen.mock_s3_file( f"s3://bulk_load_test_bucket/loc_{node_type}__nodes.csv", @@ -31,9 +41,7 @@ def _add_mock_loc_transformer_outputs() -> None: def test_wikidata_concepts_source_edges() -> None: _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests( - ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of"] - ) + _add_mock_wikidata_requests("edges") mesh_concepts_source = WikidataLinkedOntologySource( node_type="concepts", linked_ontology="loc", entity_type="edges" @@ -64,9 +72,7 @@ def test_wikidata_concepts_source_edges() -> None: def test_wikidata_concepts_source_nodes() -> None: _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests( - ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items"] - ) + _add_mock_wikidata_requests("nodes") mesh_concepts_source = WikidataLinkedOntologySource( node_type="concepts", linked_ontology="loc", entity_type="nodes" @@ -80,3 +86,15 @@ def test_wikidata_concepts_source_nodes() -> None: assert "item" in raw_node assert "itemLabel" in raw_node assert "itemDescription" in raw_node + + +def test_wikidata_linked_ontology_id_checker(): + _add_mock_loc_transformer_outputs() + id_checker = LinkedOntologyIdTypeChecker("locations", "loc") + + assert id_checker.id_is_valid("sh00000001") + assert not id_checker.id_is_valid("sh00000001000") + + assert not id_checker.id_included_in_selected_type("sh00000001") + assert not id_checker.id_included_in_selected_type("tgrefwdw") + assert id_checker.id_included_in_selected_type("sh00000015") diff --git a/tests/test_catalogue_concepts_transformer.py b/tests/transformers/test_catalogue_concepts_transformer.py similarity index 100% rename from tests/test_catalogue_concepts_transformer.py rename to tests/transformers/test_catalogue_concepts_transformer.py diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index 81cfc744d3..a01e970d85 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -6,13 +6,15 @@ from models.graph_edge import SourceConceptSameAs from models.graph_node import SourceConcept from transformers.wikidata.concepts_transformer import WikidataConceptsTransformer +from transformers.wikidata.raw_concept import RawWikidataLocation. RawWikidataName +from test_utils import load_fixture +import json +import math def test_wikidata_concepts_nodes_transformer() -> None: _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests( - ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of", "items"] - ) + _add_mock_wikidata_requests("nodes") transformer = WikidataConceptsTransformer( entity_type="nodes", linked_ontology="loc" @@ -34,9 +36,7 @@ def test_wikidata_concepts_nodes_transformer() -> None: def test_wikidata_concepts_edges_transformer() -> None: _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests( - ["all_ids", "linked_ids", "parents_instance_of", "parents_subclass_of"] - ) + _add_mock_wikidata_requests("edges") transformer = WikidataConceptsTransformer( entity_type="edges", linked_ontology="loc" @@ -55,3 +55,15 @@ def test_wikidata_concepts_edges_transformer() -> None: directed=False, attributes={"source": "wikidata"}, ) + + +def test_wikidata_raw_location(): + raw_location_input = json.loads(load_fixture("wikidata/raw_location.json")) + raw_location = RawWikidataLocation(raw_location_input) + + assert math.isclose(raw_location.latitude, 41.346111111) + assert math.isclose(raw_location.longitude, -85.469166666) + +def test_wikidata_raw_name(): + raw_name_input = json.loads(load_fixture("wikidata/raw_name.json")) + raw_name = RawWikidataName(raw_name_input) diff --git a/tests/test_xml.py b/tests/utils/test_xml.py similarity index 100% rename from tests/test_xml.py rename to tests/utils/test_xml.py From 0e56ab42da83e0f4e8ad2e0e3cfb7fc6a8f4b068 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 3 Feb 2025 15:30:31 +0000 Subject: [PATCH 229/310] Add edge type to models --- src/models/graph_edge.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index 2d9afd125e..fa92aaaaa9 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -37,3 +37,9 @@ class SourceConceptHasParent(BaseEdge): to_type: str = "SourceConcept" relationship: str = "HAS_PARENT" directed: bool = True + +class ConceptHasSourceConcept(BaseEdge): + from_type: str = "Concept" + to_type: str = "SourceConcept" + relationship: str = "HAS_SOURCE_CONCEPT" + directed: bool = True From d6dcc81967635172e8607d177c490792b35c8c26 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Mon, 3 Feb 2025 15:32:00 +0000 Subject: [PATCH 230/310] Extract source id and qualifier --- src/transformers/catalogue/raw_concept.py | 41 +++++++++++++++++------ 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 97e87aa0b6..139e1418e1 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -1,4 +1,5 @@ -from typing import cast, get_args +import re +from typing import get_args from models.graph_node import ConceptSource, ConceptType @@ -56,20 +57,40 @@ def type(self) -> ConceptType: concept_type: ConceptType = self.raw_concept["type"] return concept_type - def _get_identifier(self) -> dict: + @property + def raw_identifier(self) -> dict | None: """Returns metadata about the source identifier.""" - raw_identifier = self.raw_concept.get("identifiers", []) + identifier_metadata = self.raw_concept.get("identifiers", []) # There should be exactly one source identifier for each concept - assert len(raw_identifier) == 1 - identifier = raw_identifier[0] + assert len(identifier_metadata) == 1 + raw_identifier = identifier_metadata[0] - assert isinstance(identifier, dict) - return identifier + assert isinstance(raw_identifier, dict) + return raw_identifier @property def source(self) -> ConceptSource: """Returns the concept source (one of "lc-names", "label-derived", etc.).""" - identifier = self._get_identifier() - - source: ConceptSource = identifier["identifierType"]["id"] + source: ConceptSource = self.raw_identifier["identifierType"]["id"] return source + + @property + def mesh_qualifier(self) -> str | None: + """Returns MeSH qualifier ID, if present.""" + if self.source == "nlm-mesh": + qualifier = re.search(r'Q\d+', self.raw_identifier.get("value", "")) + if qualifier is not None: + return qualifier.group() + + return None + + @property + def source_concept_id(self) -> str | None: + """Returns ID of source concept, if present.""" + source_id = self.raw_identifier.get("value") + if isinstance(source_id, str): + if isinstance(self.mesh_qualifier, str): + source_id = source_id.replace(self.mesh_qualifier, "") + return source_id + + return None From 7a7da4d875640f338153cae745e87441049a6fcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 3 Feb 2025 13:42:27 +0000 Subject: [PATCH 231/310] Wikidata names fields bug fix --- src/clients/base_neptune_client.py | 9 +++-- src/sources/wikidata/sparql_client.py | 9 +++-- src/sources/wikidata/sparql_query_builder.py | 40 ++++++++++++++----- src/transformers/wikidata/raw_concept.py | 4 +- tests/fixtures/wikidata/items_query.json | 2 +- tests/fixtures/wikidata/raw_name.json | 26 ++++++++++-- .../test_wikidata_concepts_transformer.py | 11 ++++- 7 files changed, 74 insertions(+), 27 deletions(-) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 058e7da2a6..97cffc66d9 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -7,9 +7,10 @@ import requests from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest +import os -NEPTUNE_BACKOFF_DEFAULT_RETRIES = 3 -NEPTUNE_BACKOFF_DEFAULT_INTERVAL = 10 +NEPTUNE_REQUESTS_BACKOFF_RETRIES = int(os.environ.get("REQUESTS_BACKOFF_RETRIES", "3")) +NEPTUNE_REQUESTS_BACKOFF_INTERVAL = 10 def on_request_backoff(backoff_details: typing.Any) -> None: @@ -35,8 +36,8 @@ def _get_client_url(self) -> str: @backoff.on_exception( backoff.constant, Exception, - max_tries=NEPTUNE_BACKOFF_DEFAULT_RETRIES, - interval=NEPTUNE_BACKOFF_DEFAULT_INTERVAL, + max_tries=NEPTUNE_REQUESTS_BACKOFF_RETRIES, + interval=NEPTUNE_REQUESTS_BACKOFF_INTERVAL, on_backoff=on_request_backoff, ) def _make_request( diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index d4b9b14b22..09fdab48d2 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -4,6 +4,7 @@ import backoff import requests +import os from config import WIKIDATA_SPARQL_URL @@ -11,8 +12,8 @@ # See: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits # However, experimentally, running more than 4 queries in parallel consistently results in '429 Too Many Requests' errors. SPARQL_MAX_PARALLEL_QUERIES = 4 -SPARQL_BACKOFF_DEFAULT_RETRIES = 3 -SPARQL_BACKOFF_DEFAULT_INTERVAL = 10 +SPARQL_REQUESTS_BACKOFF_RETRIES = int(os.environ.get("REQUESTS_BACKOFF_RETRIES", "3")) +SPARQL_REQUESTS_BACKOFF_INTERVAL = 10 def on_request_backoff(backoff_details: typing.Any) -> None: @@ -45,8 +46,8 @@ def _get_user_agent_header() -> str: @backoff.on_exception( backoff.constant, Exception, - max_tries=SPARQL_BACKOFF_DEFAULT_RETRIES, - interval=SPARQL_BACKOFF_DEFAULT_INTERVAL, + max_tries=SPARQL_REQUESTS_BACKOFF_RETRIES, + interval=SPARQL_REQUESTS_BACKOFF_INTERVAL, on_backoff=on_request_backoff, ) def run_query(self, query: str) -> list[dict]: diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 6e9f377704..4da3ada0b4 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -24,7 +24,7 @@ def _get_formatted_fields(node_type: NodeType) -> str: fields = ["?item", "?itemLabel", "?itemDescription", "?itemAltLabel"] if node_type == "names": - fields += ["?dateOfBirthLabel", "?dateOfDeathLabel", "?placeOfBirthLabel"] + fields += ["?dateOfBirth", "?dateOfDeath", "?placeOfBirthLabel"] elif node_type == "locations": fields += ["?coordinates"] @@ -69,6 +69,31 @@ def _get_formatted_field_mappings(node_type: NodeType) -> str: return "\n".join(definitions) + @staticmethod + def _get_label_mappings(node_type: NodeType) -> str: + """ + + :param node_type: + :return: + """ + extra_mappings = [] + if node_type == "names": + extra_mappings.append("?placeOfBirth rdfs:label ?placeOfBirthLabel.") + + label_mappings = f""" + OPTIONAL {{ + SERVICE wikibase:label {{ + bd:serviceParam wikibase:language "en". + ?item rdfs:label ?itemLabel. + ?item schema:description ?itemDescription. + ?item skos:altLabel ?itemAltLabel. + {'\n'.join(extra_mappings)} + }} + }} + """ + + return label_mappings + @staticmethod def get_all_ids_query(linked_ontology: OntologyType) -> str: """ @@ -100,16 +125,9 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: query = f""" SELECT DISTINCT {cls._get_formatted_fields(node_type)} WHERE {{ - VALUES ?item {{ {ids_clause} }} - - {cls._get_formatted_field_mappings(node_type)} - - SERVICE wikibase:label {{ - bd:serviceParam wikibase:language "en". - ?item rdfs:label ?itemLabel. - ?item schema:description ?itemDescription. - ?item skos:altLabel ?itemAltLabel. - }} + VALUES ?item {{ {ids_clause} }} + {cls._get_formatted_field_mappings(node_type)} + {cls._get_label_mappings(node_type)} }} GROUP BY ?item """ diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 6621a1996b..3e07919f41 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -111,11 +111,11 @@ def _extract_date(self, field_name: str) -> str | None: @property def date_of_birth(self) -> str | None: - return self._extract_date("dateOfBirthLabel") + return self._extract_date("dateOfBirth") @property def date_of_death(self) -> str | None: - return self._extract_date("dateOfDeathLabel") + return self._extract_date("dateOfDeath") @property def place_of_birth(self) -> str | None: diff --git a/tests/fixtures/wikidata/items_query.json b/tests/fixtures/wikidata/items_query.json index 0d32141a24..0a0ddfa48d 100644 --- a/tests/fixtures/wikidata/items_query.json +++ b/tests/fixtures/wikidata/items_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?item (SAMPLE(?itemLabel) as ?itemLabel) (SAMPLE(?itemDescription) as ?itemDescription) (SAMPLE(?itemAltLabel) as ?itemAltLabel) WHERE { VALUES ?item { wd:Q1 wd:Q2 wd:Q3 wd:Q4 } SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". ?item rdfs:label ?itemLabel. ?item schema:description ?itemDescription. ?item skos:altLabel ?itemAltLabel. } } GROUP BY ?item" + "query": "SELECT DISTINCT ?item (SAMPLE(?itemLabel) as ?itemLabel) (SAMPLE(?itemDescription) as ?itemDescription) (SAMPLE(?itemAltLabel) as ?itemAltLabel) WHERE { VALUES ?item { wd:Q1 wd:Q2 wd:Q3 wd:Q4 } OPTIONAL { SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". ?item rdfs:label ?itemLabel. ?item schema:description ?itemDescription. ?item skos:altLabel ?itemAltLabel. } } } GROUP BY ?item" } diff --git a/tests/fixtures/wikidata/raw_name.json b/tests/fixtures/wikidata/raw_name.json index 38ad4eddc7..1d554752b3 100644 --- a/tests/fixtures/wikidata/raw_name.json +++ b/tests/fixtures/wikidata/raw_name.json @@ -1,16 +1,36 @@ { "item": { "type": "uri", - "value": "http://www.wikidata.org/entity/Q12720393" + "value": "http://www.wikidata.org/entity/Q15429764" }, "itemLabel": { "xml:lang": "en", "type": "literal", - "value": "Alexandru Surdu" + "value": "Walter McCaffrey" }, "itemDescription": { "xml:lang": "en", "type": "literal", - "value": "Romanian philosopher (1938-2020)" + "value": "American politician" + }, + "dateOfBirth": { + "datatype": "http://www.w3.org/2001/XMLSchema#dateTime", + "type": "literal", + "value": "1949-01-28T00:00:00Z" + }, + "dateOfDeath": { + "datatype": "http://www.w3.org/2001/XMLSchema#dateTime", + "type": "literal", + "value": "2013-07-10T00:00:00Z" + }, + "placeOfBirthLabel": { + "xml:lang": "en", + "type": "literal", + "value": "Queens" + }, + "itemAltLabel": { + "xml:lang": "en", + "type": "literal", + "value": "Walter Lawrence McCaffrey" } } diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index a01e970d85..584ea64fbc 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -6,7 +6,7 @@ from models.graph_edge import SourceConceptSameAs from models.graph_node import SourceConcept from transformers.wikidata.concepts_transformer import WikidataConceptsTransformer -from transformers.wikidata.raw_concept import RawWikidataLocation. RawWikidataName +from transformers.wikidata.raw_concept import RawWikidataLocation, RawWikidataName from test_utils import load_fixture import json import math @@ -64,6 +64,13 @@ def test_wikidata_raw_location(): assert math.isclose(raw_location.latitude, 41.346111111) assert math.isclose(raw_location.longitude, -85.469166666) + def test_wikidata_raw_name(): raw_name_input = json.loads(load_fixture("wikidata/raw_name.json")) - raw_name = RawWikidataName(raw_name_input) + raw_name = RawWikidataName(raw_name_input) + + assert raw_name.date_of_birth == "1949-01-28T00:00:00Z" + assert raw_name.date_of_death == "2013-07-10T00:00:00Z" + assert raw_name.place_of_birth == "Queens" + assert raw_name.label == "Walter McCaffrey" + assert raw_name.description == "American politician" From 995dbfb4e85d78db2980719750bd7c4ab7171a96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 3 Feb 2025 17:25:30 +0000 Subject: [PATCH 232/310] Add support for streaming to local file --- .gitignore | 2 + src/extractor.py | 20 ++--- src/transformers/base_transformer.py | 86 ++++++++++--------- .../wikidata/locations_transformer.py | 4 +- .../wikidata/names_transformer.py | 6 +- src/transformers/wikidata/raw_concept.py | 7 ++ .../sources/test_wikidata_concepts_source.py | 2 +- .../test_catalogue_concepts_transformer.py | 12 ++- .../test_mesh_concepts_transformer.py | 14 ++- .../test_wikidata_concepts_transformer.py | 11 +-- 10 files changed, 87 insertions(+), 77 deletions(-) diff --git a/.gitignore b/.gitignore index 1adc4e1d59..bd10885a55 100644 --- a/.gitignore +++ b/.gitignore @@ -382,3 +382,5 @@ tags # notebooks notebooks/* !notebooks/graph_exploration.ipynb + +transformer_outputs/* diff --git a/src/extractor.py b/src/extractor.py index 977c3b5636..d4920274f6 100755 --- a/src/extractor.py +++ b/src/extractor.py @@ -8,8 +8,6 @@ from transformers.create_transformer import TransformerType, create_transformer from utils.aws import get_neptune_client -CHUNK_SIZE = 256 - class LambdaEvent(typing.TypedDict): transformer_type: TransformerType @@ -34,9 +32,7 @@ def handler( if stream_destination == "graph": neptune_client = get_neptune_client(is_local) - transformer.stream_to_graph( - neptune_client, entity_type, CHUNK_SIZE, sample_size - ) + transformer.stream_to_graph(neptune_client, entity_type, sample_size) elif stream_destination == "s3": assert ( config.S3_BULK_LOAD_BUCKET_NAME is not None @@ -44,17 +40,19 @@ def handler( file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{file_name}" - transformer.stream_to_s3(s3_uri, entity_type, CHUNK_SIZE, sample_size) + transformer.stream_to_s3(s3_uri, entity_type, sample_size) elif stream_destination == "sns": + topic_arn = config.GRAPH_QUERIES_SNS_TOPIC_ARN assert ( - config.GRAPH_QUERIES_SNS_TOPIC_ARN is not None + topic_arn is not None ), "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." - transformer.stream_to_sns( - config.GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size - ) + transformer.stream_to_sns(topic_arn, entity_type, sample_size) + elif stream_destination == "local": + file_name = f"{transformer_type}__{entity_type}.csv" + transformer.stream_to_local_file(file_name, entity_type, sample_size) elif stream_destination == "void": - for _ in transformer.stream(entity_type, CHUNK_SIZE, sample_size): + for _ in transformer.stream(entity_type, sample_size): pass else: raise ValueError("Unsupported stream destination.") diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 8a40a5c719..539dac1f14 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -2,10 +2,11 @@ import csv from collections.abc import Generator from itertools import islice -from typing import Any, Literal +from typing import Any, Literal, TextIO import boto3 import smart_open +import os from clients.base_neptune_client import BaseNeptuneClient from converters.cypher.bulk_load_converter import CypherBulkLoadConverter @@ -17,7 +18,9 @@ from utils.streaming import generator_to_chunks EntityType = Literal["nodes", "edges"] -StreamDestination = Literal["graph", "s3", "sns", "void"] +StreamDestination = Literal["graph", "s3", "sns", "local", "void"] + +CHUNK_SIZE = int(os.environ.get("TRANSFORMER_CHUNK_SIZE", "256")) class BaseTransformer: @@ -92,26 +95,38 @@ def _stream_entities( yield from entities + def _stream_to_bulk_load_file( + self, file: TextIO, entity_type: EntityType, sample_size: int | None = None + ) -> None: + """Streams entities to a file in the openCypher format for Neptune bulk load.""" + csv_writer = None + converter = CypherBulkLoadConverter(entity_type) + + for chunk in self._stream_chunks(entity_type, sample_size): + bulk_dicts = [] + for entity in chunk: + bulk_dict = converter.convert_to_bulk_cypher(entity) + bulk_dicts.append(bulk_dict) + + if csv_writer is None: + csv_writer = csv.DictWriter(file, fieldnames=bulk_dicts[0].keys()) + csv_writer.writeheader() + + csv_writer.writerows(bulk_dicts) + def _stream_chunks( - self, - entity_type: EntityType, - chunk_size: int, - sample_size: int | None = None, + self, entity_type: EntityType, sample_size: int | None = None ) -> Generator[list[BaseNode | BaseEdge]]: """ Extracts the specified entity type (nodes or edges) from its source, transforms each entity, and returns the results stream in fixed-size chunks. """ entities = self._stream_entities(entity_type, sample_size) - for chunk in generator_to_chunks(entities, chunk_size): + for chunk in generator_to_chunks(entities, CHUNK_SIZE): yield chunk def stream_to_s3( - self, - s3_uri: str, - entity_type: EntityType, - chunk_size: int, - sample_size: int | None = None, + self, s3_uri: str, entity_type: EntityType, sample_size: int | None = None ) -> None: """ Streams transformed entities (nodes or edges) into an S3 bucket for bulk loading into the Neptune cluster. @@ -119,33 +134,19 @@ def stream_to_s3( """ transport_params = {"client": boto3.client("s3")} with smart_open.open(s3_uri, "w", transport_params=transport_params) as f: - csv_writer = None - - converter = CypherBulkLoadConverter(entity_type) - for chunk in self._stream_chunks(entity_type, chunk_size, sample_size): - bulk_dicts = [] - for entity in chunk: - bulk_dict = converter.convert_to_bulk_cypher(entity) - bulk_dicts.append(bulk_dict) - - if csv_writer is None: - csv_writer = csv.DictWriter(f, fieldnames=bulk_dicts[0].keys()) - csv_writer.writeheader() - - csv_writer.writerows(bulk_dicts) + self._stream_to_bulk_load_file(f, entity_type, sample_size) def stream_to_graph( self, neptune_client: BaseNeptuneClient, entity_type: EntityType, - query_chunk_size: int, sample_size: int | None = None, ) -> None: """ Streams transformed entities (nodes or edges) directly into Neptune using multiple threads for parallel processing. Suitable for local testing. Not recommended for indexing large numbers of entities. """ - chunks = self._stream_chunks(entity_type, query_chunk_size, sample_size) + chunks = self._stream_chunks(entity_type, sample_size) def run_query(chunk: list[BaseNode | BaseEdge]) -> None: query = construct_upsert_cypher_query(chunk, entity_type) @@ -172,11 +173,7 @@ def run_query(chunk: list[BaseNode | BaseEdge]) -> None: futures.add(executor.submit(run_query, chunk)) def stream_to_sns( - self, - topic_arn: str, - entity_type: EntityType, - query_chunk_size: int, - sample_size: int | None = None, + self, topic_arn: str, entity_type: EntityType, sample_size: int | None = None ) -> None: """ Streams transformed entities (nodes or edges) into an SNS topic as openCypher queries, where they will be @@ -185,7 +182,7 @@ def stream_to_sns( queries = [] counter = 0 - for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): + for chunk in self._stream_chunks(entity_type, sample_size): queries.append(construct_upsert_cypher_query(chunk, entity_type)) # SNS supports a maximum batch size of 10 @@ -202,13 +199,22 @@ def stream_to_sns( publish_batch_to_sns(topic_arn, queries) def stream( - self, - entity_type: EntityType, - query_chunk_size: int, - sample_size: int | None = None, - ) -> Generator[Any, Any, Any]: + self, entity_type: EntityType, sample_size: int | None = None + ) -> Generator: """ Streams transformed entities (nodes or edges) as a generator. Useful for development and testing purposes. """ - for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): + for chunk in self._stream_chunks(entity_type, sample_size): yield chunk + + def stream_to_local_file( + self, file_name: str, entity_type: EntityType, sample_size: int | None = None + ) -> None: + """ + Streams transformed entities (nodes or edges) into the local `transformer_outputs` folder. + Useful for development and testing purposes. + """ + file_path = f"../transformer_outputs/{file_name}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w") as f: + self._stream_to_bulk_load_file(f, entity_type, sample_size) diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 8b7be0e636..2c320b89b7 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -10,9 +10,9 @@ class WikidataLocationsTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type: EntityType, ontology_type: OntologyType): + def __init__(self, entity_type: EntityType, linked_ontology: OntologyType): self.source = WikidataLinkedOntologySource( - "locations", ontology_type, entity_type + "locations", linked_ontology, entity_type ) def transform_node(self, raw_node: dict) -> SourceLocation | None: diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 5c8b8318cb..424fe2f8c0 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -10,8 +10,10 @@ class WikidataNamesTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type: EntityType, ontology_type: OntologyType): - self.source = WikidataLinkedOntologySource("names", ontology_type, entity_type) + def __init__(self, entity_type: EntityType, linked_ontology: OntologyType): + self.source = WikidataLinkedOntologySource( + "names", linked_ontology, entity_type + ) def transform_node(self, raw_node: dict) -> SourceName | None: raw_concept = RawWikidataName(raw_node) diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 3e07919f41..d5d88b9be2 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -94,6 +94,13 @@ def latitude(self) -> float | None: class RawWikidataName(RawWikidataConcept): def _extract_date(self, field_name: str) -> str | None: + # Some Wikidata items store invalid dates of type 'uri', such as https://www.wikidata.org/wiki/Q20760409 + if ( + field_name in self.raw_concept + and self.raw_concept[field_name]["type"] == "uri" + ): + return None + date_value = self._extract_optional_field_value(field_name) # When a date is unknown, sometimes Wikidata returns a URL instead of a valid date, such as diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 90360cdbce..7729920b91 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -88,7 +88,7 @@ def test_wikidata_concepts_source_nodes() -> None: assert "itemDescription" in raw_node -def test_wikidata_linked_ontology_id_checker(): +def test_wikidata_linked_ontology_id_checker() -> None: _add_mock_loc_transformer_outputs() id_checker = LinkedOntologyIdTypeChecker("locations", "loc") diff --git a/tests/transformers/test_catalogue_concepts_transformer.py b/tests/transformers/test_catalogue_concepts_transformer.py index 6999b2580e..52b62f48da 100644 --- a/tests/transformers/test_catalogue_concepts_transformer.py +++ b/tests/transformers/test_catalogue_concepts_transformer.py @@ -22,11 +22,9 @@ def test_mesh_concepts_transformer() -> None: catalogue_concepts_transformer = CatalogueConceptsTransformer(test_url) # test transform_node - nodes = list( - catalogue_concepts_transformer.stream(entity_type="nodes", query_chunk_size=1) - ) + nodes = list(catalogue_concepts_transformer.stream(entity_type="nodes"))[0] assert len(list(nodes)) == 12 - assert nodes[0][0].id == "s6s24vd7" - assert nodes[0][0].label == "Human anatomy" - assert nodes[0][0].type == "Concept" - assert nodes[0][0].source == "lc-subjects" + assert nodes[0].id == "s6s24vd7" + assert nodes[0].label == "Human anatomy" + assert nodes[0].type == "Concept" + assert nodes[0].source == "lc-subjects" diff --git a/tests/transformers/test_mesh_concepts_transformer.py b/tests/transformers/test_mesh_concepts_transformer.py index 54ea6d5d90..dd2166c83e 100644 --- a/tests/transformers/test_mesh_concepts_transformer.py +++ b/tests/transformers/test_mesh_concepts_transformer.py @@ -22,17 +22,13 @@ def test_mesh_concepts_transformer() -> None: mesh_concepts_transformer = MeSHConceptsTransformer(test_url) # test transform_node - nodes = list( - mesh_concepts_transformer.stream(entity_type="nodes", query_chunk_size=1) - ) + nodes = list(mesh_concepts_transformer._stream_nodes()) assert len(list(nodes)) == 7 - assert nodes[0][0].id == "D009930" - assert nodes[0][0].label == "Organic Chemicals" + assert nodes[0].id == "D009930" + assert nodes[0].label == "Organic Chemicals" - stream = mesh_concepts_transformer.stream(entity_type="edges", query_chunk_size=1) - # get first element, trying to get all of them will get edges we don't have in the test data - first_chunk = stream.__next__() - first_element = first_chunk[0] + stream = mesh_concepts_transformer._stream_edges() + first_element = stream.__next__() assert first_element.from_type == "SourceConcept" assert first_element.to_type == "SourceConcept" diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index 584ea64fbc..d685470e68 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -20,7 +20,7 @@ def test_wikidata_concepts_nodes_transformer() -> None: entity_type="nodes", linked_ontology="loc" ) - nodes = list(transformer.stream(entity_type="nodes", query_chunk_size=100))[0] + nodes = list(transformer._stream_entities(entity_type="nodes")) assert len(list(nodes)) == 4 @@ -42,8 +42,7 @@ def test_wikidata_concepts_edges_transformer() -> None: entity_type="edges", linked_ontology="loc" ) - edges = list(transformer.stream(entity_type="edges", query_chunk_size=100))[0] - print(edges) + edges = list(transformer._stream_entities(entity_type="edges")) assert len(list(edges)) == 7 assert edges[0] == SourceConceptSameAs( @@ -57,15 +56,17 @@ def test_wikidata_concepts_edges_transformer() -> None: ) -def test_wikidata_raw_location(): +def test_wikidata_raw_location() -> None: raw_location_input = json.loads(load_fixture("wikidata/raw_location.json")) raw_location = RawWikidataLocation(raw_location_input) + assert raw_location.latitude is not None + assert raw_location.longitude is not None assert math.isclose(raw_location.latitude, 41.346111111) assert math.isclose(raw_location.longitude, -85.469166666) -def test_wikidata_raw_name(): +def test_wikidata_raw_name() -> None: raw_name_input = json.loads(load_fixture("wikidata/raw_name.json")) raw_name = RawWikidataName(raw_name_input) From f372c8f03d11a302ce3174fb496f0238c9796844 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Tue, 4 Feb 2025 09:50:59 +0000 Subject: [PATCH 233/310] Apply auto-formatting rules --- src/clients/base_neptune_client.py | 2 +- src/sources/wikidata/sparql_client.py | 2 +- src/sources/wikidata/sparql_query_builder.py | 8 ++++---- src/transformers/base_transformer.py | 2 +- tests/sources/test_wikidata_concepts_source.py | 4 ++-- tests/transformers/test_wikidata_concepts_transformer.py | 7 ++++--- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/clients/base_neptune_client.py b/src/clients/base_neptune_client.py index 97cffc66d9..23c36ec304 100644 --- a/src/clients/base_neptune_client.py +++ b/src/clients/base_neptune_client.py @@ -1,5 +1,6 @@ import datetime import json +import os import typing import backoff @@ -7,7 +8,6 @@ import requests from botocore.auth import SigV4Auth from botocore.awsrequest import AWSRequest -import os NEPTUNE_REQUESTS_BACKOFF_RETRIES = int(os.environ.get("REQUESTS_BACKOFF_RETRIES", "3")) NEPTUNE_REQUESTS_BACKOFF_INTERVAL = 10 diff --git a/src/sources/wikidata/sparql_client.py b/src/sources/wikidata/sparql_client.py index 09fdab48d2..94a52bc651 100644 --- a/src/sources/wikidata/sparql_client.py +++ b/src/sources/wikidata/sparql_client.py @@ -1,10 +1,10 @@ +import os import threading import time import typing import backoff import requests -import os from config import WIKIDATA_SPARQL_URL diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 4da3ada0b4..6354760fa1 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -72,9 +72,9 @@ def _get_formatted_field_mappings(node_type: NodeType) -> str: @staticmethod def _get_label_mappings(node_type: NodeType) -> str: """ - - :param node_type: - :return: + + :param node_type: + :return: """ extra_mappings = [] if node_type == "names": @@ -90,7 +90,7 @@ def _get_label_mappings(node_type: NodeType) -> str: {'\n'.join(extra_mappings)} }} }} - """ + """ return label_mappings diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 539dac1f14..26796b287c 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -1,12 +1,12 @@ import concurrent.futures import csv +import os from collections.abc import Generator from itertools import islice from typing import Any, Literal, TextIO import boto3 import smart_open -import os from clients.base_neptune_client import BaseNeptuneClient from converters.cypher.bulk_load_converter import CypherBulkLoadConverter diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 7729920b91..4fdfb82da0 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -2,11 +2,11 @@ from typing import Literal from test_mocks import MockRequest, MockSmartOpen -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from sources.wikidata.linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from test_utils import load_fixture from config import WIKIDATA_SPARQL_URL +from sources.wikidata.linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index d685470e68..fb9d575fbe 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -1,3 +1,7 @@ +import json +import math + +from test_utils import load_fixture from test_wikidata_concepts_source import ( _add_mock_loc_transformer_outputs, _add_mock_wikidata_requests, @@ -7,9 +11,6 @@ from models.graph_node import SourceConcept from transformers.wikidata.concepts_transformer import WikidataConceptsTransformer from transformers.wikidata.raw_concept import RawWikidataLocation, RawWikidataName -from test_utils import load_fixture -import json -import math def test_wikidata_concepts_nodes_transformer() -> None: From 43b2f2f70916cad528e7b0eec53bcbd2c0aa17f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 3 Feb 2025 17:25:30 +0000 Subject: [PATCH 234/310] Add support for streaming to local file --- .gitignore | 2 + src/extractor.py | 20 ++--- src/sources/wikidata/sparql_query_builder.py | 4 +- src/transformers/base_transformer.py | 86 ++++++++++--------- .../wikidata/locations_transformer.py | 4 +- .../wikidata/names_transformer.py | 6 +- src/transformers/wikidata/raw_concept.py | 7 ++ .../sources/test_wikidata_concepts_source.py | 2 +- .../test_catalogue_concepts_transformer.py | 12 ++- .../test_mesh_concepts_transformer.py | 14 ++- .../test_wikidata_concepts_transformer.py | 11 +-- 11 files changed, 88 insertions(+), 80 deletions(-) diff --git a/.gitignore b/.gitignore index 1adc4e1d59..bd10885a55 100644 --- a/.gitignore +++ b/.gitignore @@ -382,3 +382,5 @@ tags # notebooks notebooks/* !notebooks/graph_exploration.ipynb + +transformer_outputs/* diff --git a/src/extractor.py b/src/extractor.py index 977c3b5636..d4920274f6 100755 --- a/src/extractor.py +++ b/src/extractor.py @@ -8,8 +8,6 @@ from transformers.create_transformer import TransformerType, create_transformer from utils.aws import get_neptune_client -CHUNK_SIZE = 256 - class LambdaEvent(typing.TypedDict): transformer_type: TransformerType @@ -34,9 +32,7 @@ def handler( if stream_destination == "graph": neptune_client = get_neptune_client(is_local) - transformer.stream_to_graph( - neptune_client, entity_type, CHUNK_SIZE, sample_size - ) + transformer.stream_to_graph(neptune_client, entity_type, sample_size) elif stream_destination == "s3": assert ( config.S3_BULK_LOAD_BUCKET_NAME is not None @@ -44,17 +40,19 @@ def handler( file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{file_name}" - transformer.stream_to_s3(s3_uri, entity_type, CHUNK_SIZE, sample_size) + transformer.stream_to_s3(s3_uri, entity_type, sample_size) elif stream_destination == "sns": + topic_arn = config.GRAPH_QUERIES_SNS_TOPIC_ARN assert ( - config.GRAPH_QUERIES_SNS_TOPIC_ARN is not None + topic_arn is not None ), "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." - transformer.stream_to_sns( - config.GRAPH_QUERIES_SNS_TOPIC_ARN, entity_type, CHUNK_SIZE, sample_size - ) + transformer.stream_to_sns(topic_arn, entity_type, sample_size) + elif stream_destination == "local": + file_name = f"{transformer_type}__{entity_type}.csv" + transformer.stream_to_local_file(file_name, entity_type, sample_size) elif stream_destination == "void": - for _ in transformer.stream(entity_type, CHUNK_SIZE, sample_size): + for _ in transformer.stream(entity_type, sample_size): pass else: raise ValueError("Unsupported stream destination.") diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 4da3ada0b4..1abbd0009e 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -72,9 +72,7 @@ def _get_formatted_field_mappings(node_type: NodeType) -> str: @staticmethod def _get_label_mappings(node_type: NodeType) -> str: """ - - :param node_type: - :return: + Returns SPARQL label mappings using the `wikibase:label` service. """ extra_mappings = [] if node_type == "names": diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 8a40a5c719..539dac1f14 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -2,10 +2,11 @@ import csv from collections.abc import Generator from itertools import islice -from typing import Any, Literal +from typing import Any, Literal, TextIO import boto3 import smart_open +import os from clients.base_neptune_client import BaseNeptuneClient from converters.cypher.bulk_load_converter import CypherBulkLoadConverter @@ -17,7 +18,9 @@ from utils.streaming import generator_to_chunks EntityType = Literal["nodes", "edges"] -StreamDestination = Literal["graph", "s3", "sns", "void"] +StreamDestination = Literal["graph", "s3", "sns", "local", "void"] + +CHUNK_SIZE = int(os.environ.get("TRANSFORMER_CHUNK_SIZE", "256")) class BaseTransformer: @@ -92,26 +95,38 @@ def _stream_entities( yield from entities + def _stream_to_bulk_load_file( + self, file: TextIO, entity_type: EntityType, sample_size: int | None = None + ) -> None: + """Streams entities to a file in the openCypher format for Neptune bulk load.""" + csv_writer = None + converter = CypherBulkLoadConverter(entity_type) + + for chunk in self._stream_chunks(entity_type, sample_size): + bulk_dicts = [] + for entity in chunk: + bulk_dict = converter.convert_to_bulk_cypher(entity) + bulk_dicts.append(bulk_dict) + + if csv_writer is None: + csv_writer = csv.DictWriter(file, fieldnames=bulk_dicts[0].keys()) + csv_writer.writeheader() + + csv_writer.writerows(bulk_dicts) + def _stream_chunks( - self, - entity_type: EntityType, - chunk_size: int, - sample_size: int | None = None, + self, entity_type: EntityType, sample_size: int | None = None ) -> Generator[list[BaseNode | BaseEdge]]: """ Extracts the specified entity type (nodes or edges) from its source, transforms each entity, and returns the results stream in fixed-size chunks. """ entities = self._stream_entities(entity_type, sample_size) - for chunk in generator_to_chunks(entities, chunk_size): + for chunk in generator_to_chunks(entities, CHUNK_SIZE): yield chunk def stream_to_s3( - self, - s3_uri: str, - entity_type: EntityType, - chunk_size: int, - sample_size: int | None = None, + self, s3_uri: str, entity_type: EntityType, sample_size: int | None = None ) -> None: """ Streams transformed entities (nodes or edges) into an S3 bucket for bulk loading into the Neptune cluster. @@ -119,33 +134,19 @@ def stream_to_s3( """ transport_params = {"client": boto3.client("s3")} with smart_open.open(s3_uri, "w", transport_params=transport_params) as f: - csv_writer = None - - converter = CypherBulkLoadConverter(entity_type) - for chunk in self._stream_chunks(entity_type, chunk_size, sample_size): - bulk_dicts = [] - for entity in chunk: - bulk_dict = converter.convert_to_bulk_cypher(entity) - bulk_dicts.append(bulk_dict) - - if csv_writer is None: - csv_writer = csv.DictWriter(f, fieldnames=bulk_dicts[0].keys()) - csv_writer.writeheader() - - csv_writer.writerows(bulk_dicts) + self._stream_to_bulk_load_file(f, entity_type, sample_size) def stream_to_graph( self, neptune_client: BaseNeptuneClient, entity_type: EntityType, - query_chunk_size: int, sample_size: int | None = None, ) -> None: """ Streams transformed entities (nodes or edges) directly into Neptune using multiple threads for parallel processing. Suitable for local testing. Not recommended for indexing large numbers of entities. """ - chunks = self._stream_chunks(entity_type, query_chunk_size, sample_size) + chunks = self._stream_chunks(entity_type, sample_size) def run_query(chunk: list[BaseNode | BaseEdge]) -> None: query = construct_upsert_cypher_query(chunk, entity_type) @@ -172,11 +173,7 @@ def run_query(chunk: list[BaseNode | BaseEdge]) -> None: futures.add(executor.submit(run_query, chunk)) def stream_to_sns( - self, - topic_arn: str, - entity_type: EntityType, - query_chunk_size: int, - sample_size: int | None = None, + self, topic_arn: str, entity_type: EntityType, sample_size: int | None = None ) -> None: """ Streams transformed entities (nodes or edges) into an SNS topic as openCypher queries, where they will be @@ -185,7 +182,7 @@ def stream_to_sns( queries = [] counter = 0 - for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): + for chunk in self._stream_chunks(entity_type, sample_size): queries.append(construct_upsert_cypher_query(chunk, entity_type)) # SNS supports a maximum batch size of 10 @@ -202,13 +199,22 @@ def stream_to_sns( publish_batch_to_sns(topic_arn, queries) def stream( - self, - entity_type: EntityType, - query_chunk_size: int, - sample_size: int | None = None, - ) -> Generator[Any, Any, Any]: + self, entity_type: EntityType, sample_size: int | None = None + ) -> Generator: """ Streams transformed entities (nodes or edges) as a generator. Useful for development and testing purposes. """ - for chunk in self._stream_chunks(entity_type, query_chunk_size, sample_size): + for chunk in self._stream_chunks(entity_type, sample_size): yield chunk + + def stream_to_local_file( + self, file_name: str, entity_type: EntityType, sample_size: int | None = None + ) -> None: + """ + Streams transformed entities (nodes or edges) into the local `transformer_outputs` folder. + Useful for development and testing purposes. + """ + file_path = f"../transformer_outputs/{file_name}" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w") as f: + self._stream_to_bulk_load_file(f, entity_type, sample_size) diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 8b7be0e636..2c320b89b7 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -10,9 +10,9 @@ class WikidataLocationsTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type: EntityType, ontology_type: OntologyType): + def __init__(self, entity_type: EntityType, linked_ontology: OntologyType): self.source = WikidataLinkedOntologySource( - "locations", ontology_type, entity_type + "locations", linked_ontology, entity_type ) def transform_node(self, raw_node: dict) -> SourceLocation | None: diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 5c8b8318cb..424fe2f8c0 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -10,8 +10,10 @@ class WikidataNamesTransformer(WikidataConceptsTransformer): - def __init__(self, entity_type: EntityType, ontology_type: OntologyType): - self.source = WikidataLinkedOntologySource("names", ontology_type, entity_type) + def __init__(self, entity_type: EntityType, linked_ontology: OntologyType): + self.source = WikidataLinkedOntologySource( + "names", linked_ontology, entity_type + ) def transform_node(self, raw_node: dict) -> SourceName | None: raw_concept = RawWikidataName(raw_node) diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index 3e07919f41..d5d88b9be2 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -94,6 +94,13 @@ def latitude(self) -> float | None: class RawWikidataName(RawWikidataConcept): def _extract_date(self, field_name: str) -> str | None: + # Some Wikidata items store invalid dates of type 'uri', such as https://www.wikidata.org/wiki/Q20760409 + if ( + field_name in self.raw_concept + and self.raw_concept[field_name]["type"] == "uri" + ): + return None + date_value = self._extract_optional_field_value(field_name) # When a date is unknown, sometimes Wikidata returns a URL instead of a valid date, such as diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 90360cdbce..7729920b91 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -88,7 +88,7 @@ def test_wikidata_concepts_source_nodes() -> None: assert "itemDescription" in raw_node -def test_wikidata_linked_ontology_id_checker(): +def test_wikidata_linked_ontology_id_checker() -> None: _add_mock_loc_transformer_outputs() id_checker = LinkedOntologyIdTypeChecker("locations", "loc") diff --git a/tests/transformers/test_catalogue_concepts_transformer.py b/tests/transformers/test_catalogue_concepts_transformer.py index 6999b2580e..52b62f48da 100644 --- a/tests/transformers/test_catalogue_concepts_transformer.py +++ b/tests/transformers/test_catalogue_concepts_transformer.py @@ -22,11 +22,9 @@ def test_mesh_concepts_transformer() -> None: catalogue_concepts_transformer = CatalogueConceptsTransformer(test_url) # test transform_node - nodes = list( - catalogue_concepts_transformer.stream(entity_type="nodes", query_chunk_size=1) - ) + nodes = list(catalogue_concepts_transformer.stream(entity_type="nodes"))[0] assert len(list(nodes)) == 12 - assert nodes[0][0].id == "s6s24vd7" - assert nodes[0][0].label == "Human anatomy" - assert nodes[0][0].type == "Concept" - assert nodes[0][0].source == "lc-subjects" + assert nodes[0].id == "s6s24vd7" + assert nodes[0].label == "Human anatomy" + assert nodes[0].type == "Concept" + assert nodes[0].source == "lc-subjects" diff --git a/tests/transformers/test_mesh_concepts_transformer.py b/tests/transformers/test_mesh_concepts_transformer.py index 54ea6d5d90..dd2166c83e 100644 --- a/tests/transformers/test_mesh_concepts_transformer.py +++ b/tests/transformers/test_mesh_concepts_transformer.py @@ -22,17 +22,13 @@ def test_mesh_concepts_transformer() -> None: mesh_concepts_transformer = MeSHConceptsTransformer(test_url) # test transform_node - nodes = list( - mesh_concepts_transformer.stream(entity_type="nodes", query_chunk_size=1) - ) + nodes = list(mesh_concepts_transformer._stream_nodes()) assert len(list(nodes)) == 7 - assert nodes[0][0].id == "D009930" - assert nodes[0][0].label == "Organic Chemicals" + assert nodes[0].id == "D009930" + assert nodes[0].label == "Organic Chemicals" - stream = mesh_concepts_transformer.stream(entity_type="edges", query_chunk_size=1) - # get first element, trying to get all of them will get edges we don't have in the test data - first_chunk = stream.__next__() - first_element = first_chunk[0] + stream = mesh_concepts_transformer._stream_edges() + first_element = stream.__next__() assert first_element.from_type == "SourceConcept" assert first_element.to_type == "SourceConcept" diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index 584ea64fbc..d685470e68 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -20,7 +20,7 @@ def test_wikidata_concepts_nodes_transformer() -> None: entity_type="nodes", linked_ontology="loc" ) - nodes = list(transformer.stream(entity_type="nodes", query_chunk_size=100))[0] + nodes = list(transformer._stream_entities(entity_type="nodes")) assert len(list(nodes)) == 4 @@ -42,8 +42,7 @@ def test_wikidata_concepts_edges_transformer() -> None: entity_type="edges", linked_ontology="loc" ) - edges = list(transformer.stream(entity_type="edges", query_chunk_size=100))[0] - print(edges) + edges = list(transformer._stream_entities(entity_type="edges")) assert len(list(edges)) == 7 assert edges[0] == SourceConceptSameAs( @@ -57,15 +56,17 @@ def test_wikidata_concepts_edges_transformer() -> None: ) -def test_wikidata_raw_location(): +def test_wikidata_raw_location() -> None: raw_location_input = json.loads(load_fixture("wikidata/raw_location.json")) raw_location = RawWikidataLocation(raw_location_input) + assert raw_location.latitude is not None + assert raw_location.longitude is not None assert math.isclose(raw_location.latitude, 41.346111111) assert math.isclose(raw_location.longitude, -85.469166666) -def test_wikidata_raw_name(): +def test_wikidata_raw_name() -> None: raw_name_input = json.loads(load_fixture("wikidata/raw_name.json")) raw_name = RawWikidataName(raw_name_input) From d94bd0879443612d26697e8eda042b66791b3d73 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Tue, 4 Feb 2025 09:54:32 +0000 Subject: [PATCH 235/310] Apply auto-formatting rules --- src/transformers/base_transformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 95a851b9f6..26796b287c 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -7,7 +7,6 @@ import boto3 import smart_open -import os from clients.base_neptune_client import BaseNeptuneClient from converters.cypher.bulk_load_converter import CypherBulkLoadConverter From 361d80ae3e7ba65549b947288628f4becfb3e21b Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Tue, 4 Feb 2025 10:30:12 +0000 Subject: [PATCH 236/310] implement alternativeLabales in MADS --- src/transformers/loc/common.py | 4 + src/transformers/loc/concepts_transformer.py | 11 +- src/transformers/loc/locations_transformer.py | 10 +- src/transformers/loc/mads/raw_concept.py | 21 ++ src/transformers/loc/names_transformer.py | 3 + src/transformers/loc/skos/raw_concept.py | 4 + tests/fixtures/mads_composite_name.json | 202 ++++++++++++++++++ .../loc/mads/test_raw_mads_concept.py | 11 + 8 files changed, 260 insertions(+), 6 deletions(-) create mode 100644 tests/fixtures/mads_composite_name.json diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py index 98224c0a0d..df0fb68ef8 100644 --- a/src/transformers/loc/common.py +++ b/src/transformers/loc/common.py @@ -36,6 +36,10 @@ def source(self) -> Literal["lc-subjects", "lc-names"]: raise ValueError("Unknown concept type.") + @property + def alternative_labels(self) -> list[str]: + raise NotImplementedError + @property def label(self) -> str: raise NotImplementedError diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index a647e342b0..aa36729c69 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -4,7 +4,7 @@ from models.graph_node import SourceConcept from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer -from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept +from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept class LibraryOfCongressConceptsTransformer(BaseTransformer): @@ -12,7 +12,7 @@ def __init__(self, url: str): self.source = GZipSource(url) def transform_node(self, raw_node: dict) -> SourceConcept | None: - raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) + raw_concept = RawLibraryOfCongressMADSConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: return None @@ -27,7 +27,7 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: def extract_edges( self, raw_node: dict ) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: - raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) + raw_concept = RawLibraryOfCongressMADSConcept(raw_node) if raw_concept.exclude() or raw_concept.is_geographic: return @@ -37,6 +37,11 @@ def extract_edges( from_id=raw_concept.source_id, to_id=broader_id ) + for narrower_id in raw_concept.narrower_concept_ids: + yield SourceConceptNarrowerThan( + from_id=narrower_id, to_id=raw_concept.source_id + ) + for related_id in raw_concept.related_concept_ids: yield SourceConceptRelatedTo( from_id=raw_concept.source_id, to_id=related_id diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index afbc2c0a95..dc873e0c2a 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -4,7 +4,7 @@ from models.graph_node import SourceLocation from sources.gzip_source import MultiGZipSource from transformers.base_transformer import BaseTransformer -from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept +from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept class LibraryOfCongressLocationsTransformer(BaseTransformer): @@ -12,7 +12,7 @@ def __init__(self, subject_headings_url: str, names_url: str): self.source = MultiGZipSource([subject_headings_url, names_url]) def transform_node(self, raw_node: dict) -> SourceLocation | None: - raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) + raw_concept = RawLibraryOfCongressMADSConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: return None @@ -27,7 +27,7 @@ def transform_node(self, raw_node: dict) -> SourceLocation | None: def extract_edges( self, raw_node: dict ) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: - raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) + raw_concept = RawLibraryOfCongressMADSConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: return @@ -36,6 +36,10 @@ def extract_edges( yield SourceConceptNarrowerThan( from_id=raw_concept.source_id, to_id=broader_id ) + for narrower_id in raw_concept.narrower_concept_ids: + yield SourceConceptNarrowerThan( + from_id=narrower_id, to_id=raw_concept.source_id + ) for related_id in raw_concept.related_concept_ids: yield SourceConceptRelatedTo( diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py index b2b2c6f597..ffbcd4bef1 100644 --- a/src/transformers/loc/mads/raw_concept.py +++ b/src/transformers/loc/mads/raw_concept.py @@ -86,6 +86,27 @@ def related_concept_ids(self) -> list[str]: ] ) + @property + def alternative_labels(self) -> list[str]: + """Returns a list of alternative labels for the concept.""" + assert self._raw_concept_node is not None + + raw_alternative_identifiers = [ + entry["@id"] + for entry in self._raw_concept_node.get("madsrdf:hasVariant", []) + ] + if raw_alternative_identifiers: + identifier_lookup = { + n["@id"]: n["madsrdf:variantLabel"]["@value"] + for n in self.raw_concept.get("@graph", []) + if "madsrdf:Variant" in n["@type"] + } + return [ + identifier_lookup[identifier] + for identifier in raw_alternative_identifiers + ] + return [] + def _filter_irrelevant_ids(ids: list[str]) -> list[str]: return [concept_id for concept_id in ids if not concept_id.startswith("_:n")] diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index 903a69f419..f0e2405a15 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -25,6 +25,9 @@ def transform_node(self, raw_node: dict) -> SourceName | None: ) def extract_edges(self, raw_node: dict) -> Generator[SourceConceptRelatedTo]: + # Although there are some broader and narrower relationships specified + # in the MADS instance of the names export, there are not many that are likely + # to be relevant, so the names transformer uses the lighter-weight SKOS export. raw_concept = RawLibraryOfCongressSKOSConcept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: diff --git a/src/transformers/loc/skos/raw_concept.py b/src/transformers/loc/skos/raw_concept.py index a56ee331dc..6ad2ef3f56 100644 --- a/src/transformers/loc/skos/raw_concept.py +++ b/src/transformers/loc/skos/raw_concept.py @@ -71,6 +71,10 @@ def broader_concept_ids(self) -> list[str]: sko_link_type = "broader" return self.linked_concepts_ids(sko_link_type) + @property + def narrower_concept_ids(self) -> list[str]: + return [] + @property def related_concept_ids(self) -> list[str]: """Returns a list of IDs representing concepts which are related to the current concept.""" diff --git a/tests/fixtures/mads_composite_name.json b/tests/fixtures/mads_composite_name.json new file mode 100644 index 0000000000..d0eed5a2da --- /dev/null +++ b/tests/fixtures/mads_composite_name.json @@ -0,0 +1,202 @@ +{ + "@context": "http://v3/authorities/names/context.json", + "@graph": [ + { + "@id": "http://id.loc.gov/authorities/names/nr99036560", + "@type": [ + "madsrdf:Authority", + "madsrdf:NameTitle" + ], + "identifiers:lccn": "nr 99036560", + "madsrdf:adminMetadata": [ + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab1" + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab2" + } + ], + "madsrdf:authoritativeLabel": "Plutarch. Bioi Themistokleous kai Kamillou", + "madsrdf:componentList": { + "@list": [ + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab3" + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab6" + } + ] + }, + "madsrdf:hasExactExternalAuthority": { + "@id": "http://viaf.org/viaf/sourceID/LC%7Cnr+99036560#skos:Concept" + }, + "madsrdf:hasNarrowerAuthority": { + "@id": "http://id.loc.gov/authorities/names/nr99036558" + }, + "madsrdf:hasSource": [ + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab11" + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab12" + } + ], + "madsrdf:isMemberOfMADSCollection": [ + { + "@id": "http://id.loc.gov/authorities/names/collection_FRBRWork" + }, + { + "@id": "http://id.loc.gov/authorities/names/collection_LCNAF" + }, + { + "@id": "http://id.loc.gov/authorities/names/collection_NamesAuthorizedHeadings" + } + ], + "madsrdf:isMemberOfMADSScheme": { + "@id": "http://id.loc.gov/authorities/names" + } + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab1", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "1999-11-18T00:00:00" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/upb" + }, + "ri:recordStatus": "new" + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab2", + "@type": "ri:RecordInfo", + "ri:languageOfCataloging": { + "@id": "http://id.loc.gov/vocabulary/iso639-2/eng" + }, + "ri:recordChangeDate": { + "@type": "xsd:dateTime", + "@value": "1999-12-03T01:36:43" + }, + "ri:recordContentSource": { + "@id": "http://id.loc.gov/vocabulary/organizations/upb" + }, + "ri:recordStatus": "revised" + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab3", + "@type": [ + "madsrdf:Authority", + "madsrdf:PersonalName" + ], + "madsrdf:authoritativeLabel": "Plutarch.", + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab4" + } + ] + } + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab4", + "@type": "madsrdf:FullNameElement", + "madsrdf:elementValue": "Plutarch." + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab6", + "@type": [ + "madsrdf:Authority", + "madsrdf:Title" + ], + "madsrdf:authoritativeLabel": "Bioi Themistokleous kai Kamillou", + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab7" + } + ] + } + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab7", + "@type": "madsrdf:TitleElement", + "madsrdf:elementValue": "Bioi Themistokleous kai Kamillou" + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab11", + "@type": "madsrdf:Source", + "madsrdf:citationNote": "(hdg. for Italian & Greek ed.: Plutarch. Bioi Themistokleous kai Kamillou. Italian & Greek)", + "madsrdf:citationSource": "LC in NAF, Nov. 18, 1999", + "madsrdf:citationStatus": "found" + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab12", + "@type": "madsrdf:Source", + "madsrdf:citationSource": "Life of Themistocles, 1998", + "madsrdf:citationStatus": "found" + }, + { + "@id": "http://id.loc.gov/authorities/names/nr99036558", + "@type": [ + "madsrdf:Authority", + "madsrdf:NameTitle" + ], + "madsrdf:authoritativeLabel": "Plutarch. Themistocles", + "madsrdf:componentList": { + "@list": [ + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab13" + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab16" + } + ] + } + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab13", + "@type": [ + "madsrdf:Authority", + "madsrdf:PersonalName" + ], + "madsrdf:authoritativeLabel": "Plutarch.", + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab14" + } + ] + } + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab14", + "@type": "madsrdf:FullNameElement", + "madsrdf:elementValue": "Plutarch." + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab16", + "@type": [ + "madsrdf:Authority", + "madsrdf:Title" + ], + "madsrdf:authoritativeLabel": "Themistocles", + "madsrdf:elementList": { + "@list": [ + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab17" + } + ] + } + }, + { + "@id": "_:n5a7585822cbc434d8cb684ef6244012ab17", + "@type": "madsrdf:TitleElement", + "madsrdf:elementValue": "Themistocles" + } + ], + "@id": "/authorities/names/nr99036560" +} diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/mads/test_raw_mads_concept.py index d9a12a8f2b..b2b3240f00 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/mads/test_raw_mads_concept.py @@ -222,3 +222,14 @@ def test_get_narrowers_from_both(self) -> None: "sh85098685", "sh99001366", } + + +def test_alternative_labels() -> None: + concept = RawLibraryOfCongressMADSConcept( + json.loads(load_fixture("mads_related_concept.json")) + ) + assert set(concept.alternative_labels) == { + "Loop blocking (Computer science)", + "Blocking, Loop (Computer science)", + "Tiling, Loop (Computer science)", + } From 45b238313552e70dd4404c23ce9055c42797b87c Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Tue, 4 Feb 2025 10:31:02 +0000 Subject: [PATCH 237/310] use mads for subject headings --- src/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.py b/src/config.py index 486556df30..ffcbf67292 100644 --- a/src/config.py +++ b/src/config.py @@ -4,7 +4,7 @@ GRAPH_QUERIES_SNS_TOPIC_ARN = os.environ.get("GRAPH_QUERIES_SNS_TOPIC_ARN") LOC_SUBJECT_HEADINGS_URL = ( - "https://id.loc.gov/download/authorities/subjects.skosrdf.jsonld.gz" + "https://id.loc.gov/download/authorities/subjects.madsrdf.jsonld.gz" ) LOC_NAMES_URL = "https://id.loc.gov/download/authorities/names.skosrdf.jsonld.gz" MESH_URL = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2025.gz" From 22cbcc7c12a662f578bb2df87deec4f360d92555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 4 Feb 2025 11:45:32 +0000 Subject: [PATCH 238/310] Expand Wikidata tests --- .../test_wikidata_concepts_transformer.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index fb9d575fbe..11689127ca 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -1,5 +1,6 @@ import json import math +import pytest from test_utils import load_fixture from test_wikidata_concepts_source import ( @@ -76,3 +77,40 @@ def test_wikidata_raw_name() -> None: assert raw_name.place_of_birth == "Queens" assert raw_name.label == "Walter McCaffrey" assert raw_name.description == "American politician" + + +def test_wikidata_raw_location_invalid_coordinates() -> None: + raw_location = RawWikidataLocation({}) + assert raw_location.latitude is None + assert raw_location.longitude is None + + raw_location = RawWikidataLocation({"type": "uri", "value": "some-url"}) + assert raw_location.latitude is None + assert raw_location.longitude is None + + raw_location = RawWikidataLocation( + { + "item": {"type": "uri", "value": "some-id"}, + "coordinates": {"type": "literal", "value": "invalid value"}, + } + ) + with pytest.raises(AssertionError): + _ = raw_location.latitude + + with pytest.raises(AssertionError): + _ = raw_location.longitude + + +def test_wikidata_raw_name_invalid_dates() -> None: + raw_name = RawWikidataName( + {"dateOfBirth": {"type": "literal", "value": "+0000-00-00T00:00:00Z"}} + ) + assert raw_name.date_of_birth is None + + raw_name = RawWikidataName({"dateOfBirth": {"type": "uri", "value": "some-uri"}}) + assert raw_name.date_of_birth is None + + raw_name = RawWikidataName( + {"dateOfBirth": {"type": "literal", "value": "https://some-url"}} + ) + assert raw_name.date_of_birth is None From 6d7fe6594294b8ac1713e377327a54e6f91ff4d1 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Tue, 4 Feb 2025 11:46:32 +0000 Subject: [PATCH 239/310] Apply auto-formatting rules --- tests/transformers/test_wikidata_concepts_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index 11689127ca..da6f4ce5d5 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -1,7 +1,7 @@ import json import math -import pytest +import pytest from test_utils import load_fixture from test_wikidata_concepts_source import ( _add_mock_loc_transformer_outputs, From 5df565cebfc3091c8fd092d822f40253d490d488 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 4 Feb 2025 14:34:00 +0000 Subject: [PATCH 240/310] Add functionality to fetch from s3 --- src/transformers/catalogue/raw_concept.py | 2 +- src/utils/aws.py | 30 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 139e1418e1..e426cbda53 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -58,7 +58,7 @@ def type(self) -> ConceptType: return concept_type @property - def raw_identifier(self) -> dict | None: + def raw_identifier(self) -> dict: """Returns metadata about the source identifier.""" identifier_metadata = self.raw_concept.get("identifiers", []) # There should be exactly one source identifier for each concept diff --git a/src/utils/aws.py b/src/utils/aws.py index f29a867c4a..a42e50d34e 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -1,14 +1,25 @@ import json +from collections.abc import Generator +from typing import Any, Literal import boto3 +from functools import lru_cache + +import csv +import smart_open from clients.base_neptune_client import BaseNeptuneClient from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient +from config import S3_BULK_LOAD_BUCKET_NAME + LOAD_BALANCER_SECRET_NAME = "NeptuneTest/LoadBalancerUrl" INSTANCE_ENDPOINT_SECRET_NAME = "NeptuneTest/InstanceEndpoint" +NodeType = Literal["concepts", "names", "locations"] +OntologyType = Literal["mesh", "loc", "wikidata_linked_mesh_concepts", "wikidata_linked_loc_concepts"] + def get_secret(secret_name: str) -> str: """Returns an AWS Secrets Manager secret string associated with a given secret name.""" @@ -52,3 +63,22 @@ def get_neptune_client(is_local: bool) -> BaseNeptuneClient: ) else: return LambdaNeptuneClient(get_secret(INSTANCE_ENDPOINT_SECRET_NAME)) + + +@lru_cache +def fetch_from_s3(node_type: NodeType, source: OntologyType) -> Generator[Any]: + """Retrieves the bulk load file outputted by the relevant transformer so that we can extract data from it.""" + linked_nodes_file_name = f"{source}_{node_type}__nodes.csv" + s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + + print(f"Retrieving ids of type '{node_type}' from ontology '{source}' from S3.") + + transport_params = {"client": boto3.client("s3")} + with smart_open.open(s3_url, "r", transport_params=transport_params) as f: + csv_reader = csv.reader(f) + + for i, row in enumerate(csv_reader): + # Skip header + if i == 0: + continue + yield row From 702dd45997fab097f1b561e0a297aeedde04df31 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 4 Feb 2025 14:34:16 +0000 Subject: [PATCH 241/310] Add ontology lookup --- .../linked_ontology_id_type_checker.py | 30 ++----------------- .../catalogue/id_label_checker.py | 28 +++++++++++++++++ 2 files changed, 31 insertions(+), 27 deletions(-) create mode 100644 src/transformers/catalogue/id_label_checker.py diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 90f52ad315..7381e99307 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -1,12 +1,4 @@ -import os -from functools import lru_cache - -import boto3 -import smart_open - -from config import S3_BULK_LOAD_BUCKET_NAME - -from .sparql_query_builder import NodeType, OntologyType +from utils.aws import fetch_from_s3, NodeType, OntologyType class LinkedOntologyIdTypeChecker: @@ -25,28 +17,12 @@ def __init__(self, node_type: NodeType, linked_ontology: OntologyType): linked_ontology != "mesh" ), "Invalid node_type for ontology type MeSH." - @lru_cache def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: """Return all ids classified under a given `node_type` for the selected ontology.""" # Retrieve the bulk load file outputted by the relevant transformer so that we can extract ids from it. - linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" - s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" - - print( - f"Retrieving ids of type '{node_type}' from ontology '{self.linked_ontology}' from S3.", - end=" ", - flush=True, - ) - ids = set() - transport_params = {"client": boto3.client("s3")} - with smart_open.open(s3_url, "r", transport_params=transport_params) as f: - # Loop through all items in the file and extract the id from each item - for i, line in enumerate(f): - # Skip header - if i == 0: - continue - ids.add(line.split(",")[0]) + for row in fetch_from_s3(node_type, self.linked_ontology): + ids.add(row[0]) print(f"({len(ids)} ids retrieved.)") diff --git a/src/transformers/catalogue/id_label_checker.py b/src/transformers/catalogue/id_label_checker.py new file mode 100644 index 0000000000..66cb3d2982 --- /dev/null +++ b/src/transformers/catalogue/id_label_checker.py @@ -0,0 +1,28 @@ +from typing import Any + +from utils.aws import fetch_from_s3, NodeType, OntologyType + + +class IdLabelChecker(dict): + """ + A bidirectional dictionary for checking catalogue concepts against data from source ontologies. + """ + + def __init__(self, *args: Any, **kwargs: Any): + super().__init__(*args, **kwargs) + self.inverse: dict = {} + for key, value in self.items(): + self.inverse.setdefault(value, []).append(key) + + @classmethod + def from_source(cls, node_type: NodeType, source: OntologyType) -> dict: + """Fetch source node data from s3 bulk upload files and create ID-label mapping.""" + id_label_dict = {} + + for row in fetch_from_s3(node_type, source): + # Extract source id and label at position 0 and 3, respectively + id_label_dict[row[0]] = row[3] + + print(f"({len(id_label_dict)} ids and labels retrieved.)") + + return cls(**id_label_dict) From edf3a177541d4623444c8c703b696c35ca719db9 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Tue, 4 Feb 2025 14:35:43 +0000 Subject: [PATCH 242/310] Apply auto-formatting rules --- src/models/graph_edge.py | 1 + .../wikidata/linked_ontology_id_type_checker.py | 2 +- src/transformers/catalogue/id_label_checker.py | 6 +++--- src/transformers/catalogue/raw_concept.py | 4 ++-- src/utils/aws.py | 12 ++++++------ tests/test_extractor.py | 4 ++-- 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index fa92aaaaa9..7ad3dae82d 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -38,6 +38,7 @@ class SourceConceptHasParent(BaseEdge): relationship: str = "HAS_PARENT" directed: bool = True + class ConceptHasSourceConcept(BaseEdge): from_type: str = "Concept" to_type: str = "SourceConcept" diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 7381e99307..024154d7ae 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -1,4 +1,4 @@ -from utils.aws import fetch_from_s3, NodeType, OntologyType +from utils.aws import NodeType, OntologyType, fetch_from_s3 class LinkedOntologyIdTypeChecker: diff --git a/src/transformers/catalogue/id_label_checker.py b/src/transformers/catalogue/id_label_checker.py index 66cb3d2982..89e1f13f37 100644 --- a/src/transformers/catalogue/id_label_checker.py +++ b/src/transformers/catalogue/id_label_checker.py @@ -1,6 +1,6 @@ from typing import Any -from utils.aws import fetch_from_s3, NodeType, OntologyType +from utils.aws import NodeType, OntologyType, fetch_from_s3 class IdLabelChecker(dict): @@ -20,8 +20,8 @@ def from_source(cls, node_type: NodeType, source: OntologyType) -> dict: id_label_dict = {} for row in fetch_from_s3(node_type, source): - # Extract source id and label at position 0 and 3, respectively - id_label_dict[row[0]] = row[3] + # Extract source id and label at position 0 and 3, respectively + id_label_dict[row[0]] = row[3] print(f"({len(id_label_dict)} ids and labels retrieved.)") diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index e426cbda53..3159dfc0cc 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -73,12 +73,12 @@ def source(self) -> ConceptSource: """Returns the concept source (one of "lc-names", "label-derived", etc.).""" source: ConceptSource = self.raw_identifier["identifierType"]["id"] return source - + @property def mesh_qualifier(self) -> str | None: """Returns MeSH qualifier ID, if present.""" if self.source == "nlm-mesh": - qualifier = re.search(r'Q\d+', self.raw_identifier.get("value", "")) + qualifier = re.search(r"Q\d+", self.raw_identifier.get("value", "")) if qualifier is not None: return qualifier.group() diff --git a/src/utils/aws.py b/src/utils/aws.py index a42e50d34e..1f1ba58e0a 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -1,24 +1,24 @@ +import csv import json from collections.abc import Generator +from functools import lru_cache from typing import Any, Literal import boto3 -from functools import lru_cache - -import csv import smart_open from clients.base_neptune_client import BaseNeptuneClient from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient - from config import S3_BULK_LOAD_BUCKET_NAME LOAD_BALANCER_SECRET_NAME = "NeptuneTest/LoadBalancerUrl" INSTANCE_ENDPOINT_SECRET_NAME = "NeptuneTest/InstanceEndpoint" NodeType = Literal["concepts", "names", "locations"] -OntologyType = Literal["mesh", "loc", "wikidata_linked_mesh_concepts", "wikidata_linked_loc_concepts"] +OntologyType = Literal[ + "mesh", "loc", "wikidata_linked_mesh_concepts", "wikidata_linked_loc_concepts" +] def get_secret(secret_name: str) -> str: @@ -76,7 +76,7 @@ def fetch_from_s3(node_type: NodeType, source: OntologyType) -> Generator[Any]: transport_params = {"client": boto3.client("s3")} with smart_open.open(s3_url, "r", transport_params=transport_params) as f: csv_reader = csv.reader(f) - + for i, row in enumerate(csv_reader): # Skip header if i == 0: diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 5117787863..8c48da7edb 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -6,11 +6,11 @@ from typing_extensions import get_args from config import ( + CATALOGUE_SNAPSHOT_URL, LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL, MESH_URL, WIKIDATA_SPARQL_URL, - CATALOGUE_SNAPSHOT_URL ) from extractor import LambdaEvent, lambda_handler from transformers.base_transformer import EntityType, StreamDestination @@ -156,7 +156,7 @@ def test_lambda_handler( "wikidata_linked_loc_locations": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_concepts": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_locations": [WIKIDATA_SPARQL_URL], - "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL] + "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL], } assert transformer_type in transformer_types From 7c9335c3a6247a78117d2182fb1c900ca1f347c6 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Tue, 4 Feb 2025 14:49:58 +0000 Subject: [PATCH 243/310] Cover reading of ndjson files with tests --- src/sources/gzip_source.py | 6 +- src/transformers/loc/common.py | 92 ----- src/transformers/loc/concepts_transformer.py | 2 +- src/transformers/loc/locations_transformer.py | 6 +- src/transformers/loc/mads/__init__.py | 0 src/transformers/loc/mads/raw_concept.py | 122 ------- src/transformers/loc/names_transformer.py | 2 +- src/transformers/loc/raw_concept.py | 317 ++++++++++++++++++ src/transformers/loc/skos/__init__.py | 0 src/transformers/loc/skos/raw_concept.py | 97 ------ tests/transformers/loc/test_concepts.py | 103 ++++++ tests/transformers/loc/test_raw_concept.py | 4 +- .../loc/{mads => }/test_raw_mads_concept.py | 2 +- .../loc/{skos => }/test_raw_skos_concept.py | 2 +- 14 files changed, 433 insertions(+), 322 deletions(-) delete mode 100644 src/transformers/loc/common.py delete mode 100644 src/transformers/loc/mads/__init__.py delete mode 100644 src/transformers/loc/mads/raw_concept.py create mode 100644 src/transformers/loc/raw_concept.py delete mode 100644 src/transformers/loc/skos/__init__.py delete mode 100644 src/transformers/loc/skos/raw_concept.py create mode 100644 tests/transformers/loc/test_concepts.py rename tests/transformers/loc/{mads => }/test_raw_mads_concept.py (99%) rename tests/transformers/loc/{skos => }/test_raw_skos_concept.py (96%) diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 59755a9366..44e3ebd750 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -1,6 +1,7 @@ import gzip import json from collections.abc import Generator +from json import JSONDecodeError import requests @@ -16,7 +17,10 @@ def stream_raw(self) -> Generator[dict]: with gzip.GzipFile(fileobj=response.raw) as file: for line_bytes in file: - yield json.loads(line_bytes.decode("utf8")) + try: + yield json.loads(line_bytes.decode("utf8")) + except JSONDecodeError as e: + print(e) class MultiGZipSource(BaseSource): diff --git a/src/transformers/loc/common.py b/src/transformers/loc/common.py deleted file mode 100644 index df0fb68ef8..0000000000 --- a/src/transformers/loc/common.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Literal - -ID_PREFIXES_TO_REMOVE = ( - "/authorities/subjects/", - "http://id.loc.gov/authorities/subjects/", - "/authorities/names/", - "http://id.loc.gov/authorities/names/", -) - - -def remove_id_prefix(raw_id: str) -> str: - for prefix in ID_PREFIXES_TO_REMOVE: - raw_id = raw_id.removeprefix(prefix) - return raw_id - - -class RawLibraryOfCongressConcept: - def __init__(self, raw_concept: dict): - self.raw_concept = raw_concept - self._raw_concept_node = self._extract_concept_node() - - def _extract_concept_node(self) -> dict | None: - raise NotImplementedError - - @property - def source_id(self) -> str: - return remove_id_prefix(self.raw_concept["@id"]) - - @property - def source(self) -> Literal["lc-subjects", "lc-names"]: - if "subjects" in self.raw_concept["@id"]: - return "lc-subjects" - - if "names" in self.raw_concept["@id"]: - return "lc-names" - - raise ValueError("Unknown concept type.") - - @property - def alternative_labels(self) -> list[str]: - raise NotImplementedError - - @property - def label(self) -> str: - raise NotImplementedError - - @property - def is_geographic(self) -> bool: - """Returns True if the node represents a geographic concept""" - raise NotImplementedError - - @property - def broader_concept_ids(self) -> list[str]: - """Returns a list of IDs representing concepts which are broader than the current concept.""" - raise NotImplementedError - - @property - def narrower_concept_ids(self) -> list[str]: - """Returns a list of IDs representing concepts which are narrower than the current concept.""" - raise NotImplementedError - - @property - def related_concept_ids(self) -> list[str]: - """Returns a list of IDs representing concepts which are related to the current concept.""" - raise NotImplementedError - - @staticmethod - def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: - # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. - if isinstance(raw_label, str): - return raw_label - - # In cases where an LoC Name has multiple labels written using different writing systems, labels are returned - # as a list. When this happens, we extract the first item in the list, which always stores the Latin script - # version of the label as a string. - if isinstance(raw_label, list): - assert isinstance(raw_label[0], str) - return raw_label[0] - - return raw_label["@value"] - - def exclude(self) -> bool: - """Returns True if the concept should be excluded from the graph.""" - if self._raw_concept_node is None: - return True - - # Remove concepts whose IDs have the "-781" suffix. They are duplicates of concepts with non-suffixed IDs. - # The suffix represents the fact that the concept in question is part of the LCSH - Geographic collection. - if self.source_id.endswith("-781"): - return True - - return False diff --git a/src/transformers/loc/concepts_transformer.py b/src/transformers/loc/concepts_transformer.py index aa36729c69..e40db9061b 100644 --- a/src/transformers/loc/concepts_transformer.py +++ b/src/transformers/loc/concepts_transformer.py @@ -4,7 +4,7 @@ from models.graph_node import SourceConcept from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer -from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept +from transformers.loc.raw_concept import RawLibraryOfCongressMADSConcept class LibraryOfCongressConceptsTransformer(BaseTransformer): diff --git a/src/transformers/loc/locations_transformer.py b/src/transformers/loc/locations_transformer.py index dc873e0c2a..1de588ba70 100644 --- a/src/transformers/loc/locations_transformer.py +++ b/src/transformers/loc/locations_transformer.py @@ -4,7 +4,7 @@ from models.graph_node import SourceLocation from sources.gzip_source import MultiGZipSource from transformers.base_transformer import BaseTransformer -from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept +from transformers.loc.raw_concept import raw_loc_concept class LibraryOfCongressLocationsTransformer(BaseTransformer): @@ -12,7 +12,7 @@ def __init__(self, subject_headings_url: str, names_url: str): self.source = MultiGZipSource([subject_headings_url, names_url]) def transform_node(self, raw_node: dict) -> SourceLocation | None: - raw_concept = RawLibraryOfCongressMADSConcept(raw_node) + raw_concept = raw_loc_concept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: return None @@ -27,7 +27,7 @@ def transform_node(self, raw_node: dict) -> SourceLocation | None: def extract_edges( self, raw_node: dict ) -> Generator[SourceConceptNarrowerThan | SourceConceptRelatedTo]: - raw_concept = RawLibraryOfCongressMADSConcept(raw_node) + raw_concept = raw_loc_concept(raw_node) if raw_concept.exclude() or not raw_concept.is_geographic: return diff --git a/src/transformers/loc/mads/__init__.py b/src/transformers/loc/mads/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/transformers/loc/mads/raw_concept.py b/src/transformers/loc/mads/raw_concept.py deleted file mode 100644 index ffbcd4bef1..0000000000 --- a/src/transformers/loc/mads/raw_concept.py +++ /dev/null @@ -1,122 +0,0 @@ -from transformers.loc.common import RawLibraryOfCongressConcept, remove_id_prefix - - -class RawLibraryOfCongressMADSConcept(RawLibraryOfCongressConcept): - def __init__(self, raw_concept: dict): - super().__init__(raw_concept) - - def _extract_concept_node(self) -> dict | None: - graph: list[dict] = self.raw_concept.get("@graph", []) - for node in graph: - # madsrdf:Authority corresponds to the "idea or notion" - # So the node we are after is the one whose id matches, and is an Authority - # Ignore DeprecatedAuthority in this context, as they are to be excluded. - # https://www.loc.gov/standards/mads/rdf/#t21 - if ( - self.source_id in node.get("@id", "") - and "madsrdf:Authority" in node["@type"] - ): - return node - return None - - @property - def label(self) -> str: - assert self._raw_concept_node is not None - raw_preferred_label = self._raw_concept_node["madsrdf:authoritativeLabel"] - return self._extract_label(raw_preferred_label) - - @property - def is_geographic(self) -> bool: - assert self._raw_concept_node is not None - """Returns True if the node represents a geographic concept, as determined by @type""" - return "madsrdf:Geographic" in self._raw_concept_node.get("@type", []) - - @property - def broader_concept_ids(self) -> list[str]: - assert self._raw_concept_node is not None - return _filter_irrelevant_ids( - [ - remove_id_prefix(broader["@id"]) - for broader in _as_list( - self._raw_concept_node.get("madsrdf:hasBroaderAuthority", []) - ) - ] - ) - - @property - def narrower_concept_ids(self) -> list[str]: - return ( - self._narrowers_from_narrower_authority() - + self._narrowers_from_component_list() - ) - - def _narrowers_from_component_list(self) -> list[str]: - assert self._raw_concept_node is not None - return _filter_irrelevant_ids( - [ - remove_id_prefix(broader["@id"]) - for broader in _as_list( - self._raw_concept_node.get("madsrdf:componentList", {}).get( - "@list", [] - ) - ) - ] - ) - - def _narrowers_from_narrower_authority(self) -> list[str]: - assert self._raw_concept_node is not None - return _filter_irrelevant_ids( - [ - remove_id_prefix(broader["@id"]) - for broader in _as_list( - self._raw_concept_node.get("madsrdf:hasNarrowerAuthority", []) - ) - ] - ) - - @property - def related_concept_ids(self) -> list[str]: - assert self._raw_concept_node is not None - return _filter_irrelevant_ids( - [ - remove_id_prefix(broader["@id"]) - for broader in _as_list( - self._raw_concept_node.get("madsrdf:hasReciprocalAuthority", []) - ) - ] - ) - - @property - def alternative_labels(self) -> list[str]: - """Returns a list of alternative labels for the concept.""" - assert self._raw_concept_node is not None - - raw_alternative_identifiers = [ - entry["@id"] - for entry in self._raw_concept_node.get("madsrdf:hasVariant", []) - ] - if raw_alternative_identifiers: - identifier_lookup = { - n["@id"]: n["madsrdf:variantLabel"]["@value"] - for n in self.raw_concept.get("@graph", []) - if "madsrdf:Variant" in n["@type"] - } - return [ - identifier_lookup[identifier] - for identifier in raw_alternative_identifiers - ] - return [] - - -def _filter_irrelevant_ids(ids: list[str]) -> list[str]: - return [concept_id for concept_id in ids if not concept_id.startswith("_:n")] - - -def _as_list(dict_or_list: dict | list[dict]) -> list[dict]: - # Some fields in the source data may contain one or more values - # When it contains multiple values, it will be a list, - # but in the case where they contain just one value, it is not. - # Wrap bare single values in a list, for consistency of processing downstream - if isinstance(dict_or_list, dict): - return [dict_or_list] - return dict_or_list diff --git a/src/transformers/loc/names_transformer.py b/src/transformers/loc/names_transformer.py index f0e2405a15..a8828524bf 100644 --- a/src/transformers/loc/names_transformer.py +++ b/src/transformers/loc/names_transformer.py @@ -4,7 +4,7 @@ from models.graph_node import SourceName from sources.gzip_source import GZipSource from transformers.base_transformer import BaseTransformer -from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept +from transformers.loc.raw_concept import RawLibraryOfCongressSKOSConcept class LibraryOfCongressNamesTransformer(BaseTransformer): diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py new file mode 100644 index 0000000000..022444e58a --- /dev/null +++ b/src/transformers/loc/raw_concept.py @@ -0,0 +1,317 @@ +from typing import Literal + +ID_PREFIXES_TO_REMOVE = ( + "/authorities/subjects/", + "http://id.loc.gov/authorities/subjects/", + "/authorities/names/", + "http://id.loc.gov/authorities/names/", +) + + +def remove_id_prefix(raw_id: str) -> str: + for prefix in ID_PREFIXES_TO_REMOVE: + raw_id = raw_id.removeprefix(prefix) + return raw_id + + +class RawLibraryOfCongressConcept: + def __init__(self, raw_concept: dict): + self.raw_concept = raw_concept + self._raw_concept_node = self._extract_concept_node() + + def _extract_concept_node(self) -> dict | None: + raise NotImplementedError + + @property + def source_id(self) -> str: + return remove_id_prefix(self.raw_concept["@id"]) + + @property + def source(self) -> Literal["lc-subjects", "lc-names"]: + if "subjects" in self.raw_concept["@id"]: + return "lc-subjects" + + if "names" in self.raw_concept["@id"]: + return "lc-names" + + raise ValueError("Unknown concept type.") + + @property + def alternative_labels(self) -> list[str]: + raise NotImplementedError + + @property + def label(self) -> str: + raise NotImplementedError + + @property + def is_geographic(self) -> bool: + """Returns True if the node represents a geographic concept""" + raise NotImplementedError + + @property + def broader_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are broader than the current concept.""" + raise NotImplementedError + + @property + def narrower_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are narrower than the current concept.""" + raise NotImplementedError + + @property + def related_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are related to the current concept.""" + raise NotImplementedError + + @staticmethod + def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: + # Labels are either stored directly as strings, or as nested JSON objects with a `@value` property. + if isinstance(raw_label, str): + return raw_label + + # In cases where an LoC Name has multiple labels written using different writing systems, labels are returned + # as a list. When this happens, we extract the first item in the list, which always stores the Latin script + # version of the label as a string. + if isinstance(raw_label, list): + assert isinstance(raw_label[0], str) + return raw_label[0] + + return raw_label["@value"] + + def exclude(self) -> bool: + """Returns True if the concept should be excluded from the graph.""" + if self._raw_concept_node is None: + return True + + # Remove concepts whose IDs have the "-781" suffix. They are duplicates of concepts with non-suffixed IDs. + # The suffix represents the fact that the concept in question is part of the LCSH - Geographic collection. + if self.source_id.endswith("-781"): + return True + + return False + + +class RawLibraryOfCongressSKOSConcept(RawLibraryOfCongressConcept): + def __init__(self, raw_concept: dict): + super().__init__(raw_concept) + self._raw_concept_node = self._extract_concept_node() + + def _extract_concept_node(self) -> dict | None: + graph: list[dict] = self.raw_concept.get("@graph", []) + + # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. + # When this happens, return `None` because there is no concept for us to extract. + concept_node = next( + ( + node + for node in graph + if self.source_id in node.get("@id", "") + and node["@type"] == "skos:Concept" + ), + None, + ) + + return concept_node + + @property + def label(self) -> str: + assert self._raw_concept_node is not None + + raw_preferred_label = self._raw_concept_node["skos:prefLabel"] + return self._extract_label(raw_preferred_label) + + @property + def alternative_labels(self) -> list[str]: + """Returns a list of alternative labels for the concept.""" + assert self._raw_concept_node is not None + + raw_alternative_labels = self._raw_concept_node.get("skos:altLabel", []) + + # Raw alternative labels are either returned in a list of labels, or as a single label + # in the same format as `skos:prefLabel` + if isinstance(raw_alternative_labels, list): + return [self._extract_label(item) for item in raw_alternative_labels] + + return [self._extract_label(raw_alternative_labels)] + + def linked_concepts_ids(self, sko_link: str) -> list[str]: + """Returns a list of IDs representing concepts which are linked to the current concept""" + assert self._raw_concept_node is not None + + linked_concepts = self._raw_concept_node.get(f"skos:{sko_link}", []) + + # Sometimes linked concepts are returned as a list of concepts, and sometimes as just a single JSON + if isinstance(linked_concepts, dict): + linked_concepts = [linked_concepts] + + linked_ids = [] + for concept in linked_concepts: + # Some linked concepts have IDs in the format `_:n`. + # These IDs do not exist in the LoC source files or the LoC website, so we filter them out. + if concept["@id"].startswith("_:n"): + continue + + linked_ids.append(remove_id_prefix(concept["@id"])) + + return linked_ids + + @property + def broader_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are broader than the current concept.""" + sko_link_type = "broader" + return self.linked_concepts_ids(sko_link_type) + + @property + def narrower_concept_ids(self) -> list[str]: + return [] + + @property + def related_concept_ids(self) -> list[str]: + """Returns a list of IDs representing concepts which are related to the current concept.""" + sko_link_type = "related" + return self.linked_concepts_ids(sko_link_type) + + @property + def is_geographic(self) -> bool: + """Returns True if the node represents a geographic concept, as determined by `skos:notation`.""" + if self._raw_concept_node is None: + return False + + # Notations are sometimes returned as a single notation (with a `@type` property, and a `@value` property), + # and sometimes as a list of notations. + notation = self._raw_concept_node.get("skos:notation", []) + if isinstance(notation, dict): + notation = [notation] + + notation_types = {item.get("@type") for item in notation} + return "http://id.loc.gov/datatypes/codes/gac" in notation_types + + +class RawLibraryOfCongressMADSConcept(RawLibraryOfCongressConcept): + def __init__(self, raw_concept: dict): + super().__init__(raw_concept) + + def _extract_concept_node(self) -> dict | None: + graph: list[dict] = self.raw_concept.get("@graph", []) + for node in graph: + # madsrdf:Authority corresponds to the "idea or notion" + # So the node we are after is the one whose id matches, and is an Authority + # Ignore DeprecatedAuthority in this context, as they are to be excluded. + # https://www.loc.gov/standards/mads/rdf/#t21 + if ( + self.source_id in node.get("@id", "") + and "madsrdf:Authority" in node["@type"] + ): + return node + return None + + @property + def label(self) -> str: + assert self._raw_concept_node is not None + raw_preferred_label = self._raw_concept_node["madsrdf:authoritativeLabel"] + return self._extract_label(raw_preferred_label) + + @property + def is_geographic(self) -> bool: + assert self._raw_concept_node is not None + """Returns True if the node represents a geographic concept, as determined by @type""" + return "madsrdf:Geographic" in self._raw_concept_node.get("@type", []) + + @property + def broader_concept_ids(self) -> list[str]: + assert self._raw_concept_node is not None + return _filter_irrelevant_ids( + [ + remove_id_prefix(broader["@id"]) + for broader in _as_list( + self._raw_concept_node.get("madsrdf:hasBroaderAuthority", []) + ) + ] + ) + + @property + def narrower_concept_ids(self) -> list[str]: + return ( + self._narrowers_from_narrower_authority() + + self._narrowers_from_component_list() + ) + + def _narrowers_from_component_list(self) -> list[str]: + assert self._raw_concept_node is not None + return _filter_irrelevant_ids( + [ + remove_id_prefix(broader["@id"]) + for broader in _as_list( + self._raw_concept_node.get("madsrdf:componentList", {}).get( + "@list", [] + ) + ) + ] + ) + + def _narrowers_from_narrower_authority(self) -> list[str]: + assert self._raw_concept_node is not None + return _filter_irrelevant_ids( + [ + remove_id_prefix(broader["@id"]) + for broader in _as_list( + self._raw_concept_node.get("madsrdf:hasNarrowerAuthority", []) + ) + ] + ) + + @property + def related_concept_ids(self) -> list[str]: + assert self._raw_concept_node is not None + return _filter_irrelevant_ids( + [ + remove_id_prefix(broader["@id"]) + for broader in _as_list( + self._raw_concept_node.get("madsrdf:hasReciprocalAuthority", []) + ) + ] + ) + + @property + def alternative_labels(self) -> list[str]: + """Returns a list of alternative labels for the concept.""" + assert self._raw_concept_node is not None + + raw_alternative_identifiers = [ + entry["@id"] + for entry in self._raw_concept_node.get("madsrdf:hasVariant", []) + ] + if raw_alternative_identifiers: + identifier_lookup = { + n["@id"]: n["madsrdf:variantLabel"]["@value"] + for n in self.raw_concept.get("@graph", []) + if "madsrdf:Variant" in n["@type"] + } + return [ + identifier_lookup[identifier] + for identifier in raw_alternative_identifiers + ] + return [] + + +def _filter_irrelevant_ids(ids: list[str]) -> list[str]: + return [concept_id for concept_id in ids if not concept_id.startswith("_:n")] + + +def _as_list(dict_or_list: dict | list[dict]) -> list[dict]: + # Some fields in the source data may contain one or more values + # When it contains multiple values, it will be a list, + # but in the case where they contain just one value, it is not. + # Wrap bare single values in a list, for consistency of processing downstream + if isinstance(dict_or_list, dict): + return [dict_or_list] + return dict_or_list + +def raw_loc_concept(raw_concept: dict) -> RawLibraryOfCongressConcept: + for node in raw_concept.get("@graph"): + if "skos:changeNote" in node: + return RawLibraryOfCongressSKOSConcept(raw_concept) + if "madsrdf:adminMetadata" in node: + return RawLibraryOfCongressMADSConcept(raw_concept) + raise ValueError("LoC concept data was neither MADS nor SKOS") diff --git a/src/transformers/loc/skos/__init__.py b/src/transformers/loc/skos/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/transformers/loc/skos/raw_concept.py b/src/transformers/loc/skos/raw_concept.py deleted file mode 100644 index 6ad2ef3f56..0000000000 --- a/src/transformers/loc/skos/raw_concept.py +++ /dev/null @@ -1,97 +0,0 @@ -from transformers.loc.common import RawLibraryOfCongressConcept, remove_id_prefix - - -class RawLibraryOfCongressSKOSConcept(RawLibraryOfCongressConcept): - def __init__(self, raw_concept: dict): - super().__init__(raw_concept) - self._raw_concept_node = self._extract_concept_node() - - def _extract_concept_node(self) -> dict | None: - graph: list[dict] = self.raw_concept.get("@graph", []) - - # Some LoC concepts (e.g. deprecated concepts) do not store a concept node in their graph. - # When this happens, return `None` because there is no concept for us to extract. - concept_node = next( - ( - node - for node in graph - if self.source_id in node.get("@id", "") - and node["@type"] == "skos:Concept" - ), - None, - ) - - return concept_node - - @property - def label(self) -> str: - assert self._raw_concept_node is not None - - raw_preferred_label = self._raw_concept_node["skos:prefLabel"] - return self._extract_label(raw_preferred_label) - - @property - def alternative_labels(self) -> list[str]: - """Returns a list of alternative labels for the concept.""" - assert self._raw_concept_node is not None - - raw_alternative_labels = self._raw_concept_node.get("skos:altLabel", []) - - # Raw alternative labels are either returned in a list of labels, or as a single label - # in the same format as `skos:prefLabel` - if isinstance(raw_alternative_labels, list): - return [self._extract_label(item) for item in raw_alternative_labels] - - return [self._extract_label(raw_alternative_labels)] - - def linked_concepts_ids(self, sko_link: str) -> list[str]: - """Returns a list of IDs representing concepts which are linked to the current concept""" - assert self._raw_concept_node is not None - - linked_concepts = self._raw_concept_node.get(f"skos:{sko_link}", []) - - # Sometimes linked concepts are returned as a list of concepts, and sometimes as just a single JSON - if isinstance(linked_concepts, dict): - linked_concepts = [linked_concepts] - - linked_ids = [] - for concept in linked_concepts: - # Some linked concepts have IDs in the format `_:n`. - # These IDs do not exist in the LoC source files or the LoC website, so we filter them out. - if concept["@id"].startswith("_:n"): - continue - - linked_ids.append(remove_id_prefix(concept["@id"])) - - return linked_ids - - @property - def broader_concept_ids(self) -> list[str]: - """Returns a list of IDs representing concepts which are broader than the current concept.""" - sko_link_type = "broader" - return self.linked_concepts_ids(sko_link_type) - - @property - def narrower_concept_ids(self) -> list[str]: - return [] - - @property - def related_concept_ids(self) -> list[str]: - """Returns a list of IDs representing concepts which are related to the current concept.""" - sko_link_type = "related" - return self.linked_concepts_ids(sko_link_type) - - @property - def is_geographic(self) -> bool: - """Returns True if the node represents a geographic concept, as determined by `skos:notation`.""" - if self._raw_concept_node is None: - return False - - # Notations are sometimes returned as a single notation (with a `@type` property, and a `@value` property), - # and sometimes as a list of notations. - notation = self._raw_concept_node.get("skos:notation", []) - if isinstance(notation, dict): - notation = [notation] - - notation_types = {item.get("@type") for item in notation} - return "http://id.loc.gov/datatypes/codes/gac" in notation_types diff --git a/tests/transformers/loc/test_concepts.py b/tests/transformers/loc/test_concepts.py new file mode 100644 index 0000000000..c485f3de17 --- /dev/null +++ b/tests/transformers/loc/test_concepts.py @@ -0,0 +1,103 @@ +import json +from test_mocks import MockRequest +from test_utils import load_fixture +from transformers.loc.concepts_transformer import LibraryOfCongressConceptsTransformer +from transformers.loc.locations_transformer import LibraryOfCongressLocationsTransformer + + +def jsons_to_ndjson(json_fixtures) -> bytes: + # Given a bunch of formatted JSON files, concatenate them into ndjson + return "\n".join(json.dumps(json.loads(load_fixture(fixture))) for fixture in json_fixtures).encode('utf-8') + + +def test_loc_concept_transformer_resilience() -> None: + test_url = "https://example.com" + + MockRequest.mock_responses( + [ + { + "method": "GET", + "url": test_url, + "status_code": 200, + "json_data": None, + "content_bytes": + "\n[}{}[\n".encode("utf-8") + # This line fails, but should not break the overall process + jsons_to_ndjson( + [ + "mads_geographic_concept.json", # geographic concepts are not included in the concepts transformer output + "mads_composite_concept.json", + "mads_deprecated_concept.json", # This one is deprecated, so is not included in the output + "mads_narrower_authority_concept.json" + ] + ), + "params": None, + } + ] + ) + concepts_transformer = LibraryOfCongressConceptsTransformer(test_url) + + nodes = list(concepts_transformer.stream(entity_type="nodes", query_chunk_size=1)) + # mads_composite_concept and mads_narrower_authority_concept + assert len(list(nodes)) == 2 + + + +def test_loc_location_transformer_resilience() -> None: + test_url_subjects = "https://example.com/subjects" + test_url_names = "https://example.com/names" + + MockRequest.mock_responses( + [ + { + "method": "GET", + "url": test_url_subjects, + "status_code": 200, + "json_data": None, + "content_bytes": + "\n[}{}[\n".encode("utf-8") + # This line fails, but should not break the overall process + jsons_to_ndjson( + [ + "mads_geographic_concept.json", # Only geographic concepts included in the location transformer output + "mads_composite_concept.json", + "mads_deprecated_concept.json", + "mads_narrower_authority_concept.json" + ] + ), + "params": None, + }, + { + "method": "GET", + "url": test_url_names, + "status_code": 200, + "json_data": None, + "content_bytes": + "\n[}{}[\n".encode("utf-8") + # This line fails, but should not break the overall process + load_fixture("loc_names_example.jsonld"), + "params": None, + } + ] + ) + locations_transformer = LibraryOfCongressLocationsTransformer(test_url_subjects, test_url_names) + nodes = list(locations_transformer.stream(entity_type="nodes", query_chunk_size=1)) + # Caversham Park from mads_geographic_concept + # and Budapest (Hungary) from loc_names_example + assert len(list(nodes)) == 2 + + + +def test_empty_source(): + """If there is nothing to process, nothing is emitted""" + MockRequest.mock_responses( + [ + { + "method": "GET", + "url": "/dev/null", + "status_code": 200, + "json_data": None, + "content_bytes": b"", + "params": None, + } + ] + ) + transformer = LibraryOfCongressConceptsTransformer("/dev/null") + assert list(transformer.stream(entity_type="nodes", query_chunk_size=1)) == [] \ No newline at end of file diff --git a/tests/transformers/loc/test_raw_concept.py b/tests/transformers/loc/test_raw_concept.py index b317d5d624..75a0d7266f 100644 --- a/tests/transformers/loc/test_raw_concept.py +++ b/tests/transformers/loc/test_raw_concept.py @@ -2,9 +2,7 @@ import pytest -from transformers.loc.common import RawLibraryOfCongressConcept, remove_id_prefix -from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept -from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept +from transformers.loc.raw_concept import RawLibraryOfCongressConcept, RawLibraryOfCongressMADSConcept, RawLibraryOfCongressSKOSConcept @pytest.mark.parametrize( diff --git a/tests/transformers/loc/mads/test_raw_mads_concept.py b/tests/transformers/loc/test_raw_mads_concept.py similarity index 99% rename from tests/transformers/loc/mads/test_raw_mads_concept.py rename to tests/transformers/loc/test_raw_mads_concept.py index b2b3240f00..96f9ae5253 100644 --- a/tests/transformers/loc/mads/test_raw_mads_concept.py +++ b/tests/transformers/loc/test_raw_mads_concept.py @@ -2,7 +2,7 @@ from test_utils import load_fixture -from transformers.loc.mads.raw_concept import RawLibraryOfCongressMADSConcept +from transformers.loc.raw_concept import RawLibraryOfCongressMADSConcept def test_label() -> None: diff --git a/tests/transformers/loc/skos/test_raw_skos_concept.py b/tests/transformers/loc/test_raw_skos_concept.py similarity index 96% rename from tests/transformers/loc/skos/test_raw_skos_concept.py rename to tests/transformers/loc/test_raw_skos_concept.py index 565e5323bd..bee5dbaca0 100644 --- a/tests/transformers/loc/skos/test_raw_skos_concept.py +++ b/tests/transformers/loc/test_raw_skos_concept.py @@ -2,7 +2,7 @@ from test_utils import load_fixture -from transformers.loc.skos.raw_concept import RawLibraryOfCongressSKOSConcept +from transformers.loc.raw_concept import RawLibraryOfCongressSKOSConcept def test_label() -> None: From b2000292bacad4b8185dc6d1af2cfe29a68b6975 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Tue, 4 Feb 2025 14:51:31 +0000 Subject: [PATCH 244/310] Cover reading of ndjson files with tests --- src/transformers/loc/raw_concept.py | 3 +- tests/transformers/loc/test_concepts.py | 51 ++++++++++++---------- tests/transformers/loc/test_raw_concept.py | 6 ++- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 022444e58a..b87aec5ff8 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -308,8 +308,9 @@ def _as_list(dict_or_list: dict | list[dict]) -> list[dict]: return [dict_or_list] return dict_or_list + def raw_loc_concept(raw_concept: dict) -> RawLibraryOfCongressConcept: - for node in raw_concept.get("@graph"): + for node in raw_concept.get("@graph",[]): if "skos:changeNote" in node: return RawLibraryOfCongressSKOSConcept(raw_concept) if "madsrdf:adminMetadata" in node: diff --git a/tests/transformers/loc/test_concepts.py b/tests/transformers/loc/test_concepts.py index c485f3de17..24b9aa2748 100644 --- a/tests/transformers/loc/test_concepts.py +++ b/tests/transformers/loc/test_concepts.py @@ -1,13 +1,17 @@ import json + from test_mocks import MockRequest from test_utils import load_fixture + from transformers.loc.concepts_transformer import LibraryOfCongressConceptsTransformer from transformers.loc.locations_transformer import LibraryOfCongressLocationsTransformer -def jsons_to_ndjson(json_fixtures) -> bytes: +def jsons_to_ndjson(json_fixtures:list[str]) -> bytes: # Given a bunch of formatted JSON files, concatenate them into ndjson - return "\n".join(json.dumps(json.loads(load_fixture(fixture))) for fixture in json_fixtures).encode('utf-8') + return "\n".join( + json.dumps(json.loads(load_fixture(fixture))) for fixture in json_fixtures + ).encode("utf-8") def test_loc_concept_transformer_resilience() -> None: @@ -20,14 +24,15 @@ def test_loc_concept_transformer_resilience() -> None: "url": test_url, "status_code": 200, "json_data": None, - "content_bytes": - "\n[}{}[\n".encode("utf-8") + # This line fails, but should not break the overall process - jsons_to_ndjson( + "content_bytes": "\n[}{}[\n".encode( + "utf-8" + ) # This line fails, but should not break the overall process + + jsons_to_ndjson( [ - "mads_geographic_concept.json", # geographic concepts are not included in the concepts transformer output + "mads_geographic_concept.json", # geographic concepts are not included in the concepts transformer output "mads_composite_concept.json", - "mads_deprecated_concept.json", # This one is deprecated, so is not included in the output - "mads_narrower_authority_concept.json" + "mads_deprecated_concept.json", # This one is deprecated, so is not included in the output + "mads_narrower_authority_concept.json", ] ), "params": None, @@ -41,7 +46,6 @@ def test_loc_concept_transformer_resilience() -> None: assert len(list(nodes)) == 2 - def test_loc_location_transformer_resilience() -> None: test_url_subjects = "https://example.com/subjects" test_url_names = "https://example.com/names" @@ -53,14 +57,15 @@ def test_loc_location_transformer_resilience() -> None: "url": test_url_subjects, "status_code": 200, "json_data": None, - "content_bytes": - "\n[}{}[\n".encode("utf-8") + # This line fails, but should not break the overall process - jsons_to_ndjson( + "content_bytes": "\n[}{}[\n".encode( + "utf-8" + ) # This line fails, but should not break the overall process + + jsons_to_ndjson( [ - "mads_geographic_concept.json", # Only geographic concepts included in the location transformer output + "mads_geographic_concept.json", # Only geographic concepts included in the location transformer output "mads_composite_concept.json", "mads_deprecated_concept.json", - "mads_narrower_authority_concept.json" + "mads_narrower_authority_concept.json", ] ), "params": None, @@ -70,22 +75,24 @@ def test_loc_location_transformer_resilience() -> None: "url": test_url_names, "status_code": 200, "json_data": None, - "content_bytes": - "\n[}{}[\n".encode("utf-8") + # This line fails, but should not break the overall process - load_fixture("loc_names_example.jsonld"), + "content_bytes": "\n[}{}[\n".encode( + "utf-8" + ) # This line fails, but should not break the overall process + + load_fixture("loc_names_example.jsonld"), "params": None, - } + }, ] ) - locations_transformer = LibraryOfCongressLocationsTransformer(test_url_subjects, test_url_names) + locations_transformer = LibraryOfCongressLocationsTransformer( + test_url_subjects, test_url_names + ) nodes = list(locations_transformer.stream(entity_type="nodes", query_chunk_size=1)) # Caversham Park from mads_geographic_concept # and Budapest (Hungary) from loc_names_example assert len(list(nodes)) == 2 - -def test_empty_source(): +def test_empty_source()->None: """If there is nothing to process, nothing is emitted""" MockRequest.mock_responses( [ @@ -100,4 +107,4 @@ def test_empty_source(): ] ) transformer = LibraryOfCongressConceptsTransformer("/dev/null") - assert list(transformer.stream(entity_type="nodes", query_chunk_size=1)) == [] \ No newline at end of file + assert list(transformer.stream(entity_type="nodes", query_chunk_size=1)) == [] diff --git a/tests/transformers/loc/test_raw_concept.py b/tests/transformers/loc/test_raw_concept.py index 75a0d7266f..daa568610b 100644 --- a/tests/transformers/loc/test_raw_concept.py +++ b/tests/transformers/loc/test_raw_concept.py @@ -2,7 +2,11 @@ import pytest -from transformers.loc.raw_concept import RawLibraryOfCongressConcept, RawLibraryOfCongressMADSConcept, RawLibraryOfCongressSKOSConcept +from transformers.loc.raw_concept import ( + RawLibraryOfCongressConcept, + RawLibraryOfCongressMADSConcept, + RawLibraryOfCongressSKOSConcept, +) @pytest.mark.parametrize( From fa4806f503ca472b21351c50a5e41090e2165d7d Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Tue, 4 Feb 2025 14:51:46 +0000 Subject: [PATCH 245/310] Cover reading of ndjson files with tests --- src/transformers/loc/raw_concept.py | 2 +- tests/transformers/loc/test_concepts.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index b87aec5ff8..97e851b6a6 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -310,7 +310,7 @@ def _as_list(dict_or_list: dict | list[dict]) -> list[dict]: def raw_loc_concept(raw_concept: dict) -> RawLibraryOfCongressConcept: - for node in raw_concept.get("@graph",[]): + for node in raw_concept.get("@graph", []): if "skos:changeNote" in node: return RawLibraryOfCongressSKOSConcept(raw_concept) if "madsrdf:adminMetadata" in node: diff --git a/tests/transformers/loc/test_concepts.py b/tests/transformers/loc/test_concepts.py index 24b9aa2748..a9e538af69 100644 --- a/tests/transformers/loc/test_concepts.py +++ b/tests/transformers/loc/test_concepts.py @@ -7,7 +7,7 @@ from transformers.loc.locations_transformer import LibraryOfCongressLocationsTransformer -def jsons_to_ndjson(json_fixtures:list[str]) -> bytes: +def jsons_to_ndjson(json_fixtures: list[str]) -> bytes: # Given a bunch of formatted JSON files, concatenate them into ndjson return "\n".join( json.dumps(json.loads(load_fixture(fixture))) for fixture in json_fixtures @@ -92,7 +92,7 @@ def test_loc_location_transformer_resilience() -> None: assert len(list(nodes)) == 2 -def test_empty_source()->None: +def test_empty_source() -> None: """If there is nothing to process, nothing is emitted""" MockRequest.mock_responses( [ From 12f16c9436a471511f92ec451d663666d3ba337a Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Tue, 4 Feb 2025 15:55:21 +0000 Subject: [PATCH 246/310] Apply auto-formatting rules --- tests/test_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 5117787863..8c48da7edb 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -6,11 +6,11 @@ from typing_extensions import get_args from config import ( + CATALOGUE_SNAPSHOT_URL, LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL, MESH_URL, WIKIDATA_SPARQL_URL, - CATALOGUE_SNAPSHOT_URL ) from extractor import LambdaEvent, lambda_handler from transformers.base_transformer import EntityType, StreamDestination @@ -156,7 +156,7 @@ def test_lambda_handler( "wikidata_linked_loc_locations": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_concepts": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_locations": [WIKIDATA_SPARQL_URL], - "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL] + "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL], } assert transformer_type in transformer_types From 4bced41184f3ad4d8666133df48f5b399de67b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 4 Feb 2025 14:46:22 +0000 Subject: [PATCH 247/310] Switch from black to ruff --- pyproject.toml | 19 ++++++++++++++ scripts/autoformat.sh | 12 ++++----- src/bulk_loader.py | 1 - src/clients/lambda_neptune_client.py | 2 -- src/converters/cypher/bulk_load_converter.py | 16 +++++++----- src/dev_requirements.in | 3 +-- src/dev_requirements.txt | 20 +++------------ src/extractor.py | 12 ++++----- src/query_builders/cypher.py | 2 +- src/sources/catalogue/concepts_source.py | 3 +-- .../linked_ontology_id_type_checker.py | 6 ++--- .../wikidata/linked_ontology_source.py | 5 ++-- src/sources/wikidata/sparql_query_builder.py | 2 +- src/transformers/base_transformer.py | 14 ++++------- src/transformers/catalogue/raw_concept.py | 12 ++++----- src/transformers/loc/raw_concept.py | 8 +----- src/transformers/mesh/concepts_transformer.py | 1 - .../mesh/locations_transformer.py | 1 - src/transformers/mesh/raw_concept.py | 2 +- .../wikidata/locations_transformer.py | 4 +-- src/transformers/wikidata/raw_concept.py | 25 +++++++++---------- src/utils/streaming.py | 3 +-- tests/conftest.py | 3 ++- tests/test_extractor.py | 3 ++- .../test_wikidata_concepts_transformer.py | 20 +++++++-------- 25 files changed, 95 insertions(+), 104 deletions(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..ad85a6e067 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I" +] +ignore = ["E501", "B019"] diff --git a/scripts/autoformat.sh b/scripts/autoformat.sh index c4ddb62a01..5274def754 100755 --- a/scripts/autoformat.sh +++ b/scripts/autoformat.sh @@ -12,11 +12,11 @@ cd "$ROOT" CHECK=${1:-} if [ "$CHECK" == "--check" ]; then - echo "Checking code formatting (run ./scripts/autoformat.sh to fix any issues!)..." - black --check src/ tests/ - isort --profile=black --check src/ tests/ + echo "Checking code formatting and linting (run ./scripts/autoformat.sh to fix any issues!)..." + ruff format src/ tests/ --check + ruff check src/ tests/ else - echo "Formatting code ..." - black src/ tests/ - isort --profile=black src/ tests/ + echo "Formatting and linting code ..." + ruff format src/ tests/ + ruff check src/ tests/ --fix fi diff --git a/src/bulk_loader.py b/src/bulk_loader.py index 856f54b809..1dd09a6ee3 100644 --- a/src/bulk_loader.py +++ b/src/bulk_loader.py @@ -1,5 +1,4 @@ import argparse -import os import typing from config import S3_BULK_LOAD_BUCKET_NAME diff --git a/src/clients/lambda_neptune_client.py b/src/clients/lambda_neptune_client.py index 544d49809b..3ceb978090 100644 --- a/src/clients/lambda_neptune_client.py +++ b/src/clients/lambda_neptune_client.py @@ -1,5 +1,3 @@ -import os - import boto3 from .base_neptune_client import BaseNeptuneClient diff --git a/src/converters/cypher/bulk_load_converter.py b/src/converters/cypher/bulk_load_converter.py index 59ea03fa8c..7a9e15e29d 100644 --- a/src/converters/cypher/bulk_load_converter.py +++ b/src/converters/cypher/bulk_load_converter.py @@ -19,12 +19,16 @@ def _get_bulk_loader_column_header(self, model: BaseModel, field_name: str) -> s """ # Most fields are stored as strings field_type = "String" - if isinstance(model, SourceLocation): - if field_name in {"longitude", "latitude"}: - field_type = "Float" - if isinstance(model, SourceName): - if field_name in {"date_of_birth", "date_of_death"}: - field_type = "DateTime" + if isinstance(model, SourceLocation) and field_name in { + "longitude", + "latitude", + }: + field_type = "Float" + if isinstance(model, SourceName) and field_name in { + "date_of_birth", + "date_of_death", + }: + field_type = "DateTime" return f"{field_name}:{field_type}" diff --git a/src/dev_requirements.in b/src/dev_requirements.in index c14ceff04b..b16442bb25 100644 --- a/src/dev_requirements.in +++ b/src/dev_requirements.in @@ -1,6 +1,5 @@ -black coverage -isort mypy pytest pytest-cov +ruff diff --git a/src/dev_requirements.txt b/src/dev_requirements.txt index a6513dc561..a76e7a3a02 100644 --- a/src/dev_requirements.txt +++ b/src/dev_requirements.txt @@ -4,32 +4,18 @@ # # pip-compile dev_requirements.in # -black==24.10.0 - # via -r dev_requirements.in -click==8.1.8 - # via black coverage[toml]==7.6.10 # via # -r dev_requirements.in # pytest-cov iniconfig==2.0.0 # via pytest -isort==5.13.2 - # via -r dev_requirements.in mypy==1.14.1 # via -r dev_requirements.in mypy-extensions==1.0.0 - # via - # black - # mypy + # via mypy packaging==24.2 - # via - # black - # pytest -pathspec==0.12.1 - # via black -platformdirs==4.3.6 - # via black + # via pytest pluggy==1.5.0 # via pytest pytest==8.3.4 @@ -38,5 +24,7 @@ pytest==8.3.4 # pytest-cov pytest-cov==6.0.0 # via -r dev_requirements.in +ruff==0.9.4 + # via -r dev_requirements.in typing-extensions==4.12.2 # via mypy diff --git a/src/extractor.py b/src/extractor.py index d4920274f6..d34d3db433 100755 --- a/src/extractor.py +++ b/src/extractor.py @@ -34,18 +34,18 @@ def handler( neptune_client = get_neptune_client(is_local) transformer.stream_to_graph(neptune_client, entity_type, sample_size) elif stream_destination == "s3": - assert ( - config.S3_BULK_LOAD_BUCKET_NAME is not None - ), "To stream to S3, the S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." + assert config.S3_BULK_LOAD_BUCKET_NAME is not None, ( + "To stream to S3, the S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." + ) file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{file_name}" transformer.stream_to_s3(s3_uri, entity_type, sample_size) elif stream_destination == "sns": topic_arn = config.GRAPH_QUERIES_SNS_TOPIC_ARN - assert ( - topic_arn is not None - ), "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." + assert topic_arn is not None, ( + "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." + ) transformer.stream_to_sns(topic_arn, entity_type, sample_size) elif stream_destination == "local": diff --git a/src/query_builders/cypher.py b/src/query_builders/cypher.py index 2da8cdb59a..c5d70853a6 100644 --- a/src/query_builders/cypher.py +++ b/src/query_builders/cypher.py @@ -33,7 +33,7 @@ def construct_upsert_edges_query(edges: list[BaseEdge]) -> str: relationship = edges[0].relationship attributes = edges[0].attributes or dict() - field_set = [f"n.{f} = data.{f}" for f in attributes.keys()] + field_set = [f"n.{f} = data.{f}" for f in attributes] field_set_statement = ", ".join(field_set) if len(field_set_statement) == 0: diff --git a/src/sources/catalogue/concepts_source.py b/src/sources/catalogue/concepts_source.py index ca5c1e1d8f..b2cfb7e6cb 100644 --- a/src/sources/catalogue/concepts_source.py +++ b/src/sources/catalogue/concepts_source.py @@ -15,5 +15,4 @@ def stream_raw(self) -> Generator[dict]: catalogue_source = GZipSource(self.url) for work in catalogue_source.stream_raw(): for concept_key in CONCEPT_KEYS: - for raw_concept in work.get(concept_key, []): - yield raw_concept + yield from work.get(concept_key, []) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 425ced53d4..aa28b1b05b 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -20,9 +20,9 @@ def __init__(self, node_type: NodeType, linked_ontology: OntologyType): # MeSH only has concepts and locations, so make sure we don't attempt to extract names. if node_type == "names": - assert ( - linked_ontology != "mesh" - ), "Invalid node_type for ontology type MeSH." + assert linked_ontology != "mesh", ( + "Invalid node_type for ontology type MeSH." + ) @lru_cache def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index f2c8877e7a..896d3f7ac7 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -116,13 +116,12 @@ def _parallelise_requests( ) -> Generator: """Accept an `items` generator and a `run_sparql_query` method. Split `items` chunks and apply `run_sparql_query` to each chunk. Return a single generator of results.""" - for raw_response_item in process_stream_in_parallel( + yield from process_stream_in_parallel( items, run_sparql_query, SPARQL_ITEMS_CHUNK_SIZE, SPARQL_MAX_PARALLEL_QUERIES, - ): - yield raw_response_item + ) def _stream_filtered_wikidata_ids(self) -> Generator[str]: """Streams all wikidata ids to be processed as nodes given the selected `node_type`.""" diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 082a05fc3d..e67c7157fd 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -85,7 +85,7 @@ def _get_label_mappings(node_type: NodeType) -> str: ?item rdfs:label ?itemLabel. ?item schema:description ?itemDescription. ?item skos:altLabel ?itemAltLabel. - {'\n'.join(extra_mappings)} + {"\n".join(extra_mappings)} }} }} """ diff --git a/src/transformers/base_transformer.py b/src/transformers/base_transformer.py index 26796b287c..b50fb9d459 100644 --- a/src/transformers/base_transformer.py +++ b/src/transformers/base_transformer.py @@ -122,8 +122,7 @@ def _stream_chunks( and returns the results stream in fixed-size chunks. """ entities = self._stream_entities(entity_type, sample_size) - for chunk in generator_to_chunks(entities, CHUNK_SIZE): - yield chunk + yield from generator_to_chunks(entities, CHUNK_SIZE) def stream_to_s3( self, s3_uri: str, entity_type: EntityType, sample_size: int | None = None @@ -180,9 +179,8 @@ def stream_to_sns( consumed by the `indexer` Lambda function. """ queries = [] - counter = 0 - for chunk in self._stream_chunks(entity_type, sample_size): + for i, chunk in enumerate(self._stream_chunks(entity_type, sample_size)): queries.append(construct_upsert_cypher_query(chunk, entity_type)) # SNS supports a maximum batch size of 10 @@ -190,9 +188,8 @@ def stream_to_sns( publish_batch_to_sns(topic_arn, queries) queries = [] - counter += 1 - if counter % 100 == 0: - print(f"Published {counter} messages to SNS.") + if (i + 1) % 100 == 0: + print(f"Published {i + 1} messages to SNS.") # Publish remaining messages (if any) if len(queries) > 0: @@ -204,8 +201,7 @@ def stream( """ Streams transformed entities (nodes or edges) as a generator. Useful for development and testing purposes. """ - for chunk in self._stream_chunks(entity_type, sample_size): - yield chunk + yield from self._stream_chunks(entity_type, sample_size) def stream_to_local_file( self, file_name: str, entity_type: EntityType, sample_size: int | None = None diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 97e87aa0b6..f06f60d5f9 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -1,4 +1,4 @@ -from typing import cast, get_args +from typing import get_args from models.graph_node import ConceptSource, ConceptType @@ -27,12 +27,10 @@ def is_concept(self) -> bool: Determines whether a given block of JSON represents a Concept as returned from the Catalogue API. A Concept is a block of JSON with a type property and a list of identifiers. """ - if self.raw_concept.get("type") in get_args( - ConceptType - ) and self.raw_concept.get("identifiers"): - return True - - return False + return ( + self.raw_concept.get("type") in get_args(ConceptType) + and self.raw_concept.get("identifiers") is not None + ) @property def wellcome_id(self) -> str: diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 4f2f26769d..1fb9c94998 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -54,15 +54,9 @@ def _extract_label(raw_label: str | dict[str, str] | list[str]) -> str: def exclude(self) -> bool: """Returns True if the concept should be excluded from the graph.""" - if self._raw_concept_node is None: - return True - # Remove concepts whose IDs have the "-781" suffix. They are duplicates of concepts with non-suffixed IDs. # The suffix represents the fact that the concept in question is part of the LCSH - Geographic collection. - if self.source_id.endswith("-781"): - return True - - return False + return self._raw_concept_node is None or self.source_id.endswith("-781") @property def source_id(self) -> str: diff --git a/src/transformers/mesh/concepts_transformer.py b/src/transformers/mesh/concepts_transformer.py index 54c0849dd7..59c0edca0d 100644 --- a/src/transformers/mesh/concepts_transformer.py +++ b/src/transformers/mesh/concepts_transformer.py @@ -1,4 +1,3 @@ -import xml.etree.ElementTree as ET from collections.abc import Generator from models.graph_edge import SourceConceptHasParent, SourceConceptRelatedTo diff --git a/src/transformers/mesh/locations_transformer.py b/src/transformers/mesh/locations_transformer.py index 13786fb9ce..437e18f880 100644 --- a/src/transformers/mesh/locations_transformer.py +++ b/src/transformers/mesh/locations_transformer.py @@ -1,4 +1,3 @@ -import xml.etree.ElementTree as ET from collections.abc import Generator from models.graph_edge import BaseEdge diff --git a/src/transformers/mesh/raw_concept.py b/src/transformers/mesh/raw_concept.py index 59d30b2b15..65f2f6a909 100644 --- a/src/transformers/mesh/raw_concept.py +++ b/src/transformers/mesh/raw_concept.py @@ -1,5 +1,5 @@ import xml.etree.ElementTree as ET -from typing import Any, Literal +from typing import Literal from sources.mesh.concepts_source import RawMeshNode from utils.xml import assert_get_text diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 2c320b89b7..f7860818e2 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -24,6 +24,6 @@ def transform_node(self, raw_node: dict) -> SourceLocation | None: source=raw_concept.source, alternative_labels=raw_concept.alternative_labels, description=raw_concept.description, - latitude=raw_concept.latitude, - longitude=raw_concept.longitude, + latitude=raw_concept.coordinates["latitude"], + longitude=raw_concept.coordinates["longitude"], ) diff --git a/src/transformers/wikidata/raw_concept.py b/src/transformers/wikidata/raw_concept.py index d5d88b9be2..636a6f1335 100644 --- a/src/transformers/wikidata/raw_concept.py +++ b/src/transformers/wikidata/raw_concept.py @@ -1,10 +1,14 @@ import re -from functools import lru_cache -from typing import Literal +from typing import Literal, TypedDict from sources.wikidata.linked_ontology_source import extract_wikidata_id +class Coordinates(TypedDict): + longitude: float | None + latitude: float | None + + class RawWikidataConcept: def __init__(self, raw_concept: dict): self.raw_concept = raw_concept @@ -59,8 +63,7 @@ def source(self) -> Literal["wikidata"]: class RawWikidataLocation(RawWikidataConcept): - @lru_cache - def _get_coordinates(self) -> dict[str, float | None]: + def _extract_coordinates(self) -> Coordinates: """Extracts coordinates from a raw string in the format `Point( )` (e.g. `Point(9.83 53.54)`)""" # Some items do not return valid coordinates (e.g. Q17064702, whose coordinates just say 'unknown value' on the # Wikidata website). When this happens, the 'type' of the 'coordinates' property always appears to be 'uri'. @@ -75,21 +78,17 @@ def _get_coordinates(self) -> dict[str, float | None]: pattern = r"Point\((.*)\s(.*)\)" matched_coordinates = re.search(pattern, raw_coordinates) - assert ( - matched_coordinates is not None - ), f"Could not extract coordinates from raw value '{raw_coordinates}'. Wikidata id: {self.source_id}" + assert matched_coordinates is not None, ( + f"Could not extract coordinates from raw value '{raw_coordinates}'. Wikidata id: {self.source_id}" + ) longitude = float(matched_coordinates.group(1)) latitude = float(matched_coordinates.group(2)) return {"longitude": longitude, "latitude": latitude} @property - def longitude(self) -> float | None: - return self._get_coordinates()["longitude"] - - @property - def latitude(self) -> float | None: - return self._get_coordinates()["latitude"] + def coordinates(self) -> Coordinates: + return self._extract_coordinates() class RawWikidataName(RawWikidataConcept): diff --git a/src/utils/streaming.py b/src/utils/streaming.py index a7505e9237..52612915c2 100644 --- a/src/utils/streaming.py +++ b/src/utils/streaming.py @@ -53,5 +53,4 @@ def process_stream_in_parallel( for future in done: items = future.result() - for item in items: - yield item + yield from items diff --git a/tests/conftest.py b/tests/conftest.py index ec534c6a0f..61ada586bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ -from typing import Any, Generator +from collections.abc import Generator +from typing import Any import pytest from _pytest.monkeypatch import MonkeyPatch diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 834b0f6f65..f462ad34ac 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -1,4 +1,5 @@ -from typing import Any, Generator +from collections.abc import Generator +from typing import Any import pytest from test_mocks import MOCK_INSTANCE_ENDPOINT, MockRequest, MockResponseInput diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index da6f4ce5d5..26d3c24e74 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -62,10 +62,10 @@ def test_wikidata_raw_location() -> None: raw_location_input = json.loads(load_fixture("wikidata/raw_location.json")) raw_location = RawWikidataLocation(raw_location_input) - assert raw_location.latitude is not None - assert raw_location.longitude is not None - assert math.isclose(raw_location.latitude, 41.346111111) - assert math.isclose(raw_location.longitude, -85.469166666) + assert raw_location.coordinates["latitude"] is not None + assert raw_location.coordinates["longitude"] is not None + assert math.isclose(raw_location.coordinates["latitude"], 41.346111111) + assert math.isclose(raw_location.coordinates["longitude"], -85.469166666) def test_wikidata_raw_name() -> None: @@ -81,12 +81,12 @@ def test_wikidata_raw_name() -> None: def test_wikidata_raw_location_invalid_coordinates() -> None: raw_location = RawWikidataLocation({}) - assert raw_location.latitude is None - assert raw_location.longitude is None + assert raw_location.coordinates["latitude"] is None + assert raw_location.coordinates["longitude"] is None raw_location = RawWikidataLocation({"type": "uri", "value": "some-url"}) - assert raw_location.latitude is None - assert raw_location.longitude is None + assert raw_location.coordinates["latitude"] is None + assert raw_location.coordinates["longitude"] is None raw_location = RawWikidataLocation( { @@ -95,10 +95,10 @@ def test_wikidata_raw_location_invalid_coordinates() -> None: } ) with pytest.raises(AssertionError): - _ = raw_location.latitude + _ = raw_location.coordinates["latitude"] with pytest.raises(AssertionError): - _ = raw_location.longitude + _ = raw_location.coordinates["longitude"] def test_wikidata_raw_name_invalid_dates() -> None: From b4e9bdfad5a2e9f2ee648cd2916ed370596e449e Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Wed, 5 Feb 2025 10:27:13 +0000 Subject: [PATCH 248/310] Update src/transformers/loc/raw_concept.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Štěpán Brychta --- src/transformers/loc/raw_concept.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index 97e851b6a6..e7ef53473c 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -214,8 +214,8 @@ def label(self) -> str: @property def is_geographic(self) -> bool: - assert self._raw_concept_node is not None """Returns True if the node represents a geographic concept, as determined by @type""" + assert self._raw_concept_node is not None return "madsrdf:Geographic" in self._raw_concept_node.get("@type", []) @property From 597f162ce8859accffe3e7b6c6ef2109ee2f40eb Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Wed, 5 Feb 2025 10:44:48 +0000 Subject: [PATCH 249/310] do not be too resilient --- src/sources/gzip_source.py | 5 +---- tests/transformers/loc/test_concepts.py | 15 +++------------ 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 44e3ebd750..03bbd4b6b1 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -17,10 +17,7 @@ def stream_raw(self) -> Generator[dict]: with gzip.GzipFile(fileobj=response.raw) as file: for line_bytes in file: - try: - yield json.loads(line_bytes.decode("utf8")) - except JSONDecodeError as e: - print(e) + yield json.loads(line_bytes.decode("utf8")) class MultiGZipSource(BaseSource): diff --git a/tests/transformers/loc/test_concepts.py b/tests/transformers/loc/test_concepts.py index a9e538af69..8d3c996dda 100644 --- a/tests/transformers/loc/test_concepts.py +++ b/tests/transformers/loc/test_concepts.py @@ -24,10 +24,7 @@ def test_loc_concept_transformer_resilience() -> None: "url": test_url, "status_code": 200, "json_data": None, - "content_bytes": "\n[}{}[\n".encode( - "utf-8" - ) # This line fails, but should not break the overall process - + jsons_to_ndjson( + "content_bytes": jsons_to_ndjson( [ "mads_geographic_concept.json", # geographic concepts are not included in the concepts transformer output "mads_composite_concept.json", @@ -57,10 +54,7 @@ def test_loc_location_transformer_resilience() -> None: "url": test_url_subjects, "status_code": 200, "json_data": None, - "content_bytes": "\n[}{}[\n".encode( - "utf-8" - ) # This line fails, but should not break the overall process - + jsons_to_ndjson( + "content_bytes": jsons_to_ndjson( [ "mads_geographic_concept.json", # Only geographic concepts included in the location transformer output "mads_composite_concept.json", @@ -75,10 +69,7 @@ def test_loc_location_transformer_resilience() -> None: "url": test_url_names, "status_code": 200, "json_data": None, - "content_bytes": "\n[}{}[\n".encode( - "utf-8" - ) # This line fails, but should not break the overall process - + load_fixture("loc_names_example.jsonld"), + "content_bytes": load_fixture("loc_names_example.jsonld"), "params": None, }, ] From 1fc3f23a56ae3a7dd696ef9007dbbf77f0137a37 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Wed, 5 Feb 2025 11:40:33 +0000 Subject: [PATCH 250/310] remove redundant extract --- src/transformers/loc/raw_concept.py | 1 - .../transformers/loc/test_raw_mads_concept.py | 28 ----------------- .../transformers/loc/test_raw_skos_concept.py | 31 +------------------ 3 files changed, 1 insertion(+), 59 deletions(-) diff --git a/src/transformers/loc/raw_concept.py b/src/transformers/loc/raw_concept.py index e7ef53473c..52d7162b2b 100644 --- a/src/transformers/loc/raw_concept.py +++ b/src/transformers/loc/raw_concept.py @@ -95,7 +95,6 @@ def exclude(self) -> bool: class RawLibraryOfCongressSKOSConcept(RawLibraryOfCongressConcept): def __init__(self, raw_concept: dict): super().__init__(raw_concept) - self._raw_concept_node = self._extract_concept_node() def _extract_concept_node(self) -> dict | None: graph: list[dict] = self.raw_concept.get("@graph", []) diff --git a/tests/transformers/loc/test_raw_mads_concept.py b/tests/transformers/loc/test_raw_mads_concept.py index 96f9ae5253..6b05bbb847 100644 --- a/tests/transformers/loc/test_raw_mads_concept.py +++ b/tests/transformers/loc/test_raw_mads_concept.py @@ -15,34 +15,6 @@ def test_label() -> None: assert concept.label == "Stump work" -class TestExclude: - def test_exclude_no_graph(self) -> None: - """ - If there is no graph, then the concept is to be excluded - """ - concept = RawLibraryOfCongressMADSConcept( - {"@id": "/authorities/subjects/sh2010105253", "@graph": []} - ) - assert concept.exclude() == True - - def test_exclude_no_matching_concept_node(self) -> None: - """ - If the graph does not contain a node of type skos:Concept, it is to be excluded - """ - concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_deprecated_concept.json")) - ) - assert concept.exclude() == True - - def test_do_not_exclude(self) -> None: - """ - A complete, non-duplicate, non-deprecated record is to be included in the output - """ - concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_concept.json")) - ) - assert concept.exclude() == False - class TestGeographic: def test_is_geographic(self) -> None: diff --git a/tests/transformers/loc/test_raw_skos_concept.py b/tests/transformers/loc/test_raw_skos_concept.py index bee5dbaca0..895c8fe62b 100644 --- a/tests/transformers/loc/test_raw_skos_concept.py +++ b/tests/transformers/loc/test_raw_skos_concept.py @@ -7,7 +7,7 @@ def test_label() -> None: """ - Label is extracted from madsrdf:authoritativeLabel + Label is extracted from skos:prefLabel """ concept = RawLibraryOfCongressSKOSConcept( json.loads(load_fixture("skos_concept.json")) @@ -15,35 +15,6 @@ def test_label() -> None: assert concept.label == "Pickling" -class TestExclude: - def test_exclude_no_graph(self) -> None: - """ - If there is no graph, then the concept is to be excluded - """ - concept = RawLibraryOfCongressSKOSConcept( - {"@id": "/authorities/subjects/sh2010105253", "@graph": []} - ) - assert concept.exclude() == True - - def test_exclude_no_matching_concept_node(self) -> None: - """ - If the graph does not contain a node of type skos:Concept, it is to be excluded - """ - concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_deprecated_concept.json")) - ) - assert concept.exclude() == True - - def test_do_not_exclude(self) -> None: - """ - A complete, non-duplicate, non-deprecated record is to be included in the output - """ - concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_concept.json")) - ) - assert concept.exclude() == False - - class TestGeographic: def test_is_geographic(self) -> None: """ From 580250050ed8a954b19610f47fa394d53391158d Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Wed, 5 Feb 2025 11:56:53 +0000 Subject: [PATCH 251/310] harmonise geo tests --- tests/transformers/loc/test_raw_concept.py | 28 +++++++++++++++++++ .../transformers/loc/test_raw_mads_concept.py | 18 ------------ .../transformers/loc/test_raw_skos_concept.py | 17 ----------- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/tests/transformers/loc/test_raw_concept.py b/tests/transformers/loc/test_raw_concept.py index daa568610b..747d6e9dfc 100644 --- a/tests/transformers/loc/test_raw_concept.py +++ b/tests/transformers/loc/test_raw_concept.py @@ -1,6 +1,8 @@ +import json from typing import Type import pytest +from test_utils import load_fixture from transformers.loc.raw_concept import ( RawLibraryOfCongressConcept, @@ -134,3 +136,29 @@ def test_exclude_marked_duplicates( concept = sut_class({"@id": "authorities/names/sh2010105253-781", "@graph": []}) concept._raw_concept_node = {} assert concept.exclude() == True + + +@pytest.mark.parametrize( + "sut_class,format", + [ + (RawLibraryOfCongressSKOSConcept, "skos"), + (RawLibraryOfCongressMADSConcept, "mads"), + ], +) +class TestGeographic: + def test_is_geographic( + self, sut_class: Type[RawLibraryOfCongressConcept], source_format: str + ) -> None: + """ + A concept is geographic if its @type list contains madsrdf:Geographic or http://id.loc.gov/datatypes/codes/gac" + """ + concept = sut_class( + json.loads(load_fixture(f"{source_format}_geographic_concept.json")) + ) + assert concept.is_geographic == True + + def test_is_not_geographic( + self, sut_class: Type[RawLibraryOfCongressConcept], source_format: str + ) -> None: + concept = sut_class(json.loads(load_fixture(f"{source_format}_concept.json"))) + assert concept.is_geographic == False diff --git a/tests/transformers/loc/test_raw_mads_concept.py b/tests/transformers/loc/test_raw_mads_concept.py index 6b05bbb847..ef6fe6264d 100644 --- a/tests/transformers/loc/test_raw_mads_concept.py +++ b/tests/transformers/loc/test_raw_mads_concept.py @@ -15,24 +15,6 @@ def test_label() -> None: assert concept.label == "Stump work" - -class TestGeographic: - def test_is_geographic(self) -> None: - """ - A concept is geographic if its @type list contains madsrdf:Geographic - """ - concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_geographic_concept.json")) - ) - assert concept.is_geographic == True - - def test_is_not_geographic(self) -> None: - concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_concept.json")) - ) - assert concept.is_geographic == False - - class TestBroaderConcepts: def test_real_example(self) -> None: concept = RawLibraryOfCongressMADSConcept( diff --git a/tests/transformers/loc/test_raw_skos_concept.py b/tests/transformers/loc/test_raw_skos_concept.py index 895c8fe62b..4644c10131 100644 --- a/tests/transformers/loc/test_raw_skos_concept.py +++ b/tests/transformers/loc/test_raw_skos_concept.py @@ -15,23 +15,6 @@ def test_label() -> None: assert concept.label == "Pickling" -class TestGeographic: - def test_is_geographic(self) -> None: - """ - A concept is geographic if there exists skos:notation with a gac type - """ - concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_geographic_concept.json")) - ) - assert concept.is_geographic == True - - def test_is_not_geographic(self) -> None: - concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_concept.json")) - ) - assert concept.is_geographic == False - - def test_broader_concepts() -> None: concept = RawLibraryOfCongressSKOSConcept( json.loads(load_fixture("skos_geographic_concept.json")) From 71e6ff1982b0e159c473fb4c6e9f7d7784bddbce Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Wed, 5 Feb 2025 12:44:23 +0000 Subject: [PATCH 252/310] Apply auto-formatting rules --- tests/test_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 5117787863..8c48da7edb 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -6,11 +6,11 @@ from typing_extensions import get_args from config import ( + CATALOGUE_SNAPSHOT_URL, LOC_NAMES_URL, LOC_SUBJECT_HEADINGS_URL, MESH_URL, WIKIDATA_SPARQL_URL, - CATALOGUE_SNAPSHOT_URL ) from extractor import LambdaEvent, lambda_handler from transformers.base_transformer import EntityType, StreamDestination @@ -156,7 +156,7 @@ def test_lambda_handler( "wikidata_linked_loc_locations": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_concepts": [WIKIDATA_SPARQL_URL], "wikidata_linked_mesh_locations": [WIKIDATA_SPARQL_URL], - "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL] + "catalogue_concepts": [CATALOGUE_SNAPSHOT_URL], } assert transformer_type in transformer_types From 404a25107ddbb59a22705ed9d554b7cf82b22fad Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Wed, 5 Feb 2025 13:08:47 +0000 Subject: [PATCH 253/310] harmonise geo tests --- tests/transformers/loc/test_raw_concept.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/transformers/loc/test_raw_concept.py b/tests/transformers/loc/test_raw_concept.py index 747d6e9dfc..6a9625ded8 100644 --- a/tests/transformers/loc/test_raw_concept.py +++ b/tests/transformers/loc/test_raw_concept.py @@ -139,7 +139,7 @@ def test_exclude_marked_duplicates( @pytest.mark.parametrize( - "sut_class,format", + "sut_class,source_format", [ (RawLibraryOfCongressSKOSConcept, "skos"), (RawLibraryOfCongressMADSConcept, "mads"), From 533c547321c4af8200a42edeb61327619924209d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 5 Feb 2025 13:44:33 +0000 Subject: [PATCH 254/310] Add new Wikidata edges and refactor Wikidata source --- src/extractor.py | 8 +- src/models/graph_edge.py | 7 + .../linked_ontology_id_type_checker.py | 71 ------- .../wikidata/linked_ontology_source.py | 180 +++++++++--------- src/sources/wikidata/sparql_query_builder.py | 75 +++----- .../wikidata/concepts_transformer.py | 36 ++-- .../wikidata/locations_transformer.py | 6 +- .../wikidata/names_transformer.py | 6 +- src/utils/aws.py | 25 +++ src/utils/ontology_id_checker.py | 48 +++++ src/utils/types.py | 6 + 11 files changed, 229 insertions(+), 239 deletions(-) delete mode 100644 src/sources/wikidata/linked_ontology_id_type_checker.py create mode 100644 src/utils/ontology_id_checker.py create mode 100644 src/utils/types.py diff --git a/src/extractor.py b/src/extractor.py index d4920274f6..cb4ca44474 100755 --- a/src/extractor.py +++ b/src/extractor.py @@ -28,16 +28,16 @@ def handler( f"transformer and streaming them into {stream_destination}." ) + assert ( + config.S3_BULK_LOAD_BUCKET_NAME is not None + ), "the S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." + transformer: BaseTransformer = create_transformer(transformer_type, entity_type) if stream_destination == "graph": neptune_client = get_neptune_client(is_local) transformer.stream_to_graph(neptune_client, entity_type, sample_size) elif stream_destination == "s3": - assert ( - config.S3_BULK_LOAD_BUCKET_NAME is not None - ), "To stream to S3, the S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." - file_name = f"{transformer_type}__{entity_type}.csv" s3_uri = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{file_name}" transformer.stream_to_s3(s3_uri, entity_type, sample_size) diff --git a/src/models/graph_edge.py b/src/models/graph_edge.py index 2d9afd125e..7820a65dd1 100644 --- a/src/models/graph_edge.py +++ b/src/models/graph_edge.py @@ -37,3 +37,10 @@ class SourceConceptHasParent(BaseEdge): to_type: str = "SourceConcept" relationship: str = "HAS_PARENT" directed: bool = True + + +class SourceConceptHasFieldOfWork(BaseEdge): + from_type: str = "SourceName" + to_type: str = "SourceConcept" + relationship: str = "HAS_FIELD_OF_WORK" + directed: bool = True diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py deleted file mode 100644 index 425ced53d4..0000000000 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ /dev/null @@ -1,71 +0,0 @@ -from functools import lru_cache - -import boto3 -import smart_open - -import config - -from .sparql_query_builder import NodeType, OntologyType - - -class LinkedOntologyIdTypeChecker: - """ - A class for checking whether ids from a given linked ontology (LoC or MeSH) are classified under - a selected node type (concepts, locations, or names). - """ - - def __init__(self, node_type: NodeType, linked_ontology: OntologyType): - self.node_type = node_type - self.linked_ontology = linked_ontology - - # MeSH only has concepts and locations, so make sure we don't attempt to extract names. - if node_type == "names": - assert ( - linked_ontology != "mesh" - ), "Invalid node_type for ontology type MeSH." - - @lru_cache - def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: - """Return all ids classified under a given `node_type` for the selected ontology.""" - # Retrieve the bulk load file outputted by the relevant transformer so that we can extract ids from it. - linked_nodes_file_name = f"{self.linked_ontology}_{node_type}__nodes.csv" - s3_url = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" - - print( - f"Retrieving ids of type '{node_type}' from ontology '{self.linked_ontology}' from S3.", - end=" ", - flush=True, - ) - - ids = set() - - transport_params = {"client": boto3.client("s3")} - with smart_open.open(s3_url, "r", transport_params=transport_params) as f: - # Loop through all items in the file and extract the id from each item - for i, line in enumerate(f): - # Skip header - if i == 0: - continue - ids.add(line.split(",")[0]) - - print(f"({len(ids)} ids retrieved.)") - - return ids - - def id_included_in_selected_type(self, linked_id: str) -> bool: - """ - Return `True` if a given linked ontology id is classified under the selected node type (concepts, - locations, or names). - """ - return linked_id in self._get_linked_ontology_ids(self.node_type) - - def id_is_valid(self, linked_id: str) -> bool: - """Returns 'True' if the given id from the selected linked ontology is valid.""" - is_valid = False - is_valid |= linked_id in self._get_linked_ontology_ids("concepts") - is_valid |= linked_id in self._get_linked_ontology_ids("locations") - - if self.linked_ontology == "loc": - is_valid |= linked_id in self._get_linked_ontology_ids("names") - - return is_valid diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index f2c8877e7a..58908cfd40 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -4,17 +4,32 @@ from sources.base_source import BaseSource from transformers.base_transformer import EntityType +from utils.ontology_id_checker import is_id_classified_as_node_type, is_id_in_ontology from utils.streaming import process_stream_in_parallel - -from .linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker +from utils.types import NodeType, OntologyType from .sparql_client import SPARQL_MAX_PARALLEL_QUERIES, WikidataSparqlClient -from .sparql_query_builder import NodeType, OntologyType, SparqlQueryBuilder +from .sparql_query_builder import SparqlQueryBuilder, WikidataEdgeQueryType + SPARQL_ITEMS_CHUNK_SIZE = 400 WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" +def _parallelise_sparql_requests( + items: Iterator, run_sparql_query: Callable[[list], list] +) -> Generator: + """Accept an `items` generator and a `run_sparql_query` method. Split `items` chunks and apply + `run_sparql_query` to each chunk. Return a single generator of results.""" + for raw_response_item in process_stream_in_parallel( + items, + run_sparql_query, + SPARQL_ITEMS_CHUNK_SIZE, + SPARQL_MAX_PARALLEL_QUERIES, + ): + yield raw_response_item + + def extract_wikidata_id(item: dict, key: str = "item") -> str | None: """ Accepts a raw `item` dictionary returned by the Wikidata SPARQL endpoint and returns the Wikidata id of the item. @@ -57,7 +72,6 @@ def __init__( self.node_type = node_type self.linked_ontology = linked_ontology self.entity_type = entity_type - self.id_type_checker = LinkedOntologyIdTypeChecker(node_type, linked_ontology) @lru_cache def _get_all_ids(self) -> list[str]: @@ -81,48 +95,50 @@ def _get_all_ids(self) -> list[str]: print(f"({len(all_valid_ids)} ids retrieved.)") return list(all_valid_ids) - def _get_linked_id_mappings(self, wikidata_ids: list[str]) -> list[dict]: - query = SparqlQueryBuilder.get_linked_ids_query( - wikidata_ids, self.linked_ontology - ) - return self.client.run_query(query) - def _get_wikidata_items(self, wikidata_ids: list[str]) -> list: query = SparqlQueryBuilder.get_items_query(wikidata_ids, self.node_type) return self.client.run_query(query) - def _get_parent_id_mappings(self, child_wikidata_ids: list[str]) -> list[dict]: + def _stream_all_edges_by_type( + self, edge_type: WikidataEdgeQueryType + ) -> Generator[dict]: """ - Given a list of child wikidata ids, checks for all parents of each item in the list and returns a list - of mappings between child and parent ids. + Given an `edge_type`, return a generator of all edges starting from all Wikidata items linking to the selected + ontology. + + Edges are extracted via the following steps: + 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the linked ontology. + 2. Split the returned ids into chunks. For each chunk, run a second SPARQL query to retrieve the requested + edges for all ids in the chunk. (It is possible to modify the query in step 1 to return the edges directly, + but this makes the query unreliable - sometimes it times out or returns invalid JSON. Getting the edges + in chunks is much slower, but it works every time.) """ - # Get all parent ids referenced via the Wikidata 'subclass of' field - subclass_of_query = SparqlQueryBuilder.get_parents_query( - child_wikidata_ids, "subclass_of" - ) - subclass_of_results = self.client.run_query(subclass_of_query) - # Get all parent ids referenced via the Wikidata 'instance of' field - instance_of_query = SparqlQueryBuilder.get_parents_query( - child_wikidata_ids, "instance_of" - ) - instance_of_results = self.client.run_query(instance_of_query) - - return subclass_of_results + instance_of_results - - @staticmethod - def _parallelise_requests( - items: Iterator, run_sparql_query: Callable[[list], list] - ) -> Generator: - """Accept an `items` generator and a `run_sparql_query` method. Split `items` chunks and apply - `run_sparql_query` to each chunk. Return a single generator of results.""" - for raw_response_item in process_stream_in_parallel( - items, - run_sparql_query, - SPARQL_ITEMS_CHUNK_SIZE, - SPARQL_MAX_PARALLEL_QUERIES, - ): - yield raw_response_item + def get_edges(wikidata_ids: list[str]) -> list[dict]: + query = SparqlQueryBuilder.get_edge_query(wikidata_ids, edge_type) + return self.client.run_query(query) + + all_ids = self._get_all_ids() + for raw_mapping in _parallelise_sparql_requests(iter(all_ids), get_edges): + from_id = extract_wikidata_id(raw_mapping, "fromItem") + + if edge_type in ("same_as_mesh", "same_as_loc"): + to_id = raw_mapping["toItem"]["value"] + else: + to_id = extract_wikidata_id(raw_mapping, "toItem") + + if from_id is not None and to_id is not None: + yield {"from_id": from_id, "to_id": to_id} + + def _stream_all_same_as_edges(self) -> Generator[dict]: + if self.linked_ontology == "loc": + yield from self._stream_all_edges_by_type("same_as_loc") + elif self.linked_ontology == "mesh": + yield from self._stream_all_edges_by_type("same_as_mesh") + + def _stream_all_has_parent_edges(self) -> Generator[dict]: + yield from self._stream_all_edges_by_type("parent_instance_of") + yield from self._stream_all_edges_by_type("parent_subclass_of") def _stream_filtered_wikidata_ids(self) -> Generator[str]: """Streams all wikidata ids to be processed as nodes given the selected `node_type`.""" @@ -130,15 +146,20 @@ def _stream_filtered_wikidata_ids(self) -> Generator[str]: # Stream all SAME_AS edges and extract Wikidata ids from them, making sure to deduplicate # (a given Wikidata id can appear in more than one edge). - for item in self._stream_all_same_as_edges(): - wikidata_id = item["wikidata_id"] - linked_id = item["linked_id"] - if self.id_type_checker.id_is_valid(linked_id) and wikidata_id not in seen: + for edge in self._stream_all_same_as_edges(): + wikidata_id = edge["from_id"] + linked_id = edge["to_id"] + if ( + is_id_in_ontology(linked_id, self.linked_ontology) + and wikidata_id not in seen + ): # Add Wikidata id to `seen` no matter if it's part of the selected node type # to make sure it is not processed again as a parent below. seen.add(wikidata_id) - if self.id_type_checker.id_included_in_selected_type(linked_id): + if is_id_classified_as_node_type( + linked_id, self.linked_ontology, self.node_type + ): yield wikidata_id # Stream HAS_PARENT edges and extract Wikidata ids of all parents (children are streamed above). Filter out @@ -146,50 +167,11 @@ def _stream_filtered_wikidata_ids(self) -> Generator[str]: # reference a MeSH/LoC id. We categorise all of them as _concepts_, no matter whether the children are # categorised as concepts, names, or locations. if self.node_type == "concepts": - for item in self._stream_all_has_parent_edges(): - wikidata_id = item["parent_id"] - if wikidata_id not in seen: - seen.add(wikidata_id) - yield wikidata_id - - def _stream_all_same_as_edges(self) -> Generator[dict]: - """ - Stream raw 'SAME_AS' edges, mapping Wikidata ids to ids from the selected linked ontology. - - Edges are extracted via the following steps: - 1. Run a SPARQL query which retrieves _all_ Wikidata items referencing an id from the linked ontology. - 2. Split the returned ids into chunks. For each chunk, run a second SPARQL query to retrieve a mapping - between Wikidata ids and ids from the linked ontology. (It is possible to modify the query in step 1 to - return all the mappings at once, but this makes the query unreliable - sometimes it times out or returns - invalid JSON. Getting the mappings in chunks is much slower, but it works every time.) - """ - all_linked_ids = self._get_all_ids() - for raw_mapping in self._parallelise_requests( - iter(all_linked_ids), self._get_linked_id_mappings - ): - yield { - "wikidata_id": extract_wikidata_id(raw_mapping), - "linked_id": raw_mapping["linkedId"]["value"], - "type": "SAME_AS", - } - - def _stream_all_has_parent_edges(self) -> Generator[dict]: - """ - Stream raw 'HAS_PARENT' Wikidata edges, mapping child items to parent items. - """ - all_linked_ids = self._get_all_ids() - for raw_mapping in self._parallelise_requests( - iter(all_linked_ids), self._get_parent_id_mappings - ): - parent_id = extract_wikidata_id(raw_mapping) - child_id = extract_wikidata_id(raw_mapping, "child") - - if parent_id is not None and child_id is not None: - yield { - "child_id": child_id, - "parent_id": parent_id, - "type": "HAS_PARENT", - } + for edge in self._stream_all_has_parent_edges(): + parent_wikidata_id = edge["to_id"] + if parent_wikidata_id not in seen: + seen.add(parent_wikidata_id) + yield parent_wikidata_id def _stream_raw_edges(self) -> Generator[dict]: """ @@ -202,16 +184,24 @@ def _stream_raw_edges(self) -> Generator[dict]: # For example, if we are streaming Wikidata 'names' edges linked to LoC ids but the LoC id linked to some # Wikidata id is classified as a 'location', we skip it. This filtering process also removes mappings which # include invalid LoC ids (of which there are several thousand). - if self.id_type_checker.id_included_in_selected_type(edge["linked_id"]): - streamed_wikidata_ids.add(edge["wikidata_id"]) - yield edge + if is_id_classified_as_node_type( + edge["to_id"], self.linked_ontology, self.node_type + ): + streamed_wikidata_ids.add(edge["from_id"]) + yield {**edge, "type": "SAME_AS"} print("Streaming HAS_PARENT edges...") for edge in self._stream_all_has_parent_edges(): - # Only include an edge if its `child_id` was already streamed in a SAME_AS edge, indicating that + # Only include an edge if its `from_id` was already streamed in a SAME_AS edge, indicating that # the child item belongs under the selected `node_type`. - if edge["child_id"] in streamed_wikidata_ids: - yield edge + if edge["from_id"] in streamed_wikidata_ids: + yield {**edge, "type": "HAS_PARENT"} + + if self.node_type == "names": + print("Streaming HAS_FIELD_OF_WORK edges...") + for edge in self._stream_all_edges_by_type("field_of_work"): + if is_id_in_ontology(edge["to_id"], "wikidata"): + yield {**edge, "type": "HAS_FIELD_OF_WORK"} def _stream_raw_nodes(self) -> Generator[dict]: """ @@ -221,7 +211,7 @@ def _stream_raw_nodes(self) -> Generator[dict]: Wikidata fields required to create a node. """ all_ids = self._stream_filtered_wikidata_ids() - yield from self._parallelise_requests(all_ids, self._get_wikidata_items) + yield from _parallelise_sparql_requests(all_ids, self._get_wikidata_items) def stream_raw(self) -> Generator[dict]: if self.entity_type == "nodes": diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index 082a05fc3d..8c0872082b 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -1,7 +1,15 @@ from typing import Literal -NodeType = Literal["concepts", "names", "locations"] -OntologyType = Literal["mesh", "loc"] +from utils.types import NodeType, OntologyType + + +WikidataEdgeQueryType = Literal[ + "same_as_loc", + "same_as_mesh", + "parent_instance_of", + "parent_subclass_of", + "field_of_work" +] class SparqlQueryBuilder: @@ -41,15 +49,6 @@ def _get_formatted_fields(node_type: NodeType) -> str: return " ".join(fields_with_aggregation) - @staticmethod - def _get_linked_ontology_filter(linked_ontology: OntologyType) -> str: - if linked_ontology == "loc": - return "?item p:P244/ps:P244 ?linkedId." - elif linked_ontology == "mesh": - return "?item p:P486/ps:P486 ?linkedId." - - raise ValueError(f"Invalid linked ontology type: {linked_ontology}") - @staticmethod def _get_formatted_field_mappings(node_type: NodeType) -> str: """ @@ -133,51 +132,31 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: return SparqlQueryBuilder._compact_format_query(query) @classmethod - def get_linked_ids_query( - cls, item_ids: list[str], linked_ontology: OntologyType - ) -> str: - """ - Given a list of Wikidata `item_ids`, return a query to retrieve all linked ontology ids referenced by each - item in the list. - """ - ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in sorted(item_ids)]) - - query = f""" - SELECT DISTINCT ?item ?linkedId - WHERE {{ - VALUES ?item {{ {ids_clause} }} - {cls._get_linked_ontology_filter(linked_ontology)} - }} - """ - - return SparqlQueryBuilder._compact_format_query(query) - - @classmethod - def get_parents_query( - cls, - item_ids: list[str], - relationship_type: Literal["instance_of", "subclass_of"], - ) -> str: + def get_edge_query(cls, item_ids: list[str], edge_type: WikidataEdgeQueryType) -> str: """ - Given a list of Wikidata `item_ids`, return a query to retrieve all parents of each item in the list. - Parents are determined based on the 'subclass of' (P279) or the 'instance of' (P31) fields. + Given a list of Wikidata `item_ids`, return a query to retrieve all edges of type `edge_type` linking each + item in the list to a different Wikidata item. """ ids_clause = " ".join([f"wd:{wikidata_id}" for wikidata_id in sorted(item_ids)]) - if relationship_type == "instance_of": - relationship = "?child wdt:P31 ?item." - elif relationship_type == "subclass_of": - relationship = "?child wdt:P279 ?item." + if edge_type == "same_as_loc": + property_path = "p:P244/ps:P244" + elif edge_type == "same_as_mesh": + property_path = "p:P486/ps:P486" + elif edge_type == "parent_instance_of": + property_path = "wdt:P31" + elif edge_type == "parent_subclass_of": + property_path = "wdt:P279" + elif edge_type == "field_of_work": + property_path = "wdt:P101" else: - raise ValueError( - f"Unknown parent/child relationship type: {relationship_type}" - ) + raise ValueError(f"Unknown edge type: {edge_type}") query = f""" - SELECT DISTINCT ?child ?item + SELECT DISTINCT ?fromItem ?toItem WHERE {{ - VALUES ?child {{ {ids_clause} }} - {relationship} + VALUES ?fromItem {{ {ids_clause} }} + ?fromItem {property_path} ?toItem. }} """ diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index 4704f6ff06..57617e5e5d 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,11 +1,14 @@ from collections.abc import Generator -from models.graph_edge import SourceConceptHasParent, SourceConceptSameAs -from models.graph_node import SourceConcept -from sources.wikidata.linked_ontology_source import ( - OntologyType, - WikidataLinkedOntologySource, +from models.graph_edge import ( + SourceConceptHasFieldOfWork, + SourceConceptHasParent, + SourceConceptSameAs, + BaseEdge, ) +from models.graph_node import SourceConcept +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from utils.types import OntologyType from transformers.base_transformer import BaseTransformer, EntityType from .raw_concept import RawWikidataConcept @@ -28,21 +31,28 @@ def transform_node(self, raw_node: dict) -> SourceConcept | None: description=raw_concept.description, ) - def extract_edges( - self, raw_edge: dict - ) -> Generator[SourceConceptSameAs | SourceConceptHasParent]: + def extract_edges(self, raw_edge: dict) -> Generator[BaseEdge]: if raw_edge["type"] == "SAME_AS": - linked_id, wikidata_id = raw_edge["linked_id"], raw_edge["wikidata_id"] edge_attributes = {"source": "wikidata"} yield SourceConceptSameAs( - from_id=linked_id, to_id=wikidata_id, attributes=edge_attributes + from_id=raw_edge["from_id"], + to_id=raw_edge["to_id"], + attributes=edge_attributes, ) yield SourceConceptSameAs( - from_id=wikidata_id, to_id=linked_id, attributes=edge_attributes + from_id=raw_edge["to_id"], + to_id=raw_edge["from_id"], + attributes=edge_attributes, ) elif raw_edge["type"] == "HAS_PARENT": yield SourceConceptHasParent( - from_id=raw_edge["child_id"], to_id=raw_edge["parent_id"] + from_id=raw_edge["from_id"], + to_id=raw_edge["to_id"], + ) + elif raw_edge["type"] == "HAS_FIELD_OF_WORK": + yield SourceConceptHasFieldOfWork( + from_id=raw_edge["from_id"], + to_id=raw_edge["to_id"], ) else: - raise ValueError(f"Unknown edge type f{raw_edge['type']}") + raise ValueError(f"Unknown edge type {raw_edge['type']}") diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 2c320b89b7..46642c2f69 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,8 +1,6 @@ from models.graph_node import SourceLocation -from sources.wikidata.linked_ontology_source import ( - OntologyType, - WikidataLinkedOntologySource, -) +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from utils.types import OntologyType from transformers.base_transformer import EntityType from .concepts_transformer import WikidataConceptsTransformer diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 424fe2f8c0..1fb800de38 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,8 +1,6 @@ from models.graph_node import SourceName -from sources.wikidata.linked_ontology_source import ( - OntologyType, - WikidataLinkedOntologySource, -) +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from utils.types import OntologyType from transformers.base_transformer import EntityType from .concepts_transformer import WikidataConceptsTransformer diff --git a/src/utils/aws.py b/src/utils/aws.py index f29a867c4a..1042da852a 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -1,10 +1,16 @@ +import csv import json +from functools import lru_cache +from typing import Any, Generator import boto3 +import smart_open +from utils.types import NodeType, OntologyType from clients.base_neptune_client import BaseNeptuneClient from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient +from config import S3_BULK_LOAD_BUCKET_NAME LOAD_BALANCER_SECRET_NAME = "NeptuneTest/LoadBalancerUrl" INSTANCE_ENDPOINT_SECRET_NAME = "NeptuneTest/InstanceEndpoint" @@ -52,3 +58,22 @@ def get_neptune_client(is_local: bool) -> BaseNeptuneClient: ) else: return LambdaNeptuneClient(get_secret(INSTANCE_ENDPOINT_SECRET_NAME)) + + +@lru_cache +def fetch_transformer_output_from_s3( + ontology_type: OntologyType, node_type: NodeType +) -> Generator[list[Any]]: + """Retrieves the bulk load file outputted by the relevant transformer so that we can extract data from it.""" + linked_nodes_file_name = f"{ontology_type}_{node_type}__nodes.csv" + s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + + transport_params = {"client": boto3.client("s3")} + with smart_open.open(s3_url, "r", transport_params=transport_params) as f: + csv_reader = csv.reader(f) + + for i, row in enumerate(csv_reader): + # Skip header + if i == 0: + continue + yield row diff --git a/src/utils/ontology_id_checker.py b/src/utils/ontology_id_checker.py new file mode 100644 index 0000000000..29c831e85e --- /dev/null +++ b/src/utils/ontology_id_checker.py @@ -0,0 +1,48 @@ +from functools import lru_cache + +from utils.aws import fetch_transformer_output_from_s3 +from utils.types import NodeType, OntologyType + +@lru_cache +def _get_ids_for_ontology_and_node_type( + ontology_type: OntologyType, node_type: NodeType +) -> set[str]: + """Return all ids classified under a given `node_type` for the selected ontology.""" + if node_type == "names" and ontology_type in ("mesh", "wikidata_linked_mesh"): + return set() + + print( + f"Retrieving ids of type '{node_type}' from ontology '{ontology_type}' from S3.", + end=" ", + flush=True, + ) + ids = {row[0] for row in fetch_transformer_output_from_s3(ontology_type, node_type)} + print(f"({len(ids)} ids retrieved.)") + + return ids + + +def is_id_classified_as_node_type( + item_id: str, item_ontology: OntologyType, node_type: NodeType +) -> bool: + """ + Return `True` if a given ontology id is classified under the selected node type (concepts, locations, or names). + """ + return item_id in _get_ids_for_ontology_and_node_type(item_ontology, node_type) + + +def is_id_in_ontology(item_id: str, item_ontology: OntologyType) -> bool: + """ + Returns 'True' if the given id from the selected ontology is valid and if it has a corresponding + SourceConcept/SourceLocation/SourceName node in the catalogue graph. + """ + if item_ontology == "wikidata": + return is_id_in_ontology(item_id, "wikidata_linked_loc") or is_id_in_ontology( + item_id, "wikidata_linked_mesh" + ) + + return ( + is_id_classified_as_node_type(item_id, item_ontology, "concepts") + or is_id_classified_as_node_type(item_id, item_ontology, "locations") + or is_id_classified_as_node_type(item_id, item_ontology, "names") + ) diff --git a/src/utils/types.py b/src/utils/types.py new file mode 100644 index 0000000000..da668d0e37 --- /dev/null +++ b/src/utils/types.py @@ -0,0 +1,6 @@ +from typing import Literal + +NodeType = Literal["concepts", "names", "locations"] +OntologyType = Literal[ + "mesh", "loc", "wikidata_linked_mesh", "wikidata_linked_loc", "wikidata" +] From 1316a832b5d532f9e2ed2af2a8299efb10f4e0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 6 Feb 2025 10:17:14 +0000 Subject: [PATCH 255/310] Split and expand Wikidata tests --- .../test_wikidata_concepts_transformer.py | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index da6f4ce5d5..40872e11a3 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -79,15 +79,19 @@ def test_wikidata_raw_name() -> None: assert raw_name.description == "American politician" -def test_wikidata_raw_location_invalid_coordinates() -> None: +def test_wikidata_raw_location_empty_coordinates() -> None: raw_location = RawWikidataLocation({}) assert raw_location.latitude is None assert raw_location.longitude is None + +def test_wikidata_raw_location_uri_type_coordinates() -> None: raw_location = RawWikidataLocation({"type": "uri", "value": "some-url"}) assert raw_location.latitude is None assert raw_location.longitude is None + +def test_wikidata_raw_location_invalid_coordinates() -> None: raw_location = RawWikidataLocation( { "item": {"type": "uri", "value": "some-id"}, @@ -101,16 +105,34 @@ def test_wikidata_raw_location_invalid_coordinates() -> None: _ = raw_location.longitude -def test_wikidata_raw_name_invalid_dates() -> None: +def test_wikidata_raw_name_invalid_date() -> None: raw_name = RawWikidataName( - {"dateOfBirth": {"type": "literal", "value": "+0000-00-00T00:00:00Z"}} + { + "dateOfBirth": {"type": "literal", "value": "+0000-00-00T00:00:00Z"}, + "dateOfDeath": {"type": "literal", "value": "+0000-00-00T00:00:00Z"}, + }, ) assert raw_name.date_of_birth is None + assert raw_name.date_of_death is None + - raw_name = RawWikidataName({"dateOfBirth": {"type": "uri", "value": "some-uri"}}) +def test_wikidata_raw_name_uri_type_date() -> None: + raw_name = RawWikidataName( + { + "dateOfBirth": {"type": "uri", "value": "some-uri"}, + "dateOfDeath": {"type": "uri", "value": "some-uri"}, + } + ) assert raw_name.date_of_birth is None + assert raw_name.date_of_death is None + +def test_wikidata_raw_name_uri_date() -> None: raw_name = RawWikidataName( - {"dateOfBirth": {"type": "literal", "value": "https://some-url"}} + { + "dateOfBirth": {"type": "literal", "value": "https://some-url"}, + "dateOfDeath": {"type": "literal", "value": "https://some-url"}, + } ) assert raw_name.date_of_birth is None + assert raw_name.date_of_death is None From 6714b07b349849c26ba6669941eb935da6133af8 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Thu, 6 Feb 2025 10:23:58 +0000 Subject: [PATCH 256/310] workflow, deploy scripts, gha role --- .github/workflows/ci.yml | 57 +++++++++++++++++++++++++++ scripts/deploy_lambda_service.sh | 41 +++++++++++++++++++ scripts/deploy_lambda_zip.sh | 36 +++++++++++++++++ terraform/gha_role.tf | 51 ++++++++++++++++++++++++ terraform/state_machine_extractors.tf | 3 -- terraform/terraform.tf | 4 +- 6 files changed, 188 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 scripts/deploy_lambda_service.sh create mode 100644 scripts/deploy_lambda_zip.sh create mode 100644 terraform/gha_role.tf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000..2d91414de2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,57 @@ +name: "Run CI pipeline" + +on: + push: + branches: + - main + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: CI setup + run: | + ./scripts/ci-setup.sh + - name: Test + run: | + ./scripts/test.sh + + build: + runs-on: ubuntu-latest + needs: test + if: needs.test.result == "success" + steps: + - uses: actions/checkout@v3 + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-west-1 + role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} # to be created + - name: Build and push artefacts + run: | + ./scripts/build.sh --push + + deploy: + runs-on: ubuntu-latest + needs: build + if: needs.build.result == "success" + steps: + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-west-1 + role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} # to be created + - name: Deploy bulk-loader lambda + run: | + ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-loader + - name: Deploy bulk-load-poller lambda + run: | + ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-load-poller + - name: Deploy indexer lambda + run: | + ./scripts/deploy_lambda_zip.sh catalogue-graph-indexer + - name: Deploy extractor lambda + run: | + ./scripts/deploy_lambda_service.sh catalogue-graph-extractor \ No newline at end of file diff --git a/scripts/deploy_lambda_service.sh b/scripts/deploy_lambda_service.sh new file mode 100644 index 0000000000..5b1885b67c --- /dev/null +++ b/scripts/deploy_lambda_service.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail + + +SERVICE_NAME=$1 + +FUNCTION_NAME="${SERVICE_NAME}" +REPOSITORY_URI="760097843905.dkr.ecr.eu-west-1.amazonaws.com" + +IMAGE_URI="${REPOSITORY_URI}/uk.ac.wellcome/${SERVICE_NAME}:dev" + +echo "Deploying ${IMAGE_URI} to ${SERVICE_NAME}, @ $(date) ..." + +echo "Current lambda configuration for ${SERVICE_NAME}:" +aws lambda get-function-configuration \ + --function-name "$SERVICE_NAME" \ + --no-cli-pager + +echo "Updating lambda configuration ..." +echo "Using ${IMAGE_URI}:" +aws lambda update-function-code \ + --function-name "$SERVICE_NAME" \ + --image-uri "${IMAGE_URI}" \ + --publish \ + --no-cli-pager + +echo "Updated lambda configuration, (waiting for update @ $(date)}):" +aws lambda wait function-updated \ + --function-name "$SERVICE_NAME" \ + --no-cli-pager + +echo "New lambda configuration complete (@ $(date)), config after change:" +aws lambda get-function-configuration \ + --function-name "$SERVICE_NAME" \ + --no-cli-pager + +echo "Done deploying ${SERVICE_NAME} @ $(date)! 🚀" +done \ No newline at end of file diff --git a/scripts/deploy_lambda_zip.sh b/scripts/deploy_lambda_zip.sh new file mode 100644 index 0000000000..49e8edaec2 --- /dev/null +++ b/scripts/deploy_lambda_zip.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Usage: ./deploy_lambda_zip.sh +# Example: ./deploy_lambda_zip.sh monitoring/ingest_inspector_backend ingest_inspector_backend + +set -o errexit +set -o nounset +set -o pipefail + +LAMBDA_NAME=$1 + +S3_BUCKET="wellcomecollection-platform-infra" +S3_KEY="lambdas/catalogue_graph/lambda-dev.zip" + +echo "Identifying function: $LAMBDA_NAME" +FUNCTION_ARN=$(aws lambda get-function-configuration \ + --function-name $LAMBDA_NAME \ + --query "FunctionArn" \ + --output text) + +echo "Updating function: $FUNCTION_ARN from s3://$S3_BUCKET/$S3_KEY" +REVISION_ID=$(aws lambda update-function-code \ + --function-name $LAMBDA_NAME \ + --s3-bucket $S3_BUCKET \ + --s3-key $S3_KEY \ + --query "RevisionId" \ + --output text \ + --publish) + +echo "Revision id: $REVISION_ID" + +echo "Awaiting function update" +aws lambda wait function-updated \ + --function-name $LAMBDA_NAME + +echo "Done" \ No newline at end of file diff --git a/terraform/gha_role.tf b/terraform/gha_role.tf new file mode 100644 index 0000000000..76506e19b9 --- /dev/null +++ b/terraform/gha_role.tf @@ -0,0 +1,51 @@ +module "gha_catalogue_graph_ci_role" { + source = "github.com/wellcomecollection/terraform-aws-gha-role?ref=v1.0.0" + + policy_document = data.aws_iam_policy_document.gha_catalogue_graph_ci.json + github_repository = "wellcomecollection/catalogue-graph" + role_name = "catalogue-graph-ci" + github_oidc_provider_arn = data.terraform_remote_state.aws_account_infrastructure.outputs.github_openid_connect_provider_arn +} + +data "aws_iam_policy_document" "gha_catalogue_graph_ci" { + statement { + actions = [ + "s3:PutObject" + ] + resources = [ + "arn:aws:s3:::wellcomecollection-platform-infra/lambdas/catalogue_graph/*" + ] + } + statement { + actions = [ + "ecr:BatchCheckLayerAvailability", + "ecr:Describe*", + "ecr:Get*", + "ecr:List*", + "ecr:TagResource", + "ecr:PutImage", + "ecr:InitiateLayerUpload", + "ecr:UploadLayerPart", + "ecr:CompleteLayerUpload", + ] + resources = [ + "arn:aws:ecr:eu-west-1:760097843905:repository/uk.ac.wellcome/catalogue_graph_extractor" + ] + } + statement { + actions = [ + "lambda:GetFunctionConfiguration", + "lambda:UpdateFunctionCode" + ] + resources = [ + "arn:aws:lambda:eu-west-1:760097843905:function:catalogue-graph-extractor", + "arn:aws:lambda:eu-west-1:760097843905:function:catalogue-graph-bulk-loader", + "arn:aws:lambda:eu-west-1:760097843905:function:catalogue-graph-bulk-load-poller", + "arn:aws:lambda:eu-west-1:760097843905:function:catalogue-graph-indexer" + ] + } +} + +output "gha_catalogue_graph_ci_role_arn" { + value = module.gha_catalogue_graph_ci_role.outputs.role_arn +} \ No newline at end of file diff --git a/terraform/state_machine_extractors.tf b/terraform/state_machine_extractors.tf index 44fc6583c4..c035bf6f49 100644 --- a/terraform/state_machine_extractors.tf +++ b/terraform/state_machine_extractors.tf @@ -1,6 +1,3 @@ -locals { - extractor_lambda = "${module.extractor_lambda.lambda.arn}:${module.extractor_lambda.lambda.version}" -} resource "aws_sfn_state_machine" "catalogue_graph_extractors" { name = "catalogue-graph-extractors" role_arn = aws_iam_role.state_machine_execution_role.arn diff --git a/terraform/terraform.tf b/terraform/terraform.tf index 0f83de8b8a..827029002e 100644 --- a/terraform/terraform.tf +++ b/terraform/terraform.tf @@ -29,7 +29,9 @@ data "terraform_remote_state" "shared_infra" { backend = "s3" config = { - role_arn = "arn:aws:iam::760097843905:role/platform-read_only" + assume_role = { + role_arn = "arn:aws:iam::760097843905:role/platform-read_only" + } bucket = "wellcomecollection-platform-infra" key = "terraform/platform-infrastructure/shared.tfstate" region = "eu-west-1" From 10f50dc4dbcf8c5a15a4f38eb20e2d2250e6a8ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 6 Feb 2025 11:06:05 +0000 Subject: [PATCH 257/310] Organise test fixtures into folders --- .../loc_subjects_diverse_example.jsonld | 0 tests/fixtures/{ => loc}/mads_781_pair.ndjson | 0 .../{ => loc}/mads_composite_concept.json | 0 .../{ => loc}/mads_composite_name.json | 0 tests/fixtures/{ => loc}/mads_concept.json | 0 .../{ => loc}/mads_deprecated_concept.json | 0 .../{ => loc}/mads_geographic_concept.json | 0 .../mads_narrower_authority_concept.json | 0 .../{ => loc}/mads_related_concept.json | 0 .../{ => loc}/mads_variant_concept.json | 0 tests/fixtures/{ => loc}/skos_concept.json | 0 .../{ => loc}/skos_deprecated_concept.json | 0 .../{ => loc}/skos_geographic_concept.json | 0 .../{ => loc}/skos_related_concept.json | 0 tests/transformers/loc/test_concepts.py | 24 +++++++++---------- tests/transformers/loc/test_raw_concept.py | 6 +++-- .../transformers/loc/test_raw_mads_concept.py | 9 ++++--- .../transformers/loc/test_raw_skos_concept.py | 4 ++-- 18 files changed, 22 insertions(+), 21 deletions(-) rename tests/fixtures/{ => loc}/loc_subjects_diverse_example.jsonld (100%) rename tests/fixtures/{ => loc}/mads_781_pair.ndjson (100%) rename tests/fixtures/{ => loc}/mads_composite_concept.json (100%) rename tests/fixtures/{ => loc}/mads_composite_name.json (100%) rename tests/fixtures/{ => loc}/mads_concept.json (100%) rename tests/fixtures/{ => loc}/mads_deprecated_concept.json (100%) rename tests/fixtures/{ => loc}/mads_geographic_concept.json (100%) rename tests/fixtures/{ => loc}/mads_narrower_authority_concept.json (100%) rename tests/fixtures/{ => loc}/mads_related_concept.json (100%) rename tests/fixtures/{ => loc}/mads_variant_concept.json (100%) rename tests/fixtures/{ => loc}/skos_concept.json (100%) rename tests/fixtures/{ => loc}/skos_deprecated_concept.json (100%) rename tests/fixtures/{ => loc}/skos_geographic_concept.json (100%) rename tests/fixtures/{ => loc}/skos_related_concept.json (100%) diff --git a/tests/fixtures/loc_subjects_diverse_example.jsonld b/tests/fixtures/loc/loc_subjects_diverse_example.jsonld similarity index 100% rename from tests/fixtures/loc_subjects_diverse_example.jsonld rename to tests/fixtures/loc/loc_subjects_diverse_example.jsonld diff --git a/tests/fixtures/mads_781_pair.ndjson b/tests/fixtures/loc/mads_781_pair.ndjson similarity index 100% rename from tests/fixtures/mads_781_pair.ndjson rename to tests/fixtures/loc/mads_781_pair.ndjson diff --git a/tests/fixtures/mads_composite_concept.json b/tests/fixtures/loc/mads_composite_concept.json similarity index 100% rename from tests/fixtures/mads_composite_concept.json rename to tests/fixtures/loc/mads_composite_concept.json diff --git a/tests/fixtures/mads_composite_name.json b/tests/fixtures/loc/mads_composite_name.json similarity index 100% rename from tests/fixtures/mads_composite_name.json rename to tests/fixtures/loc/mads_composite_name.json diff --git a/tests/fixtures/mads_concept.json b/tests/fixtures/loc/mads_concept.json similarity index 100% rename from tests/fixtures/mads_concept.json rename to tests/fixtures/loc/mads_concept.json diff --git a/tests/fixtures/mads_deprecated_concept.json b/tests/fixtures/loc/mads_deprecated_concept.json similarity index 100% rename from tests/fixtures/mads_deprecated_concept.json rename to tests/fixtures/loc/mads_deprecated_concept.json diff --git a/tests/fixtures/mads_geographic_concept.json b/tests/fixtures/loc/mads_geographic_concept.json similarity index 100% rename from tests/fixtures/mads_geographic_concept.json rename to tests/fixtures/loc/mads_geographic_concept.json diff --git a/tests/fixtures/mads_narrower_authority_concept.json b/tests/fixtures/loc/mads_narrower_authority_concept.json similarity index 100% rename from tests/fixtures/mads_narrower_authority_concept.json rename to tests/fixtures/loc/mads_narrower_authority_concept.json diff --git a/tests/fixtures/mads_related_concept.json b/tests/fixtures/loc/mads_related_concept.json similarity index 100% rename from tests/fixtures/mads_related_concept.json rename to tests/fixtures/loc/mads_related_concept.json diff --git a/tests/fixtures/mads_variant_concept.json b/tests/fixtures/loc/mads_variant_concept.json similarity index 100% rename from tests/fixtures/mads_variant_concept.json rename to tests/fixtures/loc/mads_variant_concept.json diff --git a/tests/fixtures/skos_concept.json b/tests/fixtures/loc/skos_concept.json similarity index 100% rename from tests/fixtures/skos_concept.json rename to tests/fixtures/loc/skos_concept.json diff --git a/tests/fixtures/skos_deprecated_concept.json b/tests/fixtures/loc/skos_deprecated_concept.json similarity index 100% rename from tests/fixtures/skos_deprecated_concept.json rename to tests/fixtures/loc/skos_deprecated_concept.json diff --git a/tests/fixtures/skos_geographic_concept.json b/tests/fixtures/loc/skos_geographic_concept.json similarity index 100% rename from tests/fixtures/skos_geographic_concept.json rename to tests/fixtures/loc/skos_geographic_concept.json diff --git a/tests/fixtures/skos_related_concept.json b/tests/fixtures/loc/skos_related_concept.json similarity index 100% rename from tests/fixtures/skos_related_concept.json rename to tests/fixtures/loc/skos_related_concept.json diff --git a/tests/transformers/loc/test_concepts.py b/tests/transformers/loc/test_concepts.py index 8d3c996dda..df9b62924e 100644 --- a/tests/transformers/loc/test_concepts.py +++ b/tests/transformers/loc/test_concepts.py @@ -26,10 +26,10 @@ def test_loc_concept_transformer_resilience() -> None: "json_data": None, "content_bytes": jsons_to_ndjson( [ - "mads_geographic_concept.json", # geographic concepts are not included in the concepts transformer output - "mads_composite_concept.json", - "mads_deprecated_concept.json", # This one is deprecated, so is not included in the output - "mads_narrower_authority_concept.json", + "loc/mads_geographic_concept.json", # geographic concepts are not included in the concepts transformer output + "loc/mads_composite_concept.json", + "loc/mads_deprecated_concept.json", # This one is deprecated, so is not included in the output + "loc/mads_narrower_authority_concept.json", ] ), "params": None, @@ -38,7 +38,7 @@ def test_loc_concept_transformer_resilience() -> None: ) concepts_transformer = LibraryOfCongressConceptsTransformer(test_url) - nodes = list(concepts_transformer.stream(entity_type="nodes", query_chunk_size=1)) + nodes = concepts_transformer._stream_nodes() # mads_composite_concept and mads_narrower_authority_concept assert len(list(nodes)) == 2 @@ -56,10 +56,10 @@ def test_loc_location_transformer_resilience() -> None: "json_data": None, "content_bytes": jsons_to_ndjson( [ - "mads_geographic_concept.json", # Only geographic concepts included in the location transformer output - "mads_composite_concept.json", - "mads_deprecated_concept.json", - "mads_narrower_authority_concept.json", + "loc/mads_geographic_concept.json", # Only geographic concepts included in the location transformer output + "loc/mads_composite_concept.json", + "loc/mads_deprecated_concept.json", + "loc/mads_narrower_authority_concept.json", ] ), "params": None, @@ -69,7 +69,7 @@ def test_loc_location_transformer_resilience() -> None: "url": test_url_names, "status_code": 200, "json_data": None, - "content_bytes": load_fixture("loc_names_example.jsonld"), + "content_bytes": load_fixture("loc/raw_names.jsonld"), "params": None, }, ] @@ -77,7 +77,7 @@ def test_loc_location_transformer_resilience() -> None: locations_transformer = LibraryOfCongressLocationsTransformer( test_url_subjects, test_url_names ) - nodes = list(locations_transformer.stream(entity_type="nodes", query_chunk_size=1)) + nodes = locations_transformer._stream_nodes() # Caversham Park from mads_geographic_concept # and Budapest (Hungary) from loc_names_example assert len(list(nodes)) == 2 @@ -98,4 +98,4 @@ def test_empty_source() -> None: ] ) transformer = LibraryOfCongressConceptsTransformer("/dev/null") - assert list(transformer.stream(entity_type="nodes", query_chunk_size=1)) == [] + assert list(transformer._stream_nodes()) == [] diff --git a/tests/transformers/loc/test_raw_concept.py b/tests/transformers/loc/test_raw_concept.py index 6a9625ded8..d1fbc7d8f3 100644 --- a/tests/transformers/loc/test_raw_concept.py +++ b/tests/transformers/loc/test_raw_concept.py @@ -153,12 +153,14 @@ def test_is_geographic( A concept is geographic if its @type list contains madsrdf:Geographic or http://id.loc.gov/datatypes/codes/gac" """ concept = sut_class( - json.loads(load_fixture(f"{source_format}_geographic_concept.json")) + json.loads(load_fixture(f"loc/{source_format}_geographic_concept.json")) ) assert concept.is_geographic == True def test_is_not_geographic( self, sut_class: Type[RawLibraryOfCongressConcept], source_format: str ) -> None: - concept = sut_class(json.loads(load_fixture(f"{source_format}_concept.json"))) + concept = sut_class( + json.loads(load_fixture(f"loc/{source_format}_concept.json")) + ) assert concept.is_geographic == False diff --git a/tests/transformers/loc/test_raw_mads_concept.py b/tests/transformers/loc/test_raw_mads_concept.py index ef6fe6264d..369964f5f2 100644 --- a/tests/transformers/loc/test_raw_mads_concept.py +++ b/tests/transformers/loc/test_raw_mads_concept.py @@ -10,7 +10,7 @@ def test_label() -> None: Label is extracted from madsrdf:authoritativeLabel """ concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_concept.json")) + json.loads(load_fixture("loc/mads_concept.json")) ) assert concept.label == "Stump work" @@ -18,7 +18,7 @@ def test_label() -> None: class TestBroaderConcepts: def test_real_example(self) -> None: concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_geographic_concept.json")) + json.loads(load_fixture("loc/mads_geographic_concept.json")) ) assert concept.broader_concept_ids == ["sh85040229", "sh85053109", "sh92006359"] @@ -74,7 +74,7 @@ def test_real_example(self) -> None: # This helps to give confidence that the whole test isn't just # passing due to a bogus assumption when making artificial test data. concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_related_concept.json")) + json.loads(load_fixture("loc/mads_related_concept.json")) ) assert concept.related_concept_ids == ["sh90003066"] @@ -123,7 +123,6 @@ def test_ignore_underscore_n(self) -> None: class TestNarrower: - def test_get_no_narrowers(self) -> None: concept = RawLibraryOfCongressMADSConcept( {"@id": "/authorities/subjects/sh2010105253", "@graph": []} @@ -180,7 +179,7 @@ def test_get_narrowers_from_both(self) -> None: def test_alternative_labels() -> None: concept = RawLibraryOfCongressMADSConcept( - json.loads(load_fixture("mads_related_concept.json")) + json.loads(load_fixture("loc/mads_related_concept.json")) ) assert set(concept.alternative_labels) == { "Loop blocking (Computer science)", diff --git a/tests/transformers/loc/test_raw_skos_concept.py b/tests/transformers/loc/test_raw_skos_concept.py index 4644c10131..86f5750316 100644 --- a/tests/transformers/loc/test_raw_skos_concept.py +++ b/tests/transformers/loc/test_raw_skos_concept.py @@ -10,13 +10,13 @@ def test_label() -> None: Label is extracted from skos:prefLabel """ concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_concept.json")) + json.loads(load_fixture("loc/skos_concept.json")) ) assert concept.label == "Pickling" def test_broader_concepts() -> None: concept = RawLibraryOfCongressSKOSConcept( - json.loads(load_fixture("skos_geographic_concept.json")) + json.loads(load_fixture("loc/skos_geographic_concept.json")) ) assert concept.broader_concept_ids == ["sh85040229", "sh85053109", "sh92006359"] From 53260553030f191849e08c555cab2d224bfe1801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 6 Feb 2025 11:14:03 +0000 Subject: [PATCH 258/310] Changes to make ruff linter happy --- src/sources/gzip_source.py | 1 - tests/transformers/loc/test_raw_concept.py | 37 +++++++++++----------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/sources/gzip_source.py b/src/sources/gzip_source.py index 03bbd4b6b1..59755a9366 100644 --- a/src/sources/gzip_source.py +++ b/src/sources/gzip_source.py @@ -1,7 +1,6 @@ import gzip import json from collections.abc import Generator -from json import JSONDecodeError import requests diff --git a/tests/transformers/loc/test_raw_concept.py b/tests/transformers/loc/test_raw_concept.py index d1fbc7d8f3..9bf7a7af4b 100644 --- a/tests/transformers/loc/test_raw_concept.py +++ b/tests/transformers/loc/test_raw_concept.py @@ -1,5 +1,4 @@ import json -from typing import Type import pytest from test_utils import load_fixture @@ -16,7 +15,7 @@ ) class TestSourceId: def test_remove_prefix_noop( - self, sut_class: Type[RawLibraryOfCongressConcept] + self, sut_class: type[RawLibraryOfCongressConcept] ) -> None: """ If there is no prefix to remove, remove_id_prefix will do nothing @@ -24,7 +23,7 @@ def test_remove_prefix_noop( assert sut_class({"@id": "sh1234567890"}).source_id == "sh1234567890" def test_remove_prefix_fully_qualified( - self, sut_class: Type[RawLibraryOfCongressConcept] + self, sut_class: type[RawLibraryOfCongressConcept] ) -> None: """ remove_id_prefix removes fully-qualified URL-style prefixes @@ -43,7 +42,7 @@ def test_remove_prefix_fully_qualified( ) def test_remove_prefix_relative( - self, sut_class: Type[RawLibraryOfCongressConcept] + self, sut_class: type[RawLibraryOfCongressConcept] ) -> None: """ remove_id_prefix removes relative/local prefixes @@ -58,7 +57,7 @@ def test_remove_prefix_relative( ) def test_remove_prefix_lookalikes( - self, sut_class: Type[RawLibraryOfCongressConcept] + self, sut_class: type[RawLibraryOfCongressConcept] ) -> None: """ remove_id_prefix only removes specific known prefixes, @@ -81,7 +80,7 @@ def test_remove_prefix_lookalikes( ) class TestSource: def test_source_subjects( - self, sut_class: Type[RawLibraryOfCongressConcept] + self, sut_class: type[RawLibraryOfCongressConcept] ) -> None: """ Given an id with the prefix /authorities/subjects/, the source will be lc-subjects @@ -89,24 +88,24 @@ def test_source_subjects( concept = sut_class({"@id": "/authorities/subjects/sh2010105253"}) assert concept.source == "lc-subjects" - def test_source_names(self, sut_class: Type[RawLibraryOfCongressConcept]) -> None: + def test_source_names(self, sut_class: type[RawLibraryOfCongressConcept]) -> None: """ Given an id with the prefix /authorities/subjects/, the source will be lc-subjects """ concept = sut_class({"@id": "/authorities/names/sh2010105253"}) assert concept.source == "lc-names" - def test_source_invalid(self, sut_class: Type[RawLibraryOfCongressConcept]) -> None: + def test_source_invalid(self, sut_class: type[RawLibraryOfCongressConcept]) -> None: with pytest.raises(ValueError): concept = sut_class({"@id": "authorities/childrensSubjects/sj2021051581"}) - concept.source + _ = concept.source @pytest.mark.parametrize( "sut_class", [RawLibraryOfCongressSKOSConcept, RawLibraryOfCongressMADSConcept] ) class TestExclusion: - def test_do_not_exclude(self, sut_class: Type[RawLibraryOfCongressConcept]) -> None: + def test_do_not_exclude(self, sut_class: type[RawLibraryOfCongressConcept]) -> None: """ A record with a corresponding node in its internal graph, and which is not a duplicate, should be included in the output @@ -114,20 +113,20 @@ def test_do_not_exclude(self, sut_class: Type[RawLibraryOfCongressConcept]) -> N concept = sut_class({"@id": "authorities/names/sh2010105253", "@graph": []}) # The SUT at this point doesn't actually care what the node is, just that it exists concept._raw_concept_node = {} - assert concept.exclude() == False + assert concept.exclude() is False def test_exclude_no_node( - self, sut_class: Type[RawLibraryOfCongressConcept] + self, sut_class: type[RawLibraryOfCongressConcept] ) -> None: """ If a record does not contain a corresponding node in its internal graph then it should be excluded """ concept = sut_class({"@id": "authorities/names/sh2010105253", "@graph": []}) - assert concept.exclude() == True + assert concept.exclude() def test_exclude_marked_duplicates( - self, sut_class: Type[RawLibraryOfCongressConcept] + self, sut_class: type[RawLibraryOfCongressConcept] ) -> None: """ If a record's identifier is suffixed with -781, this marks the entry as a duplicate @@ -135,7 +134,7 @@ def test_exclude_marked_duplicates( """ concept = sut_class({"@id": "authorities/names/sh2010105253-781", "@graph": []}) concept._raw_concept_node = {} - assert concept.exclude() == True + assert concept.exclude() @pytest.mark.parametrize( @@ -147,7 +146,7 @@ def test_exclude_marked_duplicates( ) class TestGeographic: def test_is_geographic( - self, sut_class: Type[RawLibraryOfCongressConcept], source_format: str + self, sut_class: type[RawLibraryOfCongressConcept], source_format: str ) -> None: """ A concept is geographic if its @type list contains madsrdf:Geographic or http://id.loc.gov/datatypes/codes/gac" @@ -155,12 +154,12 @@ def test_is_geographic( concept = sut_class( json.loads(load_fixture(f"loc/{source_format}_geographic_concept.json")) ) - assert concept.is_geographic == True + assert concept.is_geographic def test_is_not_geographic( - self, sut_class: Type[RawLibraryOfCongressConcept], source_format: str + self, sut_class: type[RawLibraryOfCongressConcept], source_format: str ) -> None: concept = sut_class( json.loads(load_fixture(f"loc/{source_format}_concept.json")) ) - assert concept.is_geographic == False + assert concept.is_geographic is False From cd354365ea40092f795123634bed6ea132b0acfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 6 Feb 2025 11:55:29 +0000 Subject: [PATCH 259/310] Adjust Wikidata tests --- src/extractor.py | 12 +++++----- .../wikidata/linked_ontology_source.py | 22 +++++++++---------- src/sources/wikidata/sparql_query_builder.py | 7 +++--- .../wikidata/concepts_transformer.py | 4 ++-- .../wikidata/locations_transformer.py | 2 +- .../wikidata/names_transformer.py | 2 +- src/utils/aws.py | 9 ++++---- src/utils/ontology_id_checker.py | 1 + tests/fixtures/wikidata/linked_ids_query.json | 2 +- .../wikidata/linked_ids_response.json | 8 +++---- .../wikidata/parents_instance_of_query.json | 2 +- .../parents_instance_of_response.json | 8 +++---- .../wikidata/parents_subclass_of_query.json | 2 +- .../parents_subclass_of_response.json | 4 ++-- .../sources/test_wikidata_concepts_source.py | 19 ++++++++-------- .../test_wikidata_concepts_transformer.py | 10 +++++++++ 16 files changed, 63 insertions(+), 51 deletions(-) diff --git a/src/extractor.py b/src/extractor.py index cb4ca44474..2be14bde3c 100755 --- a/src/extractor.py +++ b/src/extractor.py @@ -28,9 +28,9 @@ def handler( f"transformer and streaming them into {stream_destination}." ) - assert ( - config.S3_BULK_LOAD_BUCKET_NAME is not None - ), "the S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." + assert config.S3_BULK_LOAD_BUCKET_NAME is not None, ( + "The S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." + ) transformer: BaseTransformer = create_transformer(transformer_type, entity_type) @@ -43,9 +43,9 @@ def handler( transformer.stream_to_s3(s3_uri, entity_type, sample_size) elif stream_destination == "sns": topic_arn = config.GRAPH_QUERIES_SNS_TOPIC_ARN - assert ( - topic_arn is not None - ), "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." + assert topic_arn is not None, ( + "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." + ) transformer.stream_to_sns(topic_arn, entity_type, sample_size) elif stream_destination == "local": diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 58908cfd40..263d644c02 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -7,10 +7,10 @@ from utils.ontology_id_checker import is_id_classified_as_node_type, is_id_in_ontology from utils.streaming import process_stream_in_parallel from utils.types import NodeType, OntologyType + from .sparql_client import SPARQL_MAX_PARALLEL_QUERIES, WikidataSparqlClient from .sparql_query_builder import SparqlQueryBuilder, WikidataEdgeQueryType - SPARQL_ITEMS_CHUNK_SIZE = 400 WIKIDATA_ID_PREFIX = "http://www.wikidata.org/entity/" @@ -19,15 +19,14 @@ def _parallelise_sparql_requests( items: Iterator, run_sparql_query: Callable[[list], list] ) -> Generator: - """Accept an `items` generator and a `run_sparql_query` method. Split `items` chunks and apply + """Accept an `items` generator and a `run_sparql_query` method. Split `items` into chunks and apply `run_sparql_query` to each chunk. Return a single generator of results.""" - for raw_response_item in process_stream_in_parallel( + yield from process_stream_in_parallel( items, run_sparql_query, SPARQL_ITEMS_CHUNK_SIZE, SPARQL_MAX_PARALLEL_QUERIES, - ): - yield raw_response_item + ) def extract_wikidata_id(item: dict, key: str = "item") -> str | None: @@ -122,6 +121,8 @@ def get_edges(wikidata_ids: list[str]) -> list[dict]: for raw_mapping in _parallelise_sparql_requests(iter(all_ids), get_edges): from_id = extract_wikidata_id(raw_mapping, "fromItem") + # The 'toItem' ids of SAME_AS edges are MeSH/LoC ids, so we take the raw value instead of extracting + # the ids via the `extract_wikidata_id` function if edge_type in ("same_as_mesh", "same_as_loc"): to_id = raw_mapping["toItem"]["value"] else: @@ -147,12 +148,9 @@ def _stream_filtered_wikidata_ids(self) -> Generator[str]: # Stream all SAME_AS edges and extract Wikidata ids from them, making sure to deduplicate # (a given Wikidata id can appear in more than one edge). for edge in self._stream_all_same_as_edges(): - wikidata_id = edge["from_id"] - linked_id = edge["to_id"] - if ( - is_id_in_ontology(linked_id, self.linked_ontology) - and wikidata_id not in seen - ): + wikidata_id, linked_id = edge["from_id"], edge["to_id"] + linked_id_is_valid = is_id_in_ontology(linked_id, self.linked_ontology) + if linked_id_is_valid and wikidata_id not in seen: # Add Wikidata id to `seen` no matter if it's part of the selected node type # to make sure it is not processed again as a parent below. seen.add(wikidata_id) @@ -197,9 +195,11 @@ def _stream_raw_edges(self) -> Generator[dict]: if edge["from_id"] in streamed_wikidata_ids: yield {**edge, "type": "HAS_PARENT"} + # HAS_FIELD_OF_WORK edges only apply to people (SourceName) nodes if self.node_type == "names": print("Streaming HAS_FIELD_OF_WORK edges...") for edge in self._stream_all_edges_by_type("field_of_work"): + # Only include an edge if its `to_id` has a corresponding concept node in the graph if is_id_in_ontology(edge["to_id"], "wikidata"): yield {**edge, "type": "HAS_FIELD_OF_WORK"} diff --git a/src/sources/wikidata/sparql_query_builder.py b/src/sources/wikidata/sparql_query_builder.py index a923e468eb..db37ad0fe3 100644 --- a/src/sources/wikidata/sparql_query_builder.py +++ b/src/sources/wikidata/sparql_query_builder.py @@ -2,13 +2,12 @@ from utils.types import NodeType, OntologyType - WikidataEdgeQueryType = Literal[ "same_as_loc", "same_as_mesh", "parent_instance_of", "parent_subclass_of", - "field_of_work" + "field_of_work", ] @@ -132,7 +131,9 @@ def get_items_query(cls, item_ids: list[str], node_type: NodeType) -> str: return SparqlQueryBuilder._compact_format_query(query) @classmethod - def get_edge_query(cls, item_ids: list[str], edge_type: WikidataEdgeQueryType) -> str: + def get_edge_query( + cls, item_ids: list[str], edge_type: WikidataEdgeQueryType + ) -> str: """ Given a list of Wikidata `item_ids`, return a query to retrieve all edges of type `edge_type` linking each item in the list to a different Wikidata item. diff --git a/src/transformers/wikidata/concepts_transformer.py b/src/transformers/wikidata/concepts_transformer.py index 57617e5e5d..9de8902d50 100644 --- a/src/transformers/wikidata/concepts_transformer.py +++ b/src/transformers/wikidata/concepts_transformer.py @@ -1,15 +1,15 @@ from collections.abc import Generator from models.graph_edge import ( + BaseEdge, SourceConceptHasFieldOfWork, SourceConceptHasParent, SourceConceptSameAs, - BaseEdge, ) from models.graph_node import SourceConcept from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from utils.types import OntologyType from transformers.base_transformer import BaseTransformer, EntityType +from utils.types import OntologyType from .raw_concept import RawWikidataConcept diff --git a/src/transformers/wikidata/locations_transformer.py b/src/transformers/wikidata/locations_transformer.py index 1b6e487d3d..72c6b50509 100644 --- a/src/transformers/wikidata/locations_transformer.py +++ b/src/transformers/wikidata/locations_transformer.py @@ -1,7 +1,7 @@ from models.graph_node import SourceLocation from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from utils.types import OntologyType from transformers.base_transformer import EntityType +from utils.types import OntologyType from .concepts_transformer import WikidataConceptsTransformer from .raw_concept import RawWikidataLocation diff --git a/src/transformers/wikidata/names_transformer.py b/src/transformers/wikidata/names_transformer.py index 1fb800de38..1e7c11dc65 100644 --- a/src/transformers/wikidata/names_transformer.py +++ b/src/transformers/wikidata/names_transformer.py @@ -1,7 +1,7 @@ from models.graph_node import SourceName from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource -from utils.types import OntologyType from transformers.base_transformer import EntityType +from utils.types import OntologyType from .concepts_transformer import WikidataConceptsTransformer from .raw_concept import RawWikidataName diff --git a/src/utils/aws.py b/src/utils/aws.py index 1042da852a..49cfe9d127 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -1,16 +1,17 @@ import csv import json +from collections.abc import Generator from functools import lru_cache -from typing import Any, Generator +from typing import Any import boto3 import smart_open -from utils.types import NodeType, OntologyType +import config from clients.base_neptune_client import BaseNeptuneClient from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient -from config import S3_BULK_LOAD_BUCKET_NAME +from utils.types import NodeType, OntologyType LOAD_BALANCER_SECRET_NAME = "NeptuneTest/LoadBalancerUrl" INSTANCE_ENDPOINT_SECRET_NAME = "NeptuneTest/InstanceEndpoint" @@ -66,7 +67,7 @@ def fetch_transformer_output_from_s3( ) -> Generator[list[Any]]: """Retrieves the bulk load file outputted by the relevant transformer so that we can extract data from it.""" linked_nodes_file_name = f"{ontology_type}_{node_type}__nodes.csv" - s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + s3_url = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" transport_params = {"client": boto3.client("s3")} with smart_open.open(s3_url, "r", transport_params=transport_params) as f: diff --git a/src/utils/ontology_id_checker.py b/src/utils/ontology_id_checker.py index 29c831e85e..cfc37c92f8 100644 --- a/src/utils/ontology_id_checker.py +++ b/src/utils/ontology_id_checker.py @@ -3,6 +3,7 @@ from utils.aws import fetch_transformer_output_from_s3 from utils.types import NodeType, OntologyType + @lru_cache def _get_ids_for_ontology_and_node_type( ontology_type: OntologyType, node_type: NodeType diff --git a/tests/fixtures/wikidata/linked_ids_query.json b/tests/fixtures/wikidata/linked_ids_query.json index d3874e7c50..5ead8e25f9 100644 --- a/tests/fixtures/wikidata/linked_ids_query.json +++ b/tests/fixtures/wikidata/linked_ids_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?item ?linkedId WHERE { VALUES ?item { wd:Q1 wd:Q2 } ?item p:P244/ps:P244 ?linkedId. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem p:P244/ps:P244 ?toItem. }" } diff --git a/tests/fixtures/wikidata/linked_ids_response.json b/tests/fixtures/wikidata/linked_ids_response.json index 039a2b4327..b24a88f36f 100644 --- a/tests/fixtures/wikidata/linked_ids_response.json +++ b/tests/fixtures/wikidata/linked_ids_response.json @@ -2,21 +2,21 @@ "results": { "bindings": [ { - "item": { + "fromItem": { "type": "uri", "value": "http://www.wikidata.org/entity/Q1" }, - "linkedId": { + "toItem": { "type": "uri", "value": "sh00000001" } }, { - "item": { + "fromItem": { "type": "uri", "value": "http://www.wikidata.org/entity/Q2" }, - "linkedId": { + "toItem": { "type": "uri", "value": "sh00000001" } diff --git a/tests/fixtures/wikidata/parents_instance_of_query.json b/tests/fixtures/wikidata/parents_instance_of_query.json index 8521f76e68..e526936a61 100644 --- a/tests/fixtures/wikidata/parents_instance_of_query.json +++ b/tests/fixtures/wikidata/parents_instance_of_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?child ?item WHERE { VALUES ?child { wd:Q1 wd:Q2 } ?child wdt:P31 ?item. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem wdt:P31 ?toItem. }" } diff --git a/tests/fixtures/wikidata/parents_instance_of_response.json b/tests/fixtures/wikidata/parents_instance_of_response.json index caeb9577f0..795c9f8d28 100644 --- a/tests/fixtures/wikidata/parents_instance_of_response.json +++ b/tests/fixtures/wikidata/parents_instance_of_response.json @@ -2,21 +2,21 @@ "results": { "bindings": [ { - "item": { + "toItem": { "type": "uri", "value": "http://www.wikidata.org/entity/Q1" }, - "child": { + "fromItem": { "type": "uri", "value": "http://www.wikidata.org/entity/Q2" } }, { - "item": { + "toItem": { "type": "uri", "value": "http://www.wikidata.org/entity/Q3" }, - "child": { + "fromItem": { "type": "uri", "value": "http://www.wikidata.org/entity/Q2" } diff --git a/tests/fixtures/wikidata/parents_subclass_of_query.json b/tests/fixtures/wikidata/parents_subclass_of_query.json index 318027fef7..6161b1366b 100644 --- a/tests/fixtures/wikidata/parents_subclass_of_query.json +++ b/tests/fixtures/wikidata/parents_subclass_of_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?child ?item WHERE { VALUES ?child { wd:Q1 wd:Q2 } ?child wdt:P279 ?item. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem wdt:P279 ?toItem. }" } diff --git a/tests/fixtures/wikidata/parents_subclass_of_response.json b/tests/fixtures/wikidata/parents_subclass_of_response.json index f524cc236d..ead29a2d8a 100644 --- a/tests/fixtures/wikidata/parents_subclass_of_response.json +++ b/tests/fixtures/wikidata/parents_subclass_of_response.json @@ -2,11 +2,11 @@ "results": { "bindings": [ { - "item": { + "toItem": { "type": "uri", "value": "http://www.wikidata.org/entity/Q4" }, - "child": { + "fromItem": { "type": "uri", "value": "http://www.wikidata.org/entity/Q1" } diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 4fdfb82da0..26ac75f5bb 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -5,8 +5,8 @@ from test_utils import load_fixture from config import WIKIDATA_SPARQL_URL -from sources.wikidata.linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from utils.ontology_id_checker import is_id_classified_as_node_type, is_id_in_ontology def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: @@ -30,7 +30,7 @@ def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: def _add_mock_loc_transformer_outputs() -> None: """ - Add mock LoC transformer output files to S3 so that the LinkedOntologyIdTypeChecker class can extract ids from them. + Add mock LoC transformer output files to S3 so that we can extract ids from them. """ for node_type in ["concepts", "locations", "names"]: MockSmartOpen.mock_s3_file( @@ -54,9 +54,9 @@ def test_wikidata_concepts_source_edges() -> None: has_parent_edges = set() for edge in stream_result: if edge["type"] == "SAME_AS": - same_as_edges.add((edge["wikidata_id"], edge["linked_id"])) + same_as_edges.add((edge["from_id"], edge["to_id"])) elif edge["type"] == "HAS_PARENT": - has_parent_edges.add((edge["child_id"], edge["parent_id"])) + has_parent_edges.add((edge["from_id"], edge["to_id"])) else: raise ValueError(f"Unknown edge type {edge['type']}") @@ -90,11 +90,10 @@ def test_wikidata_concepts_source_nodes() -> None: def test_wikidata_linked_ontology_id_checker() -> None: _add_mock_loc_transformer_outputs() - id_checker = LinkedOntologyIdTypeChecker("locations", "loc") - assert id_checker.id_is_valid("sh00000001") - assert not id_checker.id_is_valid("sh00000001000") + assert is_id_in_ontology("sh00000001", "loc") + assert not is_id_in_ontology("sh00000001000", "loc") - assert not id_checker.id_included_in_selected_type("sh00000001") - assert not id_checker.id_included_in_selected_type("tgrefwdw") - assert id_checker.id_included_in_selected_type("sh00000015") + assert not is_id_classified_as_node_type("sh00000001", "loc", "locations") + assert not is_id_classified_as_node_type("tgrefwdw", "loc", "locations") + assert is_id_classified_as_node_type("sh00000015", "loc", "locations") diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index 9d69b3d068..f45ba2368f 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -48,6 +48,16 @@ def test_wikidata_concepts_edges_transformer() -> None: assert len(list(edges)) == 7 assert edges[0] == SourceConceptSameAs( + from_type="SourceConcept", + to_type="SourceConcept", + from_id="Q1", + to_id="sh00000001", + relationship="SAME_AS", + directed=False, + attributes={"source": "wikidata"}, + ) + + assert edges[1] == SourceConceptSameAs( from_type="SourceConcept", to_type="SourceConcept", from_id="sh00000001", From f3abb8f193e721da766c550e65fdcbf604b6d50b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 6 Feb 2025 14:27:44 +0000 Subject: [PATCH 260/310] Expand Wikidata unit testing to cover names --- .../wikidata/linked_ontology_source.py | 4 +- .../all_ids_query.json | 0 .../all_ids_response.json | 12 ++++++ .../concepts}/items_query.json | 0 .../concepts}/items_response.json | 0 .../has_field_of_work_query.json} | 2 +- .../has_field_of_work_response.json | 16 ++++++++ .../wikidata_linked_loc/linked_ids_query.json | 4 ++ .../linked_ids_response.json | 25 ++++++++++++ .../parents_instance_of_query.json | 2 +- .../parents_instance_of_response.json | 0 .../parents_subclass_of_query.json | 2 +- .../parents_subclass_of_response.json | 0 .../raw_location.json | 0 .../raw_name.json | 0 .../transformer_output_concepts_nodes.csv | 3 ++ .../transformer_output_locations_nodes.csv | 1 + .../transformer_output_names_nodes.csv | 3 ++ .../sources/test_wikidata_concepts_source.py | 40 ++++++++++++------- tests/sources/test_wikidata_names_source.py | 35 ++++++++++++++++ .../test_wikidata_concepts_transformer.py | 16 ++++---- 21 files changed, 140 insertions(+), 25 deletions(-) rename tests/fixtures/{wikidata => wikidata_linked_loc}/all_ids_query.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/all_ids_response.json (62%) rename tests/fixtures/{wikidata => wikidata_linked_loc/concepts}/items_query.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc/concepts}/items_response.json (100%) rename tests/fixtures/{wikidata/linked_ids_query.json => wikidata_linked_loc/has_field_of_work_query.json} (55%) create mode 100644 tests/fixtures/wikidata_linked_loc/has_field_of_work_response.json create mode 100644 tests/fixtures/wikidata_linked_loc/linked_ids_query.json rename tests/fixtures/{wikidata => wikidata_linked_loc}/linked_ids_response.json (52%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/parents_instance_of_query.json (56%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/parents_instance_of_response.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/parents_subclass_of_query.json (55%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/parents_subclass_of_response.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/raw_location.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/raw_name.json (100%) create mode 100644 tests/fixtures/wikidata_linked_loc/transformer_output_concepts_nodes.csv create mode 100644 tests/fixtures/wikidata_linked_loc/transformer_output_locations_nodes.csv create mode 100644 tests/fixtures/wikidata_linked_loc/transformer_output_names_nodes.csv create mode 100644 tests/sources/test_wikidata_names_source.py diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 263d644c02..4da71536f0 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -200,7 +200,9 @@ def _stream_raw_edges(self) -> Generator[dict]: print("Streaming HAS_FIELD_OF_WORK edges...") for edge in self._stream_all_edges_by_type("field_of_work"): # Only include an edge if its `to_id` has a corresponding concept node in the graph - if is_id_in_ontology(edge["to_id"], "wikidata"): + if edge["from_id"] in streamed_wikidata_ids and is_id_in_ontology( + edge["to_id"], "wikidata" + ): yield {**edge, "type": "HAS_FIELD_OF_WORK"} def _stream_raw_nodes(self) -> Generator[dict]: diff --git a/tests/fixtures/wikidata/all_ids_query.json b/tests/fixtures/wikidata_linked_loc/all_ids_query.json similarity index 100% rename from tests/fixtures/wikidata/all_ids_query.json rename to tests/fixtures/wikidata_linked_loc/all_ids_query.json diff --git a/tests/fixtures/wikidata/all_ids_response.json b/tests/fixtures/wikidata_linked_loc/all_ids_response.json similarity index 62% rename from tests/fixtures/wikidata/all_ids_response.json rename to tests/fixtures/wikidata_linked_loc/all_ids_response.json index dbe318698a..e79f9a11a7 100644 --- a/tests/fixtures/wikidata/all_ids_response.json +++ b/tests/fixtures/wikidata_linked_loc/all_ids_response.json @@ -18,6 +18,18 @@ "type": "uri", "value": "http://www.wikidata.org/entity/Q2" } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q100" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q101" + } } ] } diff --git a/tests/fixtures/wikidata/items_query.json b/tests/fixtures/wikidata_linked_loc/concepts/items_query.json similarity index 100% rename from tests/fixtures/wikidata/items_query.json rename to tests/fixtures/wikidata_linked_loc/concepts/items_query.json diff --git a/tests/fixtures/wikidata/items_response.json b/tests/fixtures/wikidata_linked_loc/concepts/items_response.json similarity index 100% rename from tests/fixtures/wikidata/items_response.json rename to tests/fixtures/wikidata_linked_loc/concepts/items_response.json diff --git a/tests/fixtures/wikidata/linked_ids_query.json b/tests/fixtures/wikidata_linked_loc/has_field_of_work_query.json similarity index 55% rename from tests/fixtures/wikidata/linked_ids_query.json rename to tests/fixtures/wikidata_linked_loc/has_field_of_work_query.json index 5ead8e25f9..2145fa2336 100644 --- a/tests/fixtures/wikidata/linked_ids_query.json +++ b/tests/fixtures/wikidata_linked_loc/has_field_of_work_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem p:P244/ps:P244 ?toItem. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q100 wd:Q101 wd:Q2 } ?fromItem wdt:P101 ?toItem. }" } diff --git a/tests/fixtures/wikidata_linked_loc/has_field_of_work_response.json b/tests/fixtures/wikidata_linked_loc/has_field_of_work_response.json new file mode 100644 index 0000000000..1d22e3d0f9 --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/has_field_of_work_response.json @@ -0,0 +1,16 @@ +{ + "results": { + "bindings": [ + { + "fromItem": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q100" + }, + "toItem": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata_linked_loc/linked_ids_query.json b/tests/fixtures/wikidata_linked_loc/linked_ids_query.json new file mode 100644 index 0000000000..b10ef6e29e --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/linked_ids_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q100 wd:Q101 wd:Q2 } ?fromItem p:P244/ps:P244 ?toItem. }" +} diff --git a/tests/fixtures/wikidata/linked_ids_response.json b/tests/fixtures/wikidata_linked_loc/linked_ids_response.json similarity index 52% rename from tests/fixtures/wikidata/linked_ids_response.json rename to tests/fixtures/wikidata_linked_loc/linked_ids_response.json index b24a88f36f..e735dc9272 100644 --- a/tests/fixtures/wikidata/linked_ids_response.json +++ b/tests/fixtures/wikidata_linked_loc/linked_ids_response.json @@ -20,7 +20,32 @@ "type": "uri", "value": "sh00000001" } + }, + { + "fromItem": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q100" + }, + "toItem": { + "type": "uri", + "value": "n00000001" + } + }, + { + "fromItem": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q101" + }, + "toItem": { + "type": "uri", + "value": "n00000021" + } } ] } } + + + + + diff --git a/tests/fixtures/wikidata/parents_instance_of_query.json b/tests/fixtures/wikidata_linked_loc/parents_instance_of_query.json similarity index 56% rename from tests/fixtures/wikidata/parents_instance_of_query.json rename to tests/fixtures/wikidata_linked_loc/parents_instance_of_query.json index e526936a61..f71e695d94 100644 --- a/tests/fixtures/wikidata/parents_instance_of_query.json +++ b/tests/fixtures/wikidata_linked_loc/parents_instance_of_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem wdt:P31 ?toItem. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q100 wd:Q101 wd:Q2 } ?fromItem wdt:P31 ?toItem. }" } diff --git a/tests/fixtures/wikidata/parents_instance_of_response.json b/tests/fixtures/wikidata_linked_loc/parents_instance_of_response.json similarity index 100% rename from tests/fixtures/wikidata/parents_instance_of_response.json rename to tests/fixtures/wikidata_linked_loc/parents_instance_of_response.json diff --git a/tests/fixtures/wikidata/parents_subclass_of_query.json b/tests/fixtures/wikidata_linked_loc/parents_subclass_of_query.json similarity index 55% rename from tests/fixtures/wikidata/parents_subclass_of_query.json rename to tests/fixtures/wikidata_linked_loc/parents_subclass_of_query.json index 6161b1366b..58e9543e8a 100644 --- a/tests/fixtures/wikidata/parents_subclass_of_query.json +++ b/tests/fixtures/wikidata_linked_loc/parents_subclass_of_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem wdt:P279 ?toItem. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q100 wd:Q101 wd:Q2 } ?fromItem wdt:P279 ?toItem. }" } diff --git a/tests/fixtures/wikidata/parents_subclass_of_response.json b/tests/fixtures/wikidata_linked_loc/parents_subclass_of_response.json similarity index 100% rename from tests/fixtures/wikidata/parents_subclass_of_response.json rename to tests/fixtures/wikidata_linked_loc/parents_subclass_of_response.json diff --git a/tests/fixtures/wikidata/raw_location.json b/tests/fixtures/wikidata_linked_loc/raw_location.json similarity index 100% rename from tests/fixtures/wikidata/raw_location.json rename to tests/fixtures/wikidata_linked_loc/raw_location.json diff --git a/tests/fixtures/wikidata/raw_name.json b/tests/fixtures/wikidata_linked_loc/raw_name.json similarity index 100% rename from tests/fixtures/wikidata/raw_name.json rename to tests/fixtures/wikidata_linked_loc/raw_name.json diff --git a/tests/fixtures/wikidata_linked_loc/transformer_output_concepts_nodes.csv b/tests/fixtures/wikidata_linked_loc/transformer_output_concepts_nodes.csv new file mode 100644 index 0000000000..22d4c903c5 --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/transformer_output_concepts_nodes.csv @@ -0,0 +1,3 @@ +:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description +Q1,SourceConcept,Q1,ActionScript (Computer program language),wikidata,,,null +Q2,SourceConcept,Q2,Tacos,wikidata,,,null diff --git a/tests/fixtures/wikidata_linked_loc/transformer_output_locations_nodes.csv b/tests/fixtures/wikidata_linked_loc/transformer_output_locations_nodes.csv new file mode 100644 index 0000000000..a386832ddf --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/transformer_output_locations_nodes.csv @@ -0,0 +1 @@ +:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description,latitude,longitude \ No newline at end of file diff --git a/tests/fixtures/wikidata_linked_loc/transformer_output_names_nodes.csv b/tests/fixtures/wikidata_linked_loc/transformer_output_names_nodes.csv new file mode 100644 index 0000000000..033e07e77d --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/transformer_output_names_nodes.csv @@ -0,0 +1,3 @@ +:ID,:LABEL,id:String,label:String,source:String,alternative_ids:String,alternative_labels:String,description:String,date_of_birth:DateTime,date_of_death:DateTime,place_of_birth:String +Q100,SourceName,Q100,"McQuerry, Maureen, 1955-",wikidata,,"MacQuerry, Maureen, 1955-||Makkvyri, Morin, 1955-||McQuerry, Maureen Doyle, 1955-",,,, +Q101,SourceName,Q101,"Widmer, Thomas, 1962-",wikidata,,,,,, diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 26ac75f5bb..421e5b2522 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -9,7 +9,9 @@ from utils.ontology_id_checker import is_id_classified_as_node_type, is_id_in_ontology -def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: +def _add_mock_wikidata_requests( + entity_type: Literal["edges", "nodes"], node_type: Literal["concepts", "names"] +) -> None: """Add all the required mock Wikidata requests/responses based on whether we are streaming nodes or edges""" query_types = [ "all_ids", @@ -17,31 +19,41 @@ def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: "parents_instance_of", "parents_subclass_of", ] - if node_type == "nodes": - query_types.append("items") + if entity_type == "nodes": + query_types.append(f"{node_type}/items") + if node_type == "names": + query_types.append("has_field_of_work") for query_type in query_types: - params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) - response = json.loads(load_fixture(f"wikidata/{query_type}_response.json")) + params = json.loads( + load_fixture(f"wikidata_linked_loc/{query_type}_query.json") + ) + response = json.loads( + load_fixture(f"wikidata_linked_loc/{query_type}_response.json") + ) MockRequest.mock_response( method="GET", url=WIKIDATA_SPARQL_URL, params=params, json_data=response ) -def _add_mock_loc_transformer_outputs() -> None: +def _add_mock_transformer_outputs( + ontology: Literal["loc", "wikidata_linked_loc"] +) -> None: """ - Add mock LoC transformer output files to S3 so that we can extract ids from them. + Add mock transformer output files to S3 so that we can extract ids from them. """ for node_type in ["concepts", "locations", "names"]: MockSmartOpen.mock_s3_file( - f"s3://bulk_load_test_bucket/loc_{node_type}__nodes.csv", - load_fixture(f"loc/transformer_output_{node_type}_nodes.csv").decode(), + f"s3://bulk_load_test_bucket/{ontology}_{node_type}__nodes.csv", + load_fixture( + f"{ontology}/transformer_output_{node_type}_nodes.csv" + ).decode(), ) def test_wikidata_concepts_source_edges() -> None: - _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests("edges") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("edges", "concepts") mesh_concepts_source = WikidataLinkedOntologySource( node_type="concepts", linked_ontology="loc", entity_type="edges" @@ -71,8 +83,8 @@ def test_wikidata_concepts_source_edges() -> None: def test_wikidata_concepts_source_nodes() -> None: - _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests("nodes") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("nodes", "concepts") mesh_concepts_source = WikidataLinkedOntologySource( node_type="concepts", linked_ontology="loc", entity_type="nodes" @@ -89,7 +101,7 @@ def test_wikidata_concepts_source_nodes() -> None: def test_wikidata_linked_ontology_id_checker() -> None: - _add_mock_loc_transformer_outputs() + _add_mock_transformer_outputs("loc") assert is_id_in_ontology("sh00000001", "loc") assert not is_id_in_ontology("sh00000001000", "loc") diff --git a/tests/sources/test_wikidata_names_source.py b/tests/sources/test_wikidata_names_source.py new file mode 100644 index 0000000000..d3437e6fd4 --- /dev/null +++ b/tests/sources/test_wikidata_names_source.py @@ -0,0 +1,35 @@ +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from test_wikidata_concepts_source import ( + _add_mock_wikidata_requests, + _add_mock_transformer_outputs, +) + + +def test_wikidata_names_source_edges() -> None: + _add_mock_transformer_outputs("wikidata_linked_loc") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("edges", "names") + + mesh_concepts_source = WikidataLinkedOntologySource( + node_type="names", linked_ontology="loc", entity_type="edges" + ) + stream_result = list(mesh_concepts_source.stream_raw()) + + assert len(stream_result) == 3 + + same_as_edges = set() + has_field_of_work_edges = set() + for edge in stream_result: + if edge["type"] == "SAME_AS": + same_as_edges.add((edge["from_id"], edge["to_id"])) + elif edge["type"] == "HAS_FIELD_OF_WORK": + has_field_of_work_edges.add((edge["from_id"], edge["to_id"])) + else: + raise ValueError(f"Unknown edge type {edge['type']}") + + assert len(same_as_edges) == 2 + assert ("Q100", "n00000001") in same_as_edges + assert ("Q101", "n00000021") in same_as_edges + + assert len(has_field_of_work_edges) == 1 + assert ("Q100", "Q1") in has_field_of_work_edges diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index f45ba2368f..a3e7dc5cc1 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -4,7 +4,7 @@ import pytest from test_utils import load_fixture from test_wikidata_concepts_source import ( - _add_mock_loc_transformer_outputs, + _add_mock_transformer_outputs, _add_mock_wikidata_requests, ) @@ -15,8 +15,8 @@ def test_wikidata_concepts_nodes_transformer() -> None: - _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests("nodes") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("nodes", "concepts") transformer = WikidataConceptsTransformer( entity_type="nodes", linked_ontology="loc" @@ -37,8 +37,8 @@ def test_wikidata_concepts_nodes_transformer() -> None: def test_wikidata_concepts_edges_transformer() -> None: - _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests("edges") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("edges", "concepts") transformer = WikidataConceptsTransformer( entity_type="edges", linked_ontology="loc" @@ -69,7 +69,9 @@ def test_wikidata_concepts_edges_transformer() -> None: def test_wikidata_raw_location() -> None: - raw_location_input = json.loads(load_fixture("wikidata/raw_location.json")) + raw_location_input = json.loads( + load_fixture("wikidata_linked_loc/raw_location.json") + ) raw_location = RawWikidataLocation(raw_location_input) assert raw_location.coordinates["latitude"] is not None @@ -79,7 +81,7 @@ def test_wikidata_raw_location() -> None: def test_wikidata_raw_name() -> None: - raw_name_input = json.loads(load_fixture("wikidata/raw_name.json")) + raw_name_input = json.loads(load_fixture("wikidata_linked_loc/raw_name.json")) raw_name = RawWikidataName(raw_name_input) assert raw_name.date_of_birth == "1949-01-28T00:00:00Z" From ed40b9d4f67eea9e5c13c6a03766db029bc54a9a Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Thu, 6 Feb 2025 14:28:42 +0000 Subject: [PATCH 261/310] Apply auto-formatting rules --- tests/sources/test_wikidata_concepts_source.py | 2 +- tests/sources/test_wikidata_names_source.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 421e5b2522..bb3ea6a96d 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -37,7 +37,7 @@ def _add_mock_wikidata_requests( def _add_mock_transformer_outputs( - ontology: Literal["loc", "wikidata_linked_loc"] + ontology: Literal["loc", "wikidata_linked_loc"], ) -> None: """ Add mock transformer output files to S3 so that we can extract ids from them. diff --git a/tests/sources/test_wikidata_names_source.py b/tests/sources/test_wikidata_names_source.py index d3437e6fd4..54e49f8ed5 100644 --- a/tests/sources/test_wikidata_names_source.py +++ b/tests/sources/test_wikidata_names_source.py @@ -1,9 +1,10 @@ -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from test_wikidata_concepts_source import ( - _add_mock_wikidata_requests, _add_mock_transformer_outputs, + _add_mock_wikidata_requests, ) +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource + def test_wikidata_names_source_edges() -> None: _add_mock_transformer_outputs("wikidata_linked_loc") From e8396b3e2d2455801b6007914945cd7952f63be9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 6 Feb 2025 14:27:44 +0000 Subject: [PATCH 262/310] Expand Wikidata unit testing to cover names --- .../wikidata/linked_ontology_source.py | 4 +- .../all_ids_query.json | 0 .../all_ids_response.json | 12 ++++ .../concepts}/items_query.json | 0 .../concepts}/items_response.json | 0 .../has_field_of_work_query.json} | 2 +- .../has_field_of_work_response.json | 16 +++++ .../wikidata_linked_loc/linked_ids_query.json | 4 ++ .../linked_ids_response.json | 25 ++++++++ .../names/items_query.json | 4 ++ .../names/items_response.json | 63 +++++++++++++++++++ .../parents_instance_of_query.json | 2 +- .../parents_instance_of_response.json | 0 .../parents_subclass_of_query.json | 2 +- .../parents_subclass_of_response.json | 0 .../raw_location.json | 0 .../raw_name.json | 0 .../transformer_output_concepts_nodes.csv | 3 + .../transformer_output_locations_nodes.csv | 1 + .../transformer_output_names_nodes.csv | 3 + .../sources/test_wikidata_concepts_source.py | 40 +++++++----- tests/sources/test_wikidata_names_source.py | 57 +++++++++++++++++ .../test_wikidata_concepts_transformer.py | 16 ++--- 23 files changed, 229 insertions(+), 25 deletions(-) rename tests/fixtures/{wikidata => wikidata_linked_loc}/all_ids_query.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/all_ids_response.json (62%) rename tests/fixtures/{wikidata => wikidata_linked_loc/concepts}/items_query.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc/concepts}/items_response.json (100%) rename tests/fixtures/{wikidata/linked_ids_query.json => wikidata_linked_loc/has_field_of_work_query.json} (55%) create mode 100644 tests/fixtures/wikidata_linked_loc/has_field_of_work_response.json create mode 100644 tests/fixtures/wikidata_linked_loc/linked_ids_query.json rename tests/fixtures/{wikidata => wikidata_linked_loc}/linked_ids_response.json (52%) create mode 100644 tests/fixtures/wikidata_linked_loc/names/items_query.json create mode 100644 tests/fixtures/wikidata_linked_loc/names/items_response.json rename tests/fixtures/{wikidata => wikidata_linked_loc}/parents_instance_of_query.json (56%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/parents_instance_of_response.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/parents_subclass_of_query.json (55%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/parents_subclass_of_response.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/raw_location.json (100%) rename tests/fixtures/{wikidata => wikidata_linked_loc}/raw_name.json (100%) create mode 100644 tests/fixtures/wikidata_linked_loc/transformer_output_concepts_nodes.csv create mode 100644 tests/fixtures/wikidata_linked_loc/transformer_output_locations_nodes.csv create mode 100644 tests/fixtures/wikidata_linked_loc/transformer_output_names_nodes.csv create mode 100644 tests/sources/test_wikidata_names_source.py diff --git a/src/sources/wikidata/linked_ontology_source.py b/src/sources/wikidata/linked_ontology_source.py index 263d644c02..4da71536f0 100644 --- a/src/sources/wikidata/linked_ontology_source.py +++ b/src/sources/wikidata/linked_ontology_source.py @@ -200,7 +200,9 @@ def _stream_raw_edges(self) -> Generator[dict]: print("Streaming HAS_FIELD_OF_WORK edges...") for edge in self._stream_all_edges_by_type("field_of_work"): # Only include an edge if its `to_id` has a corresponding concept node in the graph - if is_id_in_ontology(edge["to_id"], "wikidata"): + if edge["from_id"] in streamed_wikidata_ids and is_id_in_ontology( + edge["to_id"], "wikidata" + ): yield {**edge, "type": "HAS_FIELD_OF_WORK"} def _stream_raw_nodes(self) -> Generator[dict]: diff --git a/tests/fixtures/wikidata/all_ids_query.json b/tests/fixtures/wikidata_linked_loc/all_ids_query.json similarity index 100% rename from tests/fixtures/wikidata/all_ids_query.json rename to tests/fixtures/wikidata_linked_loc/all_ids_query.json diff --git a/tests/fixtures/wikidata/all_ids_response.json b/tests/fixtures/wikidata_linked_loc/all_ids_response.json similarity index 62% rename from tests/fixtures/wikidata/all_ids_response.json rename to tests/fixtures/wikidata_linked_loc/all_ids_response.json index dbe318698a..e79f9a11a7 100644 --- a/tests/fixtures/wikidata/all_ids_response.json +++ b/tests/fixtures/wikidata_linked_loc/all_ids_response.json @@ -18,6 +18,18 @@ "type": "uri", "value": "http://www.wikidata.org/entity/Q2" } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q100" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q101" + } } ] } diff --git a/tests/fixtures/wikidata/items_query.json b/tests/fixtures/wikidata_linked_loc/concepts/items_query.json similarity index 100% rename from tests/fixtures/wikidata/items_query.json rename to tests/fixtures/wikidata_linked_loc/concepts/items_query.json diff --git a/tests/fixtures/wikidata/items_response.json b/tests/fixtures/wikidata_linked_loc/concepts/items_response.json similarity index 100% rename from tests/fixtures/wikidata/items_response.json rename to tests/fixtures/wikidata_linked_loc/concepts/items_response.json diff --git a/tests/fixtures/wikidata/linked_ids_query.json b/tests/fixtures/wikidata_linked_loc/has_field_of_work_query.json similarity index 55% rename from tests/fixtures/wikidata/linked_ids_query.json rename to tests/fixtures/wikidata_linked_loc/has_field_of_work_query.json index 5ead8e25f9..2145fa2336 100644 --- a/tests/fixtures/wikidata/linked_ids_query.json +++ b/tests/fixtures/wikidata_linked_loc/has_field_of_work_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem p:P244/ps:P244 ?toItem. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q100 wd:Q101 wd:Q2 } ?fromItem wdt:P101 ?toItem. }" } diff --git a/tests/fixtures/wikidata_linked_loc/has_field_of_work_response.json b/tests/fixtures/wikidata_linked_loc/has_field_of_work_response.json new file mode 100644 index 0000000000..1d22e3d0f9 --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/has_field_of_work_response.json @@ -0,0 +1,16 @@ +{ + "results": { + "bindings": [ + { + "fromItem": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q100" + }, + "toItem": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q1" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata_linked_loc/linked_ids_query.json b/tests/fixtures/wikidata_linked_loc/linked_ids_query.json new file mode 100644 index 0000000000..b10ef6e29e --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/linked_ids_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q100 wd:Q101 wd:Q2 } ?fromItem p:P244/ps:P244 ?toItem. }" +} diff --git a/tests/fixtures/wikidata/linked_ids_response.json b/tests/fixtures/wikidata_linked_loc/linked_ids_response.json similarity index 52% rename from tests/fixtures/wikidata/linked_ids_response.json rename to tests/fixtures/wikidata_linked_loc/linked_ids_response.json index b24a88f36f..e735dc9272 100644 --- a/tests/fixtures/wikidata/linked_ids_response.json +++ b/tests/fixtures/wikidata_linked_loc/linked_ids_response.json @@ -20,7 +20,32 @@ "type": "uri", "value": "sh00000001" } + }, + { + "fromItem": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q100" + }, + "toItem": { + "type": "uri", + "value": "n00000001" + } + }, + { + "fromItem": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q101" + }, + "toItem": { + "type": "uri", + "value": "n00000021" + } } ] } } + + + + + diff --git a/tests/fixtures/wikidata_linked_loc/names/items_query.json b/tests/fixtures/wikidata_linked_loc/names/items_query.json new file mode 100644 index 0000000000..5df880c459 --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/names/items_query.json @@ -0,0 +1,4 @@ +{ + "format": "json", + "query": "SELECT DISTINCT ?item (SAMPLE(?itemLabel) as ?itemLabel) (SAMPLE(?itemDescription) as ?itemDescription) (SAMPLE(?itemAltLabel) as ?itemAltLabel) (SAMPLE(?dateOfBirth) as ?dateOfBirth) (SAMPLE(?dateOfDeath) as ?dateOfDeath) (SAMPLE(?placeOfBirthLabel) as ?placeOfBirthLabel) WHERE { VALUES ?item { wd:Q100 wd:Q101 } OPTIONAL { ?item wdt:P569 ?dateOfBirth. } OPTIONAL { ?item wdt:P570 ?dateOfDeath. } OPTIONAL { ?item wdt:P19 ?placeOfBirth. } OPTIONAL { SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". ?item rdfs:label ?itemLabel. ?item schema:description ?itemDescription. ?item skos:altLabel ?itemAltLabel. ?placeOfBirth rdfs:label ?placeOfBirthLabel. } } } GROUP BY ?item" +} diff --git a/tests/fixtures/wikidata_linked_loc/names/items_response.json b/tests/fixtures/wikidata_linked_loc/names/items_response.json new file mode 100644 index 0000000000..b797091d27 --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/names/items_response.json @@ -0,0 +1,63 @@ +{ + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q100" + }, + "itemLabel": { + "xml:lang": "en", + "type": "literal", + "value": "Emanuele Salce" + }, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "Italian actor" + }, + "dateOfBirth": { + "datatype": "http://www.w3.org/2001/XMLSchema#dateTime", + "type": "literal", + "value": "1966-08-07T00:00:00Z" + }, + "placeOfBirthLabel": { + "xml:lang": "en", + "type": "literal", + "value": "London" + } + }, + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q101" + }, + "itemLabel": { + "xml:lang": "en", + "type": "literal", + "value": "Beata Obertyńska" + }, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "Polish writer" + }, + "dateOfBirth": { + "datatype": "http://www.w3.org/2001/XMLSchema#dateTime", + "type": "literal", + "value": "1898-07-18T00:00:00Z" + }, + "placeOfBirthLabel": { + "xml:lang": "en", + "type": "literal", + "value": "Skole" + }, + "dateOfDeath": { + "datatype": "http://www.w3.org/2001/XMLSchema#dateTime", + "type": "literal", + "value": "1980-05-21T00:00:00Z" + } + } + ] + } +} diff --git a/tests/fixtures/wikidata/parents_instance_of_query.json b/tests/fixtures/wikidata_linked_loc/parents_instance_of_query.json similarity index 56% rename from tests/fixtures/wikidata/parents_instance_of_query.json rename to tests/fixtures/wikidata_linked_loc/parents_instance_of_query.json index e526936a61..f71e695d94 100644 --- a/tests/fixtures/wikidata/parents_instance_of_query.json +++ b/tests/fixtures/wikidata_linked_loc/parents_instance_of_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem wdt:P31 ?toItem. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q100 wd:Q101 wd:Q2 } ?fromItem wdt:P31 ?toItem. }" } diff --git a/tests/fixtures/wikidata/parents_instance_of_response.json b/tests/fixtures/wikidata_linked_loc/parents_instance_of_response.json similarity index 100% rename from tests/fixtures/wikidata/parents_instance_of_response.json rename to tests/fixtures/wikidata_linked_loc/parents_instance_of_response.json diff --git a/tests/fixtures/wikidata/parents_subclass_of_query.json b/tests/fixtures/wikidata_linked_loc/parents_subclass_of_query.json similarity index 55% rename from tests/fixtures/wikidata/parents_subclass_of_query.json rename to tests/fixtures/wikidata_linked_loc/parents_subclass_of_query.json index 6161b1366b..58e9543e8a 100644 --- a/tests/fixtures/wikidata/parents_subclass_of_query.json +++ b/tests/fixtures/wikidata_linked_loc/parents_subclass_of_query.json @@ -1,4 +1,4 @@ { "format": "json", - "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q2 } ?fromItem wdt:P279 ?toItem. }" + "query": "SELECT DISTINCT ?fromItem ?toItem WHERE { VALUES ?fromItem { wd:Q1 wd:Q100 wd:Q101 wd:Q2 } ?fromItem wdt:P279 ?toItem. }" } diff --git a/tests/fixtures/wikidata/parents_subclass_of_response.json b/tests/fixtures/wikidata_linked_loc/parents_subclass_of_response.json similarity index 100% rename from tests/fixtures/wikidata/parents_subclass_of_response.json rename to tests/fixtures/wikidata_linked_loc/parents_subclass_of_response.json diff --git a/tests/fixtures/wikidata/raw_location.json b/tests/fixtures/wikidata_linked_loc/raw_location.json similarity index 100% rename from tests/fixtures/wikidata/raw_location.json rename to tests/fixtures/wikidata_linked_loc/raw_location.json diff --git a/tests/fixtures/wikidata/raw_name.json b/tests/fixtures/wikidata_linked_loc/raw_name.json similarity index 100% rename from tests/fixtures/wikidata/raw_name.json rename to tests/fixtures/wikidata_linked_loc/raw_name.json diff --git a/tests/fixtures/wikidata_linked_loc/transformer_output_concepts_nodes.csv b/tests/fixtures/wikidata_linked_loc/transformer_output_concepts_nodes.csv new file mode 100644 index 0000000000..22d4c903c5 --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/transformer_output_concepts_nodes.csv @@ -0,0 +1,3 @@ +:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description +Q1,SourceConcept,Q1,ActionScript (Computer program language),wikidata,,,null +Q2,SourceConcept,Q2,Tacos,wikidata,,,null diff --git a/tests/fixtures/wikidata_linked_loc/transformer_output_locations_nodes.csv b/tests/fixtures/wikidata_linked_loc/transformer_output_locations_nodes.csv new file mode 100644 index 0000000000..a386832ddf --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/transformer_output_locations_nodes.csv @@ -0,0 +1 @@ +:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description,latitude,longitude \ No newline at end of file diff --git a/tests/fixtures/wikidata_linked_loc/transformer_output_names_nodes.csv b/tests/fixtures/wikidata_linked_loc/transformer_output_names_nodes.csv new file mode 100644 index 0000000000..033e07e77d --- /dev/null +++ b/tests/fixtures/wikidata_linked_loc/transformer_output_names_nodes.csv @@ -0,0 +1,3 @@ +:ID,:LABEL,id:String,label:String,source:String,alternative_ids:String,alternative_labels:String,description:String,date_of_birth:DateTime,date_of_death:DateTime,place_of_birth:String +Q100,SourceName,Q100,"McQuerry, Maureen, 1955-",wikidata,,"MacQuerry, Maureen, 1955-||Makkvyri, Morin, 1955-||McQuerry, Maureen Doyle, 1955-",,,, +Q101,SourceName,Q101,"Widmer, Thomas, 1962-",wikidata,,,,,, diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 26ac75f5bb..421e5b2522 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -9,7 +9,9 @@ from utils.ontology_id_checker import is_id_classified_as_node_type, is_id_in_ontology -def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: +def _add_mock_wikidata_requests( + entity_type: Literal["edges", "nodes"], node_type: Literal["concepts", "names"] +) -> None: """Add all the required mock Wikidata requests/responses based on whether we are streaming nodes or edges""" query_types = [ "all_ids", @@ -17,31 +19,41 @@ def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: "parents_instance_of", "parents_subclass_of", ] - if node_type == "nodes": - query_types.append("items") + if entity_type == "nodes": + query_types.append(f"{node_type}/items") + if node_type == "names": + query_types.append("has_field_of_work") for query_type in query_types: - params = json.loads(load_fixture(f"wikidata/{query_type}_query.json")) - response = json.loads(load_fixture(f"wikidata/{query_type}_response.json")) + params = json.loads( + load_fixture(f"wikidata_linked_loc/{query_type}_query.json") + ) + response = json.loads( + load_fixture(f"wikidata_linked_loc/{query_type}_response.json") + ) MockRequest.mock_response( method="GET", url=WIKIDATA_SPARQL_URL, params=params, json_data=response ) -def _add_mock_loc_transformer_outputs() -> None: +def _add_mock_transformer_outputs( + ontology: Literal["loc", "wikidata_linked_loc"] +) -> None: """ - Add mock LoC transformer output files to S3 so that we can extract ids from them. + Add mock transformer output files to S3 so that we can extract ids from them. """ for node_type in ["concepts", "locations", "names"]: MockSmartOpen.mock_s3_file( - f"s3://bulk_load_test_bucket/loc_{node_type}__nodes.csv", - load_fixture(f"loc/transformer_output_{node_type}_nodes.csv").decode(), + f"s3://bulk_load_test_bucket/{ontology}_{node_type}__nodes.csv", + load_fixture( + f"{ontology}/transformer_output_{node_type}_nodes.csv" + ).decode(), ) def test_wikidata_concepts_source_edges() -> None: - _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests("edges") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("edges", "concepts") mesh_concepts_source = WikidataLinkedOntologySource( node_type="concepts", linked_ontology="loc", entity_type="edges" @@ -71,8 +83,8 @@ def test_wikidata_concepts_source_edges() -> None: def test_wikidata_concepts_source_nodes() -> None: - _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests("nodes") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("nodes", "concepts") mesh_concepts_source = WikidataLinkedOntologySource( node_type="concepts", linked_ontology="loc", entity_type="nodes" @@ -89,7 +101,7 @@ def test_wikidata_concepts_source_nodes() -> None: def test_wikidata_linked_ontology_id_checker() -> None: - _add_mock_loc_transformer_outputs() + _add_mock_transformer_outputs("loc") assert is_id_in_ontology("sh00000001", "loc") assert not is_id_in_ontology("sh00000001000", "loc") diff --git a/tests/sources/test_wikidata_names_source.py b/tests/sources/test_wikidata_names_source.py new file mode 100644 index 0000000000..2600f27511 --- /dev/null +++ b/tests/sources/test_wikidata_names_source.py @@ -0,0 +1,57 @@ +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource +from test_wikidata_concepts_source import ( + _add_mock_wikidata_requests, + _add_mock_transformer_outputs, +) + + +def test_wikidata_names_source_edges() -> None: + _add_mock_transformer_outputs("wikidata_linked_loc") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("edges", "names") + + mesh_concepts_source = WikidataLinkedOntologySource( + node_type="names", linked_ontology="loc", entity_type="edges" + ) + stream_result = list(mesh_concepts_source.stream_raw()) + + assert len(stream_result) == 3 + + same_as_edges = set() + has_field_of_work_edges = set() + for edge in stream_result: + if edge["type"] == "SAME_AS": + same_as_edges.add((edge["from_id"], edge["to_id"])) + elif edge["type"] == "HAS_FIELD_OF_WORK": + has_field_of_work_edges.add((edge["from_id"], edge["to_id"])) + else: + raise ValueError(f"Unknown edge type {edge['type']}") + + assert len(same_as_edges) == 2 + assert ("Q100", "n00000001") in same_as_edges + assert ("Q101", "n00000021") in same_as_edges + + assert len(has_field_of_work_edges) == 1 + assert ("Q100", "Q1") in has_field_of_work_edges + + +def test_wikidata_names_source_nodes() -> None: + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("nodes", "names") + + mesh_concepts_source = WikidataLinkedOntologySource( + node_type="names", linked_ontology="loc", entity_type="nodes" + ) + stream_result = list(mesh_concepts_source.stream_raw()) + + assert len(stream_result) == 2 + + for raw_node in stream_result: + assert "item" in raw_node + assert "itemLabel" in raw_node + assert "itemDescription" in raw_node + assert "placeOfBirthLabel" in raw_node + assert "dateOfBirth" in raw_node + + assert "dateOfDeath" in stream_result[1] + assert "dateOfDeath" not in stream_result[0] diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index f45ba2368f..a3e7dc5cc1 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -4,7 +4,7 @@ import pytest from test_utils import load_fixture from test_wikidata_concepts_source import ( - _add_mock_loc_transformer_outputs, + _add_mock_transformer_outputs, _add_mock_wikidata_requests, ) @@ -15,8 +15,8 @@ def test_wikidata_concepts_nodes_transformer() -> None: - _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests("nodes") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("nodes", "concepts") transformer = WikidataConceptsTransformer( entity_type="nodes", linked_ontology="loc" @@ -37,8 +37,8 @@ def test_wikidata_concepts_nodes_transformer() -> None: def test_wikidata_concepts_edges_transformer() -> None: - _add_mock_loc_transformer_outputs() - _add_mock_wikidata_requests("edges") + _add_mock_transformer_outputs("loc") + _add_mock_wikidata_requests("edges", "concepts") transformer = WikidataConceptsTransformer( entity_type="edges", linked_ontology="loc" @@ -69,7 +69,9 @@ def test_wikidata_concepts_edges_transformer() -> None: def test_wikidata_raw_location() -> None: - raw_location_input = json.loads(load_fixture("wikidata/raw_location.json")) + raw_location_input = json.loads( + load_fixture("wikidata_linked_loc/raw_location.json") + ) raw_location = RawWikidataLocation(raw_location_input) assert raw_location.coordinates["latitude"] is not None @@ -79,7 +81,7 @@ def test_wikidata_raw_location() -> None: def test_wikidata_raw_name() -> None: - raw_name_input = json.loads(load_fixture("wikidata/raw_name.json")) + raw_name_input = json.loads(load_fixture("wikidata_linked_loc/raw_name.json")) raw_name = RawWikidataName(raw_name_input) assert raw_name.date_of_birth == "1949-01-28T00:00:00Z" From 6eee1ebae62a190a8a833f31a66446c8cadb6d9b Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Thu, 6 Feb 2025 14:57:14 +0000 Subject: [PATCH 263/310] Apply auto-formatting rules --- tests/sources/test_wikidata_concepts_source.py | 2 +- tests/sources/test_wikidata_names_source.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 421e5b2522..bb3ea6a96d 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -37,7 +37,7 @@ def _add_mock_wikidata_requests( def _add_mock_transformer_outputs( - ontology: Literal["loc", "wikidata_linked_loc"] + ontology: Literal["loc", "wikidata_linked_loc"], ) -> None: """ Add mock transformer output files to S3 so that we can extract ids from them. diff --git a/tests/sources/test_wikidata_names_source.py b/tests/sources/test_wikidata_names_source.py index 2600f27511..fe7185f7c0 100644 --- a/tests/sources/test_wikidata_names_source.py +++ b/tests/sources/test_wikidata_names_source.py @@ -1,9 +1,10 @@ -from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource from test_wikidata_concepts_source import ( - _add_mock_wikidata_requests, _add_mock_transformer_outputs, + _add_mock_wikidata_requests, ) +from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource + def test_wikidata_names_source_edges() -> None: _add_mock_transformer_outputs("wikidata_linked_loc") From a14c4acfdf4476afc7d25f41ff0fb0e8139fb243 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Mon, 10 Feb 2025 10:26:24 +0000 Subject: [PATCH 264/310] add github secret --- terraform/gha_role.tf | 2 +- terraform/github/gha_secret.tf | 14 ++++++++++++++ terraform/github/provider.tf | 14 ++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 terraform/github/gha_secret.tf create mode 100644 terraform/github/provider.tf diff --git a/terraform/gha_role.tf b/terraform/gha_role.tf index 76506e19b9..08e381232b 100644 --- a/terraform/gha_role.tf +++ b/terraform/gha_role.tf @@ -47,5 +47,5 @@ data "aws_iam_policy_document" "gha_catalogue_graph_ci" { } output "gha_catalogue_graph_ci_role_arn" { - value = module.gha_catalogue_graph_ci_role.outputs.role_arn + value = module.gha_catalogue_graph_ci_role.role_arn } \ No newline at end of file diff --git a/terraform/github/gha_secret.tf b/terraform/github/gha_secret.tf new file mode 100644 index 0000000000..8e40df6a6f --- /dev/null +++ b/terraform/github/gha_secret.tf @@ -0,0 +1,14 @@ +// this is required here in the child module to ensure correct resolution +terraform { + required_providers { + github = { + source = "integrations/github" + } + } +} + +resource "github_actions_secret" "catalogue_graph_ci" { + repository = "wellcomecollection/catalogue-graph" + secret_name = "CATALOGUE_GRAPH_CI_ROLE_ARN" + plaintext_value = module.gha_catalogue_graph_ci_role_arn +} \ No newline at end of file diff --git a/terraform/github/provider.tf b/terraform/github/provider.tf new file mode 100644 index 0000000000..18b0a688a5 --- /dev/null +++ b/terraform/github/provider.tf @@ -0,0 +1,14 @@ +# Configure the GitHub Provider +# Create a fine-grained personal access token in Github: +# Go to your Github account > Settings > Developer settings > PAT > Fine-grained tokens +# Give it a name, description and a short (7 days) expiration +# In Organization permissions select Secret - Manage Actions organization secrets: read and write +# export TF_VAR_github_token= before applying the tf +provider "github" { + owner = "wellcomecollection" + token = var.github_token +} + +variable "github_token" { + type = string +} \ No newline at end of file From 57e82cbd09b1ade59c6d5ea41fe18d6506418614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Mon, 10 Feb 2025 16:37:12 +0000 Subject: [PATCH 265/310] Secrets Manager cleanup #5903 --- README.md | 2 +- notebooks/graph_exploration.ipynb | 2 +- src/utils/aws.py | 4 ++-- terraform/lambda_indexer.tf | 2 +- terraform/load_balancer.tf | 12 ++++++++---- terraform/locals.tf | 2 ++ terraform/neptune.tf | 2 +- 7 files changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 864df7b0d7..a08ea98d86 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ To connect to the catalogue graph, add the following configuration into your Jup ``` %%graph_notebook_config { - "host": , + "host": , "neptune_service": "neptune-db", "port": 8182, "ssl": true, diff --git a/notebooks/graph_exploration.ipynb b/notebooks/graph_exploration.ipynb index edbdbcfc59..cf87a5016b 100644 --- a/notebooks/graph_exploration.ipynb +++ b/notebooks/graph_exploration.ipynb @@ -39,7 +39,7 @@ "source": [ "%%graph_notebook_config\n", "{\n", - " \"host\": ,\n", + " \"host\": ,\n", " \"neptune_service\": \"neptune-db\",\n", " \"port\": 8182,\n", " \"ssl\": true,\n", diff --git a/src/utils/aws.py b/src/utils/aws.py index f29a867c4a..6309ac2cd9 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -6,8 +6,8 @@ from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient -LOAD_BALANCER_SECRET_NAME = "NeptuneTest/LoadBalancerUrl" -INSTANCE_ENDPOINT_SECRET_NAME = "NeptuneTest/InstanceEndpoint" +LOAD_BALANCER_SECRET_NAME = "catalogue-graph/neptune-nlb-url" +INSTANCE_ENDPOINT_SECRET_NAME = "catalogue-graph/neptune-cluster-endpoint" def get_secret(secret_name: str) -> str: diff --git a/terraform/lambda_indexer.tf b/terraform/lambda_indexer.tf index c1a38f9def..156be06191 100644 --- a/terraform/lambda_indexer.tf +++ b/terraform/lambda_indexer.tf @@ -24,7 +24,7 @@ data "aws_iam_policy_document" "allow_secret_read" { statement { actions = ["secretsmanager:GetSecretValue"] resources = [ - "arn:aws:secretsmanager:eu-west-1:760097843905:secret:NeptuneTest/*" + "arn:aws:secretsmanager:eu-west-1:760097843905:secret:${local.namespace}/*" ] } } diff --git a/terraform/load_balancer.tf b/terraform/load_balancer.tf index 367c27484c..a6227425b2 100644 --- a/terraform/load_balancer.tf +++ b/terraform/load_balancer.tf @@ -25,11 +25,15 @@ resource "aws_lb_target_group_attachment" "neptune_instance_attachment" { # this setup is still more convenient than only being able to connect from within the VPC. # If it starts bothering us, we can create a Lambda function for dynamically updating the target group IP, as outlined # here: https://aws-samples.github.io/aws-dbs-refarch-graph/src/connecting-using-a-load-balancer/ - target_id = "172.42.174.101" + target_id = data.aws_secretsmanager_secret_version.neptune_cluster_private_ip.secret_string } -locals { - catalogue_graph_nlb_url = "catalogue-graph.wellcomecollection.org" +resource "aws_secretsmanager_secret" "neptune_cluster_private_ip" { + name = "${local.namespace}/neptune-nlb-private-ip" +} + +data "aws_secretsmanager_secret_version" "neptune_cluster_private_ip" { + secret_id = aws_secretsmanager_secret.neptune_cluster_private_ip.id } # A custom certificate which will be used for TLS termination @@ -78,7 +82,7 @@ resource "aws_vpc_security_group_egress_rule" "neptune_lb_egress" { } resource "aws_secretsmanager_secret" "neptune_nlb_url" { - name = "NeptuneTest/LoadBalancerUrl" + name = "${local.namespace}/neptune-nlb-url" } resource "aws_secretsmanager_secret_version" "neptune_nlb_endpoint_url" { diff --git a/terraform/locals.tf b/terraform/locals.tf index 5d478cfc1f..f73f2bef63 100644 --- a/terraform/locals.tf +++ b/terraform/locals.tf @@ -12,6 +12,8 @@ locals { public_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_public_subnets ec_privatelink_security_group_id = local.shared_infra["ec_developer_privatelink_sg_id"] + + catalogue_graph_nlb_url = "catalogue-graph.wellcomecollection.org" } data "aws_vpc" "vpc" { diff --git a/terraform/neptune.tf b/terraform/neptune.tf index eebafe2c73..2350911f62 100644 --- a/terraform/neptune.tf +++ b/terraform/neptune.tf @@ -86,7 +86,7 @@ resource "aws_vpc_security_group_egress_rule" "neptune_egress" { } resource "aws_secretsmanager_secret" "neptune_cluster_endpoint" { - name = "NeptuneTest/InstanceEndpoint" + name = "${local.namespace}/neptune-cluster-endpoint" } resource "aws_secretsmanager_secret_version" "neptune_cluster_endpoint_value" { From 3cf2d73d3351a998a8b4a76d49afbdc16d4fa829 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Tue, 11 Feb 2025 09:46:46 +0000 Subject: [PATCH 266/310] address cr comments --- scripts/deploy_lambda_service.sh | 13 +++----- terraform/github/.terraform.lock.hcl | 46 ++++++++++++++++++++++++++++ terraform/{ => github}/gha_role.tf | 4 --- terraform/github/gha_secret.tf | 2 +- terraform/github/terraform.tf | 12 ++++++++ 5 files changed, 64 insertions(+), 13 deletions(-) create mode 100644 terraform/github/.terraform.lock.hcl rename terraform/{ => github}/gha_role.tf (93%) create mode 100644 terraform/github/terraform.tf diff --git a/scripts/deploy_lambda_service.sh b/scripts/deploy_lambda_service.sh index 5b1885b67c..849a0b7b79 100644 --- a/scripts/deploy_lambda_service.sh +++ b/scripts/deploy_lambda_service.sh @@ -4,6 +4,7 @@ set -o errexit set -o nounset set -o pipefail +export AWS_PAGER="" SERVICE_NAME=$1 @@ -16,26 +17,22 @@ echo "Deploying ${IMAGE_URI} to ${SERVICE_NAME}, @ $(date) ..." echo "Current lambda configuration for ${SERVICE_NAME}:" aws lambda get-function-configuration \ - --function-name "$SERVICE_NAME" \ - --no-cli-pager + --function-name "$SERVICE_NAME" echo "Updating lambda configuration ..." echo "Using ${IMAGE_URI}:" aws lambda update-function-code \ --function-name "$SERVICE_NAME" \ --image-uri "${IMAGE_URI}" \ - --publish \ - --no-cli-pager + --publish echo "Updated lambda configuration, (waiting for update @ $(date)}):" aws lambda wait function-updated \ - --function-name "$SERVICE_NAME" \ - --no-cli-pager + --function-name "$SERVICE_NAME" echo "New lambda configuration complete (@ $(date)), config after change:" aws lambda get-function-configuration \ - --function-name "$SERVICE_NAME" \ - --no-cli-pager + --function-name "$SERVICE_NAME" echo "Done deploying ${SERVICE_NAME} @ $(date)! 🚀" done \ No newline at end of file diff --git a/terraform/github/.terraform.lock.hcl b/terraform/github/.terraform.lock.hcl new file mode 100644 index 0000000000..f1904210f9 --- /dev/null +++ b/terraform/github/.terraform.lock.hcl @@ -0,0 +1,46 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.86.1" + hashes = [ + "h1:IekGV22ML8NcKlhaAceeWdHdXAWfFLJYaslIEkpMHps=", + "zh:0c5901c55f9bc0d353c48aa29e08d7152055dd296f3b60e1fe1634af8a7d32e4", + "zh:26ddfc89d2a410492e31f1014bbf5388f871cb67d01e80255bde3e22a468e8a6", + "zh:380c57474796e680c4477c4a69810db9389ce2717ff2da8d0f06716247dd1295", + "zh:53bf6f567be4348ddd566792fccddd9db6104111e619aa4042afb594b9a5cc75", + "zh:575c41544fd4ac969d59ecdff66428583c228a20a4893d238414e932bb2f2dc0", + "zh:63d9473a2f55f4941e98cb2fcc7031b4266c1cdc40a8f96d52b7d29504984da3", + "zh:6ec72fbc68f608a4e947a0b1356b14791330a425b7ebd3125e8023693bb37ec8", + "zh:729a0853f9ca42b60993d6233b80e1fea52cc5c9401693cef83ade502f51e3e8", + "zh:750eda82a9bde02a999677cdeb1e6d69b0d7af783e8d629c813da9be3ee6d493", + "zh:90f70d5b31bdae6b7f3aee9b2b618168a32f434eb976b935d907c95271e7e692", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:9cbf0464984b19a5a9027e8b03ebf1b56761c73f97171013b29f2d525ba91587", + "zh:aec08a2374a5cdaac3df3d6a39d98aaf58a3e0a500259b791a2dc5693280bc4b", + "zh:b638d8bd8ad11f14f7811696edcf744df07ea0f5c6033f59f3b325f921b7f54c", + "zh:bb862a4d11da06fff7c04978769cd100547bbf4735f64bfe2374b289e41a5147", + ] +} + +provider "registry.terraform.io/integrations/github" { + version = "6.5.0" + hashes = [ + "h1:rMuaCjyJo4zR9CKZoB1kCpZ9pZke4rlfd+ea0vCpaVg=", + "zh:3088bfd30c51ebfcb7c8d829465ec7b3c19af684cf1aff1ea1111ad3c6421c11", + "zh:34f9054b0123f9fa7ab8ebc73591d2cf502f1cc75e7594bde42ce799fcac32b6", + "zh:406dc2e63d43a24ac4f1b004e5c60ada3347207ea750bbd51e6199eb7f044f9f", + "zh:43e7b6cb7e5062d9b7b7cf4d23f6ea99fb9605fb014fede62cda307051063c05", + "zh:6a0923ebcc09cb98c488c11582375d2145ba965d1e6f2f69c077be8e1224020b", + "zh:a2331f06b7ed57e83eadb784211067d675826f67cf0ed051c8ab20335d83de9a", + "zh:a3f82213c98319f20438bdb92145ce1b0407cd8b8eec9745c036db10deb3d3a2", + "zh:b4b8db8537d8e6fb3f05ed875726823e1dc6925c479db8749016e71568ebafc4", + "zh:cdcf76f6f6f5c638db540490ab35bb1aacfc27204f1197004da5e950024afc06", + "zh:de36cea60efe2b74cec958f88ec5c39d467ad9443c9c9e311424c3db229c4e78", + "zh:dfb8949edc6722da66c78a19ccb1b81ac855439a28ca3badfdac5c10bbf2190d", + "zh:e1a81734cc81f4f51dd11ca8a62b420f68e72d00835ed54f84d71bd56d19f37f", + "zh:ec0d51640c3e3cf933c73d0ed79ba8b395d1b94fed8117a6438dba872aa5561f", + "zh:ec59b7c420a2358e9750e9c6a8a5ef26ccbb8a2cae417e115e86d63520759ea5", + "zh:fbd1fee2c9df3aa19cf8851ce134dea6e45ea01cb85695c1726670c285797e25", + ] +} diff --git a/terraform/gha_role.tf b/terraform/github/gha_role.tf similarity index 93% rename from terraform/gha_role.tf rename to terraform/github/gha_role.tf index 08e381232b..da61386f6e 100644 --- a/terraform/gha_role.tf +++ b/terraform/github/gha_role.tf @@ -44,8 +44,4 @@ data "aws_iam_policy_document" "gha_catalogue_graph_ci" { "arn:aws:lambda:eu-west-1:760097843905:function:catalogue-graph-indexer" ] } -} - -output "gha_catalogue_graph_ci_role_arn" { - value = module.gha_catalogue_graph_ci_role.role_arn } \ No newline at end of file diff --git a/terraform/github/gha_secret.tf b/terraform/github/gha_secret.tf index 8e40df6a6f..d8536052ce 100644 --- a/terraform/github/gha_secret.tf +++ b/terraform/github/gha_secret.tf @@ -10,5 +10,5 @@ terraform { resource "github_actions_secret" "catalogue_graph_ci" { repository = "wellcomecollection/catalogue-graph" secret_name = "CATALOGUE_GRAPH_CI_ROLE_ARN" - plaintext_value = module.gha_catalogue_graph_ci_role_arn + plaintext_value = module.gha_catalogue_graph_ci_role.role_arn } \ No newline at end of file diff --git a/terraform/github/terraform.tf b/terraform/github/terraform.tf new file mode 100644 index 0000000000..e380e33812 --- /dev/null +++ b/terraform/github/terraform.tf @@ -0,0 +1,12 @@ +data "terraform_remote_state" "aws_account_infrastructure" { + backend = "s3" + + config = { + assume_role = { + role_arn = "arn:aws:iam::760097843905:role/platform-read_only" + } + bucket = "wellcomecollection-platform-infra" + key = "terraform/aws-account-infrastructure/platform.tfstate" + region = "eu-west-1" + } +} \ No newline at end of file From e151a4333bea0b40fe873f4d3dd1df056b39109c Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 11 Feb 2025 09:58:37 +0000 Subject: [PATCH 267/310] Add check for valid source ID --- src/transformers/catalogue/raw_concept.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 7380289f6c..78a2f7e3af 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -92,3 +92,17 @@ def source_concept_id(self) -> str | None: return source_id return None + + @property + def has_valid_source_concept(self) -> bool: + """Checks if the source concept ID format matches the specified source.""" + if (self.source == "nlm-mesh") and not self.source_concept_id.startswith("D"): + return False + + if (self.source == "lc-subjects") and not self.source_concept_id.startswith("sh"): + return False + + if (self.source == "lc-names") and not self.source_concept_id.startswith("n"): + return False + + return True From 8bce6275e30c72a84bd3c32b261f9804cfa9ebaf Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 11 Feb 2025 09:59:40 +0000 Subject: [PATCH 268/310] Allow for multiple sources in IdLabelChecker --- src/transformers/catalogue/id_label_checker.py | 16 ++++++++++++---- src/utils/aws.py | 2 -- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/transformers/catalogue/id_label_checker.py b/src/transformers/catalogue/id_label_checker.py index 89e1f13f37..4ca5191247 100644 --- a/src/transformers/catalogue/id_label_checker.py +++ b/src/transformers/catalogue/id_label_checker.py @@ -1,3 +1,4 @@ +from itertools import product from typing import Any from utils.aws import NodeType, OntologyType, fetch_from_s3 @@ -15,13 +16,20 @@ def __init__(self, *args: Any, **kwargs: Any): self.inverse.setdefault(value, []).append(key) @classmethod - def from_source(cls, node_type: NodeType, source: OntologyType) -> dict: + def from_source(cls, node_type: NodeType | list[NodeType], source: OntologyType | list[OntologyType]) -> dict: """Fetch source node data from s3 bulk upload files and create ID-label mapping.""" id_label_dict = {} - for row in fetch_from_s3(node_type, source): - # Extract source id and label at position 0 and 3, respectively - id_label_dict[row[0]] = row[3] + if not isinstance(node_type, list): + node_type = [node_type] + + if not isinstance(source, list): + source = [source] + + for nt, s in product(node_type, source): + for row in fetch_from_s3(nt, s): + # Extract source id and label at position 0 and 3, respectively + id_label_dict[row[0]] = row[3].lower() print(f"({len(id_label_dict)} ids and labels retrieved.)") diff --git a/src/utils/aws.py b/src/utils/aws.py index 6c0bef1a3d..86a1641f55 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -1,7 +1,6 @@ import csv import json from collections.abc import Generator -from functools import lru_cache from typing import Any, Literal import boto3 @@ -65,7 +64,6 @@ def get_neptune_client(is_local: bool) -> BaseNeptuneClient: return LambdaNeptuneClient(get_secret(INSTANCE_ENDPOINT_SECRET_NAME)) -@lru_cache def fetch_from_s3(node_type: NodeType, source: OntologyType) -> Generator[Any]: """Retrieves the bulk load file outputted by the relevant transformer so that we can extract data from it.""" linked_nodes_file_name = f"{source}_{node_type}__nodes.csv" From d58d7630c724266c63fbfe3474332774d424ea0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 11 Feb 2025 10:50:31 +0000 Subject: [PATCH 269/310] Remove unnecessary lru_cache decorator --- src/utils/aws.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/utils/aws.py b/src/utils/aws.py index 49cfe9d127..6379b9e7ba 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -1,7 +1,6 @@ import csv import json from collections.abc import Generator -from functools import lru_cache from typing import Any import boto3 @@ -61,7 +60,6 @@ def get_neptune_client(is_local: bool) -> BaseNeptuneClient: return LambdaNeptuneClient(get_secret(INSTANCE_ENDPOINT_SECRET_NAME)) -@lru_cache def fetch_transformer_output_from_s3( ontology_type: OntologyType, node_type: NodeType ) -> Generator[list[Any]]: From 1b115356264ddd6920cb8807c6481c0a741f0b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 11 Feb 2025 11:36:29 +0000 Subject: [PATCH 270/310] Move Neptune cluster to catalogue VPC #5904 --- terraform/locals.tf | 6 +++--- terraform/terraform.tf | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/terraform/locals.tf b/terraform/locals.tf index f73f2bef63..237d6029d9 100644 --- a/terraform/locals.tf +++ b/terraform/locals.tf @@ -7,9 +7,9 @@ locals { shared_infra = data.terraform_remote_state.shared_infra.outputs - vpc_id = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_id - private_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_private_subnets - public_subnets = data.terraform_remote_state.aws_account_infrastructure.outputs.developer_vpc_public_subnets + vpc_id = data.terraform_remote_state.catalogue_aws_account_infrastructure.outputs.catalogue_vpc_delta_id + private_subnets = data.terraform_remote_state.catalogue_aws_account_infrastructure.outputs.catalogue_vpc_delta_private_subnets + public_subnets = data.terraform_remote_state.catalogue_aws_account_infrastructure.outputs.catalogue_vpc_delta_public_subnets ec_privatelink_security_group_id = local.shared_infra["ec_developer_privatelink_sg_id"] diff --git a/terraform/terraform.tf b/terraform/terraform.tf index 0f83de8b8a..4ad6b29a48 100644 --- a/terraform/terraform.tf +++ b/terraform/terraform.tf @@ -12,7 +12,7 @@ terraform { } } -data "terraform_remote_state" "aws_account_infrastructure" { +data "terraform_remote_state" "catalogue_aws_account_infrastructure" { backend = "s3" config = { @@ -20,7 +20,7 @@ data "terraform_remote_state" "aws_account_infrastructure" { role_arn = "arn:aws:iam::760097843905:role/platform-read_only" } bucket = "wellcomecollection-platform-infra" - key = "terraform/aws-account-infrastructure/platform.tfstate" + key = "terraform/aws-account-infrastructure/catalogue.tfstate" region = "eu-west-1" } } From 5a0924686edcd90cbd04cdfda65f6d15b02b13b1 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Tue, 11 Feb 2025 12:15:58 +0000 Subject: [PATCH 271/310] address other cr comments --- .github/workflows/ci.yml | 9 ++-- scripts/deploy_lambda_service.sh | 38 -------------- terraform/extractor_ecs_task.tf | 42 +++++++++++++++ terraform/lambda_extractor.tf | 72 -------------------------- terraform/state_machine_bulk_loader.tf | 8 +-- 5 files changed, 47 insertions(+), 122 deletions(-) delete mode 100644 scripts/deploy_lambda_service.sh delete mode 100644 terraform/lambda_extractor.tf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2d91414de2..63cd63daac 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-west-1 - role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} # to be created + role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} - name: Build and push artefacts run: | ./scripts/build.sh --push @@ -42,7 +42,7 @@ jobs: - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-west-1 - role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} # to be created + role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} - name: Deploy bulk-loader lambda run: | ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-loader @@ -51,7 +51,4 @@ jobs: ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-load-poller - name: Deploy indexer lambda run: | - ./scripts/deploy_lambda_zip.sh catalogue-graph-indexer - - name: Deploy extractor lambda - run: | - ./scripts/deploy_lambda_service.sh catalogue-graph-extractor \ No newline at end of file + ./scripts/deploy_lambda_zip.sh catalogue-graph-indexer \ No newline at end of file diff --git a/scripts/deploy_lambda_service.sh b/scripts/deploy_lambda_service.sh deleted file mode 100644 index 849a0b7b79..0000000000 --- a/scripts/deploy_lambda_service.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash - -set -o errexit -set -o nounset -set -o pipefail - -export AWS_PAGER="" - -SERVICE_NAME=$1 - -FUNCTION_NAME="${SERVICE_NAME}" -REPOSITORY_URI="760097843905.dkr.ecr.eu-west-1.amazonaws.com" - -IMAGE_URI="${REPOSITORY_URI}/uk.ac.wellcome/${SERVICE_NAME}:dev" - -echo "Deploying ${IMAGE_URI} to ${SERVICE_NAME}, @ $(date) ..." - -echo "Current lambda configuration for ${SERVICE_NAME}:" -aws lambda get-function-configuration \ - --function-name "$SERVICE_NAME" - -echo "Updating lambda configuration ..." -echo "Using ${IMAGE_URI}:" -aws lambda update-function-code \ - --function-name "$SERVICE_NAME" \ - --image-uri "${IMAGE_URI}" \ - --publish - -echo "Updated lambda configuration, (waiting for update @ $(date)}):" -aws lambda wait function-updated \ - --function-name "$SERVICE_NAME" - -echo "New lambda configuration complete (@ $(date)), config after change:" -aws lambda get-function-configuration \ - --function-name "$SERVICE_NAME" - -echo "Done deploying ${SERVICE_NAME} @ $(date)! 🚀" -done \ No newline at end of file diff --git a/terraform/extractor_ecs_task.tf b/terraform/extractor_ecs_task.tf index b89a3fe94e..1897830b58 100644 --- a/terraform/extractor_ecs_task.tf +++ b/terraform/extractor_ecs_task.tf @@ -27,3 +27,45 @@ resource "aws_iam_role_policy" "ecs_stream_to_s3_policy" { role = module.extractor_ecs_task.task_role_name policy = data.aws_iam_policy_document.stream_to_s3.json } + + +# openCypher queries will be streamed to this SNS topic (when SNS is chosen as the streaming destination) +module "catalogue_graph_queries_topic" { + source = "github.com/wellcomecollection/terraform-aws-sns-topic.git?ref=v1.0.0" + name = "catalogue_graph_queries" +} + +data "aws_iam_policy_document" "stream_to_sns" { + statement { + actions = [ + "sns:Publish", + ] + + resources = [ + module.catalogue_graph_queries_topic.arn + ] + } +} + +data "aws_iam_policy_document" "stream_to_s3" { + statement { + actions = [ + "s3:PutObject", + "s3:GetObject" + ] + + resources = [ + "${aws_s3_bucket.neptune_bulk_upload_bucket.arn}/*" + ] + } +} + +resource "aws_iam_role_policy" "stream_to_sns_policy" { + role = module.extractor_lambda.lambda_role.name + policy = data.aws_iam_policy_document.stream_to_sns.json +} + +resource "aws_iam_role_policy" "stream_to_s3_policy" { + role = module.extractor_lambda.lambda_role.name + policy = data.aws_iam_policy_document.stream_to_s3.json +} \ No newline at end of file diff --git a/terraform/lambda_extractor.tf b/terraform/lambda_extractor.tf deleted file mode 100644 index a09f9c1366..0000000000 --- a/terraform/lambda_extractor.tf +++ /dev/null @@ -1,72 +0,0 @@ -module "extractor_lambda" { - source = "git@github.com:wellcomecollection/terraform-aws-lambda?ref=v1.2.0" - - name = "catalogue-graph-extractor" - description = "Extracts source concepts and turns them into Cypher queries." - runtime = "python3.13" - - filename = "../target/build.zip" - source_code_hash = filesha256("../target/build.zip") - - handler = "extractor.lambda_handler" - - // This Lambda does not need a lot of memory, but it downloads and processes large datasets (with up to 10 million - // items) and therefore needs the additional compute and networking capacity which comes with increased memory. - memory_size = 4096 - timeout = 15 * 60 // 15 minutes - - vpc_config = { - subnet_ids = local.private_subnets - security_group_ids = [aws_security_group.graph_indexer_lambda_security_group.id] - } - - environment = { - variables = { - S3_BULK_LOAD_BUCKET_NAME = aws_s3_bucket.neptune_bulk_upload_bucket.bucket - GRAPH_QUERIES_SNS_TOPIC_ARN = module.catalogue_graph_queries_topic.arn - } - } - - # error_alarm_topic_arn = data.terraform_remote_state.monitoring.outputs["platform_lambda_error_alerts_topic_arn"] -} - -# openCypher queries will be streamed to this SNS topic (when SNS is chosen as the streaming destination) -module "catalogue_graph_queries_topic" { - source = "github.com/wellcomecollection/terraform-aws-sns-topic.git?ref=v1.0.0" - name = "catalogue_graph_queries" -} - -data "aws_iam_policy_document" "stream_to_sns" { - statement { - actions = [ - "sns:Publish", - ] - - resources = [ - module.catalogue_graph_queries_topic.arn - ] - } -} - -data "aws_iam_policy_document" "stream_to_s3" { - statement { - actions = [ - "s3:PutObject", - "s3:GetObject" - ] - - resources = [ - "${aws_s3_bucket.neptune_bulk_upload_bucket.arn}/*" - ] - } -} - -resource "aws_iam_role_policy" "stream_to_sns_policy" { - role = module.extractor_lambda.lambda_role.name - policy = data.aws_iam_policy_document.stream_to_sns.json -} - -resource "aws_iam_role_policy" "stream_to_s3_policy" { - role = module.extractor_lambda.lambda_role.name - policy = data.aws_iam_policy_document.stream_to_s3.json -} diff --git a/terraform/state_machine_bulk_loader.tf b/terraform/state_machine_bulk_loader.tf index fddf4b8b59..c9f62079af 100644 --- a/terraform/state_machine_bulk_loader.tf +++ b/terraform/state_machine_bulk_loader.tf @@ -1,7 +1,3 @@ -locals { - bulk_loader_lambda = "${module.bulk_loader_lambda.lambda.arn}:${module.bulk_loader_lambda.lambda.version}" - bulk_load_poller_lambda = "${module.bulk_load_poller_lambda.lambda.arn}:${module.bulk_load_poller_lambda.lambda.version}" -} resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { name = "catalogue-graph-bulk-loader" role_arn = aws_iam_role.state_machine_execution_role.arn @@ -16,7 +12,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { "Resource" : "arn:aws:states:::lambda:invoke", "Output" : "{% $states.result.Payload %}", "Arguments" : { - "FunctionName" : local.bulk_loader_lambda, + "FunctionName" : module.bulk_loader_lambda.lambda.arn, "Payload" : "{% $states.input %}" }, "Next" : "Wait 30 seconds" @@ -31,7 +27,7 @@ resource "aws_sfn_state_machine" "catalogue_graph_bulk_loader" { "Resource" : "arn:aws:states:::lambda:invoke", "Output" : "{% $states.result.Payload %}", "Arguments" : { - "FunctionName" : local.bulk_load_poller_lambda, + "FunctionName" : module.bulk_load_poller_lambda.lambda.arn, "Payload" : "{% $states.input %}" }, "Next" : "Load complete?" From 78a6107d99b87937e7bd2d002205da8f0ff7ee03 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 11 Feb 2025 14:11:56 +0000 Subject: [PATCH 272/310] Add missing Subject type --- src/models/graph_node.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/models/graph_node.py b/src/models/graph_node.py index a58cfb5fe0..bbdbe78e4e 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -48,6 +48,7 @@ class SourceName(SourceConcept): "Meeting", "Genre", "Period", + "Subject" ] ConceptSource = Literal[ From d72528fede5eaeb230f37fb917631332a7e21a09 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 11 Feb 2025 14:12:16 +0000 Subject: [PATCH 273/310] Refactor source concept checks --- src/transformers/catalogue/raw_concept.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index 78a2f7e3af..b0e2df37f2 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -96,13 +96,14 @@ def source_concept_id(self) -> str | None: @property def has_valid_source_concept(self) -> bool: """Checks if the source concept ID format matches the specified source.""" - if (self.source == "nlm-mesh") and not self.source_concept_id.startswith("D"): - return False + if isinstance(self.source_concept_id, str): + if (self.source == "nlm-mesh") and self.source_concept_id.startswith("D"): + return True + + if (self.source == "lc-subjects") and self.source_concept_id.startswith("sh"): + return True + + if (self.source == "lc-names") and self.source_concept_id.startswith("n"): + return True - if (self.source == "lc-subjects") and not self.source_concept_id.startswith("sh"): - return False - - if (self.source == "lc-names") and not self.source_concept_id.startswith("n"): - return False - - return True + return False From 6497e580c7e4eabec2a72284cabf5157da1eb629 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 11 Feb 2025 14:13:32 +0000 Subject: [PATCH 274/310] Add hsc edge extraction --- .../catalogue/concepts_transformer.py | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index daa133d12f..a3021898bd 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -1,16 +1,18 @@ from collections.abc import Generator -from models.graph_edge import BaseEdge +from models.graph_edge import ConceptHasSourceConcept from models.graph_node import Concept from sources.catalogue.concepts_source import CatalogueConceptsSource from transformers.base_transformer import BaseTransformer from .raw_concept import RawCatalogueConcept +from .id_label_checker import IdLabelChecker class CatalogueConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = CatalogueConceptsSource(url) + self.id_label_checker = IdLabelChecker.from_source(node_type=["concepts", "locations"], source=["loc", "mesh"]) def transform_node(self, raw_node: dict) -> Concept | None: raw_concept = RawCatalogueConcept(raw_node) @@ -25,6 +27,37 @@ def transform_node(self, raw_node: dict) -> Concept | None: type=raw_concept.type, ) - def extract_edges(self, raw_node: RawCatalogueConcept) -> Generator[BaseEdge]: - # TODO: Extract `HAS_SOURCE_CONCEPT` edges - yield from () + def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: + raw_concept = RawCatalogueConcept(raw_node) + + if not raw_concept.is_concept: + return + + else: + if (raw_concept.source == "label-derived") and (raw_concept.type not in ["Person", "Organisation", "Agent"]): + # Generate edges via label + assert hasattr(self.id_label_checker, "inverse") + for source_concept_id in self.id_label_checker.inverse.get(raw_concept.label.lower(), []): + yield ConceptHasSourceConcept( + from_id=raw_concept.wellcome_id, + to_id=source_concept_id, + attributes={ + "qualifier": None, + "matched_by": "label" + } + ) + + elif raw_concept.has_valid_source_concept: + # Generate edges via ID + if (raw_concept.source != "nlm-mesh") or (self.id_label_checker.get(raw_concept.source_concept_id) == raw_concept.label.lower()): + yield ConceptHasSourceConcept( + from_id=raw_concept.wellcome_id, + to_id=str(raw_concept.source_concept_id), + attributes={ + "qualifier": raw_concept.mesh_qualifier, + "matched_by": "identifier" + } + ) + + else: + return From e941b7dc926ae1167617ffd50a78b0523dce8127 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Tue, 11 Feb 2025 14:14:52 +0000 Subject: [PATCH 275/310] Apply auto-formatting rules --- src/models/graph_node.py | 2 +- .../catalogue/concepts_transformer.py | 28 +++++++++++-------- .../catalogue/id_label_checker.py | 6 +++- src/transformers/catalogue/raw_concept.py | 12 ++++---- 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/src/models/graph_node.py b/src/models/graph_node.py index bbdbe78e4e..b869b97a2e 100644 --- a/src/models/graph_node.py +++ b/src/models/graph_node.py @@ -48,7 +48,7 @@ class SourceName(SourceConcept): "Meeting", "Genre", "Period", - "Subject" + "Subject", ] ConceptSource = Literal[ diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index a3021898bd..027d2afce9 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -5,14 +5,16 @@ from sources.catalogue.concepts_source import CatalogueConceptsSource from transformers.base_transformer import BaseTransformer -from .raw_concept import RawCatalogueConcept from .id_label_checker import IdLabelChecker +from .raw_concept import RawCatalogueConcept class CatalogueConceptsTransformer(BaseTransformer): def __init__(self, url: str): self.source = CatalogueConceptsSource(url) - self.id_label_checker = IdLabelChecker.from_source(node_type=["concepts", "locations"], source=["loc", "mesh"]) + self.id_label_checker = IdLabelChecker.from_source( + node_type=["concepts", "locations"], source=["loc", "mesh"] + ) def transform_node(self, raw_node: dict) -> Concept | None: raw_concept = RawCatalogueConcept(raw_node) @@ -34,29 +36,33 @@ def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: return else: - if (raw_concept.source == "label-derived") and (raw_concept.type not in ["Person", "Organisation", "Agent"]): + if (raw_concept.source == "label-derived") and ( + raw_concept.type not in ["Person", "Organisation", "Agent"] + ): # Generate edges via label assert hasattr(self.id_label_checker, "inverse") - for source_concept_id in self.id_label_checker.inverse.get(raw_concept.label.lower(), []): + for source_concept_id in self.id_label_checker.inverse.get( + raw_concept.label.lower(), [] + ): yield ConceptHasSourceConcept( from_id=raw_concept.wellcome_id, to_id=source_concept_id, - attributes={ - "qualifier": None, - "matched_by": "label" - } + attributes={"qualifier": None, "matched_by": "label"}, ) elif raw_concept.has_valid_source_concept: # Generate edges via ID - if (raw_concept.source != "nlm-mesh") or (self.id_label_checker.get(raw_concept.source_concept_id) == raw_concept.label.lower()): + if (raw_concept.source != "nlm-mesh") or ( + self.id_label_checker.get(raw_concept.source_concept_id) + == raw_concept.label.lower() + ): yield ConceptHasSourceConcept( from_id=raw_concept.wellcome_id, to_id=str(raw_concept.source_concept_id), attributes={ "qualifier": raw_concept.mesh_qualifier, - "matched_by": "identifier" - } + "matched_by": "identifier", + }, ) else: diff --git a/src/transformers/catalogue/id_label_checker.py b/src/transformers/catalogue/id_label_checker.py index 4ca5191247..e70c4a1611 100644 --- a/src/transformers/catalogue/id_label_checker.py +++ b/src/transformers/catalogue/id_label_checker.py @@ -16,7 +16,11 @@ def __init__(self, *args: Any, **kwargs: Any): self.inverse.setdefault(value, []).append(key) @classmethod - def from_source(cls, node_type: NodeType | list[NodeType], source: OntologyType | list[OntologyType]) -> dict: + def from_source( + cls, + node_type: NodeType | list[NodeType], + source: OntologyType | list[OntologyType], + ) -> dict: """Fetch source node data from s3 bulk upload files and create ID-label mapping.""" id_label_dict = {} diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index b0e2df37f2..c3300dab17 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -92,18 +92,20 @@ def source_concept_id(self) -> str | None: return source_id return None - + @property def has_valid_source_concept(self) -> bool: """Checks if the source concept ID format matches the specified source.""" if isinstance(self.source_concept_id, str): if (self.source == "nlm-mesh") and self.source_concept_id.startswith("D"): return True - - if (self.source == "lc-subjects") and self.source_concept_id.startswith("sh"): + + if (self.source == "lc-subjects") and self.source_concept_id.startswith( + "sh" + ): return True - + if (self.source == "lc-names") and self.source_concept_id.startswith("n"): return True - + return False From f4951046a2c10ff4646d696acc7f0b23e86f4f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 11 Feb 2025 15:04:56 +0000 Subject: [PATCH 276/310] Remove leftover extractor lambda resources --- terraform/extractor_ecs_task.tf | 10 ---------- terraform/iam_state_machines.tf | 1 - 2 files changed, 11 deletions(-) diff --git a/terraform/extractor_ecs_task.tf b/terraform/extractor_ecs_task.tf index 1897830b58..c572a5df34 100644 --- a/terraform/extractor_ecs_task.tf +++ b/terraform/extractor_ecs_task.tf @@ -59,13 +59,3 @@ data "aws_iam_policy_document" "stream_to_s3" { ] } } - -resource "aws_iam_role_policy" "stream_to_sns_policy" { - role = module.extractor_lambda.lambda_role.name - policy = data.aws_iam_policy_document.stream_to_sns.json -} - -resource "aws_iam_role_policy" "stream_to_s3_policy" { - role = module.extractor_lambda.lambda_role.name - policy = data.aws_iam_policy_document.stream_to_s3.json -} \ No newline at end of file diff --git a/terraform/iam_state_machines.tf b/terraform/iam_state_machines.tf index 08152c10eb..167228aa6f 100644 --- a/terraform/iam_state_machines.tf +++ b/terraform/iam_state_machines.tf @@ -39,7 +39,6 @@ resource "aws_iam_policy" "state_machine_policy" { Effect = "Allow", Action = ["lambda:InvokeFunction"], Resource = [ - module.extractor_lambda.lambda.arn, module.bulk_loader_lambda.lambda.arn, module.bulk_load_poller_lambda.lambda.arn ] From 30a29f9f8d6f6c483a5a76eed9968da3daf2e6e0 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 11 Feb 2025 15:17:43 +0000 Subject: [PATCH 277/310] Add transformer for catalogue edges --- terraform/variables.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/terraform/variables.tf b/terraform/variables.tf index 16134a7380..9532b59d09 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -98,5 +98,10 @@ variable "state_machine_inputs" { "transformer_type" : "catalogue_concepts", "entity_type" : "nodes" }, + { + "label" : "Catalogue Concept Edges", + "transformer_type" : "catalogue_concepts", + "entity_type" : "edges" + }, ] } From 405632595d40865ddc1609ed16f35259fe773189 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Tue, 11 Feb 2025 15:36:37 +0000 Subject: [PATCH 278/310] indent + correct repo name + backend for new tfstack --- .github/workflows/ci.yml | 8 ++++---- terraform/github/gha_secret.tf | 2 +- terraform/github/terraform.tf | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 63cd63daac..085e09cfdf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,11 +14,11 @@ jobs: with: python-version: '3.13' - name: CI setup - run: | - ./scripts/ci-setup.sh + run: | + ./scripts/ci-setup.sh - name: Test - run: | - ./scripts/test.sh + run: | + ./scripts/test.sh build: runs-on: ubuntu-latest diff --git a/terraform/github/gha_secret.tf b/terraform/github/gha_secret.tf index d8536052ce..bb3d1294c0 100644 --- a/terraform/github/gha_secret.tf +++ b/terraform/github/gha_secret.tf @@ -8,7 +8,7 @@ terraform { } resource "github_actions_secret" "catalogue_graph_ci" { - repository = "wellcomecollection/catalogue-graph" + repository = "catalogue-graph" secret_name = "CATALOGUE_GRAPH_CI_ROLE_ARN" plaintext_value = module.gha_catalogue_graph_ci_role.role_arn } \ No newline at end of file diff --git a/terraform/github/terraform.tf b/terraform/github/terraform.tf index e380e33812..fd6aaf846c 100644 --- a/terraform/github/terraform.tf +++ b/terraform/github/terraform.tf @@ -1,3 +1,17 @@ +terraform { + required_version = ">= 0.11" + + backend "s3" { + assume_role = { + role_arn = "arn:aws:iam::760097843905:role/platform-developer" + } + bucket = "wellcomecollection-platform-infra" + key = "terraform/catalogue/ci.tfstate" + dynamodb_table = "terraform-locktable" + region = "eu-west-1" + } +} + data "terraform_remote_state" "aws_account_infrastructure" { backend = "s3" From c8f31f5173ccb7fa9a9e6a5fa9926978cc60a690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Tue, 11 Feb 2025 16:24:57 +0000 Subject: [PATCH 279/310] Update privatelink security group --- terraform/locals.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/locals.tf b/terraform/locals.tf index 237d6029d9..1787c77748 100644 --- a/terraform/locals.tf +++ b/terraform/locals.tf @@ -11,7 +11,7 @@ locals { private_subnets = data.terraform_remote_state.catalogue_aws_account_infrastructure.outputs.catalogue_vpc_delta_private_subnets public_subnets = data.terraform_remote_state.catalogue_aws_account_infrastructure.outputs.catalogue_vpc_delta_public_subnets - ec_privatelink_security_group_id = local.shared_infra["ec_developer_privatelink_sg_id"] + ec_privatelink_security_group_id = local.shared_infra["ec_platform_privatelink_sg_id"] catalogue_graph_nlb_url = "catalogue-graph.wellcomecollection.org" } From dc983f78939ca78e4a0b5b44c3e2b935ac4d79b3 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 11 Feb 2025 16:27:20 +0000 Subject: [PATCH 280/310] Use csv.DictReader to fetch data from s3 --- .../linked_ontology_id_type_checker.py | 2 +- .../catalogue/concepts_transformer.py | 56 +++++++++---------- .../catalogue/id_label_checker.py | 2 +- src/utils/aws.py | 7 +-- 4 files changed, 30 insertions(+), 37 deletions(-) diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index f80c6b2d1d..5715416e13 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -22,7 +22,7 @@ def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: # Retrieve the bulk load file outputted by the relevant transformer so that we can extract ids from it. ids = set() for row in fetch_from_s3(node_type, self.linked_ontology): - ids.add(row[0]) + ids.add(row[":ID"]) print(f"({len(ids)} ids retrieved.)") diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index 027d2afce9..e644150a23 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -35,35 +35,31 @@ def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: if not raw_concept.is_concept: return - else: - if (raw_concept.source == "label-derived") and ( - raw_concept.type not in ["Person", "Organisation", "Agent"] + if (raw_concept.source == "label-derived") and ( + raw_concept.type not in ["Person", "Organisation", "Agent"] + ): + # Generate edges via label + assert hasattr(self.id_label_checker, "inverse") + for source_concept_id in self.id_label_checker.inverse.get( + raw_concept.label.lower(), [] ): - # Generate edges via label - assert hasattr(self.id_label_checker, "inverse") - for source_concept_id in self.id_label_checker.inverse.get( - raw_concept.label.lower(), [] - ): - yield ConceptHasSourceConcept( - from_id=raw_concept.wellcome_id, - to_id=source_concept_id, - attributes={"qualifier": None, "matched_by": "label"}, - ) + yield ConceptHasSourceConcept( + from_id=raw_concept.wellcome_id, + to_id=source_concept_id, + attributes={"qualifier": None, "matched_by": "label"}, + ) - elif raw_concept.has_valid_source_concept: - # Generate edges via ID - if (raw_concept.source != "nlm-mesh") or ( - self.id_label_checker.get(raw_concept.source_concept_id) - == raw_concept.label.lower() - ): - yield ConceptHasSourceConcept( - from_id=raw_concept.wellcome_id, - to_id=str(raw_concept.source_concept_id), - attributes={ - "qualifier": raw_concept.mesh_qualifier, - "matched_by": "identifier", - }, - ) - - else: - return + if raw_concept.has_valid_source_concept: + # Generate edges via ID + if (raw_concept.source != "nlm-mesh") or ( + self.id_label_checker.get(raw_concept.source_concept_id) + == raw_concept.label.lower() + ): + yield ConceptHasSourceConcept( + from_id=raw_concept.wellcome_id, + to_id=str(raw_concept.source_concept_id), + attributes={ + "qualifier": raw_concept.mesh_qualifier, + "matched_by": "identifier", + }, + ) diff --git a/src/transformers/catalogue/id_label_checker.py b/src/transformers/catalogue/id_label_checker.py index e70c4a1611..a5608a863e 100644 --- a/src/transformers/catalogue/id_label_checker.py +++ b/src/transformers/catalogue/id_label_checker.py @@ -33,7 +33,7 @@ def from_source( for nt, s in product(node_type, source): for row in fetch_from_s3(nt, s): # Extract source id and label at position 0 and 3, respectively - id_label_dict[row[0]] = row[3].lower() + id_label_dict[row[":ID"]] = row["label:String"].lower() print(f"({len(id_label_dict)} ids and labels retrieved.)") diff --git a/src/utils/aws.py b/src/utils/aws.py index 86a1641f55..673bee203a 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -73,10 +73,7 @@ def fetch_from_s3(node_type: NodeType, source: OntologyType) -> Generator[Any]: transport_params = {"client": boto3.client("s3")} with smart_open.open(s3_url, "r", transport_params=transport_params) as f: - csv_reader = csv.reader(f) + csv_reader = csv.DictReader(f) - for i, row in enumerate(csv_reader): - # Skip header - if i == 0: - continue + for row in csv_reader: yield row From ebcd690f17463402ce0785dc1fbf0c568ad9ea30 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Tue, 11 Feb 2025 16:36:07 +0000 Subject: [PATCH 281/310] Update import config --- src/utils/aws.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/aws.py b/src/utils/aws.py index 673bee203a..90c65c66ee 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -9,7 +9,7 @@ from clients.base_neptune_client import BaseNeptuneClient from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient -from config import S3_BULK_LOAD_BUCKET_NAME +import config LOAD_BALANCER_SECRET_NAME = "catalogue-graph/neptune-nlb-url" INSTANCE_ENDPOINT_SECRET_NAME = "catalogue-graph/neptune-cluster-endpoint" @@ -67,7 +67,7 @@ def get_neptune_client(is_local: bool) -> BaseNeptuneClient: def fetch_from_s3(node_type: NodeType, source: OntologyType) -> Generator[Any]: """Retrieves the bulk load file outputted by the relevant transformer so that we can extract data from it.""" linked_nodes_file_name = f"{source}_{node_type}__nodes.csv" - s3_url = f"s3://{S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" + s3_url = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" print(f"Retrieving ids of type '{node_type}' from ontology '{source}' from S3.") From 69cff52f639f45499297b99177143ce977a79254 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 12 Feb 2025 09:44:36 +0000 Subject: [PATCH 282/310] Fix tests --- .../linked_ontology_id_type_checker.py | 3 +++ .../loc/transformer_output_concepts_nodes.csv | 2 +- .../transformer_output_locations_nodes.csv | 2 +- .../transformer_output_concepts_nodes.csv | 7 +++++++ .../transformer_output_locations_nodes.csv | 7 +++++++ .../sources/test_wikidata_concepts_source.py | 19 ++++--------------- tests/test_extractor.py | 3 ++- tests/test_utils.py | 15 +++++++++++++++ .../test_catalogue_concepts_transformer.py | 6 ++++-- .../test_wikidata_concepts_transformer.py | 11 ++++------- 10 files changed, 48 insertions(+), 27 deletions(-) create mode 100644 tests/fixtures/mesh/transformer_output_concepts_nodes.csv create mode 100644 tests/fixtures/mesh/transformer_output_locations_nodes.csv diff --git a/src/sources/wikidata/linked_ontology_id_type_checker.py b/src/sources/wikidata/linked_ontology_id_type_checker.py index 5715416e13..f04ab07c9b 100644 --- a/src/sources/wikidata/linked_ontology_id_type_checker.py +++ b/src/sources/wikidata/linked_ontology_id_type_checker.py @@ -1,3 +1,5 @@ +from functools import lru_cache + from utils.aws import NodeType, OntologyType, fetch_from_s3 @@ -17,6 +19,7 @@ def __init__(self, node_type: NodeType, linked_ontology: OntologyType): "Invalid node_type for ontology type MeSH." ) + @lru_cache def _get_linked_ontology_ids(self, node_type: NodeType) -> set[str]: """Return all ids classified under a given `node_type` for the selected ontology.""" # Retrieve the bulk load file outputted by the relevant transformer so that we can extract ids from it. diff --git a/tests/fixtures/loc/transformer_output_concepts_nodes.csv b/tests/fixtures/loc/transformer_output_concepts_nodes.csv index fedbad4077..abd628797f 100644 --- a/tests/fixtures/loc/transformer_output_concepts_nodes.csv +++ b/tests/fixtures/loc/transformer_output_concepts_nodes.csv @@ -1,3 +1,3 @@ -:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description +:ID,:LABEL,id:String,label:String,source:String,alternative_ids:String,alternative_labels:String,description:String sh00000001,SourceConcept,sh00000001,ActionScript (Computer program language),lc-subjects,,,null sh00000002,SourceConcept,sh00000002,Tacos,lc-subjects,,,null diff --git a/tests/fixtures/loc/transformer_output_locations_nodes.csv b/tests/fixtures/loc/transformer_output_locations_nodes.csv index d259fa3a68..40f31ec7ef 100644 --- a/tests/fixtures/loc/transformer_output_locations_nodes.csv +++ b/tests/fixtures/loc/transformer_output_locations_nodes.csv @@ -1,3 +1,3 @@ -:ID,:LABEL,id,label,source,alternative_ids,alternative_labels,description,latitude,longitude +:ID,:LABEL,id:String,label:String,source:String,alternative_ids:String,alternative_labels:String,description:String,latitude,longitude sh00000015,SourceLocation,sh00000015,"Trail Creek Watershed (Jackson County, Or.)",lc-subjects,,,null,null,null sh00000016,SourceLocation,sh00000016,"Trail Creek (Jackson County, Or.)",lc-subjects,,,null,null,null \ No newline at end of file diff --git a/tests/fixtures/mesh/transformer_output_concepts_nodes.csv b/tests/fixtures/mesh/transformer_output_concepts_nodes.csv new file mode 100644 index 0000000000..832d66ffec --- /dev/null +++ b/tests/fixtures/mesh/transformer_output_concepts_nodes.csv @@ -0,0 +1,7 @@ +:ID,:LABEL,id:String,label:String,source:String,alternative_ids:String,alternative_labels:String,description:String +D000001,SourceConcept,D000001,Calcimycin,nlm-mesh,D02.355.291.933.125||D02.540.576.625.125||D03.633.100.221.173||D04.345.241.654.125||D04.345.674.625.125,A-23187,"An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems. + " +D000002,SourceConcept,D000002,Temefos,nlm-mesh,D02.705.400.625.800||D02.705.539.345.800||D02.886.300.692.800,Difos||Abate,"An organothiophosphate insecticide. + " +D000003,SourceConcept,D000003,Abattoirs,nlm-mesh,J01.576.423.200.700.100||J03.540.020,,"Places where animals are slaughtered and dressed for market. + " \ No newline at end of file diff --git a/tests/fixtures/mesh/transformer_output_locations_nodes.csv b/tests/fixtures/mesh/transformer_output_locations_nodes.csv new file mode 100644 index 0000000000..d34703565f --- /dev/null +++ b/tests/fixtures/mesh/transformer_output_locations_nodes.csv @@ -0,0 +1,7 @@ +:ID,:LABEL,id:String,label:String,source:String,alternative_ids:String,alternative_labels:String,description:String,latitude:Float,longitude:Float +D000346,SourceLocation,D000346,Afghanistan,nlm-mesh,Z01.252.245.782.250,,"Country located north and west of Pakistan, and east of Iran. The capital is Kabul. + ",, +D000349,SourceLocation,D000349,Africa,nlm-mesh,Z01.058,,"The continent south of EUROPE, east of the ATLANTIC OCEAN and west of the INDIAN OCEAN. + ",, +D000350,SourceLocation,D000350,"Africa, Central",nlm-mesh,Z01.058.290.100,,"The geographical area of Africa comprising CAMEROON; CENTRAL AFRICAN REPUBLIC; CHAD; CONGO; DEMOCRATIC REPUBLIC OF THE CONGO; EQUATORIAL GUINEA; GABON; RWANDA.and SAO TOME AND PRINCIPE. + ",, \ No newline at end of file diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index 4fdfb82da0..a24d12b74d 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -2,7 +2,7 @@ from typing import Literal from test_mocks import MockRequest, MockSmartOpen -from test_utils import load_fixture +from test_utils import load_fixture, add_mock_transformer_outputs from config import WIKIDATA_SPARQL_URL from sources.wikidata.linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker @@ -28,19 +28,8 @@ def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: ) -def _add_mock_loc_transformer_outputs() -> None: - """ - Add mock LoC transformer output files to S3 so that the LinkedOntologyIdTypeChecker class can extract ids from them. - """ - for node_type in ["concepts", "locations", "names"]: - MockSmartOpen.mock_s3_file( - f"s3://bulk_load_test_bucket/loc_{node_type}__nodes.csv", - load_fixture(f"loc/transformer_output_{node_type}_nodes.csv").decode(), - ) - - def test_wikidata_concepts_source_edges() -> None: - _add_mock_loc_transformer_outputs() + add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) _add_mock_wikidata_requests("edges") mesh_concepts_source = WikidataLinkedOntologySource( @@ -71,7 +60,7 @@ def test_wikidata_concepts_source_edges() -> None: def test_wikidata_concepts_source_nodes() -> None: - _add_mock_loc_transformer_outputs() + add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) _add_mock_wikidata_requests("nodes") mesh_concepts_source = WikidataLinkedOntologySource( @@ -89,7 +78,7 @@ def test_wikidata_concepts_source_nodes() -> None: def test_wikidata_linked_ontology_id_checker() -> None: - _add_mock_loc_transformer_outputs() + add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) id_checker = LinkedOntologyIdTypeChecker("locations", "loc") assert id_checker.id_is_valid("sh00000001") diff --git a/tests/test_extractor.py b/tests/test_extractor.py index f462ad34ac..6901572a28 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -3,7 +3,7 @@ import pytest from test_mocks import MOCK_INSTANCE_ENDPOINT, MockRequest, MockResponseInput -from test_utils import load_fixture +from test_utils import load_fixture, add_mock_transformer_outputs from typing_extensions import get_args from config import ( @@ -138,6 +138,7 @@ def test_lambda_handler( mock_responses: list[MockResponseInput], ) -> None: MockRequest.mock_responses(mock_responses) + add_mock_transformer_outputs(sources=["loc", "mesh"], node_types=["concepts", "locations"]) lambda_handler(lambda_event, None) transformer_type = lambda_event["transformer_type"] diff --git a/tests/test_utils.py b/tests/test_utils.py index 7b34986fe9..b9d97d6751 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,21 @@ import os +from itertools import product +from typing import Literal + +from test_mocks import MockSmartOpen def load_fixture(file_name: str) -> bytes: with open(f"{os.path.dirname(__file__)}/fixtures/{file_name}", "rb") as f: return f.read() + + +def add_mock_transformer_outputs(sources: Literal["loc", "mesh"], node_types: Literal["concepts", "locations", "names"]) -> None: + """ + Add mock transformer output files to S3 so that the IdLabelChecker class can extract ids and labels from them. + """ + for source, node_type in product(sources, node_types): + MockSmartOpen.mock_s3_file( + f"s3://bulk_load_test_bucket/{source}_{node_type}__nodes.csv", + load_fixture(f"{source}/transformer_output_{node_type}_nodes.csv").decode(), + ) diff --git a/tests/transformers/test_catalogue_concepts_transformer.py b/tests/transformers/test_catalogue_concepts_transformer.py index 52b62f48da..c219b5d20d 100644 --- a/tests/transformers/test_catalogue_concepts_transformer.py +++ b/tests/transformers/test_catalogue_concepts_transformer.py @@ -1,10 +1,12 @@ from test_mocks import MockRequest -from test_utils import load_fixture +from test_utils import load_fixture, add_mock_transformer_outputs from transformers.catalogue.concepts_transformer import CatalogueConceptsTransformer -def test_mesh_concepts_transformer() -> None: +def test_catalogue_concepts_transformer() -> None: + add_mock_transformer_outputs(sources=["loc", "mesh"], node_types=["concepts", "locations"]) + test_url = "https://example.com" MockRequest.mock_responses( diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index 9d69b3d068..9f6af581ac 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -2,11 +2,8 @@ import math import pytest -from test_utils import load_fixture -from test_wikidata_concepts_source import ( - _add_mock_loc_transformer_outputs, - _add_mock_wikidata_requests, -) +from test_utils import load_fixture, add_mock_transformer_outputs +from test_wikidata_concepts_source import _add_mock_wikidata_requests from models.graph_edge import SourceConceptSameAs from models.graph_node import SourceConcept @@ -15,7 +12,7 @@ def test_wikidata_concepts_nodes_transformer() -> None: - _add_mock_loc_transformer_outputs() + add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) _add_mock_wikidata_requests("nodes") transformer = WikidataConceptsTransformer( @@ -37,7 +34,7 @@ def test_wikidata_concepts_nodes_transformer() -> None: def test_wikidata_concepts_edges_transformer() -> None: - _add_mock_loc_transformer_outputs() + add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) _add_mock_wikidata_requests("edges") transformer = WikidataConceptsTransformer( From f534ad676287af8b093aae682d9b1be94f505d65 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 12 Feb 2025 09:47:33 +0000 Subject: [PATCH 283/310] Fix formatting --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index b9d97d6751..2f63d6081b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,7 +10,7 @@ def load_fixture(file_name: str) -> bytes: return f.read() -def add_mock_transformer_outputs(sources: Literal["loc", "mesh"], node_types: Literal["concepts", "locations", "names"]) -> None: +def add_mock_transformer_outputs(sources: list[Literal["loc", "mesh"]], node_types: list[Literal["concepts", "locations", "names"]]) -> None: """ Add mock transformer output files to S3 so that the IdLabelChecker class can extract ids and labels from them. """ From 83c20efdc8c106b2c89449b7ef0aea5a8481896b Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 09:49:58 +0000 Subject: [PATCH 284/310] sloppy indentation --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 085e09cfdf..6701af2441 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,7 @@ jobs: role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} - name: Build and push artefacts run: | - ./scripts/build.sh --push + ./scripts/build.sh --push deploy: runs-on: ubuntu-latest @@ -45,10 +45,10 @@ jobs: role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} - name: Deploy bulk-loader lambda run: | - ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-loader + ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-loader - name: Deploy bulk-load-poller lambda run: | - ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-load-poller + ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-load-poller - name: Deploy indexer lambda run: | - ./scripts/deploy_lambda_zip.sh catalogue-graph-indexer \ No newline at end of file + ./scripts/deploy_lambda_zip.sh catalogue-graph-indexer \ No newline at end of file From a8b2738f731b178a1e519a5788fc8990a98c8437 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 12 Feb 2025 10:00:03 +0000 Subject: [PATCH 285/310] Fix formatting --- src/transformers/catalogue/concepts_transformer.py | 5 ++++- src/utils/aws.py | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index e644150a23..fd96ee361d 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -49,7 +49,10 @@ def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: attributes={"qualifier": None, "matched_by": "label"}, ) - if raw_concept.has_valid_source_concept: + if raw_concept.has_valid_source_concept and ((raw_concept.source != "nlm-mesh") or ( + self.id_label_checker.get(raw_concept.source_concept_id) + == raw_concept.label.lower() + )): # Generate edges via ID if (raw_concept.source != "nlm-mesh") or ( self.id_label_checker.get(raw_concept.source_concept_id) diff --git a/src/utils/aws.py b/src/utils/aws.py index 90c65c66ee..63da11c5cb 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -75,5 +75,4 @@ def fetch_from_s3(node_type: NodeType, source: OntologyType) -> Generator[Any]: with smart_open.open(s3_url, "r", transport_params=transport_params) as f: csv_reader = csv.DictReader(f) - for row in csv_reader: - yield row + yield from csv_reader From 8aa51f5bd0aa05c5bc8e6299bf174867331c7b1b Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 12 Feb 2025 10:03:12 +0000 Subject: [PATCH 286/310] Remove obsolete concept check --- .../catalogue/concepts_transformer.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index fd96ee361d..ab8cf31c49 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -54,15 +54,11 @@ def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: == raw_concept.label.lower() )): # Generate edges via ID - if (raw_concept.source != "nlm-mesh") or ( - self.id_label_checker.get(raw_concept.source_concept_id) - == raw_concept.label.lower() - ): - yield ConceptHasSourceConcept( - from_id=raw_concept.wellcome_id, - to_id=str(raw_concept.source_concept_id), - attributes={ - "qualifier": raw_concept.mesh_qualifier, - "matched_by": "identifier", - }, - ) + yield ConceptHasSourceConcept( + from_id=raw_concept.wellcome_id, + to_id=str(raw_concept.source_concept_id), + attributes={ + "qualifier": raw_concept.mesh_qualifier, + "matched_by": "identifier", + }, + ) From 6e53c16fa12713e7f342ff84915f07ba13aabcaa Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Wed, 12 Feb 2025 10:04:17 +0000 Subject: [PATCH 287/310] Apply auto-formatting rules --- .../catalogue/concepts_transformer.py | 7 +++++-- src/utils/aws.py | 2 +- tests/sources/test_wikidata_concepts_source.py | 16 +++++++++++----- tests/test_extractor.py | 6 ++++-- tests/test_utils.py | 5 ++++- .../test_catalogue_concepts_transformer.py | 6 ++++-- .../test_wikidata_concepts_transformer.py | 10 +++++++--- 7 files changed, 36 insertions(+), 16 deletions(-) diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index ab8cf31c49..2568273544 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -49,10 +49,13 @@ def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: attributes={"qualifier": None, "matched_by": "label"}, ) - if raw_concept.has_valid_source_concept and ((raw_concept.source != "nlm-mesh") or ( + if raw_concept.has_valid_source_concept and ( + (raw_concept.source != "nlm-mesh") + or ( self.id_label_checker.get(raw_concept.source_concept_id) == raw_concept.label.lower() - )): + ) + ): # Generate edges via ID yield ConceptHasSourceConcept( from_id=raw_concept.wellcome_id, diff --git a/src/utils/aws.py b/src/utils/aws.py index 63da11c5cb..8f6f16df5f 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -6,10 +6,10 @@ import boto3 import smart_open +import config from clients.base_neptune_client import BaseNeptuneClient from clients.lambda_neptune_client import LambdaNeptuneClient from clients.local_neptune_client import LocalNeptuneClient -import config LOAD_BALANCER_SECRET_NAME = "catalogue-graph/neptune-nlb-url" INSTANCE_ENDPOINT_SECRET_NAME = "catalogue-graph/neptune-cluster-endpoint" diff --git a/tests/sources/test_wikidata_concepts_source.py b/tests/sources/test_wikidata_concepts_source.py index a24d12b74d..40c0195e07 100644 --- a/tests/sources/test_wikidata_concepts_source.py +++ b/tests/sources/test_wikidata_concepts_source.py @@ -1,8 +1,8 @@ import json from typing import Literal -from test_mocks import MockRequest, MockSmartOpen -from test_utils import load_fixture, add_mock_transformer_outputs +from test_mocks import MockRequest +from test_utils import add_mock_transformer_outputs, load_fixture from config import WIKIDATA_SPARQL_URL from sources.wikidata.linked_ontology_id_type_checker import LinkedOntologyIdTypeChecker @@ -29,7 +29,9 @@ def _add_mock_wikidata_requests(node_type: Literal["edges", "nodes"]) -> None: def test_wikidata_concepts_source_edges() -> None: - add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) + add_mock_transformer_outputs( + sources=["loc"], node_types=["concepts", "locations", "names"] + ) _add_mock_wikidata_requests("edges") mesh_concepts_source = WikidataLinkedOntologySource( @@ -60,7 +62,9 @@ def test_wikidata_concepts_source_edges() -> None: def test_wikidata_concepts_source_nodes() -> None: - add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) + add_mock_transformer_outputs( + sources=["loc"], node_types=["concepts", "locations", "names"] + ) _add_mock_wikidata_requests("nodes") mesh_concepts_source = WikidataLinkedOntologySource( @@ -78,7 +82,9 @@ def test_wikidata_concepts_source_nodes() -> None: def test_wikidata_linked_ontology_id_checker() -> None: - add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) + add_mock_transformer_outputs( + sources=["loc"], node_types=["concepts", "locations", "names"] + ) id_checker = LinkedOntologyIdTypeChecker("locations", "loc") assert id_checker.id_is_valid("sh00000001") diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 6901572a28..f5cf56e0a9 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -3,7 +3,7 @@ import pytest from test_mocks import MOCK_INSTANCE_ENDPOINT, MockRequest, MockResponseInput -from test_utils import load_fixture, add_mock_transformer_outputs +from test_utils import add_mock_transformer_outputs, load_fixture from typing_extensions import get_args from config import ( @@ -138,7 +138,9 @@ def test_lambda_handler( mock_responses: list[MockResponseInput], ) -> None: MockRequest.mock_responses(mock_responses) - add_mock_transformer_outputs(sources=["loc", "mesh"], node_types=["concepts", "locations"]) + add_mock_transformer_outputs( + sources=["loc", "mesh"], node_types=["concepts", "locations"] + ) lambda_handler(lambda_event, None) transformer_type = lambda_event["transformer_type"] diff --git a/tests/test_utils.py b/tests/test_utils.py index 2f63d6081b..a8eed441eb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,7 +10,10 @@ def load_fixture(file_name: str) -> bytes: return f.read() -def add_mock_transformer_outputs(sources: list[Literal["loc", "mesh"]], node_types: list[Literal["concepts", "locations", "names"]]) -> None: +def add_mock_transformer_outputs( + sources: list[Literal["loc", "mesh"]], + node_types: list[Literal["concepts", "locations", "names"]], +) -> None: """ Add mock transformer output files to S3 so that the IdLabelChecker class can extract ids and labels from them. """ diff --git a/tests/transformers/test_catalogue_concepts_transformer.py b/tests/transformers/test_catalogue_concepts_transformer.py index c219b5d20d..c53ad53227 100644 --- a/tests/transformers/test_catalogue_concepts_transformer.py +++ b/tests/transformers/test_catalogue_concepts_transformer.py @@ -1,11 +1,13 @@ from test_mocks import MockRequest -from test_utils import load_fixture, add_mock_transformer_outputs +from test_utils import add_mock_transformer_outputs, load_fixture from transformers.catalogue.concepts_transformer import CatalogueConceptsTransformer def test_catalogue_concepts_transformer() -> None: - add_mock_transformer_outputs(sources=["loc", "mesh"], node_types=["concepts", "locations"]) + add_mock_transformer_outputs( + sources=["loc", "mesh"], node_types=["concepts", "locations"] + ) test_url = "https://example.com" diff --git a/tests/transformers/test_wikidata_concepts_transformer.py b/tests/transformers/test_wikidata_concepts_transformer.py index 9f6af581ac..66ed85f97c 100644 --- a/tests/transformers/test_wikidata_concepts_transformer.py +++ b/tests/transformers/test_wikidata_concepts_transformer.py @@ -2,7 +2,7 @@ import math import pytest -from test_utils import load_fixture, add_mock_transformer_outputs +from test_utils import add_mock_transformer_outputs, load_fixture from test_wikidata_concepts_source import _add_mock_wikidata_requests from models.graph_edge import SourceConceptSameAs @@ -12,7 +12,9 @@ def test_wikidata_concepts_nodes_transformer() -> None: - add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) + add_mock_transformer_outputs( + sources=["loc"], node_types=["concepts", "locations", "names"] + ) _add_mock_wikidata_requests("nodes") transformer = WikidataConceptsTransformer( @@ -34,7 +36,9 @@ def test_wikidata_concepts_nodes_transformer() -> None: def test_wikidata_concepts_edges_transformer() -> None: - add_mock_transformer_outputs(sources=["loc"], node_types=["concepts", "locations", "names"]) + add_mock_transformer_outputs( + sources=["loc"], node_types=["concepts", "locations", "names"] + ) _add_mock_wikidata_requests("edges") transformer = WikidataConceptsTransformer( From 002b67914aa3b2769f721b83389f11761c704222 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 10:08:18 +0000 Subject: [PATCH 288/310] fix condition --- .github/workflows/ci.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6701af2441..541d9659b3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,9 +1,6 @@ name: "Run CI pipeline" -on: - push: - branches: - - main +on: push jobs: test: @@ -23,7 +20,7 @@ jobs: build: runs-on: ubuntu-latest needs: test - if: needs.test.result == "success" + if: ${{ needs.test.result == 'success' }} steps: - uses: actions/checkout@v3 - uses: aws-actions/configure-aws-credentials@v4 @@ -37,7 +34,7 @@ jobs: deploy: runs-on: ubuntu-latest needs: build - if: needs.build.result == "success" + if: ${{ needs.build.result == 'success' }} steps: - uses: aws-actions/configure-aws-credentials@v4 with: From a84e7eb8dd80afb2b70253d4d26e7093029a860f Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 10:15:19 +0000 Subject: [PATCH 289/310] add token write permission --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 541d9659b3..81729913fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,8 @@ name: "Run CI pipeline" +permissions: + id-token: write + on: push jobs: From 386a29053f7428420a0f773a8946f1453de0d42c Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 10:21:07 +0000 Subject: [PATCH 290/310] where is the repo --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81729913fb..eb596902a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,9 +1,9 @@ name: "Run CI pipeline" +on: push + permissions: id-token: write - -on: push jobs: test: From e9d3a3d4fe4e1ed526633d27829d166fb9fc826d Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 10:46:03 +0000 Subject: [PATCH 291/310] more permission --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index eb596902a8..f08ee048d6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,6 +3,7 @@ name: "Run CI pipeline" on: push permissions: + contents: read id-token: write jobs: From 8fc093c3a6fc025d54b119600d521795c98b9ec8 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 10:51:26 +0000 Subject: [PATCH 292/310] log in to ECR --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f08ee048d6..55a7aec1e0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,6 +31,8 @@ jobs: with: aws-region: eu-west-1 role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} + - name: Log in to private ECR + uses: aws-actions/amazon-ecr-login@v2 - name: Build and push artefacts run: | ./scripts/build.sh --push From e816265f068301aed86e31416916b19a87fd6f5b Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 11:00:28 +0000 Subject: [PATCH 293/310] ecr token permission + checkout before deploy --- .github/workflows/ci.yml | 1 + terraform/github/gha_role.tf | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 55a7aec1e0..305c396171 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -42,6 +42,7 @@ jobs: needs: build if: ${{ needs.build.result == 'success' }} steps: + - uses: actions/checkout@v3 - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-west-1 diff --git a/terraform/github/gha_role.tf b/terraform/github/gha_role.tf index da61386f6e..ba018ab2e0 100644 --- a/terraform/github/gha_role.tf +++ b/terraform/github/gha_role.tf @@ -32,6 +32,15 @@ data "aws_iam_policy_document" "gha_catalogue_graph_ci" { "arn:aws:ecr:eu-west-1:760097843905:repository/uk.ac.wellcome/catalogue_graph_extractor" ] } + + statement { + actions = [ + "ecr:GetAuthorizationToken" + ] + resources = [ + "*" + ] + } statement { actions = [ "lambda:GetFunctionConfiguration", From 25f5e21d30abd2a16ec6e6166a5096fa98837ea6 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 11:28:12 +0000 Subject: [PATCH 294/310] permission denied --- scripts/deploy_lambda_zip.sh | 3 --- terraform/github/gha_role.tf | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/deploy_lambda_zip.sh b/scripts/deploy_lambda_zip.sh index 49e8edaec2..b275e20889 100644 --- a/scripts/deploy_lambda_zip.sh +++ b/scripts/deploy_lambda_zip.sh @@ -1,8 +1,5 @@ #!/usr/bin/env bash -# Usage: ./deploy_lambda_zip.sh -# Example: ./deploy_lambda_zip.sh monitoring/ingest_inspector_backend ingest_inspector_backend - set -o errexit set -o nounset set -o pipefail diff --git a/terraform/github/gha_role.tf b/terraform/github/gha_role.tf index ba018ab2e0..1daac85301 100644 --- a/terraform/github/gha_role.tf +++ b/terraform/github/gha_role.tf @@ -13,6 +13,7 @@ data "aws_iam_policy_document" "gha_catalogue_graph_ci" { "s3:PutObject" ] resources = [ + "arn:aws:s3:::wellcomecollection-platform-infra/lambdas/catalogue_graph", "arn:aws:s3:::wellcomecollection-platform-infra/lambdas/catalogue_graph/*" ] } From d60f914ee5585854f358657c283bebbaea5f0381 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 11:38:20 +0000 Subject: [PATCH 295/310] make script executable --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 305c396171..a040b2b1bd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,6 +47,9 @@ jobs: with: aws-region: eu-west-1 role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} + - name: Make script executable + run: | + chmod +x ./scripts/deploy_lambda_zip.sh - name: Deploy bulk-loader lambda run: | ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-loader From 6d2ca53b21e8f49f8db030cebc4d623e0a8ebeb6 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 11:44:41 +0000 Subject: [PATCH 296/310] even more permissions --- .github/workflows/ci.yml | 3 --- terraform/github/gha_role.tf | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a040b2b1bd..305c396171 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,9 +47,6 @@ jobs: with: aws-region: eu-west-1 role-to-assume: ${{ secrets.CATALOGUE_GRAPH_CI_ROLE_ARN }} - - name: Make script executable - run: | - chmod +x ./scripts/deploy_lambda_zip.sh - name: Deploy bulk-loader lambda run: | ./scripts/deploy_lambda_zip.sh catalogue-graph-bulk-loader diff --git a/terraform/github/gha_role.tf b/terraform/github/gha_role.tf index 1daac85301..2ffd1f82e8 100644 --- a/terraform/github/gha_role.tf +++ b/terraform/github/gha_role.tf @@ -10,7 +10,8 @@ module "gha_catalogue_graph_ci_role" { data "aws_iam_policy_document" "gha_catalogue_graph_ci" { statement { actions = [ - "s3:PutObject" + "s3:PutObject", + "s3:GetObject" ] resources = [ "arn:aws:s3:::wellcomecollection-platform-infra/lambdas/catalogue_graph", From 1d345532e3d54889addc0eed78ee3555e02ec46f Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 11:53:05 +0000 Subject: [PATCH 297/310] exec --- scripts/deploy_lambda_zip.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/deploy_lambda_zip.sh diff --git a/scripts/deploy_lambda_zip.sh b/scripts/deploy_lambda_zip.sh old mode 100644 new mode 100755 From 25da7c58c66bfbe8307e6dd423a15a607c5fdf2b Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 11:53:20 +0000 Subject: [PATCH 298/310] exec --- scripts/deploy_lambda_zip.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 scripts/deploy_lambda_zip.sh diff --git a/scripts/deploy_lambda_zip.sh b/scripts/deploy_lambda_zip.sh old mode 100755 new mode 100644 From 41c43b4f0ef64a01cd0b0d44be8dac1eee476d29 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 12:03:48 +0000 Subject: [PATCH 299/310] exe again --- scripts/deploy_lambda_zip.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/deploy_lambda_zip.sh diff --git a/scripts/deploy_lambda_zip.sh b/scripts/deploy_lambda_zip.sh old mode 100644 new mode 100755 From 7d248a7f19ed5c67cc5c8b89dd9add1b183c68ff Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 12:09:14 +0000 Subject: [PATCH 300/310] fail test to check build doesnt run --- scripts/deploy_lambda_zip.sh | 0 tests/test_extractor.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) mode change 100755 => 100644 scripts/deploy_lambda_zip.sh diff --git a/scripts/deploy_lambda_zip.sh b/scripts/deploy_lambda_zip.sh old mode 100755 new mode 100644 diff --git a/tests/test_extractor.py b/tests/test_extractor.py index f462ad34ac..62e03ea48f 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -41,7 +41,7 @@ def mock_requests_lookup_table( if transformer_type in ["mesh_concepts", "mesh_locations"]: mocked_responses.append( { - "method": "GET", + "method": "POTATO", "url": MESH_URL, "content_bytes": load_fixture("mesh/raw_descriptors.xml"), } From 24babea81131f51d8d6020bc1d94907412dc2d8c Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 12 Feb 2025 12:09:40 +0000 Subject: [PATCH 301/310] Move concept validity checks --- .../catalogue/concepts_transformer.py | 17 ++++------------- src/transformers/catalogue/raw_concept.py | 14 ++++++++++++-- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index 2568273544..73f7243617 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -17,7 +17,7 @@ def __init__(self, url: str): ) def transform_node(self, raw_node: dict) -> Concept | None: - raw_concept = RawCatalogueConcept(raw_node) + raw_concept = RawCatalogueConcept(raw_node, self.id_label_checker) if not raw_concept.is_concept: return None @@ -30,7 +30,7 @@ def transform_node(self, raw_node: dict) -> Concept | None: ) def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: - raw_concept = RawCatalogueConcept(raw_node) + raw_concept = RawCatalogueConcept(raw_node, self.id_label_checker) if not raw_concept.is_concept: return @@ -39,23 +39,14 @@ def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: raw_concept.type not in ["Person", "Organisation", "Agent"] ): # Generate edges via label - assert hasattr(self.id_label_checker, "inverse") - for source_concept_id in self.id_label_checker.inverse.get( - raw_concept.label.lower(), [] - ): + for source_concept_id in raw_concept.label_derived_source_concept_ids: yield ConceptHasSourceConcept( from_id=raw_concept.wellcome_id, to_id=source_concept_id, attributes={"qualifier": None, "matched_by": "label"}, ) - if raw_concept.has_valid_source_concept and ( - (raw_concept.source != "nlm-mesh") - or ( - self.id_label_checker.get(raw_concept.source_concept_id) - == raw_concept.label.lower() - ) - ): + if raw_concept.has_valid_source_concept: # Generate edges via ID yield ConceptHasSourceConcept( from_id=raw_concept.wellcome_id, diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index c3300dab17..b219a2da0e 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -1,12 +1,15 @@ import re from typing import get_args +from .id_label_checker import IdLabelChecker + from models.graph_node import ConceptSource, ConceptType class RawCatalogueConcept: - def __init__(self, raw_concept: dict): + def __init__(self, raw_concept: dict, id_label_checker: IdLabelChecker): self.raw_concept = self._extract_concept_node(raw_concept) + self.id_label_checker = id_label_checker @staticmethod def _extract_concept_node(raw_concept: dict) -> dict: @@ -92,12 +95,19 @@ def source_concept_id(self) -> str | None: return source_id return None + + @property + def label_derived_source_concept_ids(self) -> list[str]: + label_derived_ids = self.id_label_checker.inverse.get(self.label.lower(), []) + assert isinstance(label_derived_ids, list) + return label_derived_ids @property def has_valid_source_concept(self) -> bool: """Checks if the source concept ID format matches the specified source.""" if isinstance(self.source_concept_id, str): - if (self.source == "nlm-mesh") and self.source_concept_id.startswith("D"): + if (self.source == "nlm-mesh") and self.source_concept_id.startswith("D") and (self.id_label_checker.get(self.source_concept_id) + == self.label.lower()): return True if (self.source == "lc-subjects") and self.source_concept_id.startswith( From 9874fada91a473fb1b4f1652ca194f2313efbc3d Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 12:10:31 +0000 Subject: [PATCH 302/310] make test alright again --- tests/test_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 62e03ea48f..f462ad34ac 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -41,7 +41,7 @@ def mock_requests_lookup_table( if transformer_type in ["mesh_concepts", "mesh_locations"]: mocked_responses.append( { - "method": "POTATO", + "method": "GET", "url": MESH_URL, "content_bytes": load_fixture("mesh/raw_descriptors.xml"), } From f03601133419976b878f4558c887800c99db61f2 Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 12 Feb 2025 12:12:31 +0000 Subject: [PATCH 303/310] Add wikidata transformer outputs to sources --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index a8eed441eb..5a5fb303b2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,7 +11,7 @@ def load_fixture(file_name: str) -> bytes: def add_mock_transformer_outputs( - sources: list[Literal["loc", "mesh"]], + sources: list[Literal["loc", "mesh", "wikidata_linked_loc", "wikidata_linked_mesh"]], node_types: list[Literal["concepts", "locations", "names"]], ) -> None: """ From 0d73bf73e60e0242f443fd5ae98317f7a098f4c9 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 12:13:42 +0000 Subject: [PATCH 304/310] exe again --- scripts/deploy_lambda_zip.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/deploy_lambda_zip.sh diff --git a/scripts/deploy_lambda_zip.sh b/scripts/deploy_lambda_zip.sh old mode 100644 new mode 100755 From ec88c7c346ca915570012ae058ceaf663a9af6a0 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Wed, 12 Feb 2025 12:13:44 +0000 Subject: [PATCH 305/310] Apply auto-formatting rules --- src/transformers/catalogue/raw_concept.py | 16 +++++++++++----- src/utils/aws.py | 4 +++- src/utils/ontology_id_checker.py | 4 +++- tests/sources/test_wikidata_names_source.py | 5 +++-- tests/test_utils.py | 4 +++- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/transformers/catalogue/raw_concept.py b/src/transformers/catalogue/raw_concept.py index b219a2da0e..26f560c440 100644 --- a/src/transformers/catalogue/raw_concept.py +++ b/src/transformers/catalogue/raw_concept.py @@ -1,10 +1,10 @@ import re from typing import get_args -from .id_label_checker import IdLabelChecker - from models.graph_node import ConceptSource, ConceptType +from .id_label_checker import IdLabelChecker + class RawCatalogueConcept: def __init__(self, raw_concept: dict, id_label_checker: IdLabelChecker): @@ -95,7 +95,7 @@ def source_concept_id(self) -> str | None: return source_id return None - + @property def label_derived_source_concept_ids(self) -> list[str]: label_derived_ids = self.id_label_checker.inverse.get(self.label.lower(), []) @@ -106,8 +106,14 @@ def label_derived_source_concept_ids(self) -> list[str]: def has_valid_source_concept(self) -> bool: """Checks if the source concept ID format matches the specified source.""" if isinstance(self.source_concept_id, str): - if (self.source == "nlm-mesh") and self.source_concept_id.startswith("D") and (self.id_label_checker.get(self.source_concept_id) - == self.label.lower()): + if ( + (self.source == "nlm-mesh") + and self.source_concept_id.startswith("D") + and ( + self.id_label_checker.get(self.source_concept_id) + == self.label.lower() + ) + ): return True if (self.source == "lc-subjects") and self.source_concept_id.startswith( diff --git a/src/utils/aws.py b/src/utils/aws.py index f1a96752de..9985b20638 100644 --- a/src/utils/aws.py +++ b/src/utils/aws.py @@ -60,7 +60,9 @@ def get_neptune_client(is_local: bool) -> BaseNeptuneClient: return LambdaNeptuneClient(get_secret(INSTANCE_ENDPOINT_SECRET_NAME)) -def fetch_transformer_output_from_s3(node_type: NodeType, source: OntologyType) -> Generator[Any]: +def fetch_transformer_output_from_s3( + node_type: NodeType, source: OntologyType +) -> Generator[Any]: """Retrieves the bulk load file outputted by the relevant transformer so that we can extract data from it.""" linked_nodes_file_name = f"{source}_{node_type}__nodes.csv" s3_url = f"s3://{config.S3_BULK_LOAD_BUCKET_NAME}/{linked_nodes_file_name}" diff --git a/src/utils/ontology_id_checker.py b/src/utils/ontology_id_checker.py index f86ca5db76..6ca760afa4 100644 --- a/src/utils/ontology_id_checker.py +++ b/src/utils/ontology_id_checker.py @@ -17,7 +17,9 @@ def _get_ids_for_ontology_and_node_type( end=" ", flush=True, ) - ids = {row[":ID"] for row in fetch_transformer_output_from_s3(node_type, ontology_type)} + ids = { + row[":ID"] for row in fetch_transformer_output_from_s3(node_type, ontology_type) + } print(f"({len(ids)} ids retrieved.)") return ids diff --git a/tests/sources/test_wikidata_names_source.py b/tests/sources/test_wikidata_names_source.py index b314721366..5156ab30e0 100644 --- a/tests/sources/test_wikidata_names_source.py +++ b/tests/sources/test_wikidata_names_source.py @@ -1,12 +1,13 @@ -from test_wikidata_concepts_source import _add_mock_wikidata_requests from test_utils import add_mock_transformer_outputs +from test_wikidata_concepts_source import _add_mock_wikidata_requests from sources.wikidata.linked_ontology_source import WikidataLinkedOntologySource def test_wikidata_names_source_edges() -> None: add_mock_transformer_outputs( - sources=["loc", "wikidata_linked_loc"], node_types=["names", "concepts", "locations"] + sources=["loc", "wikidata_linked_loc"], + node_types=["names", "concepts", "locations"], ) _add_mock_wikidata_requests("edges", "names") diff --git a/tests/test_utils.py b/tests/test_utils.py index 5a5fb303b2..982517df76 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,7 +11,9 @@ def load_fixture(file_name: str) -> bytes: def add_mock_transformer_outputs( - sources: list[Literal["loc", "mesh", "wikidata_linked_loc", "wikidata_linked_mesh"]], + sources: list[ + Literal["loc", "mesh", "wikidata_linked_loc", "wikidata_linked_mesh"] + ], node_types: list[Literal["concepts", "locations", "names"]], ) -> None: """ From da3fd3c1ffbed1e809f2f49510e69814404cd161 Mon Sep 17 00:00:00 2001 From: Agnes Garoux Date: Wed, 12 Feb 2025 12:17:06 +0000 Subject: [PATCH 306/310] only run on push to main --- .github/workflows/ci.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 305c396171..7e69b665a7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,9 @@ name: "Run CI pipeline" -on: push +on: + push: + branches: + - main permissions: contents: read From a263ab573107b826a3e14b60b88aa6563c8e3610 Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Wed, 12 Feb 2025 14:17:02 +0000 Subject: [PATCH 307/310] Apply auto-formatting rules --- catalogue_graph/src/extractor.py | 12 ++++++------ .../src/transformers/wikidata/raw_concept.py | 6 +++--- catalogue_graph/terraform/github/gha_role.tf | 14 +++++++------- catalogue_graph/terraform/github/gha_secret.tf | 6 +++--- catalogue_graph/terraform/iam_state_machines.tf | 8 ++++---- .../terraform/state_machine_extractors.tf | 10 +++++----- .../state_machine_single_extractor_loader.tf | 8 ++++---- catalogue_graph/terraform/terraform.tf | 6 +++--- 8 files changed, 35 insertions(+), 35 deletions(-) diff --git a/catalogue_graph/src/extractor.py b/catalogue_graph/src/extractor.py index 2be14bde3c..3a7c0437d3 100755 --- a/catalogue_graph/src/extractor.py +++ b/catalogue_graph/src/extractor.py @@ -28,9 +28,9 @@ def handler( f"transformer and streaming them into {stream_destination}." ) - assert config.S3_BULK_LOAD_BUCKET_NAME is not None, ( - "The S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." - ) + assert ( + config.S3_BULK_LOAD_BUCKET_NAME is not None + ), "The S3_BULK_LOAD_BUCKET_NAME environment variable must be defined." transformer: BaseTransformer = create_transformer(transformer_type, entity_type) @@ -43,9 +43,9 @@ def handler( transformer.stream_to_s3(s3_uri, entity_type, sample_size) elif stream_destination == "sns": topic_arn = config.GRAPH_QUERIES_SNS_TOPIC_ARN - assert topic_arn is not None, ( - "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." - ) + assert ( + topic_arn is not None + ), "To stream to SNS, the GRAPH_QUERIES_SNS_TOPIC_ARN environment variable must be defined." transformer.stream_to_sns(topic_arn, entity_type, sample_size) elif stream_destination == "local": diff --git a/catalogue_graph/src/transformers/wikidata/raw_concept.py b/catalogue_graph/src/transformers/wikidata/raw_concept.py index 636a6f1335..607abf0006 100644 --- a/catalogue_graph/src/transformers/wikidata/raw_concept.py +++ b/catalogue_graph/src/transformers/wikidata/raw_concept.py @@ -78,9 +78,9 @@ def _extract_coordinates(self) -> Coordinates: pattern = r"Point\((.*)\s(.*)\)" matched_coordinates = re.search(pattern, raw_coordinates) - assert matched_coordinates is not None, ( - f"Could not extract coordinates from raw value '{raw_coordinates}'. Wikidata id: {self.source_id}" - ) + assert ( + matched_coordinates is not None + ), f"Could not extract coordinates from raw value '{raw_coordinates}'. Wikidata id: {self.source_id}" longitude = float(matched_coordinates.group(1)) latitude = float(matched_coordinates.group(2)) diff --git a/catalogue_graph/terraform/github/gha_role.tf b/catalogue_graph/terraform/github/gha_role.tf index 2ffd1f82e8..776c380cb7 100644 --- a/catalogue_graph/terraform/github/gha_role.tf +++ b/catalogue_graph/terraform/github/gha_role.tf @@ -1,15 +1,15 @@ module "gha_catalogue_graph_ci_role" { source = "github.com/wellcomecollection/terraform-aws-gha-role?ref=v1.0.0" - policy_document = data.aws_iam_policy_document.gha_catalogue_graph_ci.json - github_repository = "wellcomecollection/catalogue-graph" - role_name = "catalogue-graph-ci" - github_oidc_provider_arn = data.terraform_remote_state.aws_account_infrastructure.outputs.github_openid_connect_provider_arn + policy_document = data.aws_iam_policy_document.gha_catalogue_graph_ci.json + github_repository = "wellcomecollection/catalogue-graph" + role_name = "catalogue-graph-ci" + github_oidc_provider_arn = data.terraform_remote_state.aws_account_infrastructure.outputs.github_openid_connect_provider_arn } data "aws_iam_policy_document" "gha_catalogue_graph_ci" { statement { - actions = [ + actions = [ "s3:PutObject", "s3:GetObject" ] @@ -19,7 +19,7 @@ data "aws_iam_policy_document" "gha_catalogue_graph_ci" { ] } statement { - actions = [ + actions = [ "ecr:BatchCheckLayerAvailability", "ecr:Describe*", "ecr:Get*", @@ -44,7 +44,7 @@ data "aws_iam_policy_document" "gha_catalogue_graph_ci" { ] } statement { - actions = [ + actions = [ "lambda:GetFunctionConfiguration", "lambda:UpdateFunctionCode" ] diff --git a/catalogue_graph/terraform/github/gha_secret.tf b/catalogue_graph/terraform/github/gha_secret.tf index bb3d1294c0..52e812bab1 100644 --- a/catalogue_graph/terraform/github/gha_secret.tf +++ b/catalogue_graph/terraform/github/gha_secret.tf @@ -8,7 +8,7 @@ terraform { } resource "github_actions_secret" "catalogue_graph_ci" { - repository = "catalogue-graph" - secret_name = "CATALOGUE_GRAPH_CI_ROLE_ARN" - plaintext_value = module.gha_catalogue_graph_ci_role.role_arn + repository = "catalogue-graph" + secret_name = "CATALOGUE_GRAPH_CI_ROLE_ARN" + plaintext_value = module.gha_catalogue_graph_ci_role.role_arn } \ No newline at end of file diff --git a/catalogue_graph/terraform/iam_state_machines.tf b/catalogue_graph/terraform/iam_state_machines.tf index 167228aa6f..6d0bb1a780 100644 --- a/catalogue_graph/terraform/iam_state_machines.tf +++ b/catalogue_graph/terraform/iam_state_machines.tf @@ -26,8 +26,8 @@ resource "aws_iam_policy" "state_machine_policy" { Resource = "*" }, { - Effect = "Allow", - Action = ["states:StartExecution"], + Effect = "Allow", + Action = ["states:StartExecution"], Resource = [ aws_sfn_state_machine.catalogue_graph_extractor.arn, aws_sfn_state_machine.catalogue_graph_extractors.arn, @@ -36,8 +36,8 @@ resource "aws_iam_policy" "state_machine_policy" { ] }, { - Effect = "Allow", - Action = ["lambda:InvokeFunction"], + Effect = "Allow", + Action = ["lambda:InvokeFunction"], Resource = [ module.bulk_loader_lambda.lambda.arn, module.bulk_load_poller_lambda.lambda.arn diff --git a/catalogue_graph/terraform/state_machine_extractors.tf b/catalogue_graph/terraform/state_machine_extractors.tf index c035bf6f49..bcbe9c3c3b 100644 --- a/catalogue_graph/terraform/state_machine_extractors.tf +++ b/catalogue_graph/terraform/state_machine_extractors.tf @@ -14,15 +14,15 @@ resource "aws_sfn_state_machine" "catalogue_graph_extractors" { Parameters = { StateMachineArn = aws_sfn_state_machine.catalogue_graph_extractor.arn Input = { - "stream_destination": "s3", - "transformer_type.$": "$$.Execution.Input.transformer_type", - "entity_type.$": "$$.Execution.Input.entity_type", - "sample_size.$": "$$.Execution.Input.sample_size" + "stream_destination" : "s3", + "transformer_type.$" : "$$.Execution.Input.transformer_type", + "entity_type.$" : "$$.Execution.Input.entity_type", + "sample_size.$" : "$$.Execution.Input.sample_size" } } Next = index == length(var.state_machine_inputs) - 1 ? "Success" : "Extract ${var.state_machine_inputs[index + 1].label}" } - }), { + }), { Success = { Type = "Succeed" } diff --git a/catalogue_graph/terraform/state_machine_single_extractor_loader.tf b/catalogue_graph/terraform/state_machine_single_extractor_loader.tf index 26d6757e54..4f0c423226 100644 --- a/catalogue_graph/terraform/state_machine_single_extractor_loader.tf +++ b/catalogue_graph/terraform/state_machine_single_extractor_loader.tf @@ -13,10 +13,10 @@ resource "aws_sfn_state_machine" "catalogue_graph_single_extract_load" { Parameters = { StateMachineArn = aws_sfn_state_machine.catalogue_graph_extractor.arn Input = { - "stream_destination": "s3", - "transformer_type.$": "$$.Execution.Input.transformer_type", - "entity_type.$": "$$.Execution.Input.entity_type", - "sample_size.$": "$$.Execution.Input.sample_size" + "stream_destination" : "s3", + "transformer_type.$" : "$$.Execution.Input.transformer_type", + "entity_type.$" : "$$.Execution.Input.entity_type", + "sample_size.$" : "$$.Execution.Input.sample_size" } } } diff --git a/catalogue_graph/terraform/terraform.tf b/catalogue_graph/terraform/terraform.tf index 6f0dabc0cb..6a2c6c63b7 100644 --- a/catalogue_graph/terraform/terraform.tf +++ b/catalogue_graph/terraform/terraform.tf @@ -32,9 +32,9 @@ data "terraform_remote_state" "shared_infra" { assume_role = { role_arn = "arn:aws:iam::760097843905:role/platform-read_only" } - bucket = "wellcomecollection-platform-infra" - key = "terraform/platform-infrastructure/shared.tfstate" - region = "eu-west-1" + bucket = "wellcomecollection-platform-infra" + key = "terraform/platform-infrastructure/shared.tfstate" + region = "eu-west-1" } } From 49f21bb42d8bcba9b2e89b63c5947e89d2ca6bee Mon Sep 17 00:00:00 2001 From: Antonia Langfelder Date: Wed, 12 Feb 2025 15:25:18 +0000 Subject: [PATCH 308/310] Add deduplication --- src/transformers/catalogue/concepts_transformer.py | 11 +++++++++++ .../test_catalogue_concepts_transformer.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index 73f7243617..bad25401ce 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -15,12 +15,18 @@ def __init__(self, url: str): self.id_label_checker = IdLabelChecker.from_source( node_type=["concepts", "locations"], source=["loc", "mesh"] ) + self.id_lookup: set = set() def transform_node(self, raw_node: dict) -> Concept | None: raw_concept = RawCatalogueConcept(raw_node, self.id_label_checker) if not raw_concept.is_concept: return None + + if raw_concept.wellcome_id in self.id_lookup: + return None + + self.id_lookup.add(raw_concept.wellcome_id) return Concept( id=raw_concept.wellcome_id, @@ -34,6 +40,11 @@ def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: if not raw_concept.is_concept: return + + if raw_concept.wellcome_id in self.id_lookup: + return + + self.id_lookup.add(raw_concept.wellcome_id) if (raw_concept.source == "label-derived") and ( raw_concept.type not in ["Person", "Organisation", "Agent"] diff --git a/tests/transformers/test_catalogue_concepts_transformer.py b/tests/transformers/test_catalogue_concepts_transformer.py index c53ad53227..1491921780 100644 --- a/tests/transformers/test_catalogue_concepts_transformer.py +++ b/tests/transformers/test_catalogue_concepts_transformer.py @@ -27,7 +27,7 @@ def test_catalogue_concepts_transformer() -> None: # test transform_node nodes = list(catalogue_concepts_transformer.stream(entity_type="nodes"))[0] - assert len(list(nodes)) == 12 + assert len(list(nodes)) == 6 assert nodes[0].id == "s6s24vd7" assert nodes[0].label == "Human anatomy" assert nodes[0].type == "Concept" From 476112e6b557d5b19cc038a5ed56464a2a3191ff Mon Sep 17 00:00:00 2001 From: Github on behalf of Wellcome Collection Date: Wed, 12 Feb 2025 15:26:31 +0000 Subject: [PATCH 309/310] Apply auto-formatting rules --- src/transformers/catalogue/concepts_transformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/catalogue/concepts_transformer.py b/src/transformers/catalogue/concepts_transformer.py index bad25401ce..b2c2f72f9c 100644 --- a/src/transformers/catalogue/concepts_transformer.py +++ b/src/transformers/catalogue/concepts_transformer.py @@ -22,10 +22,10 @@ def transform_node(self, raw_node: dict) -> Concept | None: if not raw_concept.is_concept: return None - + if raw_concept.wellcome_id in self.id_lookup: return None - + self.id_lookup.add(raw_concept.wellcome_id) return Concept( @@ -40,10 +40,10 @@ def extract_edges(self, raw_node: dict) -> Generator[ConceptHasSourceConcept]: if not raw_concept.is_concept: return - + if raw_concept.wellcome_id in self.id_lookup: return - + self.id_lookup.add(raw_concept.wellcome_id) if (raw_concept.source == "label-derived") and ( From 4867f9222a10e725e1ac5df3647a657a2ffe052b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Wed, 12 Feb 2025 15:56:08 +0000 Subject: [PATCH 310/310] Fix linting issues #5903 --- builds/run_linting.sh | 2 +- .../src/sources/wikidata/sparql_query_builder.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/builds/run_linting.sh b/builds/run_linting.sh index 98a8450194..1d89917af9 100755 --- a/builds/run_linting.sh +++ b/builds/run_linting.sh @@ -12,5 +12,5 @@ docker run --tty --rm \ --volume "$ROOT:/data" \ --workdir /data \ "$ECR_REGISTRY/wellcome/flake8:latest" \ - --exclude .git,__pycache__,target,.terraform \ + --exclude .git,__pycache__,target,.terraform,catalogue_graph \ --ignore=E501,E122,E126,E203,W503 diff --git a/catalogue_graph/src/sources/wikidata/sparql_query_builder.py b/catalogue_graph/src/sources/wikidata/sparql_query_builder.py index db37ad0fe3..7d863e799d 100644 --- a/catalogue_graph/src/sources/wikidata/sparql_query_builder.py +++ b/catalogue_graph/src/sources/wikidata/sparql_query_builder.py @@ -84,7 +84,7 @@ def _get_label_mappings(node_type: NodeType) -> str: ?item schema:description ?itemDescription. ?item skos:altLabel ?itemAltLabel. {"\n".join(extra_mappings)} - }} + }} }} """ @@ -103,7 +103,7 @@ def get_all_ids_query(linked_ontology: OntologyType) -> str: raise ValueError(f"Invalid linked ontology type: {linked_ontology}") get_ids_query = f""" - SELECT ?item WHERE {{ + SELECT ?item WHERE {{ {field_filter} }} """ @@ -154,7 +154,7 @@ def get_edge_query( raise ValueError(f"Unknown edge type: {edge_type}") query = f""" - SELECT DISTINCT ?fromItem ?toItem + SELECT DISTINCT ?fromItem ?toItem WHERE {{ VALUES ?fromItem {{ {ids_clause} }} ?fromItem {property_path} ?toItem.