diff --git a/how-to/use-the-api.md b/how-to/use-the-api.md index dedf7e9dd..0af873993 100644 --- a/how-to/use-the-api.md +++ b/how-to/use-the-api.md @@ -9,12 +9,18 @@ `GET /trove/index-value-search`: search values for specific properties on index-cards ## Posting index-cards -> NOTE: currently used only by other COS projects, not yet for public use +> NOTE: currently used only by other COS projects, not yet for public use, authorization required `POST /trove/ingest?focus_iri=...&record_identifier=...`: currently supports only `Content-Type: text/turtle` +query params: +- `focus_iri` (required): full iri of the focus resource, exactly as used in the request body +- `record_identifier` (required): a source-specific identifier for the metadata record (no format restrictions) -- sending another record with the same `record_identifier` is considered a full update (only the most recent is used) +- `nonurgent`: if present (regardless of value), ingestion may be given a lower priority -- recommended for bulk or background operations +- `is_supplementary`: if present (regardless of value), this record's metadata will be added to all pre-existing index-cards from the same user with the same `focus_iri` (if any), but will not get an index-card of its own nor affect the last-updated timestamp (e.g. in OAI-PMH) of the index-cards it supplements + ## Deleting index-cards `DELETE /trove/ingest?record_identifier=...`: request diff --git a/share/migrations/0074_sourceuniqueidentifier_is_supplementary.py b/share/migrations/0074_sourceuniqueidentifier_is_supplementary.py new file mode 100644 index 000000000..b25b35fa2 --- /dev/null +++ b/share/migrations/0074_sourceuniqueidentifier_is_supplementary.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.25 on 2024-09-19 20:33 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('share', '0073_remove_indexbackfill_backfill_phase_index'), + ] + + operations = [ + migrations.AddField( + model_name='sourceuniqueidentifier', + name='is_supplementary', + field=models.BooleanField(null=True), + ), + ] diff --git a/share/models/source_unique_identifier.py b/share/models/source_unique_identifier.py index 5bd9c88cb..bc3bbaf5e 100644 --- a/share/models/source_unique_identifier.py +++ b/share/models/source_unique_identifier.py @@ -14,6 +14,7 @@ class SourceUniqueIdentifier(models.Model): identifier = models.TextField() # no restrictions on identifier format source_config = models.ForeignKey('SourceConfig', on_delete=models.CASCADE) focus_identifier = models.ForeignKey('trove.ResourceIdentifier', null=True, on_delete=models.PROTECT, related_name='suid_set') + is_supplementary = models.BooleanField(null=True) class JSONAPIMeta(BaseJSONAPIMeta): pass diff --git a/tests/trove/digestive_tract/test_derive.py b/tests/trove/digestive_tract/test_derive.py new file mode 100644 index 000000000..79fefe859 --- /dev/null +++ b/tests/trove/digestive_tract/test_derive.py @@ -0,0 +1,66 @@ +import json + +from django.test import TestCase +from primitive_metadata import primitive_rdf as rdf + +from tests import factories +from trove import digestive_tract +from trove import models as trove_db +from trove.vocab.namespaces import TROVE +from trove.util.iris import get_sufficiently_unique_iri + + +_BLARG = rdf.IriNamespace('https://blarg.example/') + + +class TestDigestiveTractDerive(TestCase): + @classmethod + def setUpTestData(cls): + cls.focus_iri = _BLARG.this + _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(cls.focus_iri) + _raw = factories.RawDatumFactory() + cls.indexcard = trove_db.Indexcard.objects.create(source_record_suid=_raw.suid) + cls.indexcard.focus_identifier_set.add(_focus_ident) + cls.latest_rdf = trove_db.LatestIndexcardRdf.objects.create( + indexcard=cls.indexcard, + from_raw_datum=_raw, + focus_iri=cls.focus_iri, + rdf_as_turtle='''@prefix blarg: . +blarg:this + a blarg:Thing ; + blarg:like blarg:that . +''', + ) + + def test_derive(self): + (_derived,) = digestive_tract.derive(self.indexcard) + self.assertEqual(_derived.upriver_indexcard_id, self.indexcard.id) + self.assertEqual(_derived.deriver_identifier.sufficiently_unique_iri, get_sufficiently_unique_iri(TROVE['derive/osfmap_json'])) + self.assertEqual(json.loads(_derived.derived_text), { + '@id': _BLARG.this, + 'resourceType': [{'@id': _BLARG.Thing}], + _BLARG.like: [{'@id': _BLARG.that}], + }) + + def test_derive_with_supplementary(self): + _supp_raw = factories.RawDatumFactory( + suid=factories.SourceUniqueIdentifierFactory(is_supplementary=True), + ) + trove_db.SupplementaryIndexcardRdf.objects.create( + indexcard=self.indexcard, + from_raw_datum=_supp_raw, + supplementary_suid=_supp_raw.suid, + focus_iri=self.focus_iri, + rdf_as_turtle='''@prefix blarg: . +blarg:this blarg:unlike blarg:nonthing . +''', + ) + (_derived,) = digestive_tract.derive(self.indexcard) + self.assertEqual(_derived.upriver_indexcard_id, self.indexcard.id) + self.assertEqual(_derived.deriver_identifier.sufficiently_unique_iri, get_sufficiently_unique_iri(TROVE['derive/osfmap_json'])) + self.assertEqual(json.loads(_derived.derived_text), { + '@id': _BLARG.this, + 'resourceType': [{'@id': _BLARG.Thing}], + _BLARG.like: [{'@id': _BLARG.that}], + _BLARG.unlike: [{'@id': _BLARG.nonthing}], + }) diff --git a/tests/trove/digestive_tract/test_extract.py b/tests/trove/digestive_tract/test_extract.py new file mode 100644 index 000000000..a8f19c7ab --- /dev/null +++ b/tests/trove/digestive_tract/test_extract.py @@ -0,0 +1,91 @@ +from django.test import TestCase +from primitive_metadata import primitive_rdf as rdf + +from tests import factories +from trove import digestive_tract +from trove import models as trove_db + + +_BLARG = rdf.IriNamespace('https://blarg.example/') + + +class TestDigestiveTractExtract(TestCase): + @classmethod + def setUpTestData(cls): + _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_BLARG.this) + cls.raw = factories.RawDatumFactory( + mediatype='text/turtle', + datum='''@prefix blarg: . +blarg:this + a blarg:Thing ; + blarg:like blarg:that . +''', + suid__focus_identifier=_focus_ident, + ) + cls.supplementary_raw = factories.RawDatumFactory( + mediatype='text/turtle', + datum='''@prefix blarg: . +blarg:this blarg:like blarg:another ; + blarg:unlike blarg:nonthing . +''', + suid=factories.SourceUniqueIdentifierFactory( + source_config=cls.raw.suid.source_config, + focus_identifier=cls.raw.suid.focus_identifier, + is_supplementary=True, + ), + ) + + def test_setup(self): + self.assertEqual(trove_db.Indexcard.objects.all().count(), 0) + self.assertEqual(trove_db.LatestIndexcardRdf.objects.all().count(), 0) + self.assertEqual(trove_db.ArchivedIndexcardRdf.objects.all().count(), 0) + self.assertEqual(trove_db.SupplementaryIndexcardRdf.objects.all().count(), 0) + + def test_extract(self): + (_indexcard,) = digestive_tract.extract(self.raw) + self.assertEqual(_indexcard.source_record_suid_id, self.raw.suid_id) + _focus_idents = list( + _indexcard.focus_identifier_set.values_list('sufficiently_unique_iri', flat=True), + ) + self.assertEqual(_focus_idents, ['://blarg.example/this']) + _focustype_idents = list( + _indexcard.focustype_identifier_set.values_list('sufficiently_unique_iri', flat=True), + ) + self.assertEqual(_focustype_idents, ['://blarg.example/Thing']) + self.assertEqual(list(_indexcard.supplementary_rdf_set.all()), []) + _latest_rdf = _indexcard.latest_rdf + self.assertEqual(_latest_rdf.from_raw_datum_id, self.raw.id) + self.assertEqual(_latest_rdf.indexcard_id, _indexcard.id) + self.assertEqual(_latest_rdf.focus_iri, _BLARG.this) + self.assertEqual(_latest_rdf.as_rdf_tripledict(), { + _BLARG.this: { + rdf.RDF.type: {_BLARG.Thing}, + _BLARG.like: {_BLARG.that}, + }, + }) + + def test_extract_supplementary_without_prior(self): + _cards = digestive_tract.extract(self.supplementary_raw) + self.assertEqual(_cards, []) + self.assertEqual(trove_db.Indexcard.objects.all().count(), 0) + self.assertEqual(trove_db.LatestIndexcardRdf.objects.all().count(), 0) + self.assertEqual(trove_db.ArchivedIndexcardRdf.objects.all().count(), 0) + self.assertEqual(trove_db.SupplementaryIndexcardRdf.objects.all().count(), 0) + + def test_extract_supplementary(self): + (_orig_indexcard,) = digestive_tract.extract(self.raw) + _orig_timestamp = _orig_indexcard.latest_rdf.modified + (_indexcard,) = digestive_tract.extract(self.supplementary_raw) + self.assertEqual(_orig_indexcard.id, _indexcard.id) + self.assertEqual(_indexcard.source_record_suid_id, self.raw.suid_id) + (_supp_rdf,) = _indexcard.supplementary_rdf_set.all() + self.assertEqual(_supp_rdf.from_raw_datum_id, self.supplementary_raw.id) + self.assertEqual(_supp_rdf.indexcard_id, _indexcard.id) + self.assertEqual(_supp_rdf.focus_iri, _BLARG.this) + self.assertEqual(_supp_rdf.as_rdf_tripledict(), { + _BLARG.this: { + _BLARG.like: {_BLARG.another}, + _BLARG.unlike: {_BLARG.nonthing}, + }, + }) + self.assertEqual(_indexcard.latest_rdf.modified, _orig_timestamp) diff --git a/tests/trove/digestive_tract/test_swallow.py b/tests/trove/digestive_tract/test_swallow.py new file mode 100644 index 000000000..62a81309e --- /dev/null +++ b/tests/trove/digestive_tract/test_swallow.py @@ -0,0 +1,77 @@ +from unittest import mock +from django.test import TestCase + +from tests import factories +from trove import digestive_tract +from share import models as share_db + + +class TestDigestiveTractSwallow(TestCase): + @classmethod + def setUpTestData(cls): + cls.user = factories.ShareUserFactory() + cls.turtle = ''' +@prefix blarg: . +blarg:this + a blarg:Thing ; + blarg:like blarg:that . +''' + + def test_setup(self): + self.assertEqual(share_db.RawDatum.objects.all().count(), 0) + + def test_swallow(self): + with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task: + digestive_tract.swallow( + from_user=self.user, + record=self.turtle, + record_identifier='blarg', + record_mediatype='text/turtle', + focus_iri='https://blarg.example/this', + ) + (_raw,) = share_db.RawDatum.objects.all() + self.assertEqual(_raw.datum, self.turtle) + self.assertEqual(_raw.mediatype, 'text/turtle') + self.assertEqual(_raw.suid.identifier, 'blarg') + self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/this') + self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id) + self.assertFalse(_raw.suid.is_supplementary) + _mock_task.delay.assert_called_once_with(_raw.id, urgent=False) + + def test_swallow_urgent(self): + with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task: + digestive_tract.swallow( + from_user=self.user, + record=self.turtle, + record_identifier='blarg', + record_mediatype='text/turtle', + focus_iri='https://blarg.example/this', + urgent=True + ) + (_raw,) = share_db.RawDatum.objects.all() + self.assertEqual(_raw.datum, self.turtle) + self.assertEqual(_raw.mediatype, 'text/turtle') + self.assertEqual(_raw.suid.identifier, 'blarg') + self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/this') + self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id) + self.assertFalse(_raw.suid.is_supplementary) + _mock_task.delay.assert_called_once_with(_raw.id, urgent=True) + + def test_swallow_supplementary(self): + with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task: + digestive_tract.swallow( + from_user=self.user, + record=self.turtle, + record_identifier='blarg', + record_mediatype='text/turtle', + focus_iri='https://blarg.example/this', + is_supplementary=True, + ) + (_raw,) = share_db.RawDatum.objects.all() + self.assertEqual(_raw.datum, self.turtle) + self.assertEqual(_raw.mediatype, 'text/turtle') + self.assertEqual(_raw.suid.identifier, 'blarg') + self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/this') + self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id) + self.assertTrue(_raw.suid.is_supplementary) + _mock_task.delay.assert_called_once_with(_raw.id, urgent=False) diff --git a/tests/trove/views/test_ingest.py b/tests/trove/views/test_ingest.py index f7fb49f74..2b828f399 100644 --- a/tests/trove/views/test_ingest.py +++ b/tests/trove/views/test_ingest.py @@ -1,6 +1,139 @@ +from http import HTTPStatus +from unittest import mock +from urllib.parse import urlencode + from django.test import TestCase +from tests import factories + class TestIngest(TestCase): - def test_simple_ingest(self): - pass # TODO + @classmethod + def setUpTestData(cls): + cls.user = factories.ShareUserFactory(is_trusted=True) + + def test_post(self): + with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: + _resp = self.client.post( + '/trove/ingest?' + urlencode({ + 'focus_iri': 'https://foo.example/blarg', + 'record_identifier': 'blarg', + }), + content_type='text/turtle', + data='turtleturtleturtle', + HTTP_AUTHORIZATION=self.user.authorization(), + ) + self.assertEqual(_resp.status_code, HTTPStatus.CREATED) + _mock_tract.swallow.assert_called_once_with( + from_user=self.user, + record='turtleturtleturtle', + record_identifier='blarg', + record_mediatype='text/turtle', + focus_iri='https://foo.example/blarg', + urgent=True, + is_supplementary=False, + ) + + def test_post_nonurgent(self): + with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: + _resp = self.client.post( + '/trove/ingest?' + urlencode({ + 'focus_iri': 'https://foo.example/blarg', + 'record_identifier': 'blarg', + 'nonurgent': '', + }), + content_type='text/turtle', + data='turtleturtleturtle', + HTTP_AUTHORIZATION=self.user.authorization(), + ) + self.assertEqual(_resp.status_code, HTTPStatus.CREATED) + _mock_tract.swallow.assert_called_once_with( + from_user=self.user, + record='turtleturtleturtle', + record_identifier='blarg', + record_mediatype='text/turtle', + focus_iri='https://foo.example/blarg', + urgent=False, + is_supplementary=False, + ) + + def test_post_supplementary(self): + with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: + _resp = self.client.post( + '/trove/ingest?' + urlencode({ + 'focus_iri': 'https://foo.example/blarg', + 'record_identifier': 'blarg', + 'is_supplementary': '', + }), + content_type='text/turtle', + data='turtleturtleturtle', + HTTP_AUTHORIZATION=self.user.authorization(), + ) + self.assertEqual(_resp.status_code, HTTPStatus.CREATED) + _mock_tract.swallow.assert_called_once_with( + from_user=self.user, + record='turtleturtleturtle', + record_identifier='blarg', + record_mediatype='text/turtle', + focus_iri='https://foo.example/blarg', + urgent=True, + is_supplementary=True, + ) + + def test_delete(self): + with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: + _resp = self.client.delete( + '/trove/ingest?record_identifier=blarg', + HTTP_AUTHORIZATION=self.user.authorization(), + ) + self.assertEqual(_resp.status_code, HTTPStatus.OK) + _mock_tract.expel.assert_called_once_with( + from_user=self.user, + record_identifier='blarg', + ) + + def test_anonymous_post(self): + with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: + _resp = self.client.post( + '/trove/ingest?' + urlencode({ + 'focus_iri': 'https://foo.example/blarg', + 'record_identifier': 'blarg', + 'is_supplementary': '', + }), + content_type='text/turtle', + data='turtleturtleturtle', + ) + self.assertEqual(_resp.status_code, HTTPStatus.UNAUTHORIZED) + self.assertFalse(_mock_tract.swallow.called) + + def test_nontrusted_post(self): + _nontrusted_user = factories.ShareUserFactory() + with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: + _resp = self.client.post( + '/trove/ingest?' + urlencode({ + 'focus_iri': 'https://foo.example/blarg', + 'record_identifier': 'blarg', + 'is_supplementary': '', + }), + content_type='text/turtle', + data='turtleturtleturtle', + HTTP_AUTHORIZATION=_nontrusted_user.authorization(), + ) + self.assertEqual(_resp.status_code, HTTPStatus.FORBIDDEN) + self.assertFalse(_mock_tract.swallow.called) + + def test_anonymous_delete(self): + with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: + _resp = self.client.delete('/trove/ingest?record_identifier=blarg') + self.assertEqual(_resp.status_code, HTTPStatus.UNAUTHORIZED) + self.assertFalse(_mock_tract.expel.called) + + def test_nontrusted_delete(self): + _nontrusted_user = factories.ShareUserFactory() + with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract: + _resp = self.client.delete( + '/trove/ingest?record_identifier=blarg', + HTTP_AUTHORIZATION=_nontrusted_user.authorization(), + ) + self.assertEqual(_resp.status_code, HTTPStatus.FORBIDDEN) + self.assertFalse(_mock_tract.expel.called) diff --git a/trove/derive/_base.py b/trove/derive/_base.py index 823f494e9..a16dc8fe0 100644 --- a/trove/derive/_base.py +++ b/trove/derive/_base.py @@ -1,4 +1,5 @@ import abc +from collections.abc import Iterable from primitive_metadata import primitive_rdf @@ -10,10 +11,16 @@ class IndexcardDeriver(abc.ABC): focus_iri: str data: primitive_rdf.RdfGraph - def __init__(self, upriver_rdf: IndexcardRdf): + def __init__( + self, + upriver_rdf: IndexcardRdf, + supplementary_rdf_set: Iterable[IndexcardRdf] = (), + ): self.upriver_rdf = upriver_rdf self.focus_iri = upriver_rdf.focus_iri self.data = primitive_rdf.RdfGraph(upriver_rdf.as_rdf_tripledict()) + for _supplementary_rdf in supplementary_rdf_set: + self.data.add_tripledict(_supplementary_rdf.as_rdf_tripledict()) def q(self, pathset): # convenience for querying self.data on self.focus_iri diff --git a/trove/digestive_tract.py b/trove/digestive_tract.py index 402ed4ac1..2c3e21397 100644 --- a/trove/digestive_tract.py +++ b/trove/digestive_tract.py @@ -40,6 +40,7 @@ def swallow( focus_iri: str, datestamp=None, # default "now" urgent=False, + is_supplementary=False, ): '''swallow: store a given record by checksum; queue for extraction @@ -55,7 +56,12 @@ def swallow( _suid, _suid_created = share_db.SourceUniqueIdentifier.objects.get_or_create( source_config=_source_config, identifier=record_identifier, + defaults={ + 'is_supplementary': is_supplementary, + }, ) + if bool(_suid.is_supplementary) != is_supplementary: + raise DigestiveError(f'suid is_supplementary should not change! suid={_suid}, is_supplementary changed from {bool(_suid.is_supplementary)} to {is_supplementary}') _focus_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri) if _suid.focus_identifier is None: _suid.focus_identifier = _focus_identifier @@ -104,11 +110,12 @@ def swallow__sharev2_legacy( def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_db.Indexcard]: '''extract: gather rdf graph from a record; store as index card(s) - will create (or update): + may create (or update): ResourceIdentifier (for each described resource and its types) Indexcard (with identifiers and type-identifiers for each described resource) - ArchivedIndexcardRdf (all extracted metadata) - LatestIndexcardRdf (all extracted metadata, if latest raw) + ArchivedIndexcardRdf (all extracted metadata, if non-supplementary) + LatestIndexcardRdf (all extracted metadata, if latest raw and non-supplementary) + SupplementaryIndexcardRdf (all extracted metadata, if supplementary) may delete: LatestIndexcardRdf (previously extracted from the record, but no longer present) ''' @@ -137,6 +144,11 @@ def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_ (_iri, RDFS.isDefinedBy, _focus_iri), ) _tripledicts_by_focus_iri[_iri] = _term_tripledict + if raw.suid.is_supplementary: + return trove_db.Indexcard.objects.supplement_indexcards_from_tripledicts( + from_raw_datum=raw, + rdf_tripledicts_by_focus_iri=_tripledicts_by_focus_iri, + ) return trove_db.Indexcard.objects.save_indexcards_from_tripledicts( from_raw_datum=raw, rdf_tripledicts_by_focus_iri=_tripledicts_by_focus_iri, @@ -150,10 +162,18 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None): will create, update, or delete: DerivedIndexcard ''' - if indexcard.deleted or not indexcard.latest_rdf: - return + if indexcard.deleted: + return [] + try: + _latest_rdf = indexcard.latest_rdf + except trove_db.LatestIndexcardRdf.DoesNotExist: + return [] + _derived_list = [] for _deriver_class in get_deriver_classes(deriver_iris): - _deriver = _deriver_class(upriver_rdf=indexcard.latest_rdf) + _deriver = _deriver_class( + upriver_rdf=_latest_rdf, + supplementary_rdf_set=indexcard.supplementary_rdf_set.all(), + ) _deriver_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_deriver.deriver_iri()) if _deriver.should_skip(): trove_db.DerivedIndexcard.objects.filter( @@ -163,14 +183,16 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None): else: _derived_text = _deriver.derive_card_as_text() _derived_checksum_iri = ChecksumIri.digest('sha-256', salt='', raw_data=_derived_text) - trove_db.DerivedIndexcard.objects.update_or_create( + _derived, _ = trove_db.DerivedIndexcard.objects.update_or_create( upriver_indexcard=indexcard, deriver_identifier=_deriver_identifier, defaults={ - 'derived_text': _deriver.derive_card_as_text(), + 'derived_text': _derived_text, 'derived_checksum_iri': _derived_checksum_iri, }, ) + _derived_list.append(_derived) + return _derived_list def expel(from_user: share_db.ShareUser, record_identifier: str): @@ -178,6 +200,11 @@ def expel(from_user: share_db.ShareUser, record_identifier: str): source_config__source__user=from_user, identifier=record_identifier, ) + ( + trove_db.SupplementaryIndexcardRdf.objects + .filter(supplementary_suid__in=_suid_qs) + .delete() + ) for _indexcard in trove_db.Indexcard.objects.filter(source_record_suid__in=_suid_qs): _indexcard.pls_delete() @@ -247,6 +274,9 @@ def task__schedule_all_for_deriver(deriver_iri: str, notify_index=False): def _sharev2_legacy_ingest(raw, urgent: bool): assert raw.mediatype is None, 'raw datum has a mediatype -- did you mean to call non-legacy extract?' _extractor = get_rdf_extractor_class(None)(raw.suid.source_config) + if typing.TYPE_CHECKING: + from trove.extract.legacy_sharev2 import LegacySharev2Extractor + assert isinstance(_extractor, LegacySharev2Extractor) _sharev2graph = _extractor.extract_sharev2_graph(raw.datum) _centralnode = _sharev2graph.get_central_node(guess=True) _normd = share_db.NormalizedData.objects.create( diff --git a/trove/migrations/0006_supplementary_indexcard_rdf.py b/trove/migrations/0006_supplementary_indexcard_rdf.py new file mode 100644 index 000000000..1dbf504ab --- /dev/null +++ b/trove/migrations/0006_supplementary_indexcard_rdf.py @@ -0,0 +1,33 @@ +# Generated by Django 3.2.25 on 2024-09-19 20:33 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('share', '0074_sourceuniqueidentifier_is_supplementary'), + ('trove', '0005_indexes_for_oaipmh'), + ] + + operations = [ + migrations.CreateModel( + name='SupplementaryIndexcardRdf', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', models.DateTimeField(auto_now_add=True)), + ('modified', models.DateTimeField(auto_now=True)), + ('turtle_checksum_iri', models.TextField(db_index=True)), + ('focus_iri', models.TextField()), + ('rdf_as_turtle', models.TextField()), + ('from_raw_datum', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='+', to='share.rawdatum')), + ('indexcard', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_supplementaryindexcardrdf_set', to='trove.indexcard')), + ('supplementary_suid', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='supplementary_rdf_set', to='share.sourceuniqueidentifier')), + ], + ), + migrations.AddConstraint( + model_name='supplementaryindexcardrdf', + constraint=models.UniqueConstraint(fields=('indexcard', 'supplementary_suid'), name='trove_supplementaryindexcardrdf_uniq_supplement'), + ), + ] diff --git a/trove/models/__init__.py b/trove/models/__init__.py index 42bc2d0ca..acaadd7c4 100644 --- a/trove/models/__init__.py +++ b/trove/models/__init__.py @@ -4,7 +4,15 @@ 'IndexcardRdf', 'LatestIndexcardRdf', 'ArchivedIndexcardRdf', + 'SupplementaryIndexcardRdf', 'DerivedIndexcard', ) -from .indexcard import Indexcard, IndexcardRdf, LatestIndexcardRdf, ArchivedIndexcardRdf, DerivedIndexcard +from .indexcard import ( + ArchivedIndexcardRdf, + DerivedIndexcard, + Indexcard, + IndexcardRdf, + LatestIndexcardRdf, + SupplementaryIndexcardRdf, +) from .resource_identifier import ResourceIdentifier diff --git a/trove/models/indexcard.py b/trove/models/indexcard.py index b7a411ffa..35b11fa27 100644 --- a/trove/models/indexcard.py +++ b/trove/models/indexcard.py @@ -1,4 +1,4 @@ -from typing import Optional +from __future__ import annotations import uuid from django.db import models @@ -27,6 +27,7 @@ def save_indexcards_from_tripledicts( rdf_tripledicts_by_focus_iri: dict[str, rdf.RdfTripleDictionary], undelete: bool = False, ) -> list['Indexcard']: + assert not from_raw_datum.suid.is_supplementary from_raw_datum.no_output = (not rdf_tripledicts_by_focus_iri) from_raw_datum.save(update_fields=['no_output']) _indexcards = [] @@ -55,6 +56,36 @@ def save_indexcards_from_tripledicts( _indexcard_to_delete.pls_delete() return _indexcards + @transaction.atomic + def supplement_indexcards_from_tripledicts( + self, *, + from_raw_datum: share_db.RawDatum, + rdf_tripledicts_by_focus_iri: dict[str, rdf.RdfTripleDictionary], + ) -> list[Indexcard]: + assert from_raw_datum.suid.is_supplementary + from_raw_datum.no_output = (not rdf_tripledicts_by_focus_iri) + from_raw_datum.save(update_fields=['no_output']) + if not from_raw_datum.is_latest(): + return [] + _indexcards = [] + for _focus_iri, _tripledict in rdf_tripledicts_by_focus_iri.items(): + _indexcards.extend(self.supplement_indexcards( + from_raw_datum=from_raw_datum, + rdf_tripledict=_tripledict, + focus_iri=_focus_iri, + )) + _seen_indexcard_ids = {_card.id for _card in _indexcards} + # supplementary data seen previously on this suid (but not this time) should be deleted + for _supplement_to_delete in ( + SupplementaryIndexcardRdf.objects + .filter(supplementary_suid=from_raw_datum.suid) + .exclude(from_raw_datum=from_raw_datum) + ): + if _supplement_to_delete.indexcard_id not in _seen_indexcard_ids: + _indexcards.append(_supplement_to_delete.indexcard) + _supplement_to_delete.delete() + return _indexcards + @transaction.atomic def save_indexcard_from_tripledict( self, *, @@ -63,8 +94,7 @@ def save_indexcard_from_tripledict( focus_iri: str, undelete: bool = False, ): - if focus_iri not in rdf_tripledict: - raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}') + assert not from_raw_datum.suid.is_supplementary _focus_identifier_set = ( ResourceIdentifier.objects .save_equivalent_identifier_set(rdf_tripledict, focus_iri) @@ -84,14 +114,35 @@ def save_indexcard_from_tripledict( _indexcard.save() _indexcard.focus_identifier_set.set(_focus_identifier_set) _indexcard.focustype_identifier_set.set(_focustype_identifier_set) - IndexcardRdf.save_indexcard_rdf( - indexcard=_indexcard, + _indexcard.update_rdf( from_raw_datum=from_raw_datum, rdf_tripledict=rdf_tripledict, focus_iri=focus_iri, ) return _indexcard + @transaction.atomic + def supplement_indexcards( + self, *, + from_raw_datum: share_db.RawDatum, + rdf_tripledict: rdf.RdfTripleDictionary, + focus_iri: str, + ) -> list[Indexcard]: + assert from_raw_datum.suid.is_supplementary + # supplement indexcards with the same focus from the same source_config + # (if none exist, fine, nothing gets supplemented) + _indexcards = list(Indexcard.objects.filter( + source_record_suid__source_config_id=from_raw_datum.suid.source_config_id, + focus_identifier_set__in=ResourceIdentifier.objects.queryset_for_iri(focus_iri), + )) + for _indexcard in _indexcards: + _indexcard.update_supplementary_rdf( + from_raw_datum=from_raw_datum, + rdf_tripledict=rdf_tripledict, + focus_iri=focus_iri, + ) + return _indexcards + class Indexcard(models.Model): objects = IndexcardManager() @@ -127,10 +178,10 @@ class Meta: ] @property - def latest_rdf(self) -> Optional['LatestIndexcardRdf']: + def latest_rdf(self) -> LatestIndexcardRdf: '''convenience for the "other side" of LatestIndexcardRdf.indexcard ''' - return self.trove_latestindexcardrdf_set.first() + return self.trove_latestindexcardrdf_set.get() # may raise DoesNotExist @property def archived_rdf_set(self): @@ -140,6 +191,14 @@ def archived_rdf_set(self): ''' return self.trove_archivedindexcardrdf_set + @property + def supplementary_rdf_set(self): + '''convenience for the "other side" of SupplementaryIndexcardRdf.indexcard + + returns a RelatedManager + ''' + return self.trove_supplementaryindexcardrdf_set + def get_iri(self): return trove_indexcard_iri(self.uuid) @@ -166,6 +225,61 @@ def __repr__(self): def __str__(self): return repr(self) + @transaction.atomic + def update_rdf( + self, + from_raw_datum: share_db.RawDatum, + focus_iri: str, + rdf_tripledict: rdf.RdfTripleDictionary, + ) -> 'IndexcardRdf': + if focus_iri not in rdf_tripledict: + raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}') + _rdf_as_turtle, _turtle_checksum_iri = _turtlify(rdf_tripledict) + _archived, _archived_created = ArchivedIndexcardRdf.objects.get_or_create( + indexcard=self, + from_raw_datum=from_raw_datum, + turtle_checksum_iri=_turtle_checksum_iri, + defaults={ + 'rdf_as_turtle': _rdf_as_turtle, + 'focus_iri': focus_iri, + }, + ) + if (not _archived_created) and (_archived.rdf_as_turtle != _rdf_as_turtle): + raise DigestiveError(f'hash collision? {_archived}\n===\n{_rdf_as_turtle}') + if not self.deleted and from_raw_datum.is_latest(): + _latest_indexcard_rdf, _created = LatestIndexcardRdf.objects.update_or_create( + indexcard=self, + defaults={ + 'from_raw_datum': from_raw_datum, + 'turtle_checksum_iri': _turtle_checksum_iri, + 'rdf_as_turtle': _rdf_as_turtle, + 'focus_iri': focus_iri, + }, + ) + return _latest_indexcard_rdf + return _archived + + def update_supplementary_rdf( + self, + from_raw_datum: share_db.RawDatum, + focus_iri: str, + rdf_tripledict: rdf.RdfTripleDictionary, + ) -> SupplementaryIndexcardRdf: + if focus_iri not in rdf_tripledict: + raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}') + _rdf_as_turtle, _turtle_checksum_iri = _turtlify(rdf_tripledict) + _supplement_rdf, _ = SupplementaryIndexcardRdf.objects.update_or_create( + indexcard=self, + supplementary_suid=from_raw_datum.suid, + defaults={ + 'from_raw_datum': from_raw_datum, + 'turtle_checksum_iri': _turtle_checksum_iri, + 'rdf_as_turtle': _rdf_as_turtle, + 'focus_iri': focus_iri, + }, + ) + return _supplement_rdf + class IndexcardRdf(models.Model): # auto: @@ -205,44 +319,6 @@ def __repr__(self): def __str__(self): return repr(self) - @transaction.atomic - @staticmethod - def save_indexcard_rdf( - indexcard: Indexcard, - from_raw_datum: share_db.RawDatum, - rdf_tripledict: rdf.RdfTripleDictionary, - focus_iri: str, - ) -> 'IndexcardRdf': - if focus_iri not in rdf_tripledict: - raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}') - _rdf_as_turtle = rdf.turtle_from_tripledict(rdf_tripledict) - _turtle_checksum_iri = str( - ChecksumIri.digest('sha-256', salt='', raw_data=_rdf_as_turtle), - ) - _archived, _archived_created = ArchivedIndexcardRdf.objects.get_or_create( - indexcard=indexcard, - from_raw_datum=from_raw_datum, - turtle_checksum_iri=_turtle_checksum_iri, - defaults={ - 'rdf_as_turtle': _rdf_as_turtle, - 'focus_iri': focus_iri, - }, - ) - if (not _archived_created) and (_archived.rdf_as_turtle != _rdf_as_turtle): - raise DigestiveError(f'hash collision? {_archived}\n===\n{_rdf_as_turtle}') - if not indexcard.deleted and from_raw_datum.is_latest(): - _latest_indexcard_rdf, _created = LatestIndexcardRdf.objects.update_or_create( - indexcard=indexcard, - defaults={ - 'from_raw_datum': from_raw_datum, - 'turtle_checksum_iri': _turtle_checksum_iri, - 'rdf_as_turtle': _rdf_as_turtle, - 'focus_iri': focus_iri, - }, - ) - return _latest_indexcard_rdf - return _archived - class LatestIndexcardRdf(IndexcardRdf): # just the most recent version of this indexcard @@ -254,7 +330,7 @@ class Meta: ), ] indexes = [ - models.Index(fields=('modified',)), + models.Index(fields=('modified',)), # for OAI-PMH selective harvest ] @@ -269,6 +345,23 @@ class Meta: ] +class SupplementaryIndexcardRdf(IndexcardRdf): + # supplementary (non-descriptive) metadata from the same source (just the most recent) + supplementary_suid = models.ForeignKey( + share_db.SourceUniqueIdentifier, + on_delete=models.CASCADE, + related_name='supplementary_rdf_set', + ) + + class Meta: + constraints = [ + models.UniqueConstraint( + fields=('indexcard', 'supplementary_suid'), + name='%(app_label)s_%(class)s_uniq_supplement', + ), + ] + + class DerivedIndexcard(models.Model): # auto: created = models.DateTimeField(auto_now_add=True) @@ -309,3 +402,15 @@ def as_rdf_literal(self) -> rdf.Literal: self.derived_text, datatype_iris=self.deriver_cls.derived_datatype_iris(), ) + + +### +# local helpers + +def _turtlify(rdf_tripledict: rdf.RdfTripleDictionary) -> tuple[str, str]: + '''return turtle serialization and checksum iri of that serialization''' + _rdf_as_turtle = rdf.turtle_from_tripledict(rdf_tripledict) + _turtle_checksum_iri = str( + ChecksumIri.digest('sha-256', salt='', raw_data=_rdf_as_turtle), + ) + return (_rdf_as_turtle, _turtle_checksum_iri) diff --git a/trove/views/ingest.py b/trove/views/ingest.py index c213c9cf2..3f4dc654b 100644 --- a/trove/views/ingest.py +++ b/trove/views/ingest.py @@ -1,3 +1,4 @@ +from http import HTTPStatus import logging from django import http @@ -17,16 +18,18 @@ def get(self, request): def post(self, request): # TODO: better error responses (jsonapi? shacl:ValidationReport?) - # TODO: permissions, validate focus_iri domain with user Source? + # TODO: permissions by focus_iri domain (compare with user's Source)? if not request.user.is_authenticated: - return http.HttpResponse(status=401) + return http.HttpResponse(status=HTTPStatus.UNAUTHORIZED) + if not request.user.is_trusted: + return http.HttpResponse(status=HTTPStatus.FORBIDDEN) # TODO: declare/validate params with dataclass _focus_iri = request.GET.get('focus_iri') if not _focus_iri: - return http.HttpResponse('focus_iri queryparam required', status=400) + return http.HttpResponse('focus_iri queryparam required', status=HTTPStatus.BAD_REQUEST) _record_identifier = request.GET.get('record_identifier') if not _record_identifier: - return http.HttpResponse('record_identifier queryparam required', status=400) + return http.HttpResponse('record_identifier queryparam required', status=HTTPStatus.BAD_REQUEST) try: digestive_tract.swallow( from_user=request.user, @@ -35,24 +38,27 @@ def post(self, request): record_mediatype=request.content_type, focus_iri=_focus_iri, urgent=(request.GET.get('nonurgent') is None), + is_supplementary=(request.GET.get('is_supplementary') is not None), ) except exceptions.IngestError as e: logger.exception(str(e)) - return http.HttpResponse(str(e), status=400) + return http.HttpResponse(str(e), status=HTTPStatus.BAD_REQUEST) else: # TODO: include link to view status (return task id from `swallow`?) - return http.HttpResponse(status=201) + return http.HttpResponse(status=HTTPStatus.CREATED) def delete(self, request): # TODO: cleaner permissions if not request.user.is_authenticated: - return http.HttpResponse(status=401) + return http.HttpResponse(status=HTTPStatus.UNAUTHORIZED) + if not request.user.is_trusted: + return http.HttpResponse(status=HTTPStatus.FORBIDDEN) # TODO: declare/validate params with dataclass _record_identifier = request.GET.get('record_identifier') if not _record_identifier: - return http.HttpResponse('record_identifier queryparam required', status=400) + return http.HttpResponse('record_identifier queryparam required', status=HTTPStatus.BAD_REQUEST) digestive_tract.expel( from_user=request.user, record_identifier=_record_identifier, ) - return http.HttpResponse(status=200) + return http.HttpResponse(status=HTTPStatus.OK)