diff --git a/how-to/use-the-api.md b/how-to/use-the-api.md
index dedf7e9dd..0af873993 100644
--- a/how-to/use-the-api.md
+++ b/how-to/use-the-api.md
@@ -9,12 +9,18 @@
`GET /trove/index-value-search`: search values for specific properties on index-cards
## Posting index-cards
-> NOTE: currently used only by other COS projects, not yet for public use
+> NOTE: currently used only by other COS projects, not yet for public use, authorization required
`POST /trove/ingest?focus_iri=...&record_identifier=...`:
currently supports only `Content-Type: text/turtle`
+query params:
+- `focus_iri` (required): full iri of the focus resource, exactly as used in the request body
+- `record_identifier` (required): a source-specific identifier for the metadata record (no format restrictions) -- sending another record with the same `record_identifier` is considered a full update (only the most recent is used)
+- `nonurgent`: if present (regardless of value), ingestion may be given a lower priority -- recommended for bulk or background operations
+- `is_supplementary`: if present (regardless of value), this record's metadata will be added to all pre-existing index-cards from the same user with the same `focus_iri` (if any), but will not get an index-card of its own nor affect the last-updated timestamp (e.g. in OAI-PMH) of the index-cards it supplements
+
## Deleting index-cards
`DELETE /trove/ingest?record_identifier=...`: request
diff --git a/share/migrations/0074_sourceuniqueidentifier_is_supplementary.py b/share/migrations/0074_sourceuniqueidentifier_is_supplementary.py
new file mode 100644
index 000000000..b25b35fa2
--- /dev/null
+++ b/share/migrations/0074_sourceuniqueidentifier_is_supplementary.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.2.25 on 2024-09-19 20:33
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('share', '0073_remove_indexbackfill_backfill_phase_index'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='sourceuniqueidentifier',
+ name='is_supplementary',
+ field=models.BooleanField(null=True),
+ ),
+ ]
diff --git a/share/models/source_unique_identifier.py b/share/models/source_unique_identifier.py
index 5bd9c88cb..bc3bbaf5e 100644
--- a/share/models/source_unique_identifier.py
+++ b/share/models/source_unique_identifier.py
@@ -14,6 +14,7 @@ class SourceUniqueIdentifier(models.Model):
identifier = models.TextField() # no restrictions on identifier format
source_config = models.ForeignKey('SourceConfig', on_delete=models.CASCADE)
focus_identifier = models.ForeignKey('trove.ResourceIdentifier', null=True, on_delete=models.PROTECT, related_name='suid_set')
+ is_supplementary = models.BooleanField(null=True)
class JSONAPIMeta(BaseJSONAPIMeta):
pass
diff --git a/tests/trove/digestive_tract/test_derive.py b/tests/trove/digestive_tract/test_derive.py
new file mode 100644
index 000000000..79fefe859
--- /dev/null
+++ b/tests/trove/digestive_tract/test_derive.py
@@ -0,0 +1,66 @@
+import json
+
+from django.test import TestCase
+from primitive_metadata import primitive_rdf as rdf
+
+from tests import factories
+from trove import digestive_tract
+from trove import models as trove_db
+from trove.vocab.namespaces import TROVE
+from trove.util.iris import get_sufficiently_unique_iri
+
+
+_BLARG = rdf.IriNamespace('https://blarg.example/')
+
+
+class TestDigestiveTractDerive(TestCase):
+ @classmethod
+ def setUpTestData(cls):
+ cls.focus_iri = _BLARG.this
+ _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(cls.focus_iri)
+ _raw = factories.RawDatumFactory()
+ cls.indexcard = trove_db.Indexcard.objects.create(source_record_suid=_raw.suid)
+ cls.indexcard.focus_identifier_set.add(_focus_ident)
+ cls.latest_rdf = trove_db.LatestIndexcardRdf.objects.create(
+ indexcard=cls.indexcard,
+ from_raw_datum=_raw,
+ focus_iri=cls.focus_iri,
+ rdf_as_turtle='''@prefix blarg: .
+blarg:this
+ a blarg:Thing ;
+ blarg:like blarg:that .
+''',
+ )
+
+ def test_derive(self):
+ (_derived,) = digestive_tract.derive(self.indexcard)
+ self.assertEqual(_derived.upriver_indexcard_id, self.indexcard.id)
+ self.assertEqual(_derived.deriver_identifier.sufficiently_unique_iri, get_sufficiently_unique_iri(TROVE['derive/osfmap_json']))
+ self.assertEqual(json.loads(_derived.derived_text), {
+ '@id': _BLARG.this,
+ 'resourceType': [{'@id': _BLARG.Thing}],
+ _BLARG.like: [{'@id': _BLARG.that}],
+ })
+
+ def test_derive_with_supplementary(self):
+ _supp_raw = factories.RawDatumFactory(
+ suid=factories.SourceUniqueIdentifierFactory(is_supplementary=True),
+ )
+ trove_db.SupplementaryIndexcardRdf.objects.create(
+ indexcard=self.indexcard,
+ from_raw_datum=_supp_raw,
+ supplementary_suid=_supp_raw.suid,
+ focus_iri=self.focus_iri,
+ rdf_as_turtle='''@prefix blarg: .
+blarg:this blarg:unlike blarg:nonthing .
+''',
+ )
+ (_derived,) = digestive_tract.derive(self.indexcard)
+ self.assertEqual(_derived.upriver_indexcard_id, self.indexcard.id)
+ self.assertEqual(_derived.deriver_identifier.sufficiently_unique_iri, get_sufficiently_unique_iri(TROVE['derive/osfmap_json']))
+ self.assertEqual(json.loads(_derived.derived_text), {
+ '@id': _BLARG.this,
+ 'resourceType': [{'@id': _BLARG.Thing}],
+ _BLARG.like: [{'@id': _BLARG.that}],
+ _BLARG.unlike: [{'@id': _BLARG.nonthing}],
+ })
diff --git a/tests/trove/digestive_tract/test_extract.py b/tests/trove/digestive_tract/test_extract.py
new file mode 100644
index 000000000..a8f19c7ab
--- /dev/null
+++ b/tests/trove/digestive_tract/test_extract.py
@@ -0,0 +1,91 @@
+from django.test import TestCase
+from primitive_metadata import primitive_rdf as rdf
+
+from tests import factories
+from trove import digestive_tract
+from trove import models as trove_db
+
+
+_BLARG = rdf.IriNamespace('https://blarg.example/')
+
+
+class TestDigestiveTractExtract(TestCase):
+ @classmethod
+ def setUpTestData(cls):
+ _focus_ident = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_BLARG.this)
+ cls.raw = factories.RawDatumFactory(
+ mediatype='text/turtle',
+ datum='''@prefix blarg: .
+blarg:this
+ a blarg:Thing ;
+ blarg:like blarg:that .
+''',
+ suid__focus_identifier=_focus_ident,
+ )
+ cls.supplementary_raw = factories.RawDatumFactory(
+ mediatype='text/turtle',
+ datum='''@prefix blarg: .
+blarg:this blarg:like blarg:another ;
+ blarg:unlike blarg:nonthing .
+''',
+ suid=factories.SourceUniqueIdentifierFactory(
+ source_config=cls.raw.suid.source_config,
+ focus_identifier=cls.raw.suid.focus_identifier,
+ is_supplementary=True,
+ ),
+ )
+
+ def test_setup(self):
+ self.assertEqual(trove_db.Indexcard.objects.all().count(), 0)
+ self.assertEqual(trove_db.LatestIndexcardRdf.objects.all().count(), 0)
+ self.assertEqual(trove_db.ArchivedIndexcardRdf.objects.all().count(), 0)
+ self.assertEqual(trove_db.SupplementaryIndexcardRdf.objects.all().count(), 0)
+
+ def test_extract(self):
+ (_indexcard,) = digestive_tract.extract(self.raw)
+ self.assertEqual(_indexcard.source_record_suid_id, self.raw.suid_id)
+ _focus_idents = list(
+ _indexcard.focus_identifier_set.values_list('sufficiently_unique_iri', flat=True),
+ )
+ self.assertEqual(_focus_idents, ['://blarg.example/this'])
+ _focustype_idents = list(
+ _indexcard.focustype_identifier_set.values_list('sufficiently_unique_iri', flat=True),
+ )
+ self.assertEqual(_focustype_idents, ['://blarg.example/Thing'])
+ self.assertEqual(list(_indexcard.supplementary_rdf_set.all()), [])
+ _latest_rdf = _indexcard.latest_rdf
+ self.assertEqual(_latest_rdf.from_raw_datum_id, self.raw.id)
+ self.assertEqual(_latest_rdf.indexcard_id, _indexcard.id)
+ self.assertEqual(_latest_rdf.focus_iri, _BLARG.this)
+ self.assertEqual(_latest_rdf.as_rdf_tripledict(), {
+ _BLARG.this: {
+ rdf.RDF.type: {_BLARG.Thing},
+ _BLARG.like: {_BLARG.that},
+ },
+ })
+
+ def test_extract_supplementary_without_prior(self):
+ _cards = digestive_tract.extract(self.supplementary_raw)
+ self.assertEqual(_cards, [])
+ self.assertEqual(trove_db.Indexcard.objects.all().count(), 0)
+ self.assertEqual(trove_db.LatestIndexcardRdf.objects.all().count(), 0)
+ self.assertEqual(trove_db.ArchivedIndexcardRdf.objects.all().count(), 0)
+ self.assertEqual(trove_db.SupplementaryIndexcardRdf.objects.all().count(), 0)
+
+ def test_extract_supplementary(self):
+ (_orig_indexcard,) = digestive_tract.extract(self.raw)
+ _orig_timestamp = _orig_indexcard.latest_rdf.modified
+ (_indexcard,) = digestive_tract.extract(self.supplementary_raw)
+ self.assertEqual(_orig_indexcard.id, _indexcard.id)
+ self.assertEqual(_indexcard.source_record_suid_id, self.raw.suid_id)
+ (_supp_rdf,) = _indexcard.supplementary_rdf_set.all()
+ self.assertEqual(_supp_rdf.from_raw_datum_id, self.supplementary_raw.id)
+ self.assertEqual(_supp_rdf.indexcard_id, _indexcard.id)
+ self.assertEqual(_supp_rdf.focus_iri, _BLARG.this)
+ self.assertEqual(_supp_rdf.as_rdf_tripledict(), {
+ _BLARG.this: {
+ _BLARG.like: {_BLARG.another},
+ _BLARG.unlike: {_BLARG.nonthing},
+ },
+ })
+ self.assertEqual(_indexcard.latest_rdf.modified, _orig_timestamp)
diff --git a/tests/trove/digestive_tract/test_swallow.py b/tests/trove/digestive_tract/test_swallow.py
new file mode 100644
index 000000000..62a81309e
--- /dev/null
+++ b/tests/trove/digestive_tract/test_swallow.py
@@ -0,0 +1,77 @@
+from unittest import mock
+from django.test import TestCase
+
+from tests import factories
+from trove import digestive_tract
+from share import models as share_db
+
+
+class TestDigestiveTractSwallow(TestCase):
+ @classmethod
+ def setUpTestData(cls):
+ cls.user = factories.ShareUserFactory()
+ cls.turtle = '''
+@prefix blarg: .
+blarg:this
+ a blarg:Thing ;
+ blarg:like blarg:that .
+'''
+
+ def test_setup(self):
+ self.assertEqual(share_db.RawDatum.objects.all().count(), 0)
+
+ def test_swallow(self):
+ with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task:
+ digestive_tract.swallow(
+ from_user=self.user,
+ record=self.turtle,
+ record_identifier='blarg',
+ record_mediatype='text/turtle',
+ focus_iri='https://blarg.example/this',
+ )
+ (_raw,) = share_db.RawDatum.objects.all()
+ self.assertEqual(_raw.datum, self.turtle)
+ self.assertEqual(_raw.mediatype, 'text/turtle')
+ self.assertEqual(_raw.suid.identifier, 'blarg')
+ self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/this')
+ self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id)
+ self.assertFalse(_raw.suid.is_supplementary)
+ _mock_task.delay.assert_called_once_with(_raw.id, urgent=False)
+
+ def test_swallow_urgent(self):
+ with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task:
+ digestive_tract.swallow(
+ from_user=self.user,
+ record=self.turtle,
+ record_identifier='blarg',
+ record_mediatype='text/turtle',
+ focus_iri='https://blarg.example/this',
+ urgent=True
+ )
+ (_raw,) = share_db.RawDatum.objects.all()
+ self.assertEqual(_raw.datum, self.turtle)
+ self.assertEqual(_raw.mediatype, 'text/turtle')
+ self.assertEqual(_raw.suid.identifier, 'blarg')
+ self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/this')
+ self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id)
+ self.assertFalse(_raw.suid.is_supplementary)
+ _mock_task.delay.assert_called_once_with(_raw.id, urgent=True)
+
+ def test_swallow_supplementary(self):
+ with mock.patch('trove.digestive_tract.task__extract_and_derive') as _mock_task:
+ digestive_tract.swallow(
+ from_user=self.user,
+ record=self.turtle,
+ record_identifier='blarg',
+ record_mediatype='text/turtle',
+ focus_iri='https://blarg.example/this',
+ is_supplementary=True,
+ )
+ (_raw,) = share_db.RawDatum.objects.all()
+ self.assertEqual(_raw.datum, self.turtle)
+ self.assertEqual(_raw.mediatype, 'text/turtle')
+ self.assertEqual(_raw.suid.identifier, 'blarg')
+ self.assertEqual(_raw.suid.focus_identifier.sufficiently_unique_iri, '://blarg.example/this')
+ self.assertEqual(_raw.suid.source_config.source.user_id, self.user.id)
+ self.assertTrue(_raw.suid.is_supplementary)
+ _mock_task.delay.assert_called_once_with(_raw.id, urgent=False)
diff --git a/tests/trove/views/test_ingest.py b/tests/trove/views/test_ingest.py
index f7fb49f74..2b828f399 100644
--- a/tests/trove/views/test_ingest.py
+++ b/tests/trove/views/test_ingest.py
@@ -1,6 +1,139 @@
+from http import HTTPStatus
+from unittest import mock
+from urllib.parse import urlencode
+
from django.test import TestCase
+from tests import factories
+
class TestIngest(TestCase):
- def test_simple_ingest(self):
- pass # TODO
+ @classmethod
+ def setUpTestData(cls):
+ cls.user = factories.ShareUserFactory(is_trusted=True)
+
+ def test_post(self):
+ with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
+ _resp = self.client.post(
+ '/trove/ingest?' + urlencode({
+ 'focus_iri': 'https://foo.example/blarg',
+ 'record_identifier': 'blarg',
+ }),
+ content_type='text/turtle',
+ data='turtleturtleturtle',
+ HTTP_AUTHORIZATION=self.user.authorization(),
+ )
+ self.assertEqual(_resp.status_code, HTTPStatus.CREATED)
+ _mock_tract.swallow.assert_called_once_with(
+ from_user=self.user,
+ record='turtleturtleturtle',
+ record_identifier='blarg',
+ record_mediatype='text/turtle',
+ focus_iri='https://foo.example/blarg',
+ urgent=True,
+ is_supplementary=False,
+ )
+
+ def test_post_nonurgent(self):
+ with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
+ _resp = self.client.post(
+ '/trove/ingest?' + urlencode({
+ 'focus_iri': 'https://foo.example/blarg',
+ 'record_identifier': 'blarg',
+ 'nonurgent': '',
+ }),
+ content_type='text/turtle',
+ data='turtleturtleturtle',
+ HTTP_AUTHORIZATION=self.user.authorization(),
+ )
+ self.assertEqual(_resp.status_code, HTTPStatus.CREATED)
+ _mock_tract.swallow.assert_called_once_with(
+ from_user=self.user,
+ record='turtleturtleturtle',
+ record_identifier='blarg',
+ record_mediatype='text/turtle',
+ focus_iri='https://foo.example/blarg',
+ urgent=False,
+ is_supplementary=False,
+ )
+
+ def test_post_supplementary(self):
+ with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
+ _resp = self.client.post(
+ '/trove/ingest?' + urlencode({
+ 'focus_iri': 'https://foo.example/blarg',
+ 'record_identifier': 'blarg',
+ 'is_supplementary': '',
+ }),
+ content_type='text/turtle',
+ data='turtleturtleturtle',
+ HTTP_AUTHORIZATION=self.user.authorization(),
+ )
+ self.assertEqual(_resp.status_code, HTTPStatus.CREATED)
+ _mock_tract.swallow.assert_called_once_with(
+ from_user=self.user,
+ record='turtleturtleturtle',
+ record_identifier='blarg',
+ record_mediatype='text/turtle',
+ focus_iri='https://foo.example/blarg',
+ urgent=True,
+ is_supplementary=True,
+ )
+
+ def test_delete(self):
+ with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
+ _resp = self.client.delete(
+ '/trove/ingest?record_identifier=blarg',
+ HTTP_AUTHORIZATION=self.user.authorization(),
+ )
+ self.assertEqual(_resp.status_code, HTTPStatus.OK)
+ _mock_tract.expel.assert_called_once_with(
+ from_user=self.user,
+ record_identifier='blarg',
+ )
+
+ def test_anonymous_post(self):
+ with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
+ _resp = self.client.post(
+ '/trove/ingest?' + urlencode({
+ 'focus_iri': 'https://foo.example/blarg',
+ 'record_identifier': 'blarg',
+ 'is_supplementary': '',
+ }),
+ content_type='text/turtle',
+ data='turtleturtleturtle',
+ )
+ self.assertEqual(_resp.status_code, HTTPStatus.UNAUTHORIZED)
+ self.assertFalse(_mock_tract.swallow.called)
+
+ def test_nontrusted_post(self):
+ _nontrusted_user = factories.ShareUserFactory()
+ with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
+ _resp = self.client.post(
+ '/trove/ingest?' + urlencode({
+ 'focus_iri': 'https://foo.example/blarg',
+ 'record_identifier': 'blarg',
+ 'is_supplementary': '',
+ }),
+ content_type='text/turtle',
+ data='turtleturtleturtle',
+ HTTP_AUTHORIZATION=_nontrusted_user.authorization(),
+ )
+ self.assertEqual(_resp.status_code, HTTPStatus.FORBIDDEN)
+ self.assertFalse(_mock_tract.swallow.called)
+
+ def test_anonymous_delete(self):
+ with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
+ _resp = self.client.delete('/trove/ingest?record_identifier=blarg')
+ self.assertEqual(_resp.status_code, HTTPStatus.UNAUTHORIZED)
+ self.assertFalse(_mock_tract.expel.called)
+
+ def test_nontrusted_delete(self):
+ _nontrusted_user = factories.ShareUserFactory()
+ with mock.patch('trove.views.ingest.digestive_tract') as _mock_tract:
+ _resp = self.client.delete(
+ '/trove/ingest?record_identifier=blarg',
+ HTTP_AUTHORIZATION=_nontrusted_user.authorization(),
+ )
+ self.assertEqual(_resp.status_code, HTTPStatus.FORBIDDEN)
+ self.assertFalse(_mock_tract.expel.called)
diff --git a/trove/derive/_base.py b/trove/derive/_base.py
index 823f494e9..a16dc8fe0 100644
--- a/trove/derive/_base.py
+++ b/trove/derive/_base.py
@@ -1,4 +1,5 @@
import abc
+from collections.abc import Iterable
from primitive_metadata import primitive_rdf
@@ -10,10 +11,16 @@ class IndexcardDeriver(abc.ABC):
focus_iri: str
data: primitive_rdf.RdfGraph
- def __init__(self, upriver_rdf: IndexcardRdf):
+ def __init__(
+ self,
+ upriver_rdf: IndexcardRdf,
+ supplementary_rdf_set: Iterable[IndexcardRdf] = (),
+ ):
self.upriver_rdf = upriver_rdf
self.focus_iri = upriver_rdf.focus_iri
self.data = primitive_rdf.RdfGraph(upriver_rdf.as_rdf_tripledict())
+ for _supplementary_rdf in supplementary_rdf_set:
+ self.data.add_tripledict(_supplementary_rdf.as_rdf_tripledict())
def q(self, pathset):
# convenience for querying self.data on self.focus_iri
diff --git a/trove/digestive_tract.py b/trove/digestive_tract.py
index 402ed4ac1..2c3e21397 100644
--- a/trove/digestive_tract.py
+++ b/trove/digestive_tract.py
@@ -40,6 +40,7 @@ def swallow(
focus_iri: str,
datestamp=None, # default "now"
urgent=False,
+ is_supplementary=False,
):
'''swallow: store a given record by checksum; queue for extraction
@@ -55,7 +56,12 @@ def swallow(
_suid, _suid_created = share_db.SourceUniqueIdentifier.objects.get_or_create(
source_config=_source_config,
identifier=record_identifier,
+ defaults={
+ 'is_supplementary': is_supplementary,
+ },
)
+ if bool(_suid.is_supplementary) != is_supplementary:
+ raise DigestiveError(f'suid is_supplementary should not change! suid={_suid}, is_supplementary changed from {bool(_suid.is_supplementary)} to {is_supplementary}')
_focus_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri)
if _suid.focus_identifier is None:
_suid.focus_identifier = _focus_identifier
@@ -104,11 +110,12 @@ def swallow__sharev2_legacy(
def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_db.Indexcard]:
'''extract: gather rdf graph from a record; store as index card(s)
- will create (or update):
+ may create (or update):
ResourceIdentifier (for each described resource and its types)
Indexcard (with identifiers and type-identifiers for each described resource)
- ArchivedIndexcardRdf (all extracted metadata)
- LatestIndexcardRdf (all extracted metadata, if latest raw)
+ ArchivedIndexcardRdf (all extracted metadata, if non-supplementary)
+ LatestIndexcardRdf (all extracted metadata, if latest raw and non-supplementary)
+ SupplementaryIndexcardRdf (all extracted metadata, if supplementary)
may delete:
LatestIndexcardRdf (previously extracted from the record, but no longer present)
'''
@@ -137,6 +144,11 @@ def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_
(_iri, RDFS.isDefinedBy, _focus_iri),
)
_tripledicts_by_focus_iri[_iri] = _term_tripledict
+ if raw.suid.is_supplementary:
+ return trove_db.Indexcard.objects.supplement_indexcards_from_tripledicts(
+ from_raw_datum=raw,
+ rdf_tripledicts_by_focus_iri=_tripledicts_by_focus_iri,
+ )
return trove_db.Indexcard.objects.save_indexcards_from_tripledicts(
from_raw_datum=raw,
rdf_tripledicts_by_focus_iri=_tripledicts_by_focus_iri,
@@ -150,10 +162,18 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None):
will create, update, or delete:
DerivedIndexcard
'''
- if indexcard.deleted or not indexcard.latest_rdf:
- return
+ if indexcard.deleted:
+ return []
+ try:
+ _latest_rdf = indexcard.latest_rdf
+ except trove_db.LatestIndexcardRdf.DoesNotExist:
+ return []
+ _derived_list = []
for _deriver_class in get_deriver_classes(deriver_iris):
- _deriver = _deriver_class(upriver_rdf=indexcard.latest_rdf)
+ _deriver = _deriver_class(
+ upriver_rdf=_latest_rdf,
+ supplementary_rdf_set=indexcard.supplementary_rdf_set.all(),
+ )
_deriver_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_deriver.deriver_iri())
if _deriver.should_skip():
trove_db.DerivedIndexcard.objects.filter(
@@ -163,14 +183,16 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None):
else:
_derived_text = _deriver.derive_card_as_text()
_derived_checksum_iri = ChecksumIri.digest('sha-256', salt='', raw_data=_derived_text)
- trove_db.DerivedIndexcard.objects.update_or_create(
+ _derived, _ = trove_db.DerivedIndexcard.objects.update_or_create(
upriver_indexcard=indexcard,
deriver_identifier=_deriver_identifier,
defaults={
- 'derived_text': _deriver.derive_card_as_text(),
+ 'derived_text': _derived_text,
'derived_checksum_iri': _derived_checksum_iri,
},
)
+ _derived_list.append(_derived)
+ return _derived_list
def expel(from_user: share_db.ShareUser, record_identifier: str):
@@ -178,6 +200,11 @@ def expel(from_user: share_db.ShareUser, record_identifier: str):
source_config__source__user=from_user,
identifier=record_identifier,
)
+ (
+ trove_db.SupplementaryIndexcardRdf.objects
+ .filter(supplementary_suid__in=_suid_qs)
+ .delete()
+ )
for _indexcard in trove_db.Indexcard.objects.filter(source_record_suid__in=_suid_qs):
_indexcard.pls_delete()
@@ -247,6 +274,9 @@ def task__schedule_all_for_deriver(deriver_iri: str, notify_index=False):
def _sharev2_legacy_ingest(raw, urgent: bool):
assert raw.mediatype is None, 'raw datum has a mediatype -- did you mean to call non-legacy extract?'
_extractor = get_rdf_extractor_class(None)(raw.suid.source_config)
+ if typing.TYPE_CHECKING:
+ from trove.extract.legacy_sharev2 import LegacySharev2Extractor
+ assert isinstance(_extractor, LegacySharev2Extractor)
_sharev2graph = _extractor.extract_sharev2_graph(raw.datum)
_centralnode = _sharev2graph.get_central_node(guess=True)
_normd = share_db.NormalizedData.objects.create(
diff --git a/trove/migrations/0006_supplementary_indexcard_rdf.py b/trove/migrations/0006_supplementary_indexcard_rdf.py
new file mode 100644
index 000000000..1dbf504ab
--- /dev/null
+++ b/trove/migrations/0006_supplementary_indexcard_rdf.py
@@ -0,0 +1,33 @@
+# Generated by Django 3.2.25 on 2024-09-19 20:33
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('share', '0074_sourceuniqueidentifier_is_supplementary'),
+ ('trove', '0005_indexes_for_oaipmh'),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='SupplementaryIndexcardRdf',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('created', models.DateTimeField(auto_now_add=True)),
+ ('modified', models.DateTimeField(auto_now=True)),
+ ('turtle_checksum_iri', models.TextField(db_index=True)),
+ ('focus_iri', models.TextField()),
+ ('rdf_as_turtle', models.TextField()),
+ ('from_raw_datum', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='+', to='share.rawdatum')),
+ ('indexcard', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_supplementaryindexcardrdf_set', to='trove.indexcard')),
+ ('supplementary_suid', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='supplementary_rdf_set', to='share.sourceuniqueidentifier')),
+ ],
+ ),
+ migrations.AddConstraint(
+ model_name='supplementaryindexcardrdf',
+ constraint=models.UniqueConstraint(fields=('indexcard', 'supplementary_suid'), name='trove_supplementaryindexcardrdf_uniq_supplement'),
+ ),
+ ]
diff --git a/trove/models/__init__.py b/trove/models/__init__.py
index 42bc2d0ca..acaadd7c4 100644
--- a/trove/models/__init__.py
+++ b/trove/models/__init__.py
@@ -4,7 +4,15 @@
'IndexcardRdf',
'LatestIndexcardRdf',
'ArchivedIndexcardRdf',
+ 'SupplementaryIndexcardRdf',
'DerivedIndexcard',
)
-from .indexcard import Indexcard, IndexcardRdf, LatestIndexcardRdf, ArchivedIndexcardRdf, DerivedIndexcard
+from .indexcard import (
+ ArchivedIndexcardRdf,
+ DerivedIndexcard,
+ Indexcard,
+ IndexcardRdf,
+ LatestIndexcardRdf,
+ SupplementaryIndexcardRdf,
+)
from .resource_identifier import ResourceIdentifier
diff --git a/trove/models/indexcard.py b/trove/models/indexcard.py
index b7a411ffa..35b11fa27 100644
--- a/trove/models/indexcard.py
+++ b/trove/models/indexcard.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from __future__ import annotations
import uuid
from django.db import models
@@ -27,6 +27,7 @@ def save_indexcards_from_tripledicts(
rdf_tripledicts_by_focus_iri: dict[str, rdf.RdfTripleDictionary],
undelete: bool = False,
) -> list['Indexcard']:
+ assert not from_raw_datum.suid.is_supplementary
from_raw_datum.no_output = (not rdf_tripledicts_by_focus_iri)
from_raw_datum.save(update_fields=['no_output'])
_indexcards = []
@@ -55,6 +56,36 @@ def save_indexcards_from_tripledicts(
_indexcard_to_delete.pls_delete()
return _indexcards
+ @transaction.atomic
+ def supplement_indexcards_from_tripledicts(
+ self, *,
+ from_raw_datum: share_db.RawDatum,
+ rdf_tripledicts_by_focus_iri: dict[str, rdf.RdfTripleDictionary],
+ ) -> list[Indexcard]:
+ assert from_raw_datum.suid.is_supplementary
+ from_raw_datum.no_output = (not rdf_tripledicts_by_focus_iri)
+ from_raw_datum.save(update_fields=['no_output'])
+ if not from_raw_datum.is_latest():
+ return []
+ _indexcards = []
+ for _focus_iri, _tripledict in rdf_tripledicts_by_focus_iri.items():
+ _indexcards.extend(self.supplement_indexcards(
+ from_raw_datum=from_raw_datum,
+ rdf_tripledict=_tripledict,
+ focus_iri=_focus_iri,
+ ))
+ _seen_indexcard_ids = {_card.id for _card in _indexcards}
+ # supplementary data seen previously on this suid (but not this time) should be deleted
+ for _supplement_to_delete in (
+ SupplementaryIndexcardRdf.objects
+ .filter(supplementary_suid=from_raw_datum.suid)
+ .exclude(from_raw_datum=from_raw_datum)
+ ):
+ if _supplement_to_delete.indexcard_id not in _seen_indexcard_ids:
+ _indexcards.append(_supplement_to_delete.indexcard)
+ _supplement_to_delete.delete()
+ return _indexcards
+
@transaction.atomic
def save_indexcard_from_tripledict(
self, *,
@@ -63,8 +94,7 @@ def save_indexcard_from_tripledict(
focus_iri: str,
undelete: bool = False,
):
- if focus_iri not in rdf_tripledict:
- raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}')
+ assert not from_raw_datum.suid.is_supplementary
_focus_identifier_set = (
ResourceIdentifier.objects
.save_equivalent_identifier_set(rdf_tripledict, focus_iri)
@@ -84,14 +114,35 @@ def save_indexcard_from_tripledict(
_indexcard.save()
_indexcard.focus_identifier_set.set(_focus_identifier_set)
_indexcard.focustype_identifier_set.set(_focustype_identifier_set)
- IndexcardRdf.save_indexcard_rdf(
- indexcard=_indexcard,
+ _indexcard.update_rdf(
from_raw_datum=from_raw_datum,
rdf_tripledict=rdf_tripledict,
focus_iri=focus_iri,
)
return _indexcard
+ @transaction.atomic
+ def supplement_indexcards(
+ self, *,
+ from_raw_datum: share_db.RawDatum,
+ rdf_tripledict: rdf.RdfTripleDictionary,
+ focus_iri: str,
+ ) -> list[Indexcard]:
+ assert from_raw_datum.suid.is_supplementary
+ # supplement indexcards with the same focus from the same source_config
+ # (if none exist, fine, nothing gets supplemented)
+ _indexcards = list(Indexcard.objects.filter(
+ source_record_suid__source_config_id=from_raw_datum.suid.source_config_id,
+ focus_identifier_set__in=ResourceIdentifier.objects.queryset_for_iri(focus_iri),
+ ))
+ for _indexcard in _indexcards:
+ _indexcard.update_supplementary_rdf(
+ from_raw_datum=from_raw_datum,
+ rdf_tripledict=rdf_tripledict,
+ focus_iri=focus_iri,
+ )
+ return _indexcards
+
class Indexcard(models.Model):
objects = IndexcardManager()
@@ -127,10 +178,10 @@ class Meta:
]
@property
- def latest_rdf(self) -> Optional['LatestIndexcardRdf']:
+ def latest_rdf(self) -> LatestIndexcardRdf:
'''convenience for the "other side" of LatestIndexcardRdf.indexcard
'''
- return self.trove_latestindexcardrdf_set.first()
+ return self.trove_latestindexcardrdf_set.get() # may raise DoesNotExist
@property
def archived_rdf_set(self):
@@ -140,6 +191,14 @@ def archived_rdf_set(self):
'''
return self.trove_archivedindexcardrdf_set
+ @property
+ def supplementary_rdf_set(self):
+ '''convenience for the "other side" of SupplementaryIndexcardRdf.indexcard
+
+ returns a RelatedManager
+ '''
+ return self.trove_supplementaryindexcardrdf_set
+
def get_iri(self):
return trove_indexcard_iri(self.uuid)
@@ -166,6 +225,61 @@ def __repr__(self):
def __str__(self):
return repr(self)
+ @transaction.atomic
+ def update_rdf(
+ self,
+ from_raw_datum: share_db.RawDatum,
+ focus_iri: str,
+ rdf_tripledict: rdf.RdfTripleDictionary,
+ ) -> 'IndexcardRdf':
+ if focus_iri not in rdf_tripledict:
+ raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}')
+ _rdf_as_turtle, _turtle_checksum_iri = _turtlify(rdf_tripledict)
+ _archived, _archived_created = ArchivedIndexcardRdf.objects.get_or_create(
+ indexcard=self,
+ from_raw_datum=from_raw_datum,
+ turtle_checksum_iri=_turtle_checksum_iri,
+ defaults={
+ 'rdf_as_turtle': _rdf_as_turtle,
+ 'focus_iri': focus_iri,
+ },
+ )
+ if (not _archived_created) and (_archived.rdf_as_turtle != _rdf_as_turtle):
+ raise DigestiveError(f'hash collision? {_archived}\n===\n{_rdf_as_turtle}')
+ if not self.deleted and from_raw_datum.is_latest():
+ _latest_indexcard_rdf, _created = LatestIndexcardRdf.objects.update_or_create(
+ indexcard=self,
+ defaults={
+ 'from_raw_datum': from_raw_datum,
+ 'turtle_checksum_iri': _turtle_checksum_iri,
+ 'rdf_as_turtle': _rdf_as_turtle,
+ 'focus_iri': focus_iri,
+ },
+ )
+ return _latest_indexcard_rdf
+ return _archived
+
+ def update_supplementary_rdf(
+ self,
+ from_raw_datum: share_db.RawDatum,
+ focus_iri: str,
+ rdf_tripledict: rdf.RdfTripleDictionary,
+ ) -> SupplementaryIndexcardRdf:
+ if focus_iri not in rdf_tripledict:
+ raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}')
+ _rdf_as_turtle, _turtle_checksum_iri = _turtlify(rdf_tripledict)
+ _supplement_rdf, _ = SupplementaryIndexcardRdf.objects.update_or_create(
+ indexcard=self,
+ supplementary_suid=from_raw_datum.suid,
+ defaults={
+ 'from_raw_datum': from_raw_datum,
+ 'turtle_checksum_iri': _turtle_checksum_iri,
+ 'rdf_as_turtle': _rdf_as_turtle,
+ 'focus_iri': focus_iri,
+ },
+ )
+ return _supplement_rdf
+
class IndexcardRdf(models.Model):
# auto:
@@ -205,44 +319,6 @@ def __repr__(self):
def __str__(self):
return repr(self)
- @transaction.atomic
- @staticmethod
- def save_indexcard_rdf(
- indexcard: Indexcard,
- from_raw_datum: share_db.RawDatum,
- rdf_tripledict: rdf.RdfTripleDictionary,
- focus_iri: str,
- ) -> 'IndexcardRdf':
- if focus_iri not in rdf_tripledict:
- raise DigestiveError(f'expected {focus_iri} in {set(rdf_tripledict.keys())}')
- _rdf_as_turtle = rdf.turtle_from_tripledict(rdf_tripledict)
- _turtle_checksum_iri = str(
- ChecksumIri.digest('sha-256', salt='', raw_data=_rdf_as_turtle),
- )
- _archived, _archived_created = ArchivedIndexcardRdf.objects.get_or_create(
- indexcard=indexcard,
- from_raw_datum=from_raw_datum,
- turtle_checksum_iri=_turtle_checksum_iri,
- defaults={
- 'rdf_as_turtle': _rdf_as_turtle,
- 'focus_iri': focus_iri,
- },
- )
- if (not _archived_created) and (_archived.rdf_as_turtle != _rdf_as_turtle):
- raise DigestiveError(f'hash collision? {_archived}\n===\n{_rdf_as_turtle}')
- if not indexcard.deleted and from_raw_datum.is_latest():
- _latest_indexcard_rdf, _created = LatestIndexcardRdf.objects.update_or_create(
- indexcard=indexcard,
- defaults={
- 'from_raw_datum': from_raw_datum,
- 'turtle_checksum_iri': _turtle_checksum_iri,
- 'rdf_as_turtle': _rdf_as_turtle,
- 'focus_iri': focus_iri,
- },
- )
- return _latest_indexcard_rdf
- return _archived
-
class LatestIndexcardRdf(IndexcardRdf):
# just the most recent version of this indexcard
@@ -254,7 +330,7 @@ class Meta:
),
]
indexes = [
- models.Index(fields=('modified',)),
+ models.Index(fields=('modified',)), # for OAI-PMH selective harvest
]
@@ -269,6 +345,23 @@ class Meta:
]
+class SupplementaryIndexcardRdf(IndexcardRdf):
+ # supplementary (non-descriptive) metadata from the same source (just the most recent)
+ supplementary_suid = models.ForeignKey(
+ share_db.SourceUniqueIdentifier,
+ on_delete=models.CASCADE,
+ related_name='supplementary_rdf_set',
+ )
+
+ class Meta:
+ constraints = [
+ models.UniqueConstraint(
+ fields=('indexcard', 'supplementary_suid'),
+ name='%(app_label)s_%(class)s_uniq_supplement',
+ ),
+ ]
+
+
class DerivedIndexcard(models.Model):
# auto:
created = models.DateTimeField(auto_now_add=True)
@@ -309,3 +402,15 @@ def as_rdf_literal(self) -> rdf.Literal:
self.derived_text,
datatype_iris=self.deriver_cls.derived_datatype_iris(),
)
+
+
+###
+# local helpers
+
+def _turtlify(rdf_tripledict: rdf.RdfTripleDictionary) -> tuple[str, str]:
+ '''return turtle serialization and checksum iri of that serialization'''
+ _rdf_as_turtle = rdf.turtle_from_tripledict(rdf_tripledict)
+ _turtle_checksum_iri = str(
+ ChecksumIri.digest('sha-256', salt='', raw_data=_rdf_as_turtle),
+ )
+ return (_rdf_as_turtle, _turtle_checksum_iri)
diff --git a/trove/views/ingest.py b/trove/views/ingest.py
index c213c9cf2..3f4dc654b 100644
--- a/trove/views/ingest.py
+++ b/trove/views/ingest.py
@@ -1,3 +1,4 @@
+from http import HTTPStatus
import logging
from django import http
@@ -17,16 +18,18 @@ def get(self, request):
def post(self, request):
# TODO: better error responses (jsonapi? shacl:ValidationReport?)
- # TODO: permissions, validate focus_iri domain with user Source?
+ # TODO: permissions by focus_iri domain (compare with user's Source)?
if not request.user.is_authenticated:
- return http.HttpResponse(status=401)
+ return http.HttpResponse(status=HTTPStatus.UNAUTHORIZED)
+ if not request.user.is_trusted:
+ return http.HttpResponse(status=HTTPStatus.FORBIDDEN)
# TODO: declare/validate params with dataclass
_focus_iri = request.GET.get('focus_iri')
if not _focus_iri:
- return http.HttpResponse('focus_iri queryparam required', status=400)
+ return http.HttpResponse('focus_iri queryparam required', status=HTTPStatus.BAD_REQUEST)
_record_identifier = request.GET.get('record_identifier')
if not _record_identifier:
- return http.HttpResponse('record_identifier queryparam required', status=400)
+ return http.HttpResponse('record_identifier queryparam required', status=HTTPStatus.BAD_REQUEST)
try:
digestive_tract.swallow(
from_user=request.user,
@@ -35,24 +38,27 @@ def post(self, request):
record_mediatype=request.content_type,
focus_iri=_focus_iri,
urgent=(request.GET.get('nonurgent') is None),
+ is_supplementary=(request.GET.get('is_supplementary') is not None),
)
except exceptions.IngestError as e:
logger.exception(str(e))
- return http.HttpResponse(str(e), status=400)
+ return http.HttpResponse(str(e), status=HTTPStatus.BAD_REQUEST)
else:
# TODO: include link to view status (return task id from `swallow`?)
- return http.HttpResponse(status=201)
+ return http.HttpResponse(status=HTTPStatus.CREATED)
def delete(self, request):
# TODO: cleaner permissions
if not request.user.is_authenticated:
- return http.HttpResponse(status=401)
+ return http.HttpResponse(status=HTTPStatus.UNAUTHORIZED)
+ if not request.user.is_trusted:
+ return http.HttpResponse(status=HTTPStatus.FORBIDDEN)
# TODO: declare/validate params with dataclass
_record_identifier = request.GET.get('record_identifier')
if not _record_identifier:
- return http.HttpResponse('record_identifier queryparam required', status=400)
+ return http.HttpResponse('record_identifier queryparam required', status=HTTPStatus.BAD_REQUEST)
digestive_tract.expel(
from_user=request.user,
record_identifier=_record_identifier,
)
- return http.HttpResponse(status=200)
+ return http.HttpResponse(status=HTTPStatus.OK)