Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6265] supplementary metadata #825

Merged
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
supplementary metadata
aaxelb committed Sep 19, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 5a7dd8656182c466c3fc30ea6d3b9877e7517447
18 changes: 18 additions & 0 deletions share/migrations/0074_sourceuniqueidentifier_is_supplementary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.25 on 2024-09-19 20:33

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('share', '0073_remove_indexbackfill_backfill_phase_index'),
]

operations = [
migrations.AddField(
model_name='sourceuniqueidentifier',
name='is_supplementary',
field=models.BooleanField(null=True),
),
]
1 change: 1 addition & 0 deletions share/models/source_unique_identifier.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@ class SourceUniqueIdentifier(models.Model):
identifier = models.TextField() # no restrictions on identifier format
source_config = models.ForeignKey('SourceConfig', on_delete=models.CASCADE)
focus_identifier = models.ForeignKey('trove.ResourceIdentifier', null=True, on_delete=models.PROTECT, related_name='suid_set')
is_supplementary = models.BooleanField(null=True)

class JSONAPIMeta(BaseJSONAPIMeta):
pass
9 changes: 8 additions & 1 deletion trove/derive/_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
from collections.abc import Iterable

from primitive_metadata import primitive_rdf

@@ -10,10 +11,16 @@ class IndexcardDeriver(abc.ABC):
focus_iri: str
data: primitive_rdf.RdfGraph

def __init__(self, upriver_rdf: IndexcardRdf):
def __init__(
self,
upriver_rdf: IndexcardRdf,
supplementary_rdf_set: Iterable[IndexcardRdf] = (),
):
self.upriver_rdf = upriver_rdf
self.focus_iri = upriver_rdf.focus_iri
self.data = primitive_rdf.RdfGraph(upriver_rdf.as_rdf_tripledict())
for _supplementary_rdf in supplementary_rdf_set:
self.data.add_tripledict(_supplementary_rdf.as_rdf_tripledict())

def q(self, pathset):
# convenience for querying self.data on self.focus_iri
26 changes: 22 additions & 4 deletions trove/digestive_tract.py
Original file line number Diff line number Diff line change
@@ -40,6 +40,7 @@ def swallow(
focus_iri: str,
datestamp=None, # default "now"
urgent=False,
is_supplementary=False,
):
'''swallow: store a given record by checksum; queue for extraction

@@ -55,7 +56,12 @@ def swallow(
_suid, _suid_created = share_db.SourceUniqueIdentifier.objects.get_or_create(
source_config=_source_config,
identifier=record_identifier,
defaults={
'is_supplementary': is_supplementary,
},
)
if bool(_suid.is_supplementary) != is_supplementary:
raise DigestiveError(f'suid is_supplementary should not change! suid={_suid}, is_supplementary changed from {bool(_suid.is_supplementary)} to {is_supplementary}')
_focus_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri)
if _suid.focus_identifier is None:
_suid.focus_identifier = _focus_identifier
@@ -104,11 +110,12 @@ def swallow__sharev2_legacy(
def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_db.Indexcard]:
'''extract: gather rdf graph from a record; store as index card(s)

will create (or update):
may create (or update):
ResourceIdentifier (for each described resource and its types)
Indexcard (with identifiers and type-identifiers for each described resource)
ArchivedIndexcardRdf (all extracted metadata)
LatestIndexcardRdf (all extracted metadata, if latest raw)
ArchivedIndexcardRdf (all extracted metadata, if non-supplementary)
LatestIndexcardRdf (all extracted metadata, if latest raw and non-supplementary)
SupplementaryIndexcardRdf (all extracted metadata, if supplementary)
may delete:
LatestIndexcardRdf (previously extracted from the record, but no longer present)
'''
@@ -153,7 +160,10 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None):
if indexcard.deleted or not indexcard.latest_rdf:
return
for _deriver_class in get_deriver_classes(deriver_iris):
_deriver = _deriver_class(upriver_rdf=indexcard.latest_rdf)
_deriver = _deriver_class(
upriver_rdf=indexcard.latest_rdf,
supplementary_rdf_set=indexcard.supplementary_rdf_set.all(),
)
_deriver_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_deriver.deriver_iri())
if _deriver.should_skip():
trove_db.DerivedIndexcard.objects.filter(
@@ -178,6 +188,11 @@ def expel(from_user: share_db.ShareUser, record_identifier: str):
source_config__source__user=from_user,
identifier=record_identifier,
)
(
trove_db.SupplementaryIndexcardRdf.objects
.filter(supplementary_suid__in=_suid_qs)
.delete()
)
for _indexcard in trove_db.Indexcard.objects.filter(source_record_suid__in=_suid_qs):
_indexcard.pls_delete()

@@ -247,6 +262,9 @@ def task__schedule_all_for_deriver(deriver_iri: str, notify_index=False):
def _sharev2_legacy_ingest(raw, urgent: bool):
assert raw.mediatype is None, 'raw datum has a mediatype -- did you mean to call non-legacy extract?'
_extractor = get_rdf_extractor_class(None)(raw.suid.source_config)
if typing.TYPE_CHECKING:
from trove.extract.legacy_sharev2 import LegacySharev2Extractor
assert isinstance(_extractor, LegacySharev2Extractor)
_sharev2graph = _extractor.extract_sharev2_graph(raw.datum)
_centralnode = _sharev2graph.get_central_node(guess=True)
_normd = share_db.NormalizedData.objects.create(
33 changes: 33 additions & 0 deletions trove/migrations/0006_supplementary_indexcard_rdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 3.2.25 on 2024-09-19 20:33

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('share', '0074_sourceuniqueidentifier_is_supplementary'),
('trove', '0005_indexes_for_oaipmh'),
]

operations = [
migrations.CreateModel(
name='SupplementaryIndexcardRdf',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
('turtle_checksum_iri', models.TextField(db_index=True)),
('focus_iri', models.TextField()),
('rdf_as_turtle', models.TextField()),
('from_raw_datum', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='+', to='share.rawdatum')),
('indexcard', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_supplementaryindexcardrdf_set', to='trove.indexcard')),
('supplementary_suid', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='supplementary_rdf_set', to='share.sourceuniqueidentifier')),
],
),
migrations.AddConstraint(
model_name='supplementaryindexcardrdf',
constraint=models.UniqueConstraint(fields=('indexcard', 'supplementary_suid'), name='trove_supplementaryindexcardrdf_uniq_supplement'),
),
]
10 changes: 9 additions & 1 deletion trove/models/__init__.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,15 @@
'IndexcardRdf',
'LatestIndexcardRdf',
'ArchivedIndexcardRdf',
'SupplementaryIndexcardRdf',
'DerivedIndexcard',
)
from .indexcard import Indexcard, IndexcardRdf, LatestIndexcardRdf, ArchivedIndexcardRdf, DerivedIndexcard
from .indexcard import (
ArchivedIndexcardRdf,
DerivedIndexcard,
Indexcard,
IndexcardRdf,
LatestIndexcardRdf,
SupplementaryIndexcardRdf,
)
from .resource_identifier import ResourceIdentifier
Loading