Skip to content

Commit

Permalink
don't include id/doi in identifiers. datacite/lupo#498
Browse files Browse the repository at this point in the history
  • Loading branch information
Martin Fenner committed Jul 12, 2020
1 parent 0706dbd commit 8abcce5
Show file tree
Hide file tree
Showing 453 changed files with 7,253 additions and 8,494 deletions.
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
bolognese (1.6.12)
bolognese (1.7)
activesupport (>= 4.2.5)
benchmark_methods (~> 0.7)
bibtex-ruby (>= 5.1.0)
Expand Down
5 changes: 2 additions & 3 deletions lib/bolognese/datacite_utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,10 @@ def insert_resource_type(xml)
end

def insert_alternate_identifiers(xml)
alternate_identifiers = Array.wrap(identifiers).select { |r| r["identifierType"] != "DOI" }
return xml unless alternate_identifiers.present?
return xml unless identifiers.present?

xml.alternateIdentifiers do
Array.wrap(alternate_identifiers).each do |alternate_identifier|
Array.wrap(identifiers).each do |alternate_identifier|
xml.alternateIdentifier(alternate_identifier["identifier"], 'alternateIdentifierType' => alternate_identifier["identifierType"])
end
end
Expand Down
2 changes: 1 addition & 1 deletion lib/bolognese/doi_utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def doi_resolver(doi, options = {})

def doi_api_url(doi, options = {})
sandbox = Array(/handle.test.datacite.org/.match(doi)).last
sandbox.present? || options[:sandbox] ? "https://api.test.datacite.org/dois/" + doi_from_url(doi) : "https://api.datacite.org/dois/" + doi_from_url(doi)
sandbox.present? || options[:sandbox] ? "https://api.test.datacite.org/dois/#{doi_from_url(doi)}?include=media,client" : "https://api.datacite.org/dois/#{doi_from_url(doi)}?include=media,client"
end

def normalize_doi(doi, options = {})
Expand Down
9 changes: 7 additions & 2 deletions lib/bolognese/metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Metadata

attr_accessor :string, :from, :sandbox, :meta, :regenerate, :issue, :show_errors
attr_reader :doc, :page_start, :page_end
attr_writer :id, :provider_id, :client_id, :doi, :identifiers, :creators, :contributors, :titles, :publisher,
attr_writer :id, :provider_id, :client_id, :doi, :identifiers, :alternate_identifiers, :creators, :contributors, :titles, :publisher,
:rights_list, :dates, :publication_year, :volume, :url, :version_info,
:subjects, :contributor, :descriptions, :language, :sizes,
:formats, :schema_version, :meta, :container, :agency,
Expand Down Expand Up @@ -99,6 +99,7 @@ def initialize(input: nil, from: nil, **options)
:titles,
:types,
:identifiers,
:alternate_identifiers,
:container,
:publisher,
:funding_references,
Expand Down Expand Up @@ -215,7 +216,11 @@ def publisher
end

def identifiers
@identifiers ||= meta.fetch("identifiers", nil)
@identifiers ||= Array.wrap(@alternate_identifiers).map { |a| { "identifierType" => a["alternateIdentifierType"], "identifier" => a["alternateIdentifier"] } }.presence || meta.fetch("identifiers", nil)
# (Array.wrap(@identifiers) +
# Array.wrap(@alternate_identifiers).select { |r| r["alternateIdentifierType"] != "DOI" }.map do |a|
# { "identifierType" => a["alternateIdentifierType"], "identifier" => a["alternateIdentifier"] }
# end).uniq ||= meta.fetch("identifiers", nil)
end

def content_url
Expand Down
1 change: 0 additions & 1 deletion lib/bolognese/readers/bibtex_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ def read_bibtex(string: nil, **options)

{ "id" => normalize_doi(doi),
"types" => types,
"identifiers" => [{ "identifier" => normalize_doi(doi), "identifierType" => "DOI" }],
"doi" => doi,
"url" => meta.try(:url).to_s.presence,
"titles" => meta.try(:title).present? ? [{ "title" => meta.try(:title).to_s }] : [],
Expand Down
16 changes: 2 additions & 14 deletions lib/bolognese/readers/citeproc_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,8 @@ def read_citeproc(string: nil, **options)
nil
end

identifiers = [normalize_id(meta.fetch("id", nil)), normalize_doi(meta.fetch("DOI", nil))].compact.map do |r|
r = normalize_id(r)
id = normalize_id(meta.fetch("id", nil) || meta.fetch("DOI", nil))

if r.start_with?("https://doi.org")
{ "identifierType" => "DOI", "identifier" => r }
else
{ "identifierType" => "URL", "identifier" => r }
end
end.uniq

id = Array.wrap(identifiers).first.to_h.fetch("identifier", nil)
doi = Array.wrap(identifiers).find { |r| r["identifierType"] == "DOI" }.to_h.fetch("identifier", nil)

state = id.present? || read_options.present? ? "findable" : "not_found"
subjects = Array.wrap(meta.fetch("categories", nil)).reduce([]) do |sum, subject|
sum += name_to_fos(subject)
Expand All @@ -107,9 +96,8 @@ def read_citeproc(string: nil, **options)
end

{ "id" => id,
"identifiers" => identifiers,
"types" => types,
"doi" => doi_from_url(doi),
"doi" => doi_from_url(id),
"url" => normalize_id(meta.fetch("URL", nil)),
"titles" => [{ "title" => meta.fetch("title", nil) }],
"creators" => creators,
Expand Down
11 changes: 4 additions & 7 deletions lib/bolognese/readers/codemeta_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,16 @@ def read_codemeta(string: nil, **options)

meta = string.present? ? Maremma.from_json(string) : {}

identifiers = ([meta.fetch("@id", nil)] + Array.wrap(meta.fetch("identifier", nil))).map do |r|
identifiers = Array.wrap(meta.fetch("identifier", nil)).map do |r|
r = normalize_id(r) if r.is_a?(String)
if r.is_a?(String) && r.start_with?("https://doi.org")
{ "identifierType" => "DOI", "identifier" => r }
elsif r.is_a?(String)
if r.is_a?(String) && !r.start_with?("https://doi.org")
{ "identifierType" => "URL", "identifier" => r }
elsif r.is_a?(Hash)
{ "identifierType" => get_identifier_type(r["propertyID"]), "identifier" => r["value"] }
end
end.compact.uniq

id = Array.wrap(identifiers).first.to_h.fetch("identifier", nil)
doi = Array.wrap(identifiers).find { |r| r["identifierType"] == "DOI" }.to_h.fetch("identifier", nil)
id = normalize_id(options[:doi] || meta.fetch("@id", nil) || meta.fetch("identifier", nil))

has_agents = meta.fetch("agents", nil)
authors = has_agents.nil? ? meta.fetch("authors", nil) : has_agents
Expand Down Expand Up @@ -70,7 +67,7 @@ def read_codemeta(string: nil, **options)
{ "id" => id,
"types" => types,
"identifiers" => identifiers,
"doi" => doi_from_url(doi),
"doi" => doi_from_url(id),
"url" => normalize_id(meta.fetch("codeRepository", nil)),
"titles" => titles,
"creators" => creators,
Expand Down
8 changes: 3 additions & 5 deletions lib/bolognese/readers/crossref_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,12 @@ def read_crossref(string: nil, **options)
"volume" => bibliographic_metadata.fetch("volume", nil) }.compact
end

identifiers = [{ "identifierType" => "DOI", "identifier" => normalize_doi(options[:doi] || options[:id] || bibliographic_metadata.dig("doi_data", "doi")) }, crossref_alternate_identifiers(bibliographic_metadata)].compact

id = Array.wrap(identifiers).first.to_h.fetch("identifier", nil)
doi = Array.wrap(identifiers).find { |r| r["identifierType"] == "DOI" }.to_h.fetch("identifier", nil)
id = normalize_doi(options[:doi] || options[:id] || bibliographic_metadata.dig("doi_data", "doi"))
identifiers = [crossref_alternate_identifiers(bibliographic_metadata)].compact

{ "id" => id,
"types" => types,
"doi" => doi_from_url(doi),
"doi" => doi_from_url(id),
"url" => parse_attributes(bibliographic_metadata.dig("doi_data", "resource"), first: true),
"titles" => titles,
"identifiers" => identifiers,
Expand Down
8 changes: 3 additions & 5 deletions lib/bolognese/readers/datacite_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,12 @@ def read_datacite(string: nil, **options)
id = normalize_doi(meta.dig("identifier", "__content__") || options[:id], sandbox: options[:sandbox])
end

identifiers = [{ "identifierType" => "DOI", "identifier" => id }] + Array.wrap(meta.dig("alternateIdentifiers", "alternateIdentifier")).map do |r|
identifiers = Array.wrap(meta.dig("alternateIdentifiers", "alternateIdentifier")).map do |r|
if r["__content__"].present?
{ "identifierType" => get_identifier_type(r["alternateIdentifierType"]), "identifier" => r["__content__"] }
end
end.compact

doi = Array.wrap(identifiers).find { |r| r["identifierType"] == "DOI" }.to_h.fetch("identifier", nil)

resource_type_general = meta.dig("resourceType", "resourceTypeGeneral")
resource_type = meta.dig("resourceType", "__content__")
schema_org = Bolognese::Utils::CR_TO_SO_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::DC_TO_SO_TRANSLATIONS[resource_type_general.to_s.dasherize] || "CreativeWork"
Expand Down Expand Up @@ -217,11 +215,11 @@ def read_datacite(string: nil, **options)
end
end.compact

state = doi.present? || read_options.present? ? "findable" : "not_found"
state = id.present? || read_options.present? ? "findable" : "not_found"

{ "id" => id,
"types" => types,
"doi" => doi_from_url(doi),
"doi" => doi_from_url(id),
"identifiers" => identifiers,
"url" => options.fetch(:url, nil).to_s.strip.presence,
"titles" => titles,
Expand Down
12 changes: 3 additions & 9 deletions lib/bolognese/readers/ris_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,8 @@ def read_ris(string: nil, **options)
"ris" => ris_type
}.compact

identifiers = [normalize_doi(options[:doi]) || normalize_doi(meta.fetch("DO", nil))].map do |r|
{ "identifierType" => "DOI", "identifier" => normalize_id(r) }
end.compact

id = Array.wrap(identifiers).first.to_h.fetch("identifier", nil)
doi = Array.wrap(identifiers).find { |r| r["identifierType"] == "DOI" }.to_h.fetch("identifier", nil)

id = normalize_doi(options[:doi] || meta.fetch("DO", nil))

author = Array.wrap(meta.fetch("AU", nil)).map { |a| { "creatorName" => a } }
date_parts = meta.fetch("PY", nil).to_s.split("/")
created_date_parts = meta.fetch("Y1", nil).to_s.split("/")
Expand Down Expand Up @@ -90,8 +85,7 @@ def read_ris(string: nil, **options)

{ "id" => id,
"types" => types,
"identifiers" => identifiers,
"doi" => doi_from_url(doi),
"doi" => doi_from_url(id),
"url" => meta.fetch("UR", nil),
"titles" => meta.fetch("T1", nil).present? ? [{ "title" => meta.fetch("T1", nil) }] : nil,
"creators" => get_authors(author),
Expand Down
8 changes: 3 additions & 5 deletions lib/bolognese/readers/schema_org_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,16 @@ def read_schema_org(string: nil, **options)

meta = string.present? ? Maremma.from_json(string) : {}

identifiers = ([options[:doi] || meta.fetch("@id", nil)] + Array.wrap(meta.fetch("identifier", nil))).map do |r|
identifiers = Array.wrap(meta.fetch("identifier", nil)).map do |r|
r = normalize_id(r) if r.is_a?(String)
if r.is_a?(String) && r.start_with?("https://doi.org")
{ "identifierType" => "DOI", "identifier" => r }
elsif r.is_a?(String)
if r.is_a?(String) && !r.start_with?("https://doi.org")
{ "identifierType" => "URL", "identifier" => r }
elsif r.is_a?(Hash)
{ "identifierType" => get_identifier_type(r["propertyID"]), "identifier" => r["value"] }
end
end.compact.uniq

id = Array.wrap(identifiers).first.to_h.fetch("identifier", nil)
id = normalize_id(options[:doi] || meta.fetch("@id", nil) || meta.fetch("identifier", nil))

schema_org = meta.fetch("@type", nil) && meta.fetch("@type").camelcase
resource_type_general = Bolognese::Utils::SO_TO_DC_TRANSLATIONS[schema_org]
Expand Down
2 changes: 1 addition & 1 deletion lib/bolognese/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module Bolognese
VERSION = "1.6.12"
VERSION = "1.7"
end
2 changes: 1 addition & 1 deletion spec/author_utils_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
subject = Bolognese::Metadata.new(input: input, from: "datacite")
meta = Maremma.from_xml(subject.raw).fetch("resource", {})
response = subject.get_one_author(meta.dig("creators", "creator").first)
expect(response).to eq("nameType"=>"Personal", "name"=>"Ollomo, Benjamin", "givenName"=>"Benjamin", "familyName"=>"Ollomo", "nameIdentifiers" => [], "affiliation" => [])
expect(response).to eq("nameType"=>"Personal", "name"=>"Ollomo, Benjamin", "givenName"=>"Benjamin", "familyName"=>"Ollomo", "nameIdentifiers" => [], "affiliation" => [{"affiliationIdentifier"=>"https://ror.org/01wyqb997", "affiliationIdentifierScheme"=>"ROR", "name"=>"Centre International de Recherches Médicales de Franceville"}])
end

it "has name in display-order" do
Expand Down
4 changes: 2 additions & 2 deletions spec/cli_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@
let(:input) { "10.5061/dryad.8515" }

it 'default' do
expect { subject.convert input }.to output(/Phylogeny, Malaria, Parasites, Taxonomy, Mitochondrial genome, Africa, Plasmodium/).to_stdout
expect { subject.convert input }.to output(/Plasmodium, malaria, taxonomy, mitochondrial genome, phylogeny, Parasites/).to_stdout
end

it 'to schema_org' do
subject.options = { to: "schema_org" }
expect { subject.convert input }.to output(/Phylogeny, Malaria, Parasites, Taxonomy, Mitochondrial genome, Africa, Plasmodium/).to_stdout
expect { subject.convert input }.to output(/Plasmodium, malaria, taxonomy, mitochondrial genome, phylogeny, Parasites/).to_stdout
end

it 'to bibtex' do
Expand Down
29 changes: 14 additions & 15 deletions spec/datacite_utils_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
it "insert" do
xml = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml| subject.insert_creators(xml) }.to_xml
response = Maremma.from_xml(xml)
expect(response.dig("creators", "creator").first).to eq("creatorName"=>{"__content__"=>"Ollomo, Benjamin", "nameType"=>"Personal"}, "familyName"=>"Ollomo", "givenName"=>"Benjamin")
expect(response.dig("creators", "creator").first).to eq("affiliation" => {"__content__"=>"Centre International de Recherches Médicales de Franceville", "affiliationIdentifier"=>"https://ror.org/01wyqb997", "affiliationIdentifierScheme"=>"ROR"}, "creatorName"=>{"__content__"=>"Ollomo, Benjamin", "nameType"=>"Personal"}, "familyName"=>"Ollomo", "givenName"=>"Benjamin")
end
end

Expand Down Expand Up @@ -61,7 +61,7 @@
it "insert" do
xml = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml| subject.insert_publisher(xml) }.to_xml
response = Maremma.from_xml(xml)
expect(response["publisher"]).to eq("Dryad Digital Repository")
expect(response["publisher"]).to eq("Dryad")
end
end

Expand All @@ -77,17 +77,17 @@
it "insert" do
xml = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml| subject.insert_resource_type(xml) }.to_xml
response = Maremma.from_xml(xml)
expect(response["resourceType"]).to eq("resourceTypeGeneral"=>"Dataset", "__content__"=>"DataPackage")
expect(response["resourceType"]).to eq("resourceTypeGeneral"=>"Dataset", "__content__"=>"dataset")
end
end

context "insert_alternate_identifiers" do
it "insert" do
xml = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml| subject.insert_alternate_identifiers(xml) }.to_xml
response = Maremma.from_xml(xml)
expect(response.dig("alternateIdentifiers", "alternateIdentifier")).to eq("alternateIdentifierType"=>"citation", "__content__"=>"Ollomo B, Durand P, Prugnolle F, Douzery EJP, Arnathau C, Nkoghe D, Leroy E, Renaud F (2009) A new malaria agent in African hominids. PLoS Pathogens 5(5): e1000446.")
end
end
# context "insert_alternate_identifiers" do
# it "insert" do
# xml = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml| subject.insert_alternate_identifiers(xml) }.to_xml
# response = Maremma.from_xml(xml)
# expect(response.dig("alternateIdentifiers", "alternateIdentifier").to be_nil)
# end
# end

context "insert_dates" do
it "insert" do
Expand All @@ -101,7 +101,7 @@
it "insert" do
xml = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml| subject.insert_subjects(xml) }.to_xml
response = Maremma.from_xml(xml)
expect(response.dig("subjects", "subject")).to eq(["Phylogeny", "Malaria", "Parasites", "Taxonomy", "Mitochondrial genome", "Africa", "Plasmodium"])
expect(response.dig("subjects", "subject")).to eq(["Plasmodium", "malaria", "taxonomy", "mitochondrial genome", "phylogeny", "Parasites"])
end
end

Expand Down Expand Up @@ -143,15 +143,14 @@

context "insert_related_identifiers" do
it "related_identifier" do
expect(subject.related_identifiers.length).to eq(6)
expect(subject.related_identifiers.first).to eq("relatedIdentifier"=>"10.5061/dryad.8515/1", "relatedIdentifierType"=>"DOI", "relationType"=>"HasPart")
expect(subject.related_identifiers.length).to eq(1)
expect(subject.related_identifiers.first).to eq("relatedIdentifier"=>"10.1371/journal.ppat.1000446", "relatedIdentifierType"=>"DOI", "relationType"=>"IsSupplementTo")
end

it "insert" do
xml = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') { |xml| subject.insert_related_identifiers(xml) }.to_xml
response = Maremma.from_xml(xml)
expect(response.dig("relatedIdentifiers", "relatedIdentifier").length).to eq(6)
expect(response.dig("relatedIdentifiers", "relatedIdentifier").first).to eq("__content__"=>"10.5061/dryad.8515/1", "relatedIdentifierType"=>"DOI", "relationType"=>"HasPart")
expect(response.dig("relatedIdentifiers", "relatedIdentifier")).to eq("__content__"=>"10.1371/journal.ppat.1000446", "relatedIdentifierType"=>"DOI", "relationType"=>"IsSupplementTo")
end
end

Expand Down
14 changes: 7 additions & 7 deletions spec/doi_utils_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,43 +55,43 @@
it "doi" do
doi = "10.5061/DRYAD.8515"
response = subject.doi_api_url(doi)
expect(response).to eq("https://api.datacite.org/dois/10.5061/dryad.8515")
expect(response).to eq("https://api.datacite.org/dois/10.5061/dryad.8515?include=media,client")
end

it "doi with protocol" do
doi = "doi:10.5061/DRYAD.8515"
response = subject.doi_api_url(doi)
expect(response).to eq("https://api.datacite.org/dois/10.5061/dryad.8515")
expect(response).to eq("https://api.datacite.org/dois/10.5061/dryad.8515?include=media,client")
end

it "https url" do
doi = "https://doi.org/10.5061/dryad.8515"
response = subject.doi_api_url(doi)
expect(response).to eq("https://api.datacite.org/dois/10.5061/dryad.8515")
expect(response).to eq("https://api.datacite.org/dois/10.5061/dryad.8515?include=media,client")
end

it "dx.doi.org url" do
doi = "http://dx.doi.org/10.5061/dryad.8515"
response = subject.doi_api_url(doi)
expect(response).to eq("https://api.datacite.org/dois/10.5061/dryad.8515")
expect(response).to eq("https://api.datacite.org/dois/10.5061/dryad.8515?include=media,client")
end

it "test resolver" do
doi = "https://handle.test.datacite.org/10.5061/dryad.8515"
response = subject.doi_api_url(doi)
expect(response).to eq("https://api.test.datacite.org/dois/10.5061/dryad.8515")
expect(response).to eq("https://api.test.datacite.org/dois/10.5061/dryad.8515?include=media,client")
end

it "test resolver http" do
doi = "http://handle.test.datacite.org/10.5061/dryad.8515"
response = subject.doi_api_url(doi)
expect(response).to eq("https://api.test.datacite.org/dois/10.5061/dryad.8515")
expect(response).to eq("https://api.test.datacite.org/dois/10.5061/dryad.8515?include=media,client")
end

it "force test resolver" do
doi = "https://doi.org/10.5061/dryad.8515"
response = subject.doi_api_url(doi, sandbox: true)
expect(response).to eq("https://api.test.datacite.org/dois/10.5061/dryad.8515")
expect(response).to eq("https://api.test.datacite.org/dois/10.5061/dryad.8515?include=media,client")
end
end

Expand Down
1 change: 0 additions & 1 deletion spec/fixtures/schema_org_gtex.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"@type": "Dataset",
"@id": "https://doi.org/10.25491/d50j-3083",
"identifier": [
"https://doi.org/10.25491/d50j-3083",
{
"@type": "PropertyValue",
"propertyID": "md5",
Expand Down
Loading

0 comments on commit 8abcce5

Please sign in to comment.