Skip to content

Commit

Permalink
backfill crossref dois for events. datacite/levriero#56
Browse files Browse the repository at this point in the history
  • Loading branch information
Martin Fenner committed Jun 17, 2019
1 parent 1f2da3f commit 4060e3e
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 2 deletions.
62 changes: 62 additions & 0 deletions app/jobs/crossref_doi_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
class CrossrefDoiJob < ActiveJob::Base
queue_as :lupo_background

# retry_on ActiveRecord::Deadlocked, wait: 10.seconds, attempts: 3
# retry_on Faraday::TimeoutError, wait: 10.minutes, attempts: 3

# discard_on ActiveJob::DeserializationError

def perform(id)
logger = Logger.new(STDOUT)

doi = doi_from_url(id)
return {} unless doi.present?

# check whether DOI has been registered with DataCite already
result = Doi.find_by_id(doi).results.first
return {} unless result.blank?

# otherwise store Crossref metadata with DataCite
# using client crossref.citations and DataCite XML
xml = Base64.strict_encode64(id)
attributes = {
"xml" => xml,
"source" => "levriero",
"event" => "publish" }.compact

data = {
"data" => {
"type" => "dois",
"attributes" => attributes,
"relationships" => {
"client" => {
"data" => {
"type" => "clients",
"id" => "crossref.citations"
}
}
}
}
}

url = "http://localhost/dois/#{doi}"
response = Maremma.put(url, accept: 'application/vnd.api+json',
content_type: 'application/vnd.api+json',
data: data.to_json,
username: ENV["ADMIN_USERNAME"],
password: ENV["ADMIN_PASSWORD"])

if [200, 201].include?(response.status)
logger.info "DOI #{doi} created."
else
logger.warn response.body["errors"]
end
end

def doi_from_url(url)
if /\A(?:(http|https):\/\/(dx\.)?(doi.org|handle.test.datacite.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(url)
uri = Addressable::URI.parse(url)
uri.path.gsub(/^\//, '').downcase
end
end
end
4 changes: 2 additions & 2 deletions app/jobs/url_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ def perform(doi_id)
response = Doi.get_doi(doi: doi.doi)
url = response.body.dig('data', 'values', 0, 'data', 'value')
if url.present?
if (doi.is_registered_or_findable? || %w(europ ethz).include?(doi.provider_id)) && doi.minted.blank?
if (doi.is_registered_or_findable? || %w(europ ethz crossref).include?(doi.provider_id)) && doi.minted.blank?
doi.update_attributes(url: url, minted: Time.zone.now)
else
doi.update_attributes(url: url)
end

doi.update_attributes(aasm_state: "findable") if %w(europ ethz).include?(doi.provider_id)
doi.update_attributes(aasm_state: "findable") if %w(europ ethz crossref).include?(doi.provider_id)

doi.__elasticsearch__.index_document

Expand Down
4 changes: 4 additions & 0 deletions app/models/concerns/indexable.rb
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ def query(query, options={})
from = 0
search_after = [options.dig(:page, :cursor)]
sort = [{ _id: { order: 'asc' }}]
elsif self.name == "Event" && options.dig(:page, :cursor).present?
from = 0
search_after = [options.dig(:page, :cursor)]
sort = [{ _id: { order: 'asc' }}]
elsif self.name == "Activity" && options.dig(:page, :cursor).present?
from = 0
search_after = [options.dig(:page, :cursor)]
Expand Down
74 changes: 74 additions & 0 deletions app/models/event.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ class Event < ActiveRecord::Base
# include event processing
include Processable

# include helper methods for models
include Modelable

# include doi normalization
include Identifiable

Expand Down Expand Up @@ -340,6 +343,76 @@ def self.import_by_id(options={})
logger.info "[Elasticsearch] Imported #{count} events with IDs #{id} - #{(id + 499)}."
end

def self.update_crossref(options={})
logger = Logger.new(STDOUT)

size = (options[:size] || 1000).to_i

response = Event.query(nil, source_id: "crossref", page: { size: 1, cursor: 0 })
logger.info "[Update] #{response.results.total} events for source crossref."

if response.results.total > 0
# walk through results using cursor
cursor = 0

while response.results.results.length > 0 do
response = Event.query(nil, source_id: "crossref", page: { size: size, cursor: cursor })
break unless response.results.results.length > 0

logger.info "[Update] Updating #{response.results.results.length} crossref events starting with _id #{cursor + 1}."
cursor = response.results.to_a.last[:sort].first.to_i

response.results.results.each do |e|
CrossrefDoiJob.perform_later(e.subj_id)
end
end
end

response.results.total
end

def self.update_datacite_crossref(options={})
logger = Logger.new(STDOUT)

size = (options[:size] || 1000).to_i

response = Event.query(nil, source_id: "datacite_crossref", page: { size: 1, cursor: 0 })
logger.info "[Update] #{response.results.total} events for source datacite_crossref."

if response.results.total > 0
# walk through results using cursor
cursor = 0

while response.results.results.length > 0 do
response = Event.query(nil, source_id: "crossref", page: { size: size, cursor: cursor })
break unless response.results.results.length > 0

logger.info "[Update] Updating #{response.results.results.length} datacite_crossref events starting with _id #{cursor + 1}."
cursor = response.results.to_a.last[:sort].first.to_i

response.results.results.each do |e|
CrossrefDoiJob.perform_later(e.obj_id)
end
end
end

response.results.total
end

def self.get_crossref_metadata(id)
doi = doi_from_url(id)
return {} unless doi.present?

# check whether DOI has been registered with DataCite already
result = Doi.find_by_id(doi).results.first
if result.blank?
# otherwise store Crossref metadata with DataCite
# using client crossref.citations and DataCite XML
xml = Base64.strict_encode64(id)
Doi.create({ xml: xml, source: "levriero", event: "publish", client_id: "crossref.citations" }, :without_protection => true)
end
end

def to_param # overridden, use uuid instead of id
uuid
end
Expand All @@ -356,6 +429,7 @@ def send_callback
"timestamp" => timestamp }}
Maremma.post(callback, data: data.to_json, token: ENV['API_KEY'])
end

def access_method
if relation_type_id.to_s =~ /(requests|investigations)/
relation_type_id.split("-").last if relation_type_id.present?
Expand Down
1 change: 1 addition & 0 deletions config/application.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
ENV['SITE_TITLE'] ||= "DataCite REST API"
ENV['LOG_LEVEL'] ||= "info"
ENV['CONCURRENCY'] ||= "25"
ENV['API_URL'] ||= "https://api.test.datacite.org"
ENV['CDN_URL'] ||= "https://assets.datacite.org"
ENV['BRACCO_URL'] ||= "https://doi.datacite.org"
ENV['GITHUB_URL'] ||= "https://github.com/datacite/lupo"
Expand Down
20 changes: 20 additions & 0 deletions lib/tasks/event.rake
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,23 @@ namespace :event do
Event.import_by_ids(from_id: from_id, until_id: until_id)
end
end

namespace :crossref do
desc 'Import dois for all events'
task :import_doi => :environment do
from_id = (ENV['FROM_ID'] || 1).to_i
until_id = (ENV['UNTIL_ID'] || Event.maximum(:id)).to_i

Event.update_crossref(from_id: from_id, until_id: until_id)
end
end

namespace :datacite_crossref do
desc 'Import dois for all events'
task :import_doi => :environment do
from_id = (ENV['FROM_ID'] || 1).to_i
until_id = (ENV['UNTIL_ID'] || Event.maximum(:id)).to_i

Event.update_datacite_crossref(from_id: from_id, until_id: until_id)
end
end

0 comments on commit 4060e3e

Please sign in to comment.