From 7c73641b06120c3c532e9251d327ef7e731aab4a Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Fri, 4 Oct 2019 23:11:05 +0200 Subject: [PATCH] updates registrant IDs for events that have not been updated. for events from crossref and to crossref a rake task to be used only for this --- app/jobs/event_registrant_update_by_id_job.rb | 78 +++++++++++++++++++ app/jobs/event_registrant_update_job.rb | 7 ++ app/models/event.rb | 29 +++++++ lib/tasks/event.rake | 7 ++ spec/jobs/event_update_by_id_job.rb | 16 ++++ 5 files changed, 137 insertions(+) create mode 100644 app/jobs/event_registrant_update_by_id_job.rb create mode 100644 app/jobs/event_registrant_update_job.rb create mode 100644 spec/jobs/event_update_by_id_job.rb diff --git a/app/jobs/event_registrant_update_by_id_job.rb b/app/jobs/event_registrant_update_by_id_job.rb new file mode 100644 index 000000000..0198ec745 --- /dev/null +++ b/app/jobs/event_registrant_update_by_id_job.rb @@ -0,0 +1,78 @@ +class EventRegistrantUpdateByIdJob < ActiveJob::Base + queue_as :lupo_background + + + rescue_from ActiveJob::DeserializationError, Elasticsearch::Transport::Transport::Errors::BadRequest do |error| + logger = Logger.new(STDOUT) + logger.error error.message + end + + def perform(id, options={}) + logger = Logger.new(STDOUT) + + item = Event.where(uuid: id).first + return false unless item.present? + logger.info "djdjdj" + logger.info id + logger.info item.source_id + + + case item.source_id + when "datacite-crossref" + registrant_id = get_crossref_member_id(item.obj_id) if get_doi_ra(item.obj_id) == "Crossref" + logger.info registrant_id + + obj = item.obj.merge("registrant_id" => registrant_id) unless registrant_id.nil? + logger.info obj + item.update_attributes(obj: obj) if obj.present? + when "crossref" + registrant_id = get_crossref_member_id(item.subj) if get_doi_ra(item.subj) == "Crossref" + logger.info registrant_id + + subj = item.subj.merge("registrant_id" => registrant_id) unless registrant_id.nil? + logger.info subj + item.update_attributes(subj: subj) if subj.present? + end + + logger.error item.errors.full_messages.map { |message| { title: message } } if item.errors.any? + logger.info "#{item.uuid} Updated" if item.errors.blank? && registrant_id + end + + def get_crossref_member_id(id, options={}) + logger = Logger.new(STDOUT) + doi = doi_from_url(id) + # return "crossref.citations" unless doi.present? + + url = "https://api.crossref.org/works/#{Addressable::URI.encode(doi)}?mailto=info@datacite.org" + sleep(0.01) # to avoid crossref rate limitting + response = Maremma.get(url, host: true) + logger.info "[Crossref Response] [#{response.status}] for DOI #{doi} metadata" + return "" if response.status == 404 ### for cases when DOI is not in the crossreaf api + return "crossref.citations" if response.status != 200 ### for cases any other errors + + message = response.body.dig("data", "message") + + "crossref.#{message["member"]}" + end + + def get_doi_ra(doi) + prefix = validate_prefix(doi) + return nil if prefix.blank? + + url = "https://doi.org/ra/#{prefix}" + result = Maremma.get(url) + + result.body.dig("data", 0, "RA") + end + + def validate_prefix(doi) + Array(/\A(?:(http|https):\/(\/)?(dx\.)?(doi.org|handle.test.datacite.org)\/)?(doi:)?(10\.\d{4,5}).*\z/.match(doi)).last + end + + def doi_from_url(url) + if /\A(?:(http|https):\/\/(dx\.)?(doi.org|handle.test.datacite.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(url) + uri = Addressable::URI.parse(url) + uri.path.gsub(/^\//, '').downcase + end + end +end diff --git a/app/jobs/event_registrant_update_job.rb b/app/jobs/event_registrant_update_job.rb new file mode 100644 index 000000000..a6efd7657 --- /dev/null +++ b/app/jobs/event_registrant_update_job.rb @@ -0,0 +1,7 @@ +class EventRegistrantUpdateJob < ActiveJob::Base + queue_as :lupo_background + + def perform(ids, options={}) + ids.each { |id| EventRegistrantUpdateByIdJob.perform_later(id, options) } + end +end diff --git a/app/models/event.rb b/app/models/event.rb index c5a88c120..9cf905a30 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -447,6 +447,35 @@ def self.update_datacite_ra(options={}) response.results.total end + def self.update_registrant(options={}) + logger = Logger.new(STDOUT) + + size = (options[:size] || 1000).to_i + cursor = (options[:cursor] || []) + # ra = options[:ra] || "crossref" + source_id = "datacite-crossref,crossref" + + response = Event.query(nil, source_id: source_id, page: { size: 1, cursor: cursor }) + logger.info "[Update] #{response.results.total} events for sources #{source_id}." + + # walk through results using cursor + if response.results.total > 0 + while response.results.results.length > 0 do + response = Event.query(nil, source_id: source_id, page: { size: size, cursor: cursor }) + break unless response.results.results.length > 0 + + logger.info "[Update] Updating #{response.results.results.length} #{source_id} events starting with _id #{response.results.to_a.first[:_id]}." + cursor = response.results.to_a.last[:sort] + + ids = response.results.results.map(&:uuid).uniq + + EventRegistrantUpdateJob.perform_later(ids, options) + end + end + + response.results.total + end + def self.update_datacite_orcid_auto_update(options={}) logger = Logger.new(STDOUT) diff --git a/lib/tasks/event.rake b/lib/tasks/event.rake index 3f485156a..3b1f28e5b 100644 --- a/lib/tasks/event.rake +++ b/lib/tasks/event.rake @@ -53,6 +53,13 @@ namespace :event do Event.import_by_ids(from_id: from_id, until_id: until_id) end + + desc 'update registrant metadata' + task :update_registrant => :environment do + cursor = ENV['CURSOR'].to_s.split(",") || [Event.minimum(:id),Event.minimum(:id)] + + Event.update_registrant(cursor: cursor, size: ENV['SIZE']) + end end namespace :crossref do diff --git a/spec/jobs/event_update_by_id_job.rb b/spec/jobs/event_update_by_id_job.rb new file mode 100644 index 000000000..e996a9c0b --- /dev/null +++ b/spec/jobs/event_update_by_id_job.rb @@ -0,0 +1,16 @@ +require 'rails_helper' + +describe EventUpdateByIdJob, type: :job do + let(:event) { create(:event) } + subject(:job) { EventUpdateByIdJob.perform_later(event.uuid) } + + it 'queues the job' do + expect { job }.to have_enqueued_job(EventUpdateByIdJob) + .on_queue("test_lupo_background") + end + + after do + clear_enqueued_jobs + clear_performed_jobs + end +end