From 654a7fb55e3bdf4710511f7272dd10ad78df7184 Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Sun, 6 Oct 2019 17:11:03 +0200 Subject: [PATCH 1/9] fix typo on subj_id --- app/jobs/event_registrant_update_by_id_job.rb | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/app/jobs/event_registrant_update_by_id_job.rb b/app/jobs/event_registrant_update_by_id_job.rb index 0198ec745..2e51f29e7 100644 --- a/app/jobs/event_registrant_update_by_id_job.rb +++ b/app/jobs/event_registrant_update_by_id_job.rb @@ -12,9 +12,7 @@ def perform(id, options={}) item = Event.where(uuid: id).first return false unless item.present? - logger.info "djdjdj" - logger.info id - logger.info item.source_id + case item.source_id @@ -26,7 +24,7 @@ def perform(id, options={}) logger.info obj item.update_attributes(obj: obj) if obj.present? when "crossref" - registrant_id = get_crossref_member_id(item.subj) if get_doi_ra(item.subj) == "Crossref" + registrant_id = get_crossref_member_id(item.subj) if get_doi_ra(item.subj_id) == "Crossref" logger.info registrant_id subj = item.subj.merge("registrant_id" => registrant_id) unless registrant_id.nil? From c4b90b1765e2547323ed79e3d39baafa9ed170c1 Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Sun, 6 Oct 2019 18:59:58 +0200 Subject: [PATCH 2/9] fix typo in subj_id --- app/jobs/event_registrant_update_by_id_job.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/jobs/event_registrant_update_by_id_job.rb b/app/jobs/event_registrant_update_by_id_job.rb index 2e51f29e7..d9f317f33 100644 --- a/app/jobs/event_registrant_update_by_id_job.rb +++ b/app/jobs/event_registrant_update_by_id_job.rb @@ -24,7 +24,7 @@ def perform(id, options={}) logger.info obj item.update_attributes(obj: obj) if obj.present? when "crossref" - registrant_id = get_crossref_member_id(item.subj) if get_doi_ra(item.subj_id) == "Crossref" + registrant_id = get_crossref_member_id(item.subj_id) if get_doi_ra(item.subj_id) == "Crossref" logger.info registrant_id subj = item.subj.merge("registrant_id" => registrant_id) unless registrant_id.nil? From fef91dcc4bdc229369d81b8812a00c9872023ec4 Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Mon, 7 Oct 2019 00:04:58 +0200 Subject: [PATCH 3/9] adding cache for speed up the process --- app/jobs/event_registrant_update_by_id_job.rb | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/app/jobs/event_registrant_update_by_id_job.rb b/app/jobs/event_registrant_update_by_id_job.rb index d9f317f33..056840d59 100644 --- a/app/jobs/event_registrant_update_by_id_job.rb +++ b/app/jobs/event_registrant_update_by_id_job.rb @@ -17,14 +17,14 @@ def perform(id, options={}) case item.source_id when "datacite-crossref" - registrant_id = get_crossref_member_id(item.obj_id) if get_doi_ra(item.obj_id) == "Crossref" + registrant_id = cached_get_crossref_member_id(item.obj_id) if cached_get_doi_ra(item.obj_id) == "Crossref" logger.info registrant_id obj = item.obj.merge("registrant_id" => registrant_id) unless registrant_id.nil? logger.info obj item.update_attributes(obj: obj) if obj.present? when "crossref" - registrant_id = get_crossref_member_id(item.subj_id) if get_doi_ra(item.subj_id) == "Crossref" + registrant_id = cached_get_crossref_member_id(item.subj_id) if cached_get_doi_ra(item.subj_id) == "Crossref" logger.info registrant_id subj = item.subj.merge("registrant_id" => registrant_id) unless registrant_id.nil? @@ -53,6 +53,20 @@ def get_crossref_member_id(id, options={}) "crossref.#{message["member"]}" end + def cached_get_doi_ra(doi) + Rails.cache.fetch("ras/#{doi}") do + puts "did not find key in cache, executing block ..." + get_doi_ra(doi) + end + end + + def cached_get_crossref_member_id(doi) + Rails.cache.fetch("members_ids/#{doi}") do + puts "did not find key in cache, executing block ..." + get_crossref_member_id(doi) + end + end + def get_doi_ra(doi) prefix = validate_prefix(doi) return nil if prefix.blank? From b12452545a4b7f7583d504b5b8feb4de15e5ffaf Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Mon, 7 Oct 2019 00:08:30 +0200 Subject: [PATCH 4/9] enable filtering update by citation types --- app/models/event.rb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/app/models/event.rb b/app/models/event.rb index 9cf905a30..c346403c3 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -453,15 +453,16 @@ def self.update_registrant(options={}) size = (options[:size] || 1000).to_i cursor = (options[:cursor] || []) # ra = options[:ra] || "crossref" - source_id = "datacite-crossref,crossref" + source_id = options[:source_id] || "datacite-crossref,crossref" + citation_type = options[:citation_type] || "Dataset-ScholarlyArticle" - response = Event.query(nil, source_id: source_id, page: { size: 1, cursor: cursor }) + response = Event.query(nil, source_id: source_id, citation_type: citation_type, page: { size: 1, cursor: cursor }) logger.info "[Update] #{response.results.total} events for sources #{source_id}." # walk through results using cursor if response.results.total > 0 while response.results.results.length > 0 do - response = Event.query(nil, source_id: source_id, page: { size: size, cursor: cursor }) + response = Event.query(nil, source_id: source_id, citation_type: citation_type, page: { size: size, cursor: cursor }) break unless response.results.results.length > 0 logger.info "[Update] Updating #{response.results.results.length} #{source_id} events starting with _id #{response.results.to_a.first[:_id]}." From b8857a3a7d5b07d54e3b24bce673ee48ed707a0e Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Mon, 7 Oct 2019 18:27:00 +0200 Subject: [PATCH 5/9] enable filtering by registrant --- app/models/event.rb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/app/models/event.rb b/app/models/event.rb index c346403c3..e36d91f60 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -453,16 +453,17 @@ def self.update_registrant(options={}) size = (options[:size] || 1000).to_i cursor = (options[:cursor] || []) # ra = options[:ra] || "crossref" - source_id = options[:source_id] || "datacite-crossref,crossref" + source_id = options[:source_id] || "datacite-crossref" citation_type = options[:citation_type] || "Dataset-ScholarlyArticle" + query = options[:query] || "registrant_id:*crossref.citations" - response = Event.query(nil, source_id: source_id, citation_type: citation_type, page: { size: 1, cursor: cursor }) + response = Event.query(query, source_id: source_id, citation_type: citation_type, page: { size: 1, cursor: cursor }) logger.info "[Update] #{response.results.total} events for sources #{source_id}." # walk through results using cursor if response.results.total > 0 while response.results.results.length > 0 do - response = Event.query(nil, source_id: source_id, citation_type: citation_type, page: { size: size, cursor: cursor }) + response = Event.query(query, source_id: source_id, citation_type: citation_type, page: { size: size, cursor: cursor }) break unless response.results.results.length > 0 logger.info "[Update] Updating #{response.results.results.length} #{source_id} events starting with _id #{response.results.to_a.first[:_id]}." From f35e74eebb0b5d833f033241e2a21b5546dfb9a1 Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Tue, 8 Oct 2019 13:57:00 +0200 Subject: [PATCH 6/9] to avoid crossref rate limitting --- app/jobs/event_registrant_update_by_id_job.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/jobs/event_registrant_update_by_id_job.rb b/app/jobs/event_registrant_update_by_id_job.rb index 056840d59..f05547983 100644 --- a/app/jobs/event_registrant_update_by_id_job.rb +++ b/app/jobs/event_registrant_update_by_id_job.rb @@ -42,7 +42,7 @@ def get_crossref_member_id(id, options={}) # return "crossref.citations" unless doi.present? url = "https://api.crossref.org/works/#{Addressable::URI.encode(doi)}?mailto=info@datacite.org" - sleep(0.01) # to avoid crossref rate limitting + sleep(0.03) # to avoid crossref rate limitting response = Maremma.get(url, host: true) logger.info "[Crossref Response] [#{response.status}] for DOI #{doi} metadata" return "" if response.status == 404 ### for cases when DOI is not in the crossreaf api From e931fea89bae903c0e7e1bcaa2969075c60649bb Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Tue, 8 Oct 2019 16:27:03 +0200 Subject: [PATCH 7/9] there are more queues in production, we need to slow down the changes --- app/jobs/event_registrant_update_by_id_job.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/jobs/event_registrant_update_by_id_job.rb b/app/jobs/event_registrant_update_by_id_job.rb index f05547983..69ff64115 100644 --- a/app/jobs/event_registrant_update_by_id_job.rb +++ b/app/jobs/event_registrant_update_by_id_job.rb @@ -42,7 +42,7 @@ def get_crossref_member_id(id, options={}) # return "crossref.citations" unless doi.present? url = "https://api.crossref.org/works/#{Addressable::URI.encode(doi)}?mailto=info@datacite.org" - sleep(0.03) # to avoid crossref rate limitting + sleep(0.24) # to avoid crossref rate limitting response = Maremma.get(url, host: true) logger.info "[Crossref Response] [#{response.status}] for DOI #{doi} metadata" return "" if response.status == 404 ### for cases when DOI is not in the crossreaf api @@ -55,14 +55,14 @@ def get_crossref_member_id(id, options={}) def cached_get_doi_ra(doi) Rails.cache.fetch("ras/#{doi}") do - puts "did not find key in cache, executing block ..." + puts "#{doi} [RA] did not find key in cache, executing block ..." get_doi_ra(doi) end end def cached_get_crossref_member_id(doi) Rails.cache.fetch("members_ids/#{doi}") do - puts "did not find key in cache, executing block ..." + puts "#{doi} [Crossref Member] did not find key in cache, executing block ..." get_crossref_member_id(doi) end end From 63cf4738ea05a371c252d76ab69fc6843e261d0f Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Tue, 8 Oct 2019 20:25:02 +0200 Subject: [PATCH 8/9] avoid cache when its wrong --- app/jobs/event_registrant_update_by_id_job.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/app/jobs/event_registrant_update_by_id_job.rb b/app/jobs/event_registrant_update_by_id_job.rb index 69ff64115..caf98344e 100644 --- a/app/jobs/event_registrant_update_by_id_job.rb +++ b/app/jobs/event_registrant_update_by_id_job.rb @@ -19,6 +19,10 @@ def perform(id, options={}) when "datacite-crossref" registrant_id = cached_get_crossref_member_id(item.obj_id) if cached_get_doi_ra(item.obj_id) == "Crossref" logger.info registrant_id + if registrant_id == "crossref.citations" + sleep(0.50) + registrant_id = get_crossref_member_id(item.obj_id) + end obj = item.obj.merge("registrant_id" => registrant_id) unless registrant_id.nil? logger.info obj @@ -26,6 +30,7 @@ def perform(id, options={}) when "crossref" registrant_id = cached_get_crossref_member_id(item.subj_id) if cached_get_doi_ra(item.subj_id) == "Crossref" logger.info registrant_id + registrant_id = get_crossref_member_id(item.subj_id) if registrant_id == "crossref.citations" ## try without cache subj = item.subj.merge("registrant_id" => registrant_id) unless registrant_id.nil? logger.info subj From db6d66dcc50b8a990e22e5e41dc0026231d713a5 Mon Sep 17 00:00:00 2001 From: Kristian Garza Date: Wed, 9 Oct 2019 16:49:34 +0200 Subject: [PATCH 9/9] add crossref source to rake task --- app/jobs/event_registrant_update_by_id_job.rb | 5 ++++- app/models/event.rb | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/app/jobs/event_registrant_update_by_id_job.rb b/app/jobs/event_registrant_update_by_id_job.rb index caf98344e..88ed558f5 100644 --- a/app/jobs/event_registrant_update_by_id_job.rb +++ b/app/jobs/event_registrant_update_by_id_job.rb @@ -30,7 +30,10 @@ def perform(id, options={}) when "crossref" registrant_id = cached_get_crossref_member_id(item.subj_id) if cached_get_doi_ra(item.subj_id) == "Crossref" logger.info registrant_id - registrant_id = get_crossref_member_id(item.subj_id) if registrant_id == "crossref.citations" ## try without cache + if registrant_id == "crossref.citations" + sleep(0.50) + registrant_id = get_crossref_member_id(item.subj_id) + end subj = item.subj.merge("registrant_id" => registrant_id) unless registrant_id.nil? logger.info subj diff --git a/app/models/event.rb b/app/models/event.rb index e36d91f60..d0746179b 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -453,7 +453,7 @@ def self.update_registrant(options={}) size = (options[:size] || 1000).to_i cursor = (options[:cursor] || []) # ra = options[:ra] || "crossref" - source_id = options[:source_id] || "datacite-crossref" + source_id = options[:source_id] || "datacite-crossref,crossref" citation_type = options[:citation_type] || "Dataset-ScholarlyArticle" query = options[:query] || "registrant_id:*crossref.citations"