From 10fa673f7ba3bf40d794b7b65aca7d54cbad757c Mon Sep 17 00:00:00 2001 From: kjgarza Date: Thu, 23 Jan 2020 13:54:53 +0100 Subject: [PATCH 01/10] rake task to get list and totals of event with corssref events in both nodes. https://github.com/datacite/lupo/issues/351 subj_id in datacite-crossref,datacite-related events MUST always be a DataCite DOI --- app/models/event.rb | 33 +++++++++++++++++++++++++++++++++ lib/tasks/event.rake | 9 +++++++++ 2 files changed, 42 insertions(+) diff --git a/app/models/event.rb b/app/models/event.rb index b9ba26abc..a4017932b 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -489,6 +489,39 @@ def access_method end end + def self.subj_id_check(options = {}) + + size = (options[:size] || 1000).to_i + cursor = (options[:cursor] || []) + total_errors = 0 + + response = Event.query(nil, source_id: "datacite-crossref,datacite-related", page: { size: 1, cursor: [] }) + logger.info "[DoubleCheck] #{response.results.total} events for source datacite-crossref,datacite-related." + + # walk through results using cursor + if response.results.total > 0 + while response.results.results.length > 0 do + response = Event.query(nil, source_id: "datacite-crossref,datacite-related", page: { size: size, cursor: cursor }) + break unless response.results.results.length > 0 + + logger.info "[DoubleCheck] DoubleCheck #{response.results.results.length} events starting with _id #{response.results.to_a.first[:_id]}." + cursor = response.results.to_a.last[:sort] + + # dois = response.results.results.map(&:subj_id) + events = response.results.results + + events.lazy.each do | event| + subj_prefix = event.subj_id[/(10\.\d{4,5})/,1] + File.open("evens_with_double_crossref_dois.txt", "a+") do |f| + f.write(event.uuid, "\n") + total_errors= total_errors+1 + end if Prefix.where(prefix: subj_prefix).empty? + end + end + end + logger.warn "Total number of events with Errors: #{total_errors}" + end + def metric_type if relation_type_id.to_s =~ /(requests|investigations)/ arr = relation_type_id.split("-", 4) diff --git a/lib/tasks/event.rake b/lib/tasks/event.rake index 727bf6d74..ba440887c 100644 --- a/lib/tasks/event.rake +++ b/lib/tasks/event.rake @@ -71,6 +71,15 @@ namespace :crossref do end end +namespace :subj_id_check do + desc 'checks datacite-crossref,datacite-related events have a DataCite DOI in the subject node' + task :check => :environment do + cursor = ENV['CURSOR'].to_s.split(",") || [Event.minimum(:id),Event.minimum(:id)] + + Event.subj_id_check(cursor: cursor) + end +end + namespace :datacite_crossref do desc 'Import crossref dois for all events' task :import_doi => :environment do From ac3a09ffbb619e696c9e9cfc02774117b39bec10 Mon Sep 17 00:00:00 2001 From: kjgarza Date: Thu, 23 Jan 2020 13:59:00 +0100 Subject: [PATCH 02/10] remove trailing space --- app/models/event.rb | 2 -- 1 file changed, 2 deletions(-) diff --git a/app/models/event.rb b/app/models/event.rb index a4017932b..deb87f651 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -490,7 +490,6 @@ def access_method end def self.subj_id_check(options = {}) - size = (options[:size] || 1000).to_i cursor = (options[:cursor] || []) total_errors = 0 @@ -509,7 +508,6 @@ def self.subj_id_check(options = {}) # dois = response.results.results.map(&:subj_id) events = response.results.results - events.lazy.each do | event| subj_prefix = event.subj_id[/(10\.\d{4,5})/,1] File.open("evens_with_double_crossref_dois.txt", "a+") do |f| From b4234283e1cd9d202bce6286463d411fc8e9b264 Mon Sep 17 00:00:00 2001 From: kjgarza Date: Thu, 23 Jan 2020 16:23:44 +0100 Subject: [PATCH 03/10] label --- app/models/event.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/event.rb b/app/models/event.rb index deb87f651..b54bdb10f 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -517,7 +517,7 @@ def self.subj_id_check(options = {}) end end end - logger.warn "Total number of events with Errors: #{total_errors}" + logger.warn "[DoubleCheck] Total number of events with Errors: #{total_errors}" end def metric_type From 0f815e7be3e84fcc69c240df443969b5b3c153ac Mon Sep 17 00:00:00 2001 From: kjgarza Date: Fri, 24 Jan 2020 19:33:21 +0100 Subject: [PATCH 04/10] linting --- spec/models/event_spec.rb | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/spec/models/event_spec.rb b/spec/models/event_spec.rb index f94b8744d..3724a4c4e 100644 --- a/spec/models/event_spec.rb +++ b/spec/models/event_spec.rb @@ -37,5 +37,32 @@ expect(published).to eq("2011") expect(published).not_to eq(2011) end + + context "double_crossref_check", elasticsearch: true do + let(:provider) { create(:provider, symbol: "DATACITE") } + let(:client) { create(:client, provider: provider, symbol: ENV['MDS_USERNAME'], password: ENV['MDS_PASSWORD']) } + let!(:prefix) { create(:prefix, prefix: "10.14454") } + let!(:client_prefix) { create(:client_prefix, client: client, prefix: prefix) } + let!(:doi) { create(:doi, client: client) } + let!(:dois) { create_list(:doi, 10) } + let!(:events) { create_list(:event_for_datacite_related, 30, source_id: "datacite-crossref", obj_id: doi.doi) } + + before do + Provider.import + # Prefix.import + Client.import + Doi.import + Event.import + sleep 3 + end + + it "check run" do + # puts prefix.inspect + puts [Event.minimum(:id),Event.maximum(:id)] + expect(Event.crossref_double_check( cursor: [Event.minimum(:id),Event.maximum(:id)])).to eq("2006-06-13T16:14:19Z") + # expect(subject.obj["datePublished"]).to be_nil + end + + end end end From eefe175ce2edacca6f76c82469815038c2f8a548 Mon Sep 17 00:00:00 2001 From: kjgarza Date: Wed, 29 Jan 2020 14:51:43 +0100 Subject: [PATCH 05/10] name rake task change --- lib/tasks/event.rake | 4 ++-- spec/models/event_spec.rb | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/tasks/event.rake b/lib/tasks/event.rake index ba440887c..f1fffbbcb 100644 --- a/lib/tasks/event.rake +++ b/lib/tasks/event.rake @@ -72,9 +72,9 @@ namespace :crossref do end namespace :subj_id_check do - desc 'checks datacite-crossref,datacite-related events have a DataCite DOI in the subject node' + desc 'checks that events subject node congruency' task :check => :environment do - cursor = ENV['CURSOR'].to_s.split(",") || [Event.minimum(:id),Event.minimum(:id)] + cursor = ENV['CURSOR'].to_s.split(",") || [Event.minimum(:id),Event.maximum(:id)] Event.subj_id_check(cursor: cursor) end diff --git a/spec/models/event_spec.rb b/spec/models/event_spec.rb index 3724a4c4e..2180c855d 100644 --- a/spec/models/event_spec.rb +++ b/spec/models/event_spec.rb @@ -49,7 +49,6 @@ before do Provider.import - # Prefix.import Client.import Doi.import Event.import @@ -58,8 +57,7 @@ it "check run" do # puts prefix.inspect - puts [Event.minimum(:id),Event.maximum(:id)] - expect(Event.crossref_double_check( cursor: [Event.minimum(:id),Event.maximum(:id)])).to eq("2006-06-13T16:14:19Z") + expect(Event.subj_id_check(cursor: [Event.minimum(:id),Event.maximum(:id)])).to eq("2006-06-13T16:14:19Z") # expect(subject.obj["datePublished"]).to be_nil end From eadcb23219f17ef7431dd5c4ec6652392906e4cd Mon Sep 17 00:00:00 2001 From: kjgarza Date: Wed, 29 Jan 2020 14:52:34 +0100 Subject: [PATCH 06/10] linting --- app/models/event.rb | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/app/models/event.rb b/app/models/event.rb index f0c5af04d..ad1777d7e 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -498,10 +498,10 @@ def self.subj_id_check(options = {}) logger.info "[DoubleCheck] #{response.results.total} events for source datacite-crossref,datacite-related." # walk through results using cursor - if response.results.total > 0 - while response.results.results.length > 0 do + if response.results.total.positive? + while response.results.results.length.positive? response = Event.query(nil, source_id: "datacite-crossref,datacite-related", page: { size: size, cursor: cursor }) - break unless response.results.results.length > 0 + break unless response.results.results.length.positive? logger.info "[DoubleCheck] DoubleCheck #{response.results.results.length} events starting with _id #{response.results.to_a.first[:_id]}." cursor = response.results.to_a.last[:sort] @@ -509,11 +509,13 @@ def self.subj_id_check(options = {}) # dois = response.results.results.map(&:subj_id) events = response.results.results events.lazy.each do | event| - subj_prefix = event.subj_id[/(10\.\d{4,5})/,1] - File.open("evens_with_double_crossref_dois.txt", "a+") do |f| - f.write(event.uuid, "\n") - total_errors= total_errors+1 - end if Prefix.where(prefix: subj_prefix).empty? + subj_prefix = event.subj_id[/(10\.\d{4,5})/, 1] + if Prefix.where(prefix: subj_prefix).empty? + File.open("evens_with_double_crossref_dois.txt", "a+") do |f| + f.write(event.uuid, "\n") + total_errors = total_errors + 1 + end + end end end end From 4c414cf57635bc7fe815d3ad20126f7620b8f423 Mon Sep 17 00:00:00 2001 From: kjgarza Date: Wed, 29 Jan 2020 15:40:41 +0100 Subject: [PATCH 07/10] save uuids outside of the container --- app/models/event.rb | 13 +++++++++++-- spec/models/event_spec.rb | 5 +---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/app/models/event.rb b/app/models/event.rb index ad1777d7e..03884fbc2 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -490,6 +490,7 @@ def access_method end def self.subj_id_check(options = {}) + file_name = "events_with_double_crossref_dois.txt" size = (options[:size] || 1000).to_i cursor = (options[:cursor] || []) total_errors = 0 @@ -511,7 +512,7 @@ def self.subj_id_check(options = {}) events.lazy.each do | event| subj_prefix = event.subj_id[/(10\.\d{4,5})/, 1] if Prefix.where(prefix: subj_prefix).empty? - File.open("evens_with_double_crossref_dois.txt", "a+") do |f| + File.open(file_name, "a+") do |f| f.write(event.uuid, "\n") total_errors = total_errors + 1 end @@ -519,7 +520,15 @@ def self.subj_id_check(options = {}) end end end - logger.warn "[DoubleCheck] Total number of events with Errors: #{total_errors}" + + file = File.open(file_name) + if file.present? + payload = { description: "events_with_errors_from_rake_task", public: true,files: {uids_with_errors: {content: file.read} }} + ### max file size 1MB + response = Maremma.post("https://api.github.com/gists", data: payload.to_json, username: ENV["GIST_USERNAME"], password:ENV["GIST_PASSWORD"]) + logger.warn "[DoubleCheck] Total number of events with Errors: #{total_errors}" + logger.warn "[DoubleCheck] IDs saved: #{response.body.dig('data','url')}" if [200,201].include?(response.status) + end end def metric_type diff --git a/spec/models/event_spec.rb b/spec/models/event_spec.rb index 2180c855d..95eb4f663 100644 --- a/spec/models/event_spec.rb +++ b/spec/models/event_spec.rb @@ -56,11 +56,8 @@ end it "check run" do - # puts prefix.inspect - expect(Event.subj_id_check(cursor: [Event.minimum(:id),Event.maximum(:id)])).to eq("2006-06-13T16:14:19Z") - # expect(subject.obj["datePublished"]).to be_nil + expect(Event.subj_id_check(cursor: [Event.minimum(:id),Event.maximum(:id)])).to eq(true) end - end end end From 8cb1f0fe0333896f906f3d4abf9e1b9cee735254 Mon Sep 17 00:00:00 2001 From: kjgarza Date: Wed, 29 Jan 2020 16:04:24 +0100 Subject: [PATCH 08/10] don't use pagination in a different way --- app/models/event.rb | 6 +++--- lib/tasks/event.rake | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/app/models/event.rb b/app/models/event.rb index 03884fbc2..47a7d6d40 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -490,9 +490,9 @@ def access_method end def self.subj_id_check(options = {}) - file_name = "events_with_double_crossref_dois.txt" + file_name = "evens_with_double_crossref_dois.txt" size = (options[:size] || 1000).to_i - cursor = (options[:cursor] || []) + cursor = [options[:from_id], options[:until_id]] total_errors = 0 response = Event.query(nil, source_id: "datacite-crossref,datacite-related", page: { size: 1, cursor: [] }) @@ -523,7 +523,7 @@ def self.subj_id_check(options = {}) file = File.open(file_name) if file.present? - payload = { description: "events_with_errors_from_rake_task", public: true,files: {uids_with_errors: {content: file.read} }} + payload = { description: "events_with_errors_from_rake_task #{Time.now.getutc}", public: true,files: {uids_with_errors: {content: file.read} }} ### max file size 1MB response = Maremma.post("https://api.github.com/gists", data: payload.to_json, username: ENV["GIST_USERNAME"], password:ENV["GIST_PASSWORD"]) logger.warn "[DoubleCheck] Total number of events with Errors: #{total_errors}" diff --git a/lib/tasks/event.rake b/lib/tasks/event.rake index f1fffbbcb..119de386b 100644 --- a/lib/tasks/event.rake +++ b/lib/tasks/event.rake @@ -74,9 +74,10 @@ end namespace :subj_id_check do desc 'checks that events subject node congruency' task :check => :environment do - cursor = ENV['CURSOR'].to_s.split(",") || [Event.minimum(:id),Event.maximum(:id)] - - Event.subj_id_check(cursor: cursor) + from_id = (ENV['FROM_ID'] || Event.minimum(:id)).to_i + until_id = (ENV['UNTIL_ID'] || Event.maximum(:id)).to_i + + Event.subj_id_check(from_id: from_id, until_id: until_id) end end From 5fd0f79fdfd226047380a1e65d9a25b763940176 Mon Sep 17 00:00:00 2001 From: kjgarza Date: Wed, 29 Jan 2020 16:39:30 +0100 Subject: [PATCH 09/10] comment test as it can only run with es --- spec/models/event_spec.rb | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/spec/models/event_spec.rb b/spec/models/event_spec.rb index 95eb4f663..0ddd279f2 100644 --- a/spec/models/event_spec.rb +++ b/spec/models/event_spec.rb @@ -38,26 +38,26 @@ expect(published).not_to eq(2011) end - context "double_crossref_check", elasticsearch: true do - let(:provider) { create(:provider, symbol: "DATACITE") } - let(:client) { create(:client, provider: provider, symbol: ENV['MDS_USERNAME'], password: ENV['MDS_PASSWORD']) } - let!(:prefix) { create(:prefix, prefix: "10.14454") } - let!(:client_prefix) { create(:client_prefix, client: client, prefix: prefix) } - let!(:doi) { create(:doi, client: client) } - let!(:dois) { create_list(:doi, 10) } - let!(:events) { create_list(:event_for_datacite_related, 30, source_id: "datacite-crossref", obj_id: doi.doi) } + # context "double_crossref_check", elasticsearch: true do + # let(:provider) { create(:provider, symbol: "DATACITE") } + # let(:client) { create(:client, provider: provider, symbol: ENV['MDS_USERNAME'], password: ENV['MDS_PASSWORD']) } + # let!(:prefix) { create(:prefix, prefix: "10.14454") } + # let!(:client_prefix) { create(:client_prefix, client: client, prefix: prefix) } + # let!(:doi) { create(:doi, client: client) } + # let!(:dois) { create_list(:doi, 10) } + # let!(:events) { create_list(:event_for_datacite_related, 30, source_id: "datacite-crossref", obj_id: doi.doi) } - before do - Provider.import - Client.import - Doi.import - Event.import - sleep 3 - end + # before do + # Provider.import + # Client.import + # Doi.import + # Event.import + # sleep 3 + # end - it "check run" do - expect(Event.subj_id_check(cursor: [Event.minimum(:id),Event.maximum(:id)])).to eq(true) - end - end + # it "check run" do + # expect(Event.subj_id_check(cursor: [Event.minimum(:id),Event.maximum(:id)])).to eq(true) + # end + # end end end From 6b6fae318e7ee65e6389a053d341d2856bbf3ccc Mon Sep 17 00:00:00 2001 From: kjgarza Date: Tue, 4 Feb 2020 15:27:43 +0100 Subject: [PATCH 10/10] add rails to logger --- app/models/event.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/app/models/event.rb b/app/models/event.rb index d4316aedc..a6729bd73 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -536,7 +536,7 @@ def self.subj_id_check(options = {}) total_errors = 0 response = Event.query(nil, source_id: "datacite-crossref,datacite-related", page: { size: 1, cursor: [] }) - logger.info "[DoubleCheck] #{response.results.total} events for source datacite-crossref,datacite-related." + Rails.logger.info "[DoubleCheck] #{response.results.total} events for source datacite-crossref,datacite-related." # walk through results using cursor if response.results.total.positive? @@ -544,7 +544,7 @@ def self.subj_id_check(options = {}) response = Event.query(nil, source_id: "datacite-crossref,datacite-related", page: { size: size, cursor: cursor }) break unless response.results.results.length.positive? - logger.info "[DoubleCheck] DoubleCheck #{response.results.results.length} events starting with _id #{response.results.to_a.first[:_id]}." + Rails.logger.info "[DoubleCheck] DoubleCheck #{response.results.results.length} events starting with _id #{response.results.to_a.first[:_id]}." cursor = response.results.to_a.last[:sort] # dois = response.results.results.map(&:subj_id) @@ -566,8 +566,8 @@ def self.subj_id_check(options = {}) payload = { description: "events_with_errors_from_rake_task #{Time.now.getutc}", public: true,files: {uids_with_errors: {content: file.read} }} ### max file size 1MB response = Maremma.post("https://api.github.com/gists", data: payload.to_json, username: ENV["GIST_USERNAME"], password:ENV["GIST_PASSWORD"]) - logger.warn "[DoubleCheck] Total number of events with Errors: #{total_errors}" - logger.warn "[DoubleCheck] IDs saved: #{response.body.dig('data','url')}" if [200,201].include?(response.status) + Rails.logger.warn "[DoubleCheck] Total number of events with Errors: #{total_errors}" + Rails.logger.warn "[DoubleCheck] IDs saved: #{response.body.dig('data','url')}" if [200,201].include?(response.status) end end