From 1310ae863b2afda7864f767fdf58de519be35bfa Mon Sep 17 00:00:00 2001 From: Richard Hallett Date: Tue, 2 Apr 2019 15:09:30 +0200 Subject: [PATCH] Add task for importing missing by empty attribute Ref datacite/lupo#196 --- app/jobs/doi_import_one_job.rb | 14 +++++++++ app/models/doi.rb | 56 ++++++++++++++++++++++++++++------ lib/tasks/doi.rake | 12 ++++++++ 3 files changed, 72 insertions(+), 10 deletions(-) create mode 100644 app/jobs/doi_import_one_job.rb diff --git a/app/jobs/doi_import_one_job.rb b/app/jobs/doi_import_one_job.rb new file mode 100644 index 000000000..e88d54f12 --- /dev/null +++ b/app/jobs/doi_import_one_job.rb @@ -0,0 +1,14 @@ +class DoiImportOneJob < ActiveJob::Base + queue_as :lupo_background + + rescue_from(Elasticsearch::Transport::Transport::Errors::BadRequest) do |e| + logger = Logger.new(STDOUT) + logger.info("[Import DOI] Failed to index a doi, exception was: " + e.message) + end + + def perform(doi) + logger = Logger.new(STDOUT) + logger.info("[Import DOI] Attempting to import doi: " + doi) + Doi.import_one(doi_id: doi) + end + end \ No newline at end of file diff --git a/app/models/doi.rb b/app/models/doi.rb index b765cd512..cd8e823bc 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -2,7 +2,7 @@ class Doi < ActiveRecord::Base audited only: [:doi, :url, :creators, :contributors, :titles, :publisher, :publication_year, :types, :descriptions, :container, :sizes, :formats, :version_info, :language, :dates, :identifiers, :related_identifiers, :funding_references, :geo_locations, :rights_list, :subjects, :schema_version, :content_url, :landing_page, :aasm_state, :source, :reason] - + include Metadatable include Cacheable include Licensable @@ -382,7 +382,7 @@ def self.import_one(doi_id: nil) logger.error "[MySQL] No metadata for DOI " + doi.doi + " found: " + doi.current_metadata.inspect return nil end - + meta = doi.read_datacite(string: string, sandbox: doi.sandbox) attrs = %w(creators contributors titles publisher publication_year types descriptions container sizes formats language dates identifiers related_identifiers funding_references geo_locations rights_list subjects content_url).map do |a| [a.to_sym, meta[a]] @@ -411,6 +411,42 @@ def self.import_all(options={}) (from_date..until_date).to_a.length end + def self.import_missing_by_empty_attribute(options={}) + logger = Logger.new(STDOUT) + + attribute = options[:attribute] + + # Find all dois by missing attribute + count = 0 + + query = options[:query] || "-_exists_:#{attribute} -aasm_state:draft" + size = (options[:size] || 1000).to_i + + response = Doi.query(query, page: { size: 1, cursor: 0 }) + logger.info "[Metadata Missing Fix] #{response.results.total} DOIs found missing attribute #{attribute}." + + if response.results.total > 0 + # walk through results using cursor + cursor = 0 + + while response.results.results.length > 0 do + response = Doi.query(query, page: { size: size, cursor: cursor }) + break unless response.results.results.length > 0 + + logger.info "[Metadata Missing Fix] Attempting fix for #{response.results.results.length} DOIs starting with _id #{cursor + 1}." + cursor = response.results.results.last[:sort].first.to_i + + response.results.results.each do |d| + # Import One as a background job + DoiImportOneJob.perform_later(d.doi) + end + end + end + + response.results.total + end + + def self.import_missing(options={}) from_date = options[:from_date].present? ? Date.parse(options[:from_date]) : Date.current until_date = options[:until_date].present? ? Date.parse(options[:until_date]) : Date.current @@ -454,12 +490,12 @@ def self.import_by_day(options={}) logger.error "[MySQL] No metadata for DOI " + doi.doi + " found." return nil end - + meta = doi.read_datacite(string: string, sandbox: doi.sandbox) attrs = %w(creators contributors titles publisher publication_year types descriptions container sizes formats language dates identifiers related_identifiers funding_references geo_locations rights_list subjects content_url).map do |a| [a.to_sym, meta[a]] end.to_h.merge(schema_version: meta["schema_version"] || "http://datacite.org/schema/kernel-4", version_info: meta["version"], xml: string) - + # update_columns will NOT trigger validations and Elasticsearch indexing doi.update_columns(attrs) @@ -471,7 +507,7 @@ def self.import_by_day(options={}) count += 1 end end - + if count > 0 logger.info "[MySQL] Imported metadata for #{count} DOIs created on #{options[:from_date]}." end @@ -511,12 +547,12 @@ def self.import_by_day_missing(options={}) logger.error "[MySQL] No metadata for DOI " + doi.doi + " found." return nil end - + meta = doi.read_datacite(string: string, sandbox: doi.sandbox) attrs = %w(creators contributors titles publisher publication_year types descriptions container sizes formats language dates identifiers related_identifiers funding_references geo_locations rights_list subjects content_url).map do |a| [a.to_sym, meta[a]] end.to_h.merge(schema_version: meta["schema_version"] || "http://datacite.org/schema/kernel-4", version_info: meta["version"], xml: string) - + # update_columns will NOT trigger validations and Elasticsearch indexing doi.update_columns(attrs) @@ -528,7 +564,7 @@ def self.import_by_day_missing(options={}) count += 1 end end - + if count > 0 logger.info "[MySQL] Imported metadata for #{count} DOIs created on #{options[:from_date]}." end @@ -749,7 +785,7 @@ def validatable? # providers europ and ethz do their own handle registration, so fetch url from handle system instead def update_url return nil if current_user.nil? || !is_registered_or_findable? - + if %w(europ ethz).include?(provider_id) || %w(Crossref).include?(agency) UrlJob.perform_later(doi) else @@ -919,7 +955,7 @@ def self.transfer(options={}) if options[:client_id] && options[:target_id] && response.results.total > 0 # walk through results using cursor cursor = 0 - + while response.results.results.length > 0 do response = Doi.query(query, client_id: options[:client_id], page: { size: size, cursor: cursor }) break unless response.results.results.length > 0 diff --git a/lib/tasks/doi.rake b/lib/tasks/doi.rake index 0502c72bc..25d71b6b8 100644 --- a/lib/tasks/doi.rake +++ b/lib/tasks/doi.rake @@ -71,6 +71,18 @@ namespace :doi do Doi.import_missing(from_date: from_date, until_date: until_date) end + desc 'Import missing DOIs by empty attribute' + task :import_missing_by_empty_attribute => :environment do + if ENV['ATTRIBUTE'].present? + attribute = ENV['ATTRIBUTE'] + else + puts "ENV['ATTRIBUTE'] is required" + exit + end + + Doi.import_missing_by_empty_attribute(attribute: attribute) + end + desc 'Import one DOI' task :import_one => :environment do if ENV['DOI'].nil?