diff --git a/app/models/doi.rb b/app/models/doi.rb index 4349a64d0..16a3ed9e5 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -440,41 +440,55 @@ def self.import_by_day(options={}) def self.import_by_day_missing(options={}) return nil unless options[:from_date].present? from_date = Date.parse(options[:from_date]) - client_id = options[:client_id] count = 0 logger = Logger.new(STDOUT) - collection = Doi.where(xml: nil).where(created: from_date.midnight..from_date.end_of_day) - collection = collection.where(datacentre: client_id) if client_id.present? + response = Doi.query("-creators:* +created:[#{from_date.strftime("%F")} TO #{from_date.strftime("%F")}]", page: { size: 0, cursor: 1 }) + logger.info "#{response.results.total} DOIs found with missing metadata created on #{from_date.strftime("%F")}." - collection.find_each do |doi| - begin - string = doi.current_metadata.present? ? doi.clean_xml(doi.current_metadata.xml) : nil - unless string.present? - logger.error "[MySQL] No metadata for DOI " + doi.doi + " found." - return nil + if response.results.total > 0 + # walk through results using cursor + prev_cursor = 0 + cursor = 1 + + while cursor > prev_cursor do + response = Doi.query("-creators:* +created:[#{from_date.strftime("%F")} TO #{from_date.strftime("%F")}]", page: { size: 1000, cursor: cursor }) + prev_cursor = cursor + cursor = Array.wrap(response.results.results.last.to_h[:sort]).first.to_i + + response.records.each do |doi| + begin + # ignore broken xml + string = doi.current_metadata.present? ? doi.from_xml(doi.current_metadata.xml.to_s.force_encoding("UTF-8")) : nil + unless string.present? + logger.error "[MySQL] No metadata for DOI " + doi.doi + " found." + return nil + end + + meta = doi.read_datacite(string: string, sandbox: doi.sandbox) + attrs = %w(creators contributors titles publisher publication_year types descriptions container sizes formats language dates identifiers related_identifiers funding_references geo_locations rights_list subjects content_url).map do |a| + [a.to_sym, meta[a]] + end.to_h.merge(schema_version: meta["schema_version"] || "http://datacite.org/schema/kernel-4", version_info: meta["version"], xml: string) + + # update_columns will NOT trigger validations and Elasticsearch indexing + doi.update_columns(attrs) + + doi.__elasticsearch__.index_document + rescue TypeError, NoMethodError, RuntimeError, ActiveRecord::StatementInvalid, ActiveRecord::LockWaitTimeout => error + logger.error "[MySQL] Error importing metadata for " + doi.doi + ": " + error.message + Bugsnag.notify(error) + else + count += 1 + end + end + + if count > 0 + logger.info "[MySQL] Imported metadata for #{count} DOIs created on #{options[:from_date]}." end - - meta = doi.read_datacite(string: string, sandbox: doi.sandbox) - attrs = %w(creators contributors titles publisher publication_year types descriptions container sizes formats language dates identifiers related_identifiers funding_references geo_locations rights_list subjects content_url).map do |a| - [a.to_sym, meta[a]] - end.to_h.merge(schema_version: meta["schema_version"] || "http://datacite.org/schema/kernel-4", version_info: meta["version"], xml: string) - - # update_columns will NOT trigger validations and Elasticsearch indexing - doi.update_columns(attrs) - rescue TypeError, NoMethodError, RuntimeError, ActiveRecord::StatementInvalid, ActiveRecord::LockWaitTimeout => error - logger.error "[MySQL] Error importing metadata for " + doi.doi + ": " + error.message - Bugsnag.notify(error) - else - count += 1 end end - - if count > 0 - logger.info "[MySQL] Imported metadata for #{count} DOIs created on #{options[:from_date]}." - end end def self.index(options={}) @@ -768,7 +782,7 @@ def self.set_handle while cursor > prev_cursor do response = Doi.query("-registered:* +url:* -aasm_state:draft -provider_id:ethz -provider_id:europ", page: { size: 1000, cursor: cursor }) prev_cursor = cursor - cursor = Array.wrap(response.results.results.last.to_h[:sort]).first + cursor = Array.wrap(response.results.results.last.to_h[:sort]).first.to_i response.results.results.each do |d| HandleJob.perform_later(d.doi) @@ -791,7 +805,7 @@ def self.set_url while cursor > prev_cursor do response = Doi.query("-url:* (+provider_id:ethz OR -aasm_status:draft)", page: { size: 1000, cursor: cursor }) prev_cursor = cursor - cursor = Array.wrap(response.results.results.last.to_h[:sort]).first + cursor = Array.wrap(response.results.results.last.to_h[:sort]).first.to_i response.results.results.each do |d| UrlJob.perform_later(d.doi) @@ -814,7 +828,7 @@ def self.set_minted while cursor > prev_cursor do response = Doi.query("url:* +provider_id:ethz +aasm_state:draft", page: { size: 1000, cursor: cursor }) prev_cursor = cursor - cursor = Array.wrap(response.results.results.last.to_h[:sort]).first + cursor = Array.wrap(response.results.results.last.to_h[:sort]).first.to_i response.results.results.each do |d| UrlJob.perform_later(d.doi) diff --git a/config/initializers/_version.rb b/config/initializers/_version.rb index 9bbd4864c..1e043e44b 100644 --- a/config/initializers/_version.rb +++ b/config/initializers/_version.rb @@ -1,5 +1,5 @@ module Lupo class Application - VERSION = "2.3.30" + VERSION = "2.3.31" end end \ No newline at end of file