From 7034e81694d636cb0d1cd3068487edc5a64f572a Mon Sep 17 00:00:00 2001 From: Martin Fenner Date: Sat, 19 Dec 2020 10:33:25 +0100 Subject: [PATCH] add import dois not indexed rake task datacite/datacite#965 --- app/models/client.rb | 20 ++++++++++++++++++++ app/models/doi.rb | 12 ++++++------ lib/tasks/client.rake | 5 +++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/app/models/client.rb b/app/models/client.rb index 077ea98fe..422dbf760 100644 --- a/app/models/client.rb +++ b/app/models/client.rb @@ -712,6 +712,26 @@ def self.export_doi_counts(query: nil) csv.join("") end + # import all DOIs not indexed in Elasticsearch + def self.import_dois_not_indexed(query: nil) + table = CSV.parse(export_doi_counts(query: query), headers: true) + + # loop through repositories that have DOIs not indexed in Elasticsearch + table.each do |row| + client = ::Client.where(deleted_at: nil).where(symbol: row["Repository ID"]).first + if client.nil? + puts "Client not found for client ID #{row["Repository ID"]}." + exit + end + + # import DOIs for client. Ignore repositories with more than 10K DOIs + if client.dois.length <= 10000 + puts "#{client.dois.length} DOIs for repository #{client.symbol} will be imported." + Doi.import_by_client(client_id: client.symbol, total_count: client.dois.length) + end + end + end + protected def check_issn Array.wrap(issn).each do |i| diff --git a/app/models/doi.rb b/app/models/doi.rb index 6e191fcef..21b01bcba 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -1243,7 +1243,7 @@ def self.import_one(doi_id: nil) message end - def self.import_by_client(client_id: nil) + def self.import_by_client(client_id: nil, total_count: nil) client = ::Client.where(symbol: client_id).first return nil if client.blank? @@ -1271,18 +1271,18 @@ def self.import_by_client(client_id: nil) end count += dois.length - Rails.logger.info "[Elasticsearch] Imported #{count} DOIs for client #{client_id}." + Rails.logger.info "[Elasticsearch] Imported #{count} DOIs for repository #{client_id}." if total_count > 500 end if errors > 1 - Rails.logger.error "[Elasticsearch] #{errors} errors importing #{count} DOIs for client #{client_id}." + Rails.logger.error "[Elasticsearch] #{errors} errors importing #{count} DOIs for repository #{client_id}." elsif count > 0 - Rails.logger.info "[Elasticsearch] Imported a total of #{count} DOIs for client #{client_id}." + Rails.logger.info "[Elasticsearch] Imported a total of #{count} DOIs for repository #{client_id}." end count rescue Elasticsearch::Transport::Transport::Errors::RequestEntityTooLarge, Faraday::ConnectionFailed, ActiveRecord::LockWaitTimeout => e - Rails.logger.error "[Elasticsearch] Error #{e.message} importing DOIs for client #{client_id}." + Rails.logger.error "[Elasticsearch] Error #{e.message} importing DOIs for repository #{client_id}." end def self.index_by_id(options = {}) @@ -2095,7 +2095,7 @@ def self.loop_through_dois(options = {}) query = options[:query].presence response = Doi.query(query, filter.merge(page: { size: 1, cursor: [] })) - message = "#{label} #{response.results.total} Dois with #{label}." + message = "#{label} #{response.results.total} Dois." # walk through results using cursor if response.results.total.positive? diff --git a/lib/tasks/client.rake b/lib/tasks/client.rake index 136cc00ce..7a9bada66 100644 --- a/lib/tasks/client.rake +++ b/lib/tasks/client.rake @@ -102,6 +102,11 @@ namespace :client do Doi.import_by_client(client_id: ENV["CLIENT_ID"]) end + desc "Import dois not indexed" + task import_dois_not_indexed: :environment do + puts Client.import_dois_not_indexed(query: ENV["QUERY"]) + end + desc "Export doi counts" task export_doi_counts: :environment do puts Client.export_doi_counts(query: ENV["QUERY"])