From 495b4afe561c5d30bb4ec383e92950699d52d858 Mon Sep 17 00:00:00 2001 From: Martin Fenner Date: Fri, 4 Jan 2019 00:22:34 +0100 Subject: [PATCH] support indexing by id --- app/jobs/doi_index_by_id_job.rb | 7 +++++ app/models/doi.rb | 53 +++++++++++++++++++++++++++++++++ config/initializers/_version.rb | 2 +- lib/tasks/doi.rake | 8 +++++ 4 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 app/jobs/doi_index_by_id_job.rb diff --git a/app/jobs/doi_index_by_id_job.rb b/app/jobs/doi_index_by_id_job.rb new file mode 100644 index 000000000..7689e4a7c --- /dev/null +++ b/app/jobs/doi_index_by_id_job.rb @@ -0,0 +1,7 @@ +class DoiIndexByIdJob < ActiveJob::Base + queue_as :lupo_background + + def perform(options={}) + Doi.index_by_id(options) + end +end \ No newline at end of file diff --git a/app/models/doi.rb b/app/models/doi.rb index 9e45a6202..90d736f92 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -521,6 +521,59 @@ def self.index_by_day(options={}) logger.info "[Elasticsearch] Indexed #{count} DOIs created on #{options[:from_date]}." end + def self.index_by_ids(options={}) + from_id = (options[:from_id] || 1).to_i + until_id = (options[:until_id] || from_id + 249).to_i + + # get every id between from_id and end_id + (from_id..until_id).step(250).each do |id| + DoiIndexByIdJob.perform_later(id: id) + puts "Queued indexing for DOIs with IDs #{from_id} - #{(until_id)}." + end + end + + def self.index_by_id(options={}) + return nil unless options[:id].present? + id = options[:id].to_i + + errors = 0 + count = 0 + + logger = Logger.new(STDOUT) + + Doi.where(id: id..(id + 249)).find_in_batches(batch_size: 250) do |dois| + response = Doi.__elasticsearch__.client.bulk \ + index: Doi.index_name, + type: Doi.document_type, + body: dois.map { |doi| { index: { _id: doi.id, data: doi.as_indexed_json } } } + + # log errors + errors += response['items'].map { |k, v| k.values.first['error'] }.compact.length + response['items'].select { |k, v| k.values.first['error'].present? }.each do |err| + logger.error "[Elasticsearch] " + err.inspect + end + + count += dois.length + end + + if errors > 1 + logger.error "[Elasticsearch] #{errors} errors indexing #{count} DOIs with IDs #{id} - #{(id + 249)}." + elsif count > 1 + logger.info "[Elasticsearch] Indexed #{count} DOIs with IDs #{id} - #{(id + 249)}." + end + rescue Elasticsearch::Transport::Transport::Errors::RequestEntityTooLarge, Faraday::ConnectionFailed, ActiveRecord::LockWaitTimeout => error + logger.info "[Elasticsearch] Error #{error.message} indexing DOIs with IDs #{id} - #{(id + 249)}." + + count = 0 + + Doi.where(id: id..(id + 249)).find_each do |doi| + IndexJob.perform_later(doi) + count += 1 + end + + logger.info "[Elasticsearch] Indexed #{count} DOIs with IDs #{id} - #{(id + 249)}." + end + def uid doi.downcase end diff --git a/config/initializers/_version.rb b/config/initializers/_version.rb index 40e27d7d4..cb765f20b 100644 --- a/config/initializers/_version.rb +++ b/config/initializers/_version.rb @@ -1,5 +1,5 @@ module Lupo class Application - VERSION = "2.2.3" + VERSION = "2.2.4" end end \ No newline at end of file diff --git a/lib/tasks/doi.rake b/lib/tasks/doi.rake index 0982b7525..5e6d05698 100644 --- a/lib/tasks/doi.rake +++ b/lib/tasks/doi.rake @@ -74,6 +74,14 @@ namespace :doi do puts "DOIs created on #{from_date} indexed." end + desc 'Index DOIs by ID' + task :index_by_ids => :environment do + from_id = (ENV['FROM_ID'] || 1).to_i + until_id = (ENV['UNTIL_ID'] || from_id + 249).to_i + + Doi.index_by_ids(from_id: from_id, until_id: until_id) + end + desc 'Set minted' task :set_minted => :environment do from_date = ENV['FROM_DATE'] || Time.zone.now - 1.day