From 7efed60046a13f2511d64a64a4a5973642350802 Mon Sep 17 00:00:00 2001 From: Martin Fenner Date: Sat, 1 Sep 2018 15:07:30 +0200 Subject: [PATCH] bulk updates of index. #98 --- app/jobs/doi_index_by_day_job.rb | 7 +++++++ app/jobs/doi_index_by_month_job.rb | 7 ------- app/models/doi.rb | 25 ++++++++++++++++++------- lib/tasks/doi.rake | 17 ++++++++--------- spec/lib/tasks/doi_rake_spec.rb | 10 +++++----- 5 files changed, 38 insertions(+), 28 deletions(-) create mode 100644 app/jobs/doi_index_by_day_job.rb delete mode 100644 app/jobs/doi_index_by_month_job.rb diff --git a/app/jobs/doi_index_by_day_job.rb b/app/jobs/doi_index_by_day_job.rb new file mode 100644 index 000000000..99e2dea93 --- /dev/null +++ b/app/jobs/doi_index_by_day_job.rb @@ -0,0 +1,7 @@ +class DoiIndexByDayJob < ActiveJob::Base + queue_as :lupo_background + + def perform(options={}) + Doi.index_by_day(options) + end +end \ No newline at end of file diff --git a/app/jobs/doi_index_by_month_job.rb b/app/jobs/doi_index_by_month_job.rb deleted file mode 100644 index babada36e..000000000 --- a/app/jobs/doi_index_by_month_job.rb +++ /dev/null @@ -1,7 +0,0 @@ -class DoiIndexByMonthJob < ActiveJob::Base - queue_as :lupo_background - - def perform(options={}) - Doi.index(options) - end -end \ No newline at end of file diff --git a/app/models/doi.rb b/app/models/doi.rb index e6b4f4b1c..188e9a93f 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -182,6 +182,7 @@ def as_indexed_json(options={}) "suffix" => suffix, "resource_type_id" => resource_type_id, "resource_type_subtype" => resource_type_subtype, + "alternate_identifier" => alternate_identifier, "b_version" => b_version, "is_active" => is_active, "last_landing_page_status" => last_landing_page_status, @@ -235,23 +236,33 @@ def self.find_by_id(id, options={}) }) end - def self.index_by_month(options={}) + def self.index(options={}) from_date = (options[:from_date].present? ? Date.parse(options[:from_date]) : Date.current).beginning_of_month until_date = (options[:until_date].present? ? Date.parse(options[:until_date]) : Date.current).end_of_month # get first day of every month between from_date and until_date - (from_date..until_date).select {|d| d.day == 1}.each do |m| - DoiIndexByMonthJob.perform_later(from_date: m.strftime("%F"), until_date: m.end_of_month.strftime("%F")) + (from_date..until_date).each do |d| + DoiIndexByDayJob.perform_later(from_date: d.strftime("%F")) end "Queued indexing for DOIs updated from #{from_date.strftime("%F")} until #{until_date.strftime("%F")}." end - def self.index(options={}) - from_date = options[:from_date].present? ? Date.parse(options[:from_date]) : Date.current - 1.day - until_date = options[:until_date].present? ? Date.parse(options[:until_date]) : Date.current + def self.index_by_day(options={}) + from_date = options[:from_date].present? ? Date.parse(options[:from_date]) : Date.current + until_date = from_date + 1.day + errors = 0 + + Doi.where("updated >= ?", from_date.strftime("%F") + " 00:00:00").where("updated <= ?", until_date.strftime("%F") + " 00:00:00").find_in_batches(batch_size: 1000) do |dois| + response = Doi.__elasticsearch__.client.bulk \ + index: Doi.index_name, + type: Doi.document_type, + body: dois.map { |doi| { index: { _id: doi.id, data: doi.as_indexed_json } } } + + errors += response['items'].map { |k, v| k.values.first['error'] }.compact.length + end - Doi.import query: -> { where("updated >= ?", from_date.strftime("%F")).where("updated <= ?", until_date.strftime("%F")) }, batch_size: 1000 + errors end def uid diff --git a/lib/tasks/doi.rake b/lib/tasks/doi.rake index 13ac5ecfe..94f6a8cbc 100644 --- a/lib/tasks/doi.rake +++ b/lib/tasks/doi.rake @@ -7,22 +7,21 @@ namespace :doi do end end - desc 'Index all DOIs by month' - task :index_by_month => :environment do + desc 'Index all DOIs' + task :index => :environment do from_date = ENV['FROM_DATE'] || Date.current.beginning_of_month.strftime("%F") until_date = ENV['UNTIL_DATE'] || Date.current.end_of_month.strftime("%F") - response = Doi.index_by_month(from_date: from_date, until_date: until_date) + response = Doi.index(from_date: from_date, until_date: until_date) puts response end - desc 'Index all DOIs' - task :index => :environment do - from_date = ENV['FROM_DATE'] || (Date.current - 1.day).strftime("%F") - until_date = ENV['UNTIL_DATE'] || Date.current.strftime("%F") + desc 'Index DOIs per day' + task :index_by_day => :environment do + from_date = ENV['FROM_DATE'] || Date.current.strftime("%F") - Doi.index(from_date: from_date, until_date: until_date) - puts "Queued indexing for DOIs updated from #{from_date} - #{until_date}." + count = Doi.index_by_day(from_date: from_date) + puts "DOIs updated on #{from_date} indexed with #{count} errors." end desc 'Set state' diff --git a/spec/lib/tasks/doi_rake_spec.rb b/spec/lib/tasks/doi_rake_spec.rb index 38c74e44e..ccf530228 100644 --- a/spec/lib/tasks/doi_rake_spec.rb +++ b/spec/lib/tasks/doi_rake_spec.rb @@ -1,6 +1,6 @@ require 'rails_helper' -describe "doi:index_by_month", elasticsearch: true do +describe "doi:index", elasticsearch: true do include ActiveJob::TestHelper include_context "rake" @@ -21,17 +21,17 @@ it "should enqueue an DoiIndexByMonthJob" do expect { capture_stdout { subject.invoke } - }.to change(enqueued_jobs, :size).by(8) - expect(enqueued_jobs.last[:job]).to be(DoiIndexByMonthJob) + }.to change(enqueued_jobs, :size).by(243) + expect(enqueued_jobs.last[:job]).to be(DoiIndexByDayJob) end end -describe "doi:index", elasticsearch: true do +describe "doi:index_by_day", elasticsearch: true do include ActiveJob::TestHelper include_context "rake" let!(:doi) { create_list(:doi, 10) } - let(:output) { "Queued indexing for DOIs updated from 2018-01-04 - 2018-08-05.\n" } + let(:output) { "DOIs updated on 2018-01-04 indexed with 0 errors.\n" } it "prerequisites should include environment" do expect(subject.prerequisites).to include("environment")