From 85bb9241cf27eba7d9baeee4a1fc7c8e2d19b22d Mon Sep 17 00:00:00 2001 From: kjgarza Date: Thu, 13 Feb 2020 17:03:32 +0100 Subject: [PATCH 1/2] rake task to label datacite-crossref events with wrong subj --- app/jobs/subj_check_job.rb | 12 ++++++++++++ app/models/event.rb | 32 ++++++++------------------------ 2 files changed, 20 insertions(+), 24 deletions(-) create mode 100644 app/jobs/subj_check_job.rb diff --git a/app/jobs/subj_check_job.rb b/app/jobs/subj_check_job.rb new file mode 100644 index 000000000..24a960118 --- /dev/null +++ b/app/jobs/subj_check_job.rb @@ -0,0 +1,12 @@ +class SubjCheckJob < ActiveJob::Base + queue_as :lupo_background + + def perform(events, options = {}) + events.lazy.each do |event| + subj_prefix = event[:subj_id][/(10\.\d{4,5})/, 1] + if Prefix.where(prefix: subj_prefix).exists? + Event.find_by(id: event[:id]).update_attribute(:state_event, "subjId_error") + end + end + end +end diff --git a/app/models/event.rb b/app/models/event.rb index c6e164534..a8b22e16d 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -284,6 +284,11 @@ def self.advanced_aggregations } end + def self.state_aggregations + { states: { terms: { field: "state_event", size: 50, min_doc_count: 1 } }} + end + + # return results for one or more ids def self.find_by_id(ids, options = {}) ids = ids.split(",") if ids.is_a?(String) @@ -533,10 +538,8 @@ def access_method end def self.subj_id_check(options = {}) - file_name = "evens_with_double_crossref_dois.txt" size = (options[:size] || 1000).to_i cursor = [options[:from_id], options[:until_id]] - total_errors = 0 response = Event.query(nil, source_id: "datacite-crossref", page: { size: 1, cursor: [] }) Rails.logger.warn "[DoubleCheck] #{response.results.total} events for source datacite-crossref." @@ -549,29 +552,10 @@ def self.subj_id_check(options = {}) Rails.logger.warn "[DoubleCheck] DoubleCheck #{response.results.results.length} events starting with _id #{response.results.to_a.first[:_id]}." cursor = response.results.to_a.last[:sort] - Rails.logger.warn "[DoubleCheck] Cursor: #{cursor} " - - # dois = response.results.results.map(&:subj_id) - events = response.results.results - events.lazy.each do | event| - subj_prefix = event.subj_id[/(10\.\d{4,5})/, 1] - if Prefix.where(prefix: subj_prefix).empty? - File.open(file_name, "a+") do |f| - f.write(event.uuid, "\n") - total_errors = total_errors + 1 - end - end - end - end - end - file = File.open(file_name) - if file.present? - payload = { description: "events_with_errors_from_rake_task #{Time.now.getutc}", public: true,files: {uids_with_errors: {content: file.read} }} - ### max file size 1MB - response = Maremma.post("https://api.github.com/gists", data: payload.to_json, username: ENV["GIST_USERNAME"], password:ENV["GIST_PASSWORD"]) - Rails.logger.warn "[DoubleCheck] Total number of events with Errors: #{total_errors}" - Rails.logger.warn "[DoubleCheck] IDs saved: #{response.body.dig('data','url')}" if [200,201].include?(response.status) + events = response.results.results.map { |item| { id: item.id, subj_id: item.subj_id } } + SubjCheckJob.perform_later(events, options) + end end end From e42b388fa2971ff7e1e5b082ce17a0e64c055b25 Mon Sep 17 00:00:00 2001 From: kjgarza Date: Thu, 13 Feb 2020 17:03:55 +0100 Subject: [PATCH 2/2] query and agg to find datacite-crossref events with wrong subj --- app/controllers/events_controller.rb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index 253c514c4..925169913 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -106,6 +106,7 @@ def index year_month: params[:year_month], aggregations: params[:aggregations], unique: params[:unique], + state_event: params[:state], scroll_id: params[:scroll_id], page: page, sort: sort) @@ -156,6 +157,8 @@ def index downloads_histogram = nil unique_obj_count = nil unique_subj_count = nil + states = nil + bm = Benchmark.ms { aggregations = params.fetch(:aggregations, "") || "" @@ -181,6 +184,8 @@ def index # downloads = total.positive? ? EventsQuery.new.downloads(params[:doi]) : nil unique_obj_count = total.positive? && aggregations.include?("advanced_aggregations") ? response.response.aggregations.unique_obj_count.value : nil unique_subj_count = total.positive? && aggregations.include?("advanced_aggregations") ? response.response.aggregations.unique_subj_count.value : nil + states = total.positive? && aggregations.include?("state_aggregations") ? facet_by_source(response.response.aggregations.states.buckets) : nil + } Rails.logger.warn method: "GET", path: "/events", message: "Aggregations /events", duration: bm @@ -204,6 +209,7 @@ def index "uniqueCitations": citations, "references": references, "relations": relations, + "states": states, "uniqueNodes": { "objCount": unique_obj_count, "subjCount": unique_subj_count