Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into schema-4.5-cleanup-r1
Browse files Browse the repository at this point in the history
  • Loading branch information
svogt0511 committed Feb 23, 2024
2 parents 4082749 + 7c7d715 commit a555b30
Show file tree
Hide file tree
Showing 14 changed files with 145 additions and 79 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/_update_terraform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
GIT_TAG: ${{ inputs.image_tag }}
steps:
- name: Checkout terraform config repo
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
# public repo with terraform configuration
repository: 'datacite/mastino'
Expand Down Expand Up @@ -58,4 +58,3 @@ jobs:
repository: 'datacite/mastino'
branch: 'refs/heads/master'
tags: false

4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Cache Docker layers
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/changelog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
name: Generate changelog
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Generate changelog
uses: charmixer/[email protected]
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/parallel_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ jobs:
MDS_PASSWORD: ${{ secrets.MDS_PASSWORD }}
AWS_REGION: ${{ secrets.AWS_REGION }}
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Ruby 2.7.8
uses: ruby/setup-ruby@v1
with:
ruby-version: 2.7.8
- uses: actions/cache@v3
- uses: actions/cache@v4
with:
path: vendor/bundle
key: ${{ runner.os }}-gems-${{ hashFiles('**/Gemfile.lock') }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Ruby 2.7
uses: ruby/setup-ruby@v1
with:
ruby-version: 2.7
- name: Cache gems
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: vendor/bundle
key: ${{ runner.os }}-rubocop-${{ hashFiles('**/Gemfile.lock') }}
Expand Down
3 changes: 3 additions & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ Layout/EmptyLinesAroundAccessModifier:

Layout/EmptyLinesAroundBlockBody:
Enabled: true
Exclude:
- 'db/schema.rb'

# In a regular class definition, no empty lines around the body.
Layout/EmptyLinesAroundClassBody:
Expand Down Expand Up @@ -153,6 +155,7 @@ Style/FrozenStringLiteralComment:
- 'activestorage/db/update_migrate/**/*.rb'
- 'actionmailbox/db/migrate/**/*.rb'
- 'actiontext/db/migrate/**/*.rb'
- 'db/schema.rb'

Style/RedundantFreeze:
Enabled: true
Expand Down
4 changes: 2 additions & 2 deletions app/jobs/doi_import_by_client_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
class DoiImportByClientJob < ApplicationJob
queue_as :lupo_background

def perform(client_id, _options = {})
DataciteDoi.import_by_client(client_id)
def perform(client_id, **options)
DataciteDoi.import_by_client(client_id, options)
end
end
7 changes: 3 additions & 4 deletions app/models/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class Client < ApplicationRecord
after_update_commit :update_reference_repository
after_destroy_commit :destroy_reference_repository
after_commit on: %i[update] do
::Client.import_dois(self.symbol)
::Client.import_dois(self.id)
end

# use different index for testing
Expand Down Expand Up @@ -810,12 +810,11 @@ def self.export_doi_counts(query: nil)
csv.join("")
end

def self.import_dois(client_id)
def self.import_dois(client_id, options = {})
if client_id.blank?
Rails.logger.error "Repository not found for client ID #{client_id}."
Rails.logger.error "Missing client ID."
exit
end

DoiImportByClientJob.perform_later(client_id)
end

Expand Down
134 changes: 83 additions & 51 deletions app/models/datacite_doi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,24 @@ class DataciteDoi < Doi
else
index_name "dois"
end
# TODO switch index
# if Rails.env.test?
# index_name "dois-datacite-test"
# elsif ENV["ES_PREFIX"].present?
# index_name"dois-datacite-#{ENV["ES_PREFIX"]}"
# else
# index_name "dois-datacite"
# end

# TODO remove query for type once STI is enabled

def self.index_all_by_client(options = {})
client_to_doi_count = DataciteDoi.where(type: "DataciteDoi").group(:datacentre).count
# throw out id 0
client_to_doi_count.delete(0)


index = options[:index] || self.inactive_index
batch_size = options[:batch_size] || 2000
client_to_doi_count.keys.each do |client_id|
DoiImportByClientJob.perform_later(
client_id,
index: index,
batch_size: batch_size
)
end
end

def self.import_by_ids(options = {})
index =
if Rails.env.test?
Expand Down Expand Up @@ -55,56 +63,51 @@ def self.import_by_ids(options = {})
count
end

def self.import_by_client(client_id)
def self.import_by_client(client_id, options = {})
# Get optional parameters
import_index =
if Rails.env.test?
index_name
elsif options[:index].present?
options[:index]
else
active_index
end
batch_size = options[:batch_size] || 50

# Abort if client_id is blank
if client_id.blank?
Rails.logger.error "Missing client ID."
exit
end

client = ::Client.where(deleted_at: nil).where(symbol: client_id).first
# Search by propper ID
client = ::Client.find_by(id: client_id, deleted_at: nil)
if client.nil?
Rails.logger.error "Repository not found for client ID #{client_id}."
exit
# Search by symbol
client = ::Client.find_by(symbol: client_id, deleted_at: nil)
if client.nil?
Rails.logger.error "Repository not found for client ID #{client_id}."
exit
end
end

# import DOIs for client
Rails.logger.info "Started import of #{client.dois.count} DOIs for repository #{client_id}."
Rails.logger.info "Started import of #{client.dois.count} DOIs for repository #{client.symbol} into the index '#{import_index}'"

DataciteDoi.where(datacentre: client.id).
find_in_batches(batch_size: 50) do |dois|
client.dois.find_in_batches(batch_size: batch_size) do |dois|
ids = dois.pluck(:id)
DataciteDoiImportInBulkJob.perform_later(ids, index: self.active_index)
DataciteDoiImportInBulkJob.perform_later(ids, index: import_index)
end
end

def self.import_in_bulk(ids, options = {})
index =
if Rails.env.test?
index_name
elsif options[:index].present?
options[:index]
else
inactive_index
end
def self.upload_to_elasticsearch(index, bulk_body)
number_of_dois = bulk_body.length
errors = 0

# get database records from array of database ids
dois = DataciteDoi.where(id: ids)

response =
DataciteDoi.__elasticsearch__.client.bulk index: index,
type:
DataciteDoi.document_type,
body:
dois.map { |doi|
{
index: {
_id: doi.id,
data:
doi.as_indexed_json,
},
}
}
body: bulk_body

# report errors
if response["errors"]
Expand All @@ -118,22 +121,51 @@ def self.import_in_bulk(ids, options = {})

if errors > 1
Rails.logger.error "[Elasticsearch] #{errors} errors importing #{
dois.length
number_of_dois
} DataCite DOIs."
elsif dois.length > 0
Rails.logger.debug "[Elasticsearch] Imported #{
dois.length
} DataCite DOIs."
elsif number_of_dois > 0
Rails.logger.debug "[Elasticsearch] Imported #{number_of_dois} DataCite DOIs."
end

dois.length
number_of_dois
rescue Elasticsearch::Transport::Transport::Errors::RequestEntityTooLarge,
Aws::SQS::Errors::RequestEntityTooLarge,
Faraday::ConnectionFailed,
ActiveRecord::LockWaitTimeout => e

Rails.logger.error "[Elasticsearch] Error #{e.class} with message #{
e.message
} importing DataCite DOIs."
Rails.logger.error "[Elasticsearch] Error #{e.class} with message #{e.message} importing DataCite DOIs."
end


# import DOIs in bulk
def self.import_in_bulk(ids, options = {})
# Get optional parameters
batch_size = options[:batch_size] || 50
# default batch_size is 50 here in order to avoid creating a bulk request
# to elasticsearch that is too large
# With this the number of ids can be very large.

index =
if Rails.env.test?
index_name
elsif options[:index].present?
options[:index]
else
inactive_index
end

# get database records from array of database ids
selected_dois = DataciteDoi.where(id: ids).includes(:client)
selected_dois.find_in_batches(batch_size: batch_size) do |dois|
bulk_body = dois.map do |doi|
{
index: {
_id: doi.id,
data: doi.as_indexed_json,
},
}
end
upload_to_elasticsearch(index, bulk_body)
end
end
end
7 changes: 7 additions & 0 deletions db/migrate/20240209113122_remove_globus_uuid_index.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# frozen_string_literal: true

class RemoveGlobusUuidIndex < ActiveRecord::Migration[6.1]
def change
remove_index :allocator, column: :globus_uuid
end
end
9 changes: 9 additions & 0 deletions db/migrate/20240209120111_add_missing_indexes_to_allocator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# frozen_string_literal: true

class AddMissingIndexesToAllocator < ActiveRecord::Migration[6.1]
def change
add_index :allocator, :deleted_at, name: "index_allocator_deleted_at"
add_index :allocator, :role_name, name: "index_allocator_role_name"
add_index :allocator, :ror_id, name: "index_allocator_ror_id"
end
end
9 changes: 5 additions & 4 deletions db/schema.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# frozen_string_literal: true

# This file is auto-generated from the current state of the database. Instead
# of editing this file, please use the migrations feature of Active Record to
# incrementally modify your database, and then regenerate this schema definition.
Expand All @@ -12,7 +10,8 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2023_10_04_145109) do
ActiveRecord::Schema.define(version: 2024_02_09_120111) do

create_table "active_storage_attachments", charset: "utf8mb4", force: :cascade do |t|
t.string "name", limit: 191, null: false
t.string "record_type", null: false
Expand Down Expand Up @@ -85,8 +84,10 @@
t.bigint "logo_file_size"
t.datetime "logo_updated_at"
t.integer "doi_estimate", default: 0, null: false
t.index ["globus_uuid"], name: "index_allocator_on_globus_uuid"
t.index ["deleted_at"], name: "index_allocator_deleted_at"
t.index ["organization_type"], name: "index_allocator_organization_type"
t.index ["role_name"], name: "index_allocator_role_name"
t.index ["ror_id"], name: "index_allocator_ror_id"
t.index ["symbol"], name: "symbol", unique: true
end

Expand Down
26 changes: 25 additions & 1 deletion lib/tasks/datacite_doi.rake
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,31 @@ namespace :datacite_doi do
puts DataciteDoi.delete_alias
end

desc "Index all datacite DOIs grouped by Client"
task index_all_by_client: :environment do
import_index = ENV["INDEX"] || DataciteDoi.inactive_index
batch_size = ENV["BATCH_SIZE"].nil? ? 2000 : ENV["BATCH_SIZE"].to_i
DataciteDoi.index_all_by_client(
index: import_index,
batch_size: batch_size,
)
end

desc "Import all datacite DOIs for a given Client(id)"
task import_by_client: :environment do
if ENV["CLIENT_ID"].nil?
puts "ENV variable CLIENT_ID is required"
exit
end
import_index = ENV["INDEX"] || DataciteDoi.inactive_index
batch_size = ENV["BATCH_SIZE"].nil? ? 2000 : ENV["BATCH_SIZE"].to_i
DataciteDoi.import_by_client(
client_id: ENV["CLIENT_ID"],
import_index: import_index,
batch_size: batch_size,
)
end

desc "Import all datacite DOIs"
task import: :environment do
from_id = (ENV["FROM_ID"] || DataciteDoi.minimum(:id)).to_i
Expand Down Expand Up @@ -91,7 +116,6 @@ namespace :datacite_doi do
puts "ENV['DOI'] is required"
exit
end

puts DataciteDoi.index_one(doi_id: ENV["DOI"])
end
end
8 changes: 0 additions & 8 deletions spec/models/datacite_doi_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,6 @@
describe "import_by_ids", elasticsearch: true do
let(:provider) { create(:provider) }
let(:client) { create(:client, provider: provider) }
let(:target) do
create(
:client,
provider: provider,
symbol: provider.symbol + ".TARGET",
name: "Target Client",
)
end
let!(:dois) do
create_list(
:doi,
Expand Down

0 comments on commit a555b30

Please sign in to comment.