Skip to content

Commit

Permalink
Merge pull request #791 from datacite/reference-repository
Browse files Browse the repository at this point in the history
Combined index for Client and Re3Data Repositories
  • Loading branch information
jrhoads authored Mar 16, 2022
2 parents bfda30b + 27fda49 commit d8eb93d
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 2 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,5 @@ group :test do
gem "vcr", "~> 5.1"
gem "webmock", "~> 3.1"
end

gem "hashid-rails", "~> 1.4"
7 changes: 6 additions & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,10 @@ GEM
hamster (3.0.0)
concurrent-ruby (~> 1.0)
hashdiff (1.0.1)
hashid-rails (1.4.1)
activerecord (>= 4.0)
hashids (~> 1.0)
hashids (1.0.6)
hashie (4.1.0)
htmlentities (4.3.4)
http-accept (1.7.0)
Expand Down Expand Up @@ -669,6 +673,7 @@ DEPENDENCIES
graphql-cache (~> 0.6.0)
graphql-errors (~> 0.4.0)
hashdiff (>= 1.0.0.beta1, < 2.0.0)
hashid-rails
iso-639 (~> 0.3.5)
iso8601 (~> 0.9.0)
jsonlint (~> 0.3.0)
Expand Down Expand Up @@ -722,4 +727,4 @@ DEPENDENCIES
webmock (~> 3.1)

BUNDLED WITH
2.2.30
2.2.33
66 changes: 66 additions & 0 deletions app/models/reference_repository.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# frozen_string_literal: true

class ReferenceRepository < ApplicationRecord
include Indexable
include Elasticsearch::Model
include Elasticsearch::Model::Callbacks
include Hashid::Rails

before_save :force_index

validates_uniqueness_of :re3doi, allow_nil: true

def self.find_client(client_id)
::Client.where(symbol: client_id).where(deleted_at: nil).first
end

def self.find_re3(doi)
DataCatalog.find_by_id(doi).fetch(:data, []).first
end

def client_repo
if @dsclient&.symbol == self[:client_id]
@dsclient
else
@dsclient = ReferenceRepository.find_client(self[:client_id])
end
end

def re3_repo
@re3repo ||= ReferenceRepository.find_re3(self[:re3doi])
end

def as_indexed_json(_options = {})
ReferenceRepositoryDenormalizer.new(self).to_hash
end

settings index: { number_of_shards: 1 } do
mapping dynamic: "false" do
indexes :id
indexes :client_id
indexes :re3doi
indexes :re3data_url
indexes :created_at, type: :date, format: :date_optional_time
indexes :updated_at, type: :date, format: :date_optional_time
indexes :name
indexes :description
indexes :pid_system, type: :keyword
indexes :url
indexes :keyword, type: :keyword
indexes :subject
indexes :contact
indexes :language, type: :keyword
indexes :certificate, type: :keyword
indexes :data_access, type: :keyword
indexes :data_upload, type: :keyword
indexes :provider_type, type: :keyword
indexes :repository_type, type: :keyword
indexes :data_upload_licenses, type: :keyword
indexes :software, type: :keyword
end
end

def force_index
__elasticsearch__.instance_variable_set(:@__changed_model_attributes, nil)
end
end
129 changes: 129 additions & 0 deletions app/models/reference_repository_denormalizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# frozen_string_literal: true

class ReferenceRepositoryDenormalizer
attr_reader :repository

def initialize(repository)
@repository = repository
end

def doi_as_url
doi = @repository.re3doi
return nil if doi.blank?
"https://doi.org/#{doi.downcase}"
end

def to_hash
%w[
id
client_id
re3doi
re3data_url
created_at
updated_at
name
description
pid_system
url
keyword
contact
software
language
certificate
data_access
data_upload
provider_type
repository_type
subject
].map { |method_name| [ method_name, send(method_name)] }.to_h
end

def id
@repository.hashid
end

def client_id
@repository.client_id
end

def re3doi
@repository.re3doi
end

def created_at
@repository.created_at
end

def updated_at
@repository.updated_at
end

def name
@repository.client_repo&.name || @repository.re3_repo&.name
end

def description
@repository.client_repo&.description || @repository.re3_repo&.description
end

def url
@repository.client_repo&.url || @repository.re3_repo&.url
end

def re3data_url
doi_as_url
end

def pid_system
ret = Array.wrap(@repository.re3_repo&.pid_systems).map { |k| k.text }
ret += Array.wrap(@repository.client_id.nil? ? nil : "DOI")
ret.uniq
end

def keyword
ret = Array.wrap(@repository.re3_repo&.keywords).map { |k| k.text }
ret.uniq
end

def contact
ret = Array.wrap(@repository.re3_repo&.contacts).map { |k| k.text }
ret.uniq
end

def language
ret = Array.wrap(@repository.re3_repo&.repository_languages).map { |k| k.text }
ret += Array.wrap(@repository.client_repo&.language)
ret.uniq
end

def certificate
ret = Array.wrap(@repository.re3_repo&.certificates).map { |k| k.text }
ret += Array.wrap(@repository.client_repo&.certificate)
ret.uniq
end

def software
ret = Array.wrap(@repository.re3_repo&.software).map { |k| k.name }
ret.uniq
end

def data_access
Array.wrap(@repository.re3_repo&.data_accesses).map { |k| k.type }
end

def data_upload
Array.wrap(@repository.re3_repo&.data_uploads).map { |k| k.type }
end

def provider_type
Array.wrap(@repository.re3_repo&.provider_type).map { |k| k.text }
end

def repository_type
Array.wrap(@repository.re3_repo&.types).map { |k| k.text }
end

def subject
Array.wrap(@repository.re3_repo&.subjects).map { |k| k.text }
end
end
12 changes: 12 additions & 0 deletions db/migrate/20220218154500_create_reference_repositories.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

class CreateReferenceRepositories < ActiveRecord::Migration[5.2]
def change
create_table :reference_repositories do |t|
t.string :client_id, null: true
t.string :re3doi, null: true

t.timestamps
end
end
end
9 changes: 8 additions & 1 deletion db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2022_02_17_020855) do
ActiveRecord::Schema.define(version: 2022_02_18_154500) do
create_table "active_storage_attachments", options: "ENGINE=InnoDB DEFAULT CHARSET=latin1", force: :cascade do |t|
t.string "name", limit: 191, null: false
t.string "record_type", null: false
Expand Down Expand Up @@ -301,4 +301,11 @@
t.index ["provider_id"], name: "FKE7FBD67446EBD781"
t.index ["uid"], name: "index_provider_prefixes_on_uid", length: 128
end

create_table "reference_repositories", options: "ENGINE=InnoDB DEFAULT CHARSET=latin1", force: :cascade do |t|
t.string "client_id"
t.string "re3doi"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
end
end
111 changes: 111 additions & 0 deletions lib/tasks/repository.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# frozen_string_literal: true

namespace :repository do
desc "Load all Clients into Reference Repostories"
task load_client_repos: :environment do
puts "Processing Client Repositories"
progressbar = ProgressBar.create(
format: "%a %e %P% Processed: %c from %C %t",
title: "Client Repositories",
total: Client.all.count
)
Client.all.each do |c|
progressbar.increment
ReferenceRepository.find_or_create_by(
client_id: c.symbol,
re3doi: c.re3data_id
)
end
end

desc "Load all Re3data Repositories into Reference Repostories"
task :load_re3data_repos, [:pages] => :environment do |t, args|
pages = (args[:pages] || 3).to_i
re3repos = []
(1..pages).each do |page|
puts "Fetching Re3Data Repositories: Fetch Group #{page}"
re3repos += DataCatalog.query("", limit: 1000, offset: page).fetch(:data, [])
end
re3repos.uniq!
puts "Processing Re3Data Repositories"
progressbar = ProgressBar.create(
format: "%a %e %P% Processed: %c from %C %t",
title: "Re3data Repositories",
total: re3repos.length
)
re3repos.each do |repo|
progressbar.increment
doi = repo.id&.gsub("https://doi.org/", "")
if not doi.blank?
ReferenceRepository.find_or_create_by(
re3doi: doi
)
end
end
end

desc "Create index for reference_repositories"
task create_index: :environment do
puts ReferenceRepository.create_index
end

desc "Delete index for reference_repositories"
task delete_index: :environment do
puts ReferenceRepository.delete_index(index: ENV["INDEX"])
end

desc "Upgrade index for reference_repositories"
task upgrade_index: :environment do
puts ReferenceRepository.upgrade_index
end

desc "Show index stats for reference_repositories"
task index_stats: :environment do
puts ReferenceRepository.index_stats
end

desc "Switch index for reference_repositories"
task switch_index: :environment do
puts ReferenceRepository.switch_index
end

desc "Return active index for reference_repositories"
task active_index: :environment do
puts ReferenceRepository.active_index + " is the active index."
end

desc "Monitor reindexing for reference_repositories"
task monitor_reindex: :environment do
puts ReferenceRepository.monitor_reindex
end

desc "Create alias for reference_repositories"
task create_alias: :environment do
puts ReferenceRepository.create_alias(index: ENV["INDEX"], alias: ENV["ALIAS"])
end

desc "List aliases for reference_repositories"
task list_aliases: :environment do
puts ReferenceRepository.list_aliases
end

desc "Delete alias for reference_repositories"
task delete_alias: :environment do
puts ReferenceRepository.delete_alias(index: ENV["INDEX"], alias: ENV["ALIAS"])
end

desc "Import all reference_repositories"
task import: :environment do
ReferenceRepository.import(index: ReferenceRepository.inactive_index)
end

desc "Delete from index by query"
task delete_by_query: :environment do
if ENV["QUERY"].nil?
puts "ENV['QUERY'] is required"
exit
end

puts ReferenceRepository.delete_by_query(index: ENV["INDEX"], query: ENV["QUERY"])
end
end
7 changes: 7 additions & 0 deletions spec/models/reference_repository_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# frozen_string_literal: true

require "rails_helper"

RSpec.describe ReferenceRepository, type: :model do
pending "add some examples to (or delete) #{__FILE__}"
end

0 comments on commit d8eb93d

Please sign in to comment.