diff --git a/app/controllers/providers_controller.rb b/app/controllers/providers_controller.rb index 99ebaf77c..eecf117b3 100644 --- a/app/controllers/providers_controller.rb +++ b/app/controllers/providers_controller.rb @@ -150,30 +150,20 @@ def index end def show - logger = LogStashLogger.new(type: :stdout) - if params[:id] == "admin" providers = provider_count(consortium_id: nil) - logger.warn providers.inspect clients = client_count(provider_id: nil) - logger.warn clients.inspect dois = doi_count(provider_id: nil) - logger.warn dois.inspect resource_types = resource_type_count(provider_id: nil) elsif @provider.member_type == "consortium" providers = provider_count(consortium_id: params[:id]) - logger.warn providers.inspect clients = client_count(consortium_id: params[:id]) - logger.warn clients.inspect dois = doi_count(consortium_id: params[:id]) - logger.warn dois.inspect resource_types = resource_type_count(consortium_id: params[:id]) else providers = nil clients = client_count(provider_id: params[:id]) - logger.warn clients.inspect dois = doi_count(provider_id: params[:id]) - logger.warn dois.inspect resource_types = resource_type_count(provider_id: params[:id]) end diff --git a/app/models/doi.rb b/app/models/doi.rb index 8ba65bf55..05cf69d17 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -1,4 +1,5 @@ require 'maremma' +require 'benchmark' class Doi < ActiveRecord::Base audited only: [:doi, :url, :creators, :contributors, :titles, :publisher, :publication_year, :types, :descriptions, :container, :sizes, :formats, :version_info, :language, :dates, :identifiers, :related_identifiers, :funding_references, :geo_locations, :rights_list, :subjects, :schema_version, :content_url, :landing_page, :aasm_state, :source, :reason] @@ -536,6 +537,206 @@ def self.find_by_id(ids, options={}) }) end + def self.query(query, options={}) + logger = LogStashLogger.new(type: :stdout) + + # support scroll api + # map function is small performance hit + if options[:scroll_id].present? && options.dig(:page, :scroll) + begin + response = __elasticsearch__.client.scroll(body: + { scroll_id: options[:scroll_id], + scroll: options.dig(:page, :scroll) + }) + return Hashie::Mash.new({ + total: response.dig("hits", "total", "value"), + results: response.dig("hits", "hits").map { |r| r["_source"] }, + scroll_id: response["_scroll_id"] + }) + # handle expired scroll_id (Elasticsearch returns this error) + rescue Elasticsearch::Transport::Transport::Errors::NotFound + return Hashie::Mash.new({ + total: 0, + results: [], + scroll_id: nil + }) + end + end + + aggregations = get_aggregations_hash(options) + + options[:page] ||= {} + options[:page][:number] ||= 1 + options[:page][:size] ||= 25 + + # Cursor nav use the search after, this should always be an array of values that match the sort. + if options.dig(:page, :cursor) + from = 0 + + # make sure we have a valid cursor + search_after = options.dig(:page, :cursor).presence || [1, "1"] + sort = [{ created: "asc", uid: "asc" }] + else + from = ((options.dig(:page, :number) || 1) - 1) * (options.dig(:page, :size) || 25) + search_after = nil + sort = options[:sort] + end + + # make sure field name uses underscore + # escape forward slashes in query + if query.present? + query = query.gsub(/publicationYear/, "publication_year") + query = query.gsub(/relatedIdentifiers/, "related_identifiers") + query = query.gsub(/rightsList/, "rights_list") + query = query.gsub(/fundingReferences/, "funding_references") + query = query.gsub(/geoLocations/, "geo_locations") + query = query.gsub(/landingPage/, "landing_page") + query = query.gsub(/contentUrl/, "content_url") + query = query.gsub("/", '\/') + end + + must = [] + must_not = [] + + must << { query_string: { query: query, fields: query_fields }} if query.present? + must << { term: { "types.resourceTypeGeneral": options[:resource_type_id].underscore.camelize }} if options[:resource_type_id].present? + must << { terms: { provider_id: options[:provider_id].split(",") }} if options[:provider_id].present? + must << { terms: { client_id: options[:client_id].to_s.split(",") }} if options[:client_id].present? + must << { terms: { prefix: options[:prefix].to_s.split(",") }} if options[:prefix].present? + must << { term: { uid: options[:uid] }} if options[:uid].present? + must << { range: { created: { gte: "#{options[:created].split(",").min}||/y", lte: "#{options[:created].split(",").max}||/y", format: "yyyy" }}} if options[:created].present? + must << { term: { schema_version: "http://datacite.org/schema/kernel-#{options[:schema_version]}" }} if options[:schema_version].present? + must << { terms: { "subjects.subject": options[:subject].split(",") }} if options[:subject].present? + must << { term: { source: options[:source] }} if options[:source].present? + must << { term: { "landing_page.status": options[:link_check_status] }} if options[:link_check_status].present? + must << { exists: { field: "landing_page.checked" }} if options[:link_checked].present? + must << { term: { "landing_page.hasSchemaOrg": options[:link_check_has_schema_org] }} if options[:link_check_has_schema_org].present? + must << { term: { "landing_page.bodyHasPid": options[:link_check_body_has_pid] }} if options[:link_check_body_has_pid].present? + must << { exists: { field: "landing_page.schemaOrgId" }} if options[:link_check_found_schema_org_id].present? + must << { exists: { field: "landing_page.dcIdentifier" }} if options[:link_check_found_dc_identifier].present? + must << { exists: { field: "landing_page.citationDoi" }} if options[:link_check_found_citation_doi].present? + must << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present? + must << { terms: { aasm_state: options[:state].to_s.split(",") }} if options[:state].present? + must << { range: { registered: { gte: "#{options[:registered].split(",").min}||/y", lte: "#{options[:registered].split(",").max}||/y", format: "yyyy" }}} if options[:registered].present? + must << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://orcid.org/#{options[:user_id]}" }} if options[:user_id].present? + must << { term: { "creators.affiliation.affiliationIdentifier" => URI.decode(options[:affiliation_id]) }} if options[:affiliation_id].present? + must << { term: { consortium_id: options[:consortium_id] }} if options[:consortium_id].present? + must << { term: { "client.re3data_id" => options[:re3data_id].gsub("/", '\/').upcase }} if options[:re3data_id].present? + must << { term: { "client.opendoar_id" => options[:opendoar_id] }} if options[:opendoar_id].present? + must << { terms: { "client.certificate" => options[:certificate].split(",") }} if options[:certificate].present? + must_not << { terms: { provider_id: ["crossref", "medra", "op"] }} if options[:exclude_registration_agencies] + + # ES query can be optionally defined in different ways + # So here we build it differently based upon options + # This is mostly useful when trying to wrap it in a function_score query + es_query = {} + + # The main bool query with filters + bool_query = { + must: must, + must_not: must_not + } + + # Function score is used to provide varying score to return different values + # We use the bool query above as our principle query + # Then apply additional function scoring as appropriate + # Note this can be performance intensive. + function_score = { + query: { + bool: bool_query + }, + random_score: { + "seed": Rails.env.test? ? "random_1234" : "random_#{rand(1...100000)}" + } + } + + if options[:random].present? + es_query['function_score'] = function_score + # Don't do any sorting for random results + sort = nil + else + es_query['bool'] = bool_query + end + + # Sample grouping is optional included aggregation + if options[:sample_group].present? + aggregations[:samples] = { + terms: { + field: options[:sample_group], + size: 10000 + }, + aggs: { + "samples_hits": { + top_hits: { + size: options[:sample_size].present? ? options[:sample_size] : 1 + } + } + } + } + end + + # Collap results list by unique citations + unique = options[:unique].blank? ? nil : { + field: "citation_id", + inner_hits: { + name: "first_unique_event", + size: 1 + }, + "max_concurrent_group_searches": 1 + } + + # three options for going through results are scroll, cursor and pagination + # the default is pagination + # scroll is triggered by the page[scroll] query parameter + # cursor is triggered by the page[cursor] query parameter + + # can't use search wrapper function for scroll api + # map function for scroll is small performance hit + response = nil + bm = Benchmark.ms { + if options.dig(:page, :scroll).present? + response = __elasticsearch__.client.search( + index: self.index_name, + scroll: options.dig(:page, :scroll), + body: { + size: options.dig(:page, :size), + sort: sort, + query: es_query, + collapse: unique, + aggregations: aggregations, + track_total_hits: true + }.compact) + response = Hashie::Mash.new({ + total: response.dig("hits", "total", "value"), + results: response.dig("hits", "hits").map { |r| r["_source"] }, + scroll_id: response["_scroll_id"] + }) + elsif options.dig(:page, :cursor).present? + response = __elasticsearch__.search({ + size: options.dig(:page, :size), + search_after: search_after, + sort: sort, + query: es_query, + collapse: unique, + aggregations: aggregations, + track_total_hits: true + }.compact) + else + response =__elasticsearch__.search({ + size: options.dig(:page, :size), + from: from, + sort: sort, + query: es_query, + collapse: unique, + aggregations: aggregations, + track_total_hits: true + }.compact) + end + } + logger.warn method: "GET", path: "/works", message: "Query /works", duration: bm + response + end + def self.import_one(doi_id: nil) doi = Doi.where(doi: doi_id).first unless doi.present?