From d6917d251702a59b432bca1d6c0656a76574f499 Mon Sep 17 00:00:00 2001 From: jrhoads Date: Fri, 11 Aug 2023 10:16:18 -0400 Subject: [PATCH 1/6] Remove print statement --- spec/graphql/types/work_type_spec.rb | 2 -- 1 file changed, 2 deletions(-) diff --git a/spec/graphql/types/work_type_spec.rb b/spec/graphql/types/work_type_spec.rb index 0722b37e8..7032469e3 100644 --- a/spec/graphql/types/work_type_spec.rb +++ b/spec/graphql/types/work_type_spec.rb @@ -1881,8 +1881,6 @@ it "returns the correct counts for funders" do response = LupoSchema.execute(query).as_json - pp(response) - expect(response.dig("data", "works", "funders").length()).to eq(1) end end From cc66e5fc554b2b7fd78892af268c85cd6552bb0f Mon Sep 17 00:00:00 2001 From: jrhoads Date: Fri, 11 Aug 2023 14:21:50 -0400 Subject: [PATCH 2/6] Refactor Doi.gql_query into a module using the builder pattern --- app/models/concerns/modelable.rb | 6 + app/models/doi.rb | 278 +--------------------------- app/models/doi/graphql_query.rb | 307 +++++++++++++++++++++++++++++++ 3 files changed, 317 insertions(+), 274 deletions(-) create mode 100644 app/models/doi/graphql_query.rb diff --git a/app/models/concerns/modelable.rb b/app/models/concerns/modelable.rb index 5bdbd774d..2f8d2b763 100644 --- a/app/models/concerns/modelable.rb +++ b/app/models/concerns/modelable.rb @@ -6,6 +6,7 @@ module Modelable delegate :doi_from_url, to: :class delegate :orcid_as_url, to: :class delegate :orcid_from_url, to: :class + delegate :ror_from_url, to: :class module ClassMethods def doi_from_url(url) @@ -28,5 +29,10 @@ def orcid_from_url(url) uri.path.gsub(%r{^/}, "").upcase end end + + def ror_from_url(url) + ror = Array(%r{\A(?:(http|https)://)?(ror\.org/)?(.+)}.match(url)).last + "ror.org/#{ror}" if ror.present? + end end end diff --git a/app/models/doi.rb b/app/models/doi.rb index c89de83c9..3634cb067 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -641,107 +641,6 @@ def as_indexed_json(_options = {}) } end - def self.gql_query_aggregations(facet_count: 10) - if facet_count.positive? - { - resource_types: { terms: { field: "resource_type_id_and_name", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, - clients: { terms: { field: "client_id_and_name", size: facet_count, min_doc_count: 1 } }, - open_licenses: { - filter: { terms: { "rights_list.rightsIdentifier": ["cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-3.0-at", "cc-by-3.0-us", "cc-by-4.0", "cc-pddc", "cc0-1.0", "cc-pdm-1.0"] } }, - aggs: { - resource_types: { - terms: { field: "resource_type_id_and_name", size: facet_count, min_doc_count: 1 } - } - } - }, - published: { - date_histogram: { - field: "publication_year", - interval: "year", - format: "year", - order: { - _key: "desc", - }, - min_doc_count: 1, - }, - }, - registration_agencies: { terms: { field: "agency", size: facet_count, min_doc_count: 1 } }, - affiliations: { terms: { field: "affiliation_id_and_name", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, - authors: { - terms: { field: "creators.nameIdentifiers.nameIdentifier", size: facet_count, min_doc_count: 1, include: "https?://orcid.org/.*" }, - aggs: { - authors: { - top_hits: { - _source: { - includes: [ "creators.name", "creators.nameIdentifiers.nameIdentifier"] - }, - size: 1 - } - } - } - }, - creators_and_contributors: { - terms: { field: "creators_and_contributors.nameIdentifiers.nameIdentifier", size: facet_count, min_doc_count: 1, include: "https?://orcid.org/.*" }, - aggs: { - creators_and_contributors: { - top_hits: { - _source: { - includes: [ "creators_and_contributors.name", "creators_and_contributors.nameIdentifiers.nameIdentifier"] - }, - size: 1 - } - } - } - }, - funders: { - terms: { - field: "funding_references.funderIdentifier", - size: facet_count, - min_doc_count: 1 - }, - aggs: { - funders: { - top_hits: { - _source: { - includes: [ - "funding_references.funderName", - "funding_references.funderIdentifier" - ] - }, - size: 1 - } - } - } - }, - pid_entities: { - filter: { term: { "subjects.subjectScheme": "PidEntity" } }, - aggs: { - subject: { terms: { field: "subjects.subject", size: facet_count, min_doc_count: 1, - include: %w(Dataset Publication Software Organization Funder Person Grant Sample Instrument Repository Project) } }, - }, - }, - fields_of_science: { - filter: { term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" } }, - aggs: { - subject: { terms: { field: "subjects.subject", size: facet_count, min_doc_count: 1, - include: "FOS:.*" } }, - }, - }, - fields_of_science_combined: { - terms: { field: "fields_of_science_combined", size: facet_count, min_doc_count: 1 } - }, - fields_of_science_repository: { - terms: { field: "fields_of_science_repository", size: facet_count, min_doc_count: 1 } - }, - licenses: { terms: { field: "rights_list.rightsIdentifier", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, - languages: { terms: { field: "language", size: facet_count, min_doc_count: 1 } }, - view_count: { sum: { field: "view_count" } }, - download_count: { sum: { field: "download_count" } }, - citation_count: { sum: { field: "citation_count" } }, - content_url_count: { value_count: { field: "content_url" } }, - } - end - end def self.query_aggregations(disable_facets: false) if !disable_facets @@ -928,179 +827,10 @@ def self.stats_query(options = {}) # query for graphql, removing options that are not needed def self.gql_query(query, options = {}) - options[:page] ||= {} - options[:facet_count] = (options[:facet_count] || 10).to_i - aggregations = gql_query_aggregations(facet_count: options[:facet_count]) - - # cursor nav uses search_after, this should always be an array of values that match the sort. - # make sure we have a valid cursor - cursor = [0, ""] - if options.dig(:page, :cursor).is_a?(Array) - timestamp, uid = options.dig(:page, :cursor) - cursor = [timestamp.to_i, uid.to_s] - elsif options.dig(:page, :cursor).is_a?(String) - timestamp, uid = options.dig(:page, :cursor).split(",") - cursor = [timestamp.to_i, uid.to_s] - end - - # from = 0 - search_after = cursor - sort = [{ created: "asc", uid: "asc" }] - - # make sure field name uses underscore - # escape forward slash, but not other Elasticsearch special characters - if query.present? - query = query.gsub(/publicationYear/, "publication_year") - query = query.gsub(/relatedIdentifiers/, "related_identifiers") - query = query.gsub(/relatedItems/, "related_items") - query = query.gsub(/rightsList/, "rights_list") - query = query.gsub(/fundingReferences/, "funding_references") - query = query.gsub(/geoLocations/, "geo_locations") - query = query.gsub(/landingPage/, "landing_page") - query = query.gsub(/contentUrl/, "content_url") - query = query.gsub(/citationCount/, "citation_count") - query = query.gsub(/viewCount/, "view_count") - query = query.gsub(/downloadCount/, "download_count") - query = query.gsub("/", "\\/") - end - - # turn ids into an array if provided as comma-separated string - options[:ids] = options[:ids].split(",") if options[:ids].is_a?(String) - - if query.present? - must = [{ query_string: { query: query, fields: query_fields, default_operator: "AND", phrase_slop: 1 } }] - else - must = [{ match_all: {} }] - end - - filter = [] - should = [] - minimum_should_match = 0 - - filter << { terms: { doi: options[:ids].map(&:upcase) } } if options[:ids].present? - filter << { term: { "types.resourceTypeGeneral": options[:resource_type_id].underscore.camelize } } if options[:resource_type_id].present? - filter << { terms: { "types.resourceType": options[:resource_type].split(",") } } if options[:resource_type].present? - filter << { terms: { provider_id: options[:provider_id].split(",") } } if options[:provider_id].present? - filter << { terms: { client_id: options[:client_id].to_s.split(",") } } if options[:client_id].present? - filter << { terms: { agency: options[:agency].split(",").map(&:downcase) } } if options[:agency].present? - filter << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present? - filter << { terms: { language: options[:language].to_s.split(",").map(&:downcase) } } if options[:language].present? - filter << { term: { uid: options[:uid] } } if options[:uid].present? - filter << { range: { created: { gte: "#{options[:created].split(',').min}||/y", lte: "#{options[:created].split(',').max}||/y", format: "yyyy" } } } if options[:created].present? - filter << { range: { publication_year: { gte: "#{options[:published].split(',').min}||/y", lte: "#{options[:published].split(',').max}||/y", format: "yyyy" } } } if options[:published].present? - filter << { term: { schema_version: "http://datacite.org/schema/kernel-#{options[:schema_version]}" } } if options[:schema_version].present? - filter << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present? - if options[:pid_entity].present? - filter << { term: { "subjects.subjectScheme": "PidEntity" } } - filter << { terms: { "subjects.subject": options[:pid_entity].split(",").map(&:humanize) } } - end - if options[:field_of_science].present? - filter << { term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" } } - filter << { terms: { "subjects.subject": options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } - end - if options[:field_of_science_repository].present? - filter << { terms: { "fields_of_science_repository": options[:field_of_science_repository].split(",").map { |s| s.humanize } } } - end - if options[:field_of_science_combined].present? - filter << { terms: { "fields_of_science_combined": options[:field_of_science_combined].split(",").map { |s| s.humanize } } } - end - filter << { terms: { "rights_list.rightsIdentifier" => options[:license].split(",") } } if options[:license].present? - filter << { term: { source: options[:source] } } if options[:source].present? - filter << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present? - filter << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present? - filter << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present? - filter << { range: { part_of_count: { "gte": options[:has_part_of].to_i } } } if options[:has_part_of].present? - filter << { range: { version_count: { "gte": options[:has_versions].to_i } } } if options[:has_versions].present? - filter << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present? - filter << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present? - filter << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present? - filter << { term: { "landing_page.status": options[:link_check_status] } } if options[:link_check_status].present? - filter << { exists: { field: "landing_page.checked" } } if options[:link_checked].present? - filter << { term: { "landing_page.hasSchemaOrg": options[:link_check_has_schema_org] } } if options[:link_check_has_schema_org].present? - filter << { term: { "landing_page.bodyHasPid": options[:link_check_body_has_pid] } } if options[:link_check_body_has_pid].present? - filter << { exists: { field: "landing_page.schemaOrgId" } } if options[:link_check_found_schema_org_id].present? - filter << { exists: { field: "landing_page.dcIdentifier" } } if options[:link_check_found_dc_identifier].present? - filter << { exists: { field: "landing_page.citationDoi" } } if options[:link_check_found_citation_doi].present? - filter << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present? - filter << { terms: { aasm_state: options[:state].to_s.split(",") } } if options[:state].present? - filter << { range: { registered: { gte: "#{options[:registered].split(',').min}||/y", lte: "#{options[:registered].split(',').max}||/y", format: "yyyy" } } } if options[:registered].present? - filter << { term: { consortium_id: options[:consortium_id] } } if options[:consortium_id].present? - # TODO align PID parsing - filter << { term: { "client.re3data_id" => doi_from_url(options[:re3data_id]) } } if options[:re3data_id].present? - filter << { term: { "client.opendoar_id" => options[:opendoar_id] } } if options[:opendoar_id].present? - filter << { terms: { "client.certificate" => options[:certificate].split(",") } } if options[:certificate].present? - filter << { terms: { "creators.nameIdentifiers.nameIdentifier" => options[:user_id].split(",").collect { |id| "https://orcid.org/#{orcid_from_url(id)}" } } } if options[:user_id].present? - filter << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ORCID" } } if options[:has_person].present? - - # match either one of has_affiliation, has_organization, has_funder or has_member - if options[:has_organization].present? - should << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ROR" } } - should << { term: { "contributors.nameIdentifiers.nameIdentifierScheme" => "ROR" } } - minimum_should_match = 1 - end - if options[:has_affiliation].present? - should << { term: { "creators.affiliation.affiliationIdentifierScheme" => "ROR" } } - should << { term: { "contributors.affiliation.affiliationIdentifierScheme" => "ROR" } } - minimum_should_match = 1 - end - if options[:has_funder].present? - should << { term: { "funding_references.funderIdentifierType" => "Crossref Funder ID" } } - minimum_should_match = 1 - end - if options[:has_member].present? - should << { exists: { field: "provider.ror_id" } } - minimum_should_match = 1 - end - - # match either ROR ID or Crossref Funder ID if either organization_id, affiliation_id, - # funder_id or member_id is a query parameter - if options[:organization_id].present? - # TODO: remove after organization_id has been indexed - should << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://#{ror_from_url(options[:organization_id])}" } } - # TODO: remove after organization_id has been indexed - should << { term: { "contributors.nameIdentifiers.nameIdentifier" => "https://#{ror_from_url(options[:organization_id])}" } } - should << { term: { "organization_id" => ror_from_url(options[:organization_id]) } } - minimum_should_match = 1 - end - - if options[:fair_organization_id].present? - _ror_id = ror_from_url(options[:fair_organization_id]) - should << { term: { "organization_id" => _ror_id } } - should << { term: { "affiliation_id" => _ror_id } } - should << { term: { "related_dmp_organization_id" => _ror_id } } - minimum_should_match = 1 - end - - if options[:affiliation_id].present? - should << { term: { "affiliation_id" => ror_from_url(options[:affiliation_id]) } } - minimum_should_match = 1 - end - if options[:funder_id].present? - should << { terms: { "funding_references.funderIdentifier" => options[:funder_id].split(",").map { |f| "https://doi.org/#{doi_from_url(f)}" } } } - minimum_should_match = 1 - end - if options[:member_id].present? - should << { term: { "provider.ror_id" => "https://#{ror_from_url(options[:member_id])}" } } - minimum_should_match = 1 - end - - es_query = { - bool: { - must: must, - filter: filter, - should: should, - minimum_should_match: minimum_should_match, - }, - } - - __elasticsearch__.search({ - size: options.dig(:page, :size), - search_after: search_after, - sort: sort, - query: es_query, - aggregations: aggregations, - track_total_hits: true, - }.compact) + builder = Doi::GraphqlQuery::Builder.new(query, options) + __elasticsearch__.search( + builder.search_query + ) end def self.query(query, options = {}) diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb new file mode 100644 index 000000000..c447e3894 --- /dev/null +++ b/app/models/doi/graphql_query.rb @@ -0,0 +1,307 @@ +# frozen_string_literal: true + +module Doi::GraphqlQuery + class Builder + include Modelable + + def initialize(query, options) + @query = query + @options = options + end + + def search_query + inner_query(@query, @options) + end + + def query_fields + [ + "uid^50", + "related_identifiers.relatedIdentifier^3", + "titles.title^3", + "creator_names^3", + "creators.id^3", + "publisher^3", + "descriptions.description^3", + "subjects.subject^3" + ] + end + + def inner_query(query, options) + options[:page] ||= {} + options[:facet_count] = (options[:facet_count] || 10).to_i + aggregations = gql_query_aggregations(facet_count: options[:facet_count]) + + # cursor nav uses search_after, this should always be an array of values that match the sort. + # make sure we have a valid cursor + cursor = [0, ""] + if options.dig(:page, :cursor).is_a?(Array) + timestamp, uid = options.dig(:page, :cursor) + cursor = [timestamp.to_i, uid.to_s] + elsif options.dig(:page, :cursor).is_a?(String) + timestamp, uid = options.dig(:page, :cursor).split(",") + cursor = [timestamp.to_i, uid.to_s] + end + + # from = 0 + search_after = cursor + sort = [{ created: "asc", uid: "asc" }] + + # make sure field name uses underscore + # escape forward slash, but not other Elasticsearch special characters + if query.present? + query = query.gsub(/publicationYear/, "publication_year") + query = query.gsub(/relatedIdentifiers/, "related_identifiers") + query = query.gsub(/relatedItems/, "related_items") + query = query.gsub(/rightsList/, "rights_list") + query = query.gsub(/fundingReferences/, "funding_references") + query = query.gsub(/geoLocations/, "geo_locations") + query = query.gsub(/landingPage/, "landing_page") + query = query.gsub(/contentUrl/, "content_url") + query = query.gsub(/citationCount/, "citation_count") + query = query.gsub(/viewCount/, "view_count") + query = query.gsub(/downloadCount/, "download_count") + query = query.gsub("/", "\\/") + end + + # turn ids into an array if provided as comma-separated string + options[:ids] = options[:ids].split(",") if options[:ids].is_a?(String) + + if query.present? + must = [{ query_string: { query: query, fields: query_fields, default_operator: "AND", phrase_slop: 1 } }] + else + must = [{ match_all: {} }] + end + + filter = [] + should = [] + minimum_should_match = 0 + + filter << { terms: { doi: options[:ids].map(&:upcase) } } if options[:ids].present? + filter << { term: { "types.resourceTypeGeneral": options[:resource_type_id].underscore.camelize } } if options[:resource_type_id].present? + filter << { terms: { "types.resourceType": options[:resource_type].split(",") } } if options[:resource_type].present? + filter << { terms: { provider_id: options[:provider_id].split(",") } } if options[:provider_id].present? + filter << { terms: { client_id: options[:client_id].to_s.split(",") } } if options[:client_id].present? + filter << { terms: { agency: options[:agency].split(",").map(&:downcase) } } if options[:agency].present? + filter << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present? + filter << { terms: { language: options[:language].to_s.split(",").map(&:downcase) } } if options[:language].present? + filter << { term: { uid: options[:uid] } } if options[:uid].present? + filter << { range: { created: { gte: "#{options[:created].split(',').min}||/y", lte: "#{options[:created].split(',').max}||/y", format: "yyyy" } } } if options[:created].present? + filter << { range: { publication_year: { gte: "#{options[:published].split(',').min}||/y", lte: "#{options[:published].split(',').max}||/y", format: "yyyy" } } } if options[:published].present? + filter << { term: { schema_version: "http://datacite.org/schema/kernel-#{options[:schema_version]}" } } if options[:schema_version].present? + filter << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present? + if options[:pid_entity].present? + filter << { term: { "subjects.subjectScheme": "PidEntity" } } + filter << { terms: { "subjects.subject": options[:pid_entity].split(",").map(&:humanize) } } + end + if options[:field_of_science].present? + filter << { term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" } } + filter << { terms: { "subjects.subject": options[:field_of_science].split(",").map { |s| "FOS: " + s.humanize } } } + end + if options[:field_of_science_repository].present? + filter << { terms: { "fields_of_science_repository": options[:field_of_science_repository].split(",").map { |s| s.humanize } } } + end + if options[:field_of_science_combined].present? + filter << { terms: { "fields_of_science_combined": options[:field_of_science_combined].split(",").map { |s| s.humanize } } } + end + filter << { terms: { "rights_list.rightsIdentifier" => options[:license].split(",") } } if options[:license].present? + filter << { term: { source: options[:source] } } if options[:source].present? + filter << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present? + filter << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present? + filter << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present? + filter << { range: { part_of_count: { "gte": options[:has_part_of].to_i } } } if options[:has_part_of].present? + filter << { range: { version_count: { "gte": options[:has_versions].to_i } } } if options[:has_versions].present? + filter << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present? + filter << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present? + filter << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present? + filter << { term: { "landing_page.status": options[:link_check_status] } } if options[:link_check_status].present? + filter << { exists: { field: "landing_page.checked" } } if options[:link_checked].present? + filter << { term: { "landing_page.hasSchemaOrg": options[:link_check_has_schema_org] } } if options[:link_check_has_schema_org].present? + filter << { term: { "landing_page.bodyHasPid": options[:link_check_body_has_pid] } } if options[:link_check_body_has_pid].present? + filter << { exists: { field: "landing_page.schemaOrgId" } } if options[:link_check_found_schema_org_id].present? + filter << { exists: { field: "landing_page.dcIdentifier" } } if options[:link_check_found_dc_identifier].present? + filter << { exists: { field: "landing_page.citationDoi" } } if options[:link_check_found_citation_doi].present? + filter << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present? + filter << { terms: { aasm_state: options[:state].to_s.split(",") } } if options[:state].present? + filter << { range: { registered: { gte: "#{options[:registered].split(',').min}||/y", lte: "#{options[:registered].split(',').max}||/y", format: "yyyy" } } } if options[:registered].present? + filter << { term: { consortium_id: options[:consortium_id] } } if options[:consortium_id].present? + # TODO align PID parsing + filter << { term: { "client.re3data_id" => doi_from_url(options[:re3data_id]) } } if options[:re3data_id].present? + filter << { term: { "client.opendoar_id" => options[:opendoar_id] } } if options[:opendoar_id].present? + filter << { terms: { "client.certificate" => options[:certificate].split(",") } } if options[:certificate].present? + filter << { terms: { "creators.nameIdentifiers.nameIdentifier" => options[:user_id].split(",").collect { |id| "https://orcid.org/#{orcid_from_url(id)}" } } } if options[:user_id].present? + filter << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ORCID" } } if options[:has_person].present? + + # match either one of has_affiliation, has_organization, has_funder or has_member + if options[:has_organization].present? + should << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ROR" } } + should << { term: { "contributors.nameIdentifiers.nameIdentifierScheme" => "ROR" } } + minimum_should_match = 1 + end + if options[:has_affiliation].present? + should << { term: { "creators.affiliation.affiliationIdentifierScheme" => "ROR" } } + should << { term: { "contributors.affiliation.affiliationIdentifierScheme" => "ROR" } } + minimum_should_match = 1 + end + if options[:has_funder].present? + should << { term: { "funding_references.funderIdentifierType" => "Crossref Funder ID" } } + minimum_should_match = 1 + end + if options[:has_member].present? + should << { exists: { field: "provider.ror_id" } } + minimum_should_match = 1 + end + + # match either ROR ID or Crossref Funder ID if either organization_id, affiliation_id, + # funder_id or member_id is a query parameter + if options[:organization_id].present? + # TODO: remove after organization_id has been indexed + should << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://#{ror_from_url(options[:organization_id])}" } } + # TODO: remove after organization_id has been indexed + should << { term: { "contributors.nameIdentifiers.nameIdentifier" => "https://#{ror_from_url(options[:organization_id])}" } } + should << { term: { "organization_id" => ror_from_url(options[:organization_id]) } } + minimum_should_match = 1 + end + + if options[:fair_organization_id].present? + _ror_id = ror_from_url(options[:fair_organization_id]) + should << { term: { "organization_id" => _ror_id } } + should << { term: { "affiliation_id" => _ror_id } } + should << { term: { "related_dmp_organization_id" => _ror_id } } + minimum_should_match = 1 + end + + if options[:affiliation_id].present? + should << { term: { "affiliation_id" => ror_from_url(options[:affiliation_id]) } } + minimum_should_match = 1 + end + if options[:funder_id].present? + should << { terms: { "funding_references.funderIdentifier" => options[:funder_id].split(",").map { |f| "https://doi.org/#{doi_from_url(f)}" } } } + minimum_should_match = 1 + end + if options[:member_id].present? + should << { term: { "provider.ror_id" => "https://#{ror_from_url(options[:member_id])}" } } + minimum_should_match = 1 + end + + es_query = { + bool: { + must: must, + filter: filter, + should: should, + minimum_should_match: minimum_should_match, + }, + } + + { + size: options.dig(:page, :size), + search_after: search_after, + sort: sort, + query: es_query, + aggregations: aggregations, + track_total_hits: true, + }.compact + end + + def gql_query_aggregations(facet_count: 10) + if facet_count.positive? + { + resource_types: { terms: { field: "resource_type_id_and_name", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, + clients: { terms: { field: "client_id_and_name", size: facet_count, min_doc_count: 1 } }, + open_licenses: { + filter: { terms: { "rights_list.rightsIdentifier": ["cc-by-1.0", "cc-by-2.0", "cc-by-2.5", "cc-by-3.0", "cc-by-3.0-at", "cc-by-3.0-us", "cc-by-4.0", "cc-pddc", "cc0-1.0", "cc-pdm-1.0"] } }, + aggs: { + resource_types: { + terms: { field: "resource_type_id_and_name", size: facet_count, min_doc_count: 1 } + } + } + }, + published: { + date_histogram: { + field: "publication_year", + interval: "year", + format: "year", + order: { + _key: "desc", + }, + min_doc_count: 1, + }, + }, + registration_agencies: { terms: { field: "agency", size: facet_count, min_doc_count: 1 } }, + affiliations: { terms: { field: "affiliation_id_and_name", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, + authors: { + terms: { field: "creators.nameIdentifiers.nameIdentifier", size: facet_count, min_doc_count: 1, include: "https?://orcid.org/.*" }, + aggs: { + authors: { + top_hits: { + _source: { + includes: [ "creators.name", "creators.nameIdentifiers.nameIdentifier"] + }, + size: 1 + } + } + } + }, + creators_and_contributors: { + terms: { field: "creators_and_contributors.nameIdentifiers.nameIdentifier", size: facet_count, min_doc_count: 1, include: "https?://orcid.org/.*" }, + aggs: { + creators_and_contributors: { + top_hits: { + _source: { + includes: [ "creators_and_contributors.name", "creators_and_contributors.nameIdentifiers.nameIdentifier"] + }, + size: 1 + } + } + } + }, + funders: { + terms: { + field: "funding_references.funderIdentifier", + size: facet_count, + min_doc_count: 1 + }, + aggs: { + funders: { + top_hits: { + _source: { + includes: [ + "funding_references.funderName", + "funding_references.funderIdentifier" + ] + }, + size: 1 + } + } + } + }, + pid_entities: { + filter: { term: { "subjects.subjectScheme": "PidEntity" } }, + aggs: { + subject: { terms: { field: "subjects.subject", size: facet_count, min_doc_count: 1, + include: %w(Dataset Publication Software Organization Funder Person Grant Sample Instrument Repository Project) } }, + }, + }, + fields_of_science: { + filter: { term: { "subjects.subjectScheme": "Fields of Science and Technology (FOS)" } }, + aggs: { + subject: { terms: { field: "subjects.subject", size: facet_count, min_doc_count: 1, + include: "FOS:.*" } }, + }, + }, + fields_of_science_combined: { + terms: { field: "fields_of_science_combined", size: facet_count, min_doc_count: 1 } + }, + fields_of_science_repository: { + terms: { field: "fields_of_science_repository", size: facet_count, min_doc_count: 1 } + }, + licenses: { terms: { field: "rights_list.rightsIdentifier", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, + languages: { terms: { field: "language", size: facet_count, min_doc_count: 1 } }, + view_count: { sum: { field: "view_count" } }, + download_count: { sum: { field: "download_count" } }, + citation_count: { sum: { field: "citation_count" } }, + content_url_count: { value_count: { field: "content_url" } }, + } + end + end + end +end From 7436d06abad40a805ed39c687375fb7deff97e8a Mon Sep 17 00:00:00 2001 From: jrhoads Date: Tue, 15 Aug 2023 16:46:24 -0400 Subject: [PATCH 3/6] Refactor the Doi:GraphqlQuery::Builder further into smaller functions --- app/models/doi/graphql_query.rb | 169 +++++++++++++++++++------------- 1 file changed, 101 insertions(+), 68 deletions(-) diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index c447e3894..e89abbc77 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -4,13 +4,32 @@ module Doi::GraphqlQuery class Builder include Modelable + DEFAULT_CURSOR = [0, ""] + DEFAULT_PAGE_SIZE = 0 + DEFAULT_FACET_COUNT = 10 + def initialize(query, options) @query = query @options = options end def search_query - inner_query(@query, @options) + { + size: size, + search_after: search_after, + sort: sort, + query: inner_query, + aggregations: aggregations, + track_total_hits: true, + }.compact + end + + def size + (@options.dig(:page, :size)|| DEFAULT_PAGE_SIZE).to_i + end + + def sort + [{ created: "asc", uid: "asc" }] end def query_fields @@ -26,56 +45,63 @@ def query_fields ] end - def inner_query(query, options) - options[:page] ||= {} - options[:facet_count] = (options[:facet_count] || 10).to_i - aggregations = gql_query_aggregations(facet_count: options[:facet_count]) + def cursor + tmp_cursor = @options.dig(:page,:cursor) + if tmp_cursor.nil? + return DEFAULT_CURSOR + end - # cursor nav uses search_after, this should always be an array of values that match the sort. - # make sure we have a valid cursor - cursor = [0, ""] - if options.dig(:page, :cursor).is_a?(Array) - timestamp, uid = options.dig(:page, :cursor) - cursor = [timestamp.to_i, uid.to_s] - elsif options.dig(:page, :cursor).is_a?(String) - timestamp, uid = options.dig(:page, :cursor).split(",") - cursor = [timestamp.to_i, uid.to_s] + if tmp_cursor.is_a?(Array) + timestamp, uid = tmp_cursor + elsif tmp_cursor.is_a?(String) + timestamp, uid = tmp_cursor.split(",") end + [timestamp.to_i, uid.to_s] + end - # from = 0 - search_after = cursor - sort = [{ created: "asc", uid: "asc" }] + def search_after + cursor + end + def clean_query # make sure field name uses underscore # escape forward slash, but not other Elasticsearch special characters - if query.present? - query = query.gsub(/publicationYear/, "publication_year") - query = query.gsub(/relatedIdentifiers/, "related_identifiers") - query = query.gsub(/relatedItems/, "related_items") - query = query.gsub(/rightsList/, "rights_list") - query = query.gsub(/fundingReferences/, "funding_references") - query = query.gsub(/geoLocations/, "geo_locations") - query = query.gsub(/landingPage/, "landing_page") - query = query.gsub(/contentUrl/, "content_url") - query = query.gsub(/citationCount/, "citation_count") - query = query.gsub(/viewCount/, "view_count") - query = query.gsub(/downloadCount/, "download_count") - query = query.gsub("/", "\\/") + if @query.present? + @query.gsub(/publicationYear/, "publication_year")\ + .gsub(/relatedIdentifiers/, "related_identifiers")\ + .gsub(/relatedItems/, "related_items")\ + .gsub(/rightsList/, "rights_list")\ + .gsub(/fundingReferences/, "funding_references")\ + .gsub(/geoLocations/, "geo_locations")\ + .gsub(/landingPage/, "landing_page")\ + .gsub(/contentUrl/, "content_url")\ + .gsub(/citationCount/, "citation_count")\ + .gsub(/viewCount/, "view_count")\ + .gsub(/downloadCount/, "download_count")\ + .gsub("/", "\\/") + else + @query end + end - # turn ids into an array if provided as comma-separated string - options[:ids] = options[:ids].split(",") if options[:ids].is_a?(String) - - if query.present? - must = [{ query_string: { query: query, fields: query_fields, default_operator: "AND", phrase_slop: 1 } }] + def must + if !@query.present? + [{ match_all: {} }] else - must = [{ match_all: {} }] + [{ + query_string: { + query: clean_query, + fields: query_fields, + default_operator: "AND", + phrase_slop: 1 + } + }] end + end + def filters + options = @options filter = [] - should = [] - minimum_should_match = 0 - filter << { terms: { doi: options[:ids].map(&:upcase) } } if options[:ids].present? filter << { term: { "types.resourceTypeGeneral": options[:resource_type_id].underscore.camelize } } if options[:resource_type_id].present? filter << { terms: { "types.resourceType": options[:resource_type].split(",") } } if options[:resource_type].present? @@ -131,23 +157,30 @@ def inner_query(query, options) filter << { terms: { "creators.nameIdentifiers.nameIdentifier" => options[:user_id].split(",").collect { |id| "https://orcid.org/#{orcid_from_url(id)}" } } } if options[:user_id].present? filter << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ORCID" } } if options[:has_person].present? + filter + end + + def get_should_clause + options = @options + should_query = [] + minimum_should_match = 0 # match either one of has_affiliation, has_organization, has_funder or has_member if options[:has_organization].present? - should << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ROR" } } - should << { term: { "contributors.nameIdentifiers.nameIdentifierScheme" => "ROR" } } + should_query << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ROR" } } + should_query << { term: { "contributors.nameIdentifiers.nameIdentifierScheme" => "ROR" } } minimum_should_match = 1 end if options[:has_affiliation].present? - should << { term: { "creators.affiliation.affiliationIdentifierScheme" => "ROR" } } - should << { term: { "contributors.affiliation.affiliationIdentifierScheme" => "ROR" } } + should_query << { term: { "creators.affiliation.affiliationIdentifierScheme" => "ROR" } } + should_query << { term: { "contributors.affiliation.affiliationIdentifierScheme" => "ROR" } } minimum_should_match = 1 end if options[:has_funder].present? - should << { term: { "funding_references.funderIdentifierType" => "Crossref Funder ID" } } + should_query << { term: { "funding_references.funderIdentifierType" => "Crossref Funder ID" } } minimum_should_match = 1 end if options[:has_member].present? - should << { exists: { field: "provider.ror_id" } } + should_query << { exists: { field: "provider.ror_id" } } minimum_should_match = 1 end @@ -155,54 +188,54 @@ def inner_query(query, options) # funder_id or member_id is a query parameter if options[:organization_id].present? # TODO: remove after organization_id has been indexed - should << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://#{ror_from_url(options[:organization_id])}" } } + should_query << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://#{ror_from_url(options[:organization_id])}" } } # TODO: remove after organization_id has been indexed - should << { term: { "contributors.nameIdentifiers.nameIdentifier" => "https://#{ror_from_url(options[:organization_id])}" } } - should << { term: { "organization_id" => ror_from_url(options[:organization_id]) } } + should_query << { term: { "contributors.nameIdentifiers.nameIdentifier" => "https://#{ror_from_url(options[:organization_id])}" } } + should_query << { term: { "organization_id" => ror_from_url(options[:organization_id]) } } minimum_should_match = 1 end if options[:fair_organization_id].present? _ror_id = ror_from_url(options[:fair_organization_id]) - should << { term: { "organization_id" => _ror_id } } - should << { term: { "affiliation_id" => _ror_id } } - should << { term: { "related_dmp_organization_id" => _ror_id } } + should_query << { term: { "organization_id" => _ror_id } } + should_query << { term: { "affiliation_id" => _ror_id } } + should_query << { term: { "related_dmp_organization_id" => _ror_id } } minimum_should_match = 1 end if options[:affiliation_id].present? - should << { term: { "affiliation_id" => ror_from_url(options[:affiliation_id]) } } + should_query << { term: { "affiliation_id" => ror_from_url(options[:affiliation_id]) } } minimum_should_match = 1 end if options[:funder_id].present? - should << { terms: { "funding_references.funderIdentifier" => options[:funder_id].split(",").map { |f| "https://doi.org/#{doi_from_url(f)}" } } } + should_query << { terms: { "funding_references.funderIdentifier" => options[:funder_id].split(",").map { |f| "https://doi.org/#{doi_from_url(f)}" } } } minimum_should_match = 1 end if options[:member_id].present? - should << { term: { "provider.ror_id" => "https://#{ror_from_url(options[:member_id])}" } } + should_query << { term: { "provider.ror_id" => "https://#{ror_from_url(options[:member_id])}" } } minimum_should_match = 1 end - es_query = { + OpenStruct.new( + should_query: should_query, + minimum_should_match: minimum_should_match + ) + end + + def inner_query + should = get_should_clause + { bool: { must: must, - filter: filter, - should: should, - minimum_should_match: minimum_should_match, + filter: filters, + should: should.should_query, + minimum_should_match: should.minimum_should_match, }, - } - - { - size: options.dig(:page, :size), - search_after: search_after, - sort: sort, - query: es_query, - aggregations: aggregations, - track_total_hits: true, }.compact end - def gql_query_aggregations(facet_count: 10) + def aggregations + facet_count = (@options[:facet_count] || DEFAULT_FACET_COUNT).to_i if facet_count.positive? { resource_types: { terms: { field: "resource_type_id_and_name", size: facet_count, min_doc_count: 1, missing: "__missing__" } }, From 7c5928878ffbe26688bfae0cffef8439adec05c2 Mon Sep 17 00:00:00 2001 From: jrhoads Date: Wed, 16 Aug 2023 12:04:09 -0400 Subject: [PATCH 4/6] Add spec for GraphqlQuery::Builder --- app/models/doi/graphql_query.rb | 17 ++++- spec/models/doi/graphql_query_builder_spec.rb | 65 +++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 spec/models/doi/graphql_query_builder_spec.rb diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index e89abbc77..2cf2ce6aa 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -63,6 +63,21 @@ def search_after cursor end + QUERY_SUBSTITUTIONS = { + "publicationYear" => "publication_year", + "relatedIdentifiers" => "related_identifiers", + "relatedItems" => "related_items", + "rightsList" => "rights_list", + "fundingReferences" => "funding_references", + "geoLocations" => "geo_locations", + "landingPage" => "landing_page", + "contentUrl" => "content_url", + "citationCount" => "citation_count", + "viewCount" => "view_count", + "downloadCount" => "download_count" + } + + def clean_query # make sure field name uses underscore # escape forward slash, but not other Elasticsearch special characters @@ -80,7 +95,7 @@ def clean_query .gsub(/downloadCount/, "download_count")\ .gsub("/", "\\/") else - @query + "" end end diff --git a/spec/models/doi/graphql_query_builder_spec.rb b/spec/models/doi/graphql_query_builder_spec.rb new file mode 100644 index 000000000..70cd399ec --- /dev/null +++ b/spec/models/doi/graphql_query_builder_spec.rb @@ -0,0 +1,65 @@ + +# frozen_string_literal: true + +require "rails_helper" + +RSpec.describe Doi::GraphqlQuery::Builder do + + describe "page size" do + let(:query) { "" } + let(:options) { {} } + let(:builder) { described_class.new(query, options) } + + it "is DEFAULT_PAGE_SIZE with no options" do + expect(builder.size).to eq(described_class::DEFAULT_PAGE_SIZE) + end + + context "when set in options" do + let(:test_size) { 10 } + let(:options) { { page: { size: test_size } } } + it "will override DEFAULT_PAGE_SIZE" do + expect(builder.size).to eq(test_size) + end + end + end + + describe "cursor" do + let(:query) { "" } + let(:options) { {} } + let(:builder) { described_class.new(query, options) } + + it "is DEFAULT_CURSOR with no options" do + expect(builder.cursor).to eq(described_class::DEFAULT_CURSOR) + end + + context "when set in options" do + let(:test_cursor) { [1, "2"] } + let(:options) { { page: { cursor: test_cursor } } } + it "will override DEFAULT_CURSOR" do + expect(builder.cursor).to eq(test_cursor) + end + end + end + + describe "cleaned query" do + it "is an empty string if not set" do + expect(described_class.new("", {}).clean_query).to eq("") + expect(described_class.new(nil, {}).clean_query).to eq("") + end + + + it "replaces several camelcase words with underscores" do + described_class::QUERY_SUBSTITUTIONS.each do |key, value| + expect(described_class.new(key, {}).clean_query).to eq(value) + end + end + + it "escapses foward slashes" do + expect(described_class.new("foo/bar", {}).clean_query).to eq("foo\\/bar") + end + + end + + + +end From b056e95e97d7680470020b4a6afa62ffe699604c Mon Sep 17 00:00:00 2001 From: jrhoads Date: Wed, 16 Aug 2023 13:51:28 -0400 Subject: [PATCH 5/6] Appease rubocop --- app/models/doi/graphql_query.rb | 4 ++-- spec/models/doi/graphql_query_builder_spec.rb | 17 ++++++----------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index 2cf2ce6aa..efc1ad3d1 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -25,7 +25,7 @@ def search_query end def size - (@options.dig(:page, :size)|| DEFAULT_PAGE_SIZE).to_i + (@options.dig(:page, :size) || DEFAULT_PAGE_SIZE).to_i end def sort @@ -46,7 +46,7 @@ def query_fields end def cursor - tmp_cursor = @options.dig(:page,:cursor) + tmp_cursor = @options.dig(:page, :cursor) if tmp_cursor.nil? return DEFAULT_CURSOR end diff --git a/spec/models/doi/graphql_query_builder_spec.rb b/spec/models/doi/graphql_query_builder_spec.rb index 70cd399ec..b6d5a125a 100644 --- a/spec/models/doi/graphql_query_builder_spec.rb +++ b/spec/models/doi/graphql_query_builder_spec.rb @@ -4,7 +4,6 @@ require "rails_helper" RSpec.describe Doi::GraphqlQuery::Builder do - describe "page size" do let(:query) { "" } let(:options) { {} } @@ -17,9 +16,9 @@ context "when set in options" do let(:test_size) { 10 } let(:options) { { page: { size: test_size } } } - it "will override DEFAULT_PAGE_SIZE" do - expect(builder.size).to eq(test_size) - end + it "will override DEFAULT_PAGE_SIZE" do + expect(builder.size).to eq(test_size) + end end end @@ -35,9 +34,9 @@ context "when set in options" do let(:test_cursor) { [1, "2"] } let(:options) { { page: { cursor: test_cursor } } } - it "will override DEFAULT_CURSOR" do - expect(builder.cursor).to eq(test_cursor) - end + it "will override DEFAULT_CURSOR" do + expect(builder.cursor).to eq(test_cursor) + end end end @@ -57,9 +56,5 @@ it "escapses foward slashes" do expect(described_class.new("foo/bar", {}).clean_query).to eq("foo\\/bar") end - end - - - end From 30ac1292eff62523f561ed10c5a3b2c4b3320983 Mon Sep 17 00:00:00 2001 From: jrhoads Date: Fri, 18 Aug 2023 10:46:32 -0400 Subject: [PATCH 6/6] Rename 'search_query' to 'build_full_search_query' --- app/models/doi.rb | 2 +- app/models/doi/graphql_query.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/models/doi.rb b/app/models/doi.rb index 3634cb067..fde297282 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -829,7 +829,7 @@ def self.stats_query(options = {}) def self.gql_query(query, options = {}) builder = Doi::GraphqlQuery::Builder.new(query, options) __elasticsearch__.search( - builder.search_query + builder.build_full_search_query ) end diff --git a/app/models/doi/graphql_query.rb b/app/models/doi/graphql_query.rb index efc1ad3d1..1da39ed92 100644 --- a/app/models/doi/graphql_query.rb +++ b/app/models/doi/graphql_query.rb @@ -13,7 +13,7 @@ def initialize(query, options) @options = options end - def search_query + def build_full_search_query { size: size, search_after: search_after,