From 7339e911f19c13ee8e33e45ccba7c0105300dd04 Mon Sep 17 00:00:00 2001 From: Martin Fenner Date: Fri, 8 May 2020 10:55:28 +0200 Subject: [PATCH] reduce number of fields used in queries --- app/models/doi.rb | 95 ++++++++++++++++++++++++--------------------- app/models/event.rb | 2 +- 2 files changed, 52 insertions(+), 45 deletions(-) diff --git a/app/models/doi.rb b/app/models/doi.rb index 5c6b88d8e..279906536 100644 --- a/app/models/doi.rb +++ b/app/models/doi.rb @@ -634,7 +634,7 @@ def self.export_sub_aggregations end def self.query_fields - ["uid^50", "related_identifiers.relatedIdentifier^3", "funding_references.relatedIdentifier^3", "container.identifier^3", 'titles.title^3', 'creator_names^3', 'creators.name^3', 'creators.id^3', 'publisher^3', 'descriptions.description^3', 'types.resourceTypeGeneral^3', 'subjects.subject^3', 'client.uid^3', 'provider.uid^3', '_all'] + ["uid^50", "related_identifiers.relatedIdentifier^3", 'titles.title^3', 'creator_names^3', 'creators.id^3', 'publisher^3', 'descriptions.description^3', 'subjects.subject^3'] end # return results for one or more ids @@ -744,50 +744,56 @@ def self.query(query, options={}) # turn ids into an array if provided as comma-separated string options[:ids] = options[:ids].split(",") if options[:ids].is_a?(String) - must = [] - must_not = [] + if query.present? + must = [{ query_string: { query: query, fields: query_fields, default_operator: "AND", phrase_slop: 1 } }] + else + must = [{ match_all: {} }] + end - must << { query_string: { query: query, fields: query_fields, default_operator: "AND", phrase_slop: 1 } } if query.present? - must << { terms: { doi: options[:ids].map(&:upcase) }} if options[:ids].present? - must << { term: { "types.resourceTypeGeneral": options[:resource_type_id].underscore.camelize }} if options[:resource_type_id].present? - must << { terms: { "types.resourceType": options[:resource_type].split(",") }} if options[:resource_type].present? - must << { terms: { provider_id: options[:provider_id].split(",") } } if options[:provider_id].present? - must << { terms: { client_id: options[:client_id].to_s.split(",") } } if options[:client_id].present? - must << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present? - must << { term: { uid: options[:uid] }} if options[:uid].present? - must << { range: { created: { gte: "#{options[:created].split(",").min}||/y", lte: "#{options[:created].split(",").max}||/y", format: "yyyy" }}} if options[:created].present? - must << { term: { schema_version: "http://datacite.org/schema/kernel-#{options[:schema_version]}" }} if options[:schema_version].present? - must << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present? - must << { term: { source: options[:source] } } if options[:source].present? - must << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present? - must << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present? - must << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present? - must << { range: { part_of_count: { "gte": options[:has_part_of].to_i } } } if options[:has_part_of].present? - must << { range: { version_count: { "gte": options[:has_versions].to_i } } } if options[:has_versions].present? - must << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present? - must << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present? - must << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present? - must << { term: { "landing_page.status": options[:link_check_status] } } if options[:link_check_status].present? - must << { exists: { field: "landing_page.checked" }} if options[:link_checked].present? - must << { term: { "landing_page.hasSchemaOrg": options[:link_check_has_schema_org] }} if options[:link_check_has_schema_org].present? - must << { term: { "landing_page.bodyHasPid": options[:link_check_body_has_pid] }} if options[:link_check_body_has_pid].present? - must << { exists: { field: "landing_page.schemaOrgId" }} if options[:link_check_found_schema_org_id].present? - must << { exists: { field: "landing_page.dcIdentifier" }} if options[:link_check_found_dc_identifier].present? - must << { exists: { field: "landing_page.citationDoi" }} if options[:link_check_found_citation_doi].present? - must << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present? - must << { terms: { aasm_state: options[:state].to_s.split(",") }} if options[:state].present? - must << { range: { registered: { gte: "#{options[:registered].split(",").min}||/y", lte: "#{options[:registered].split(",").max}||/y", format: "yyyy" }}} if options[:registered].present? - must << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://orcid.org/#{orcid_from_url(options[:user_id])}" }} if options[:user_id].present? - must << { term: { "affiliation_id" => ror_from_url(options[:affiliation_id]) }} if options[:affiliation_id].present? - must << { term: { "funding_references.funderIdentifier" => "https://doi.org/#{doi_from_url(options[:funder_id])}" }} if options[:funder_id].present? - must << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ORCID" }} if options[:has_person].present? - must << { term: { "creators.affiliation.affiliationIdentifierScheme" => "ROR" }} if options[:has_organization].present? - must << { term: { "funding_references.funderIdentifierType" => "Crossref Funder ID" }} if options[:has_funder].present? - must << { term: { consortium_id: options[:consortium_id] }} if options[:consortium_id].present? + must_not = [] + filter = [] + + filter << { terms: { doi: options[:ids].map(&:upcase) }} if options[:ids].present? + filter << { term: { "types.resourceTypeGeneral": options[:resource_type_id].underscore.camelize }} if options[:resource_type_id].present? + filter << { terms: { "types.resourceType": options[:resource_type].split(",") }} if options[:resource_type].present? + filter << { terms: { provider_id: options[:provider_id].split(",") } } if options[:provider_id].present? + filter << { terms: { client_id: options[:client_id].to_s.split(",") } } if options[:client_id].present? + filter << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present? + filter << { term: { uid: options[:uid] }} if options[:uid].present? + filter << { range: { created: { gte: "#{options[:created].split(",").min}||/y", lte: "#{options[:created].split(",").max}||/y", format: "yyyy" }}} if options[:created].present? + filter << { term: { schema_version: "http://datacite.org/schema/kernel-#{options[:schema_version]}" }} if options[:schema_version].present? + filter << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present? + filter << { term: { source: options[:source] } } if options[:source].present? + filter << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present? + filter << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present? + filter << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present? + filter << { range: { part_of_count: { "gte": options[:has_part_of].to_i } } } if options[:has_part_of].present? + filter << { range: { version_count: { "gte": options[:has_versions].to_i } } } if options[:has_versions].present? + filter << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present? + filter << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present? + filter << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present? + filter << { term: { "landing_page.status": options[:link_check_status] } } if options[:link_check_status].present? + filter << { exists: { field: "landing_page.checked" }} if options[:link_checked].present? + filter << { term: { "landing_page.hasSchemaOrg": options[:link_check_has_schema_org] }} if options[:link_check_has_schema_org].present? + filter << { term: { "landing_page.bodyHasPid": options[:link_check_body_has_pid] }} if options[:link_check_body_has_pid].present? + filter << { exists: { field: "landing_page.schemaOrgId" }} if options[:link_check_found_schema_org_id].present? + filter << { exists: { field: "landing_page.dcIdentifier" }} if options[:link_check_found_dc_identifier].present? + filter << { exists: { field: "landing_page.citationDoi" }} if options[:link_check_found_citation_doi].present? + filter << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present? + filter << { terms: { aasm_state: options[:state].to_s.split(",") }} if options[:state].present? + filter << { range: { registered: { gte: "#{options[:registered].split(",").min}||/y", lte: "#{options[:registered].split(",").max}||/y", format: "yyyy" }}} if options[:registered].present? + filter << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://orcid.org/#{orcid_from_url(options[:user_id])}" }} if options[:user_id].present? + filter << { term: { "affiliation_id" => ror_from_url(options[:affiliation_id]) }} if options[:affiliation_id].present? + filter << { term: { "funding_references.funderIdentifier" => "https://doi.org/#{doi_from_url(options[:funder_id])}" }} if options[:funder_id].present? + filter << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ORCID" }} if options[:has_person].present? + filter << { term: { "creators.affiliation.affiliationIdentifierScheme" => "ROR" }} if options[:has_organization].present? + filter << { term: { "funding_references.funderIdentifierType" => "Crossref Funder ID" }} if options[:has_funder].present? + filter << { term: { consortium_id: options[:consortium_id] }} if options[:consortium_id].present? # TODO align PID parsing - must << { term: { "client.re3data_id" => doi_from_url(options[:re3data_id]) }} if options[:re3data_id].present? - must << { term: { "client.opendoar_id" => options[:opendoar_id] }} if options[:opendoar_id].present? - must << { terms: { "client.certificate" => options[:certificate].split(",") }} if options[:certificate].present? + filter << { term: { "client.re3data_id" => doi_from_url(options[:re3data_id]) }} if options[:re3data_id].present? + filter << { term: { "client.opendoar_id" => options[:opendoar_id] }} if options[:opendoar_id].present? + filter << { terms: { "client.certificate" => options[:certificate].split(",") }} if options[:certificate].present? + must_not << { terms: { provider_id: ["crossref", "medra", "op"] }} if options[:exclude_registration_agencies] # ES query can be optionally defined in different ways @@ -798,7 +804,8 @@ def self.query(query, options={}) # The main bool query with filters bool_query = { must: must, - must_not: must_not + must_not: must_not, + filter: filter } # Function score is used to provide varying score to return different values diff --git a/app/models/event.rb b/app/models/event.rb index 565f09f00..6ea95c503 100644 --- a/app/models/event.rb +++ b/app/models/event.rb @@ -208,7 +208,7 @@ def citation_id end def self.query_fields - ["subj_id^10", "obj_id^10", "subj.name^5", "subj.author^5", "subj.periodical^5", "subj.publisher^5", "obj.name^5", "obj.author^5", "obj.periodical^5", "obj.publisher^5", "_all"] + ["subj_id^10", "obj_id^10", "source_id", "relation_type_id"] end def self.query_aggregations