Skip to content

Commit

Permalink
reduce number of fields used in queries
Browse files Browse the repository at this point in the history
  • Loading branch information
Martin Fenner committed May 8, 2020
1 parent 91387a2 commit 7339e91
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 45 deletions.
95 changes: 51 additions & 44 deletions app/models/doi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def self.export_sub_aggregations
end

def self.query_fields
["uid^50", "related_identifiers.relatedIdentifier^3", "funding_references.relatedIdentifier^3", "container.identifier^3", 'titles.title^3', 'creator_names^3', 'creators.name^3', 'creators.id^3', 'publisher^3', 'descriptions.description^3', 'types.resourceTypeGeneral^3', 'subjects.subject^3', 'client.uid^3', 'provider.uid^3', '_all']
["uid^50", "related_identifiers.relatedIdentifier^3", 'titles.title^3', 'creator_names^3', 'creators.id^3', 'publisher^3', 'descriptions.description^3', 'subjects.subject^3']
end

# return results for one or more ids
Expand Down Expand Up @@ -744,50 +744,56 @@ def self.query(query, options={})
# turn ids into an array if provided as comma-separated string
options[:ids] = options[:ids].split(",") if options[:ids].is_a?(String)

must = []
must_not = []
if query.present?
must = [{ query_string: { query: query, fields: query_fields, default_operator: "AND", phrase_slop: 1 } }]
else
must = [{ match_all: {} }]
end

must << { query_string: { query: query, fields: query_fields, default_operator: "AND", phrase_slop: 1 } } if query.present?
must << { terms: { doi: options[:ids].map(&:upcase) }} if options[:ids].present?
must << { term: { "types.resourceTypeGeneral": options[:resource_type_id].underscore.camelize }} if options[:resource_type_id].present?
must << { terms: { "types.resourceType": options[:resource_type].split(",") }} if options[:resource_type].present?
must << { terms: { provider_id: options[:provider_id].split(",") } } if options[:provider_id].present?
must << { terms: { client_id: options[:client_id].to_s.split(",") } } if options[:client_id].present?
must << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present?
must << { term: { uid: options[:uid] }} if options[:uid].present?
must << { range: { created: { gte: "#{options[:created].split(",").min}||/y", lte: "#{options[:created].split(",").max}||/y", format: "yyyy" }}} if options[:created].present?
must << { term: { schema_version: "http://datacite.org/schema/kernel-#{options[:schema_version]}" }} if options[:schema_version].present?
must << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present?
must << { term: { source: options[:source] } } if options[:source].present?
must << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present?
must << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present?
must << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present?
must << { range: { part_of_count: { "gte": options[:has_part_of].to_i } } } if options[:has_part_of].present?
must << { range: { version_count: { "gte": options[:has_versions].to_i } } } if options[:has_versions].present?
must << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present?
must << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present?
must << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present?
must << { term: { "landing_page.status": options[:link_check_status] } } if options[:link_check_status].present?
must << { exists: { field: "landing_page.checked" }} if options[:link_checked].present?
must << { term: { "landing_page.hasSchemaOrg": options[:link_check_has_schema_org] }} if options[:link_check_has_schema_org].present?
must << { term: { "landing_page.bodyHasPid": options[:link_check_body_has_pid] }} if options[:link_check_body_has_pid].present?
must << { exists: { field: "landing_page.schemaOrgId" }} if options[:link_check_found_schema_org_id].present?
must << { exists: { field: "landing_page.dcIdentifier" }} if options[:link_check_found_dc_identifier].present?
must << { exists: { field: "landing_page.citationDoi" }} if options[:link_check_found_citation_doi].present?
must << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present?
must << { terms: { aasm_state: options[:state].to_s.split(",") }} if options[:state].present?
must << { range: { registered: { gte: "#{options[:registered].split(",").min}||/y", lte: "#{options[:registered].split(",").max}||/y", format: "yyyy" }}} if options[:registered].present?
must << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://orcid.org/#{orcid_from_url(options[:user_id])}" }} if options[:user_id].present?
must << { term: { "affiliation_id" => ror_from_url(options[:affiliation_id]) }} if options[:affiliation_id].present?
must << { term: { "funding_references.funderIdentifier" => "https://doi.org/#{doi_from_url(options[:funder_id])}" }} if options[:funder_id].present?
must << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ORCID" }} if options[:has_person].present?
must << { term: { "creators.affiliation.affiliationIdentifierScheme" => "ROR" }} if options[:has_organization].present?
must << { term: { "funding_references.funderIdentifierType" => "Crossref Funder ID" }} if options[:has_funder].present?
must << { term: { consortium_id: options[:consortium_id] }} if options[:consortium_id].present?
must_not = []
filter = []

filter << { terms: { doi: options[:ids].map(&:upcase) }} if options[:ids].present?
filter << { term: { "types.resourceTypeGeneral": options[:resource_type_id].underscore.camelize }} if options[:resource_type_id].present?
filter << { terms: { "types.resourceType": options[:resource_type].split(",") }} if options[:resource_type].present?
filter << { terms: { provider_id: options[:provider_id].split(",") } } if options[:provider_id].present?
filter << { terms: { client_id: options[:client_id].to_s.split(",") } } if options[:client_id].present?
filter << { terms: { prefix: options[:prefix].to_s.split(",") } } if options[:prefix].present?
filter << { term: { uid: options[:uid] }} if options[:uid].present?
filter << { range: { created: { gte: "#{options[:created].split(",").min}||/y", lte: "#{options[:created].split(",").max}||/y", format: "yyyy" }}} if options[:created].present?
filter << { term: { schema_version: "http://datacite.org/schema/kernel-#{options[:schema_version]}" }} if options[:schema_version].present?
filter << { terms: { "subjects.subject": options[:subject].split(",") } } if options[:subject].present?
filter << { term: { source: options[:source] } } if options[:source].present?
filter << { range: { reference_count: { "gte": options[:has_references].to_i } } } if options[:has_references].present?
filter << { range: { citation_count: { "gte": options[:has_citations].to_i } } } if options[:has_citations].present?
filter << { range: { part_count: { "gte": options[:has_parts].to_i } } } if options[:has_parts].present?
filter << { range: { part_of_count: { "gte": options[:has_part_of].to_i } } } if options[:has_part_of].present?
filter << { range: { version_count: { "gte": options[:has_versions].to_i } } } if options[:has_versions].present?
filter << { range: { version_of_count: { "gte": options[:has_version_of].to_i } } } if options[:has_version_of].present?
filter << { range: { view_count: { "gte": options[:has_views].to_i } } } if options[:has_views].present?
filter << { range: { download_count: { "gte": options[:has_downloads].to_i } } } if options[:has_downloads].present?
filter << { term: { "landing_page.status": options[:link_check_status] } } if options[:link_check_status].present?
filter << { exists: { field: "landing_page.checked" }} if options[:link_checked].present?
filter << { term: { "landing_page.hasSchemaOrg": options[:link_check_has_schema_org] }} if options[:link_check_has_schema_org].present?
filter << { term: { "landing_page.bodyHasPid": options[:link_check_body_has_pid] }} if options[:link_check_body_has_pid].present?
filter << { exists: { field: "landing_page.schemaOrgId" }} if options[:link_check_found_schema_org_id].present?
filter << { exists: { field: "landing_page.dcIdentifier" }} if options[:link_check_found_dc_identifier].present?
filter << { exists: { field: "landing_page.citationDoi" }} if options[:link_check_found_citation_doi].present?
filter << { range: { "landing_page.redirectCount": { "gte": options[:link_check_redirect_count_gte] } } } if options[:link_check_redirect_count_gte].present?
filter << { terms: { aasm_state: options[:state].to_s.split(",") }} if options[:state].present?
filter << { range: { registered: { gte: "#{options[:registered].split(",").min}||/y", lte: "#{options[:registered].split(",").max}||/y", format: "yyyy" }}} if options[:registered].present?
filter << { term: { "creators.nameIdentifiers.nameIdentifier" => "https://orcid.org/#{orcid_from_url(options[:user_id])}" }} if options[:user_id].present?
filter << { term: { "affiliation_id" => ror_from_url(options[:affiliation_id]) }} if options[:affiliation_id].present?
filter << { term: { "funding_references.funderIdentifier" => "https://doi.org/#{doi_from_url(options[:funder_id])}" }} if options[:funder_id].present?
filter << { term: { "creators.nameIdentifiers.nameIdentifierScheme" => "ORCID" }} if options[:has_person].present?
filter << { term: { "creators.affiliation.affiliationIdentifierScheme" => "ROR" }} if options[:has_organization].present?
filter << { term: { "funding_references.funderIdentifierType" => "Crossref Funder ID" }} if options[:has_funder].present?
filter << { term: { consortium_id: options[:consortium_id] }} if options[:consortium_id].present?
# TODO align PID parsing
must << { term: { "client.re3data_id" => doi_from_url(options[:re3data_id]) }} if options[:re3data_id].present?
must << { term: { "client.opendoar_id" => options[:opendoar_id] }} if options[:opendoar_id].present?
must << { terms: { "client.certificate" => options[:certificate].split(",") }} if options[:certificate].present?
filter << { term: { "client.re3data_id" => doi_from_url(options[:re3data_id]) }} if options[:re3data_id].present?
filter << { term: { "client.opendoar_id" => options[:opendoar_id] }} if options[:opendoar_id].present?
filter << { terms: { "client.certificate" => options[:certificate].split(",") }} if options[:certificate].present?

must_not << { terms: { provider_id: ["crossref", "medra", "op"] }} if options[:exclude_registration_agencies]

# ES query can be optionally defined in different ways
Expand All @@ -798,7 +804,8 @@ def self.query(query, options={})
# The main bool query with filters
bool_query = {
must: must,
must_not: must_not
must_not: must_not,
filter: filter
}

# Function score is used to provide varying score to return different values
Expand Down
2 changes: 1 addition & 1 deletion app/models/event.rb
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def citation_id
end

def self.query_fields
["subj_id^10", "obj_id^10", "subj.name^5", "subj.author^5", "subj.periodical^5", "subj.publisher^5", "obj.name^5", "obj.author^5", "obj.periodical^5", "obj.publisher^5", "_all"]
["subj_id^10", "obj_id^10", "source_id", "relation_type_id"]
end

def self.query_aggregations
Expand Down

0 comments on commit 7339e91

Please sign in to comment.