diff --git a/docs/sort_facet.md b/docs/sort_facet.md new file mode 100644 index 000000000..14ea5c6f4 --- /dev/null +++ b/docs/sort_facet.md @@ -0,0 +1,786 @@ +

Purpose of Docvalues

+ +

Background

+ +

What are docValues? In the index mapping, there is an option to enable or disable docValues for a specific field mapping. However, what does it actually mean to activate or deactivate docValues, and how does it impact the end user? This document aims to address these questions.

+
+	"default_mapping": {
+		"dynamic": true,
+		"enabled": true,
+		"properties": {
+			"loremIpsum": {
+			"enabled": true,
+			"dynamic": false,
+			"fields": [
+				{
+					"name": "loremIpsum",
+					"type": "text",
+					"store": false,
+					"index": true,
+					"include_term_vectors": false,
+					"include_in_all": false,
+					"docvalues": true
+				}
+			]
+		}
+	}
+
+

Enabling docValues will always result in an increase in the size of your Bleve index, leading to a corresponding increase in disk usage. But what advantages can you expect in return? This document also quantitatively assesses this trade-off with a test case.

+ +

In a more general sense, we recommend enabling docValues on a field mapping if you anticipate queries that involve sorting and/or facet operations on that field. It's important to note, though, that sorting and faceting will work irrespective of whether docValues are enabled or not. This may lead you to wonder if there's any real benefit to enabling docValues since you're allocating extra disk space without an apparent return. The real advantage, however, becomes evident in enhanced query response times and reduced memory consumption during active usage. By accepting a minor increase in the disk space used by your Full-Text Search (FTS) index, you can anticipate better performance in handling search requests that involve sorting and faceting.

+ +

Usage

+ +

The initial use of docValues comes into play when sorting is involved. In the search request JSON, there is a field named "sort." This optional "sort" field can have a slice of JSON objects as its value. Each JSON object must belong to one of the following types: +

+

+

DocValues are relevant only when any of the JSON objects in the "sort" field are of type SortGeoDistance or SortField. This means that if you expect queries on a field F, where the queries either do not specify a value for the "sort" field or provide a JSON object of type SortDocID or SortScore, enabling docValues will not improve sorting operations, and as a result, query latency will remain unchanged. It's worth noting that the default sorting object, SortScore, does not require docValues to be enabled for any of the field mappings. Therefore, a search request without a sorting operation will not utilize docValues at all.

+
+ + + + + + + + + + + + + + + + + + + + + + +
No Sort ObjectsSortDocIDSortScoreSortFieldSortGeoDistance
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field":"dolor"
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field":"sit_amet"
+  },
+  "sort":[
+    {
+     "by":"id",
+     "desc":true
+    }
+    ],
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field":"sit_amet"
+  },
+  "sort":[
+    {
+     "by":"score",
+    }
+    ],
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field":"sit_amet"
+  },
+  "sort":[
+    {
+     "by":"field",
+     "field":"dolor",
+     "type":"auto",
+     "mode":"min",
+     "missing":"last"
+    }
+    ],
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field": "dolor"
+  },
+  "sort": [
+    {
+      "by": "geo_distance",
+      "field": "sit_amet",
+      "location": [
+        123.223,
+        34.33
+      ],
+      "unit": "km"
+    }
+  ],
+  "size": 10,
+  "from": 0
+}
+			
+
No DocValues usedNo DocValues usedNo DocValues usedDocValues used for field "dolor". Field Mapping for "dolor" may enable docValues.DocValues used, for field "sit_amet". +Field Mapping for "sit_amet" may enable docValues.
+
+

Now, let's consider faceting. The search request object also includes another field called "facets," where you can specify a collection of facet requests, with each request being associated with a unique name. Each of these facet requests can fall into one of three types: +

+Enabling docValues for the fields associated with such facet requests might provide benefits in this context.

+
+ + + + + + + + + + + + + + + + + +
No Facet RequestDate Range FacetNumeric Range FacetTerm Facet
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field": "dolor"
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field": "sit_amet"
+  },
+  "facet": {
+    "facetA": {
+      "size": 1,
+      "field": "dolor",
+      "date_ranges": [
+        {
+          "name": "lorem",
+          "start": "20/August/2001",
+          "end": "22/August/2002",
+          "datetime_parser": "custDT"
+        }
+      ]
+    }
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field": "sit_amet"
+  },
+  "facet": {
+    "facetA": {
+      "size": 1,
+      "field": "dolor",
+      "numeric_ranges":[
+          { 
+            "name":"lorem",
+            "min":22,
+            "max":34
+          }
+        ]
+    }
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field": "sit_amet"
+  },
+  "facet": {
+    "facetA": {
+      "size": 1,
+      "field": "dolor"
+    }
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
No DocValues usedDocValues used for field "dolor". Field Mapping for "dolor" may enable docValues.
+
+ +

In summary, when a search request is received by the Bleve index, it extracts all the fields from the sort objects and facet objects. To potentially benefit from docValues, you should consider enabling docValues for the fields mentioned in SortField and SortGeoDistance sort objects, as well as the fields associated with all the facet objects. By doing so, you can optimize sorting and faceting operations in your search queries.

+ +
+ + + + + + + + + + + + + +
Combo ACombo B
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field": "sit_amet"
+  },
+  "facet": {
+    "facetA": {
+      "size": 1,
+      "field": "dolor",
+      "date_ranges": [
+        {
+          "name": "lorem",
+          "start": "20/August/2001",
+          "end": "22/August/2002",
+          "datetime_parser": "custDT"
+        }
+      ]
+    }
+  },
+  "sort":[
+    {
+     "by":"field",
+     "field":"lorem",
+     "type":"auto",
+     "mode":"min",
+     "missing":"last"
+    }
+    ],
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "lorem ipsum",
+    "field": "sit_amet"
+  },
+  "facet": {
+    "facetA": {
+      "size": 1,
+      "field": "dolor",
+      "numeric_ranges":[
+          { 
+            "name":"lorem",
+            "min":22,
+            "max":34
+          }
+        ]
+    }
+  },
+  "sort": [
+    {
+      "by": "geo_distance",
+      "field": "ipsum",
+      "location": [
+        123.223,
+        34.33
+      ],
+      "unit": "km"
+    }
+  ],
+  "size": 10,
+  "from": 0
+}
+			
+
DocValues used for field "dolor" and "lorem". Field Mapping for "dolor" and "lorem" may enable docValues.DocValues used for field "dolor" and "ipsum". Field Mapping for "dolor" and "ipsum" may enable docValues.
+
+ +

Empirical Analysis

+ +

To evaluate our hypothesis, I've set up a sample dataset on my personal computer and I've created two Bleve indexes: one with docvalues enabled for three fields (dummyDate, dummyNumber, and dummyTerm), and another where I've disabled docValues for the same three fields. These field mappings were incorporated into the Default Mapping. It's important to mention that for both indexes, DocValues for dynamic fields were enabled, as the default mapping is dynamic.

+ +

The values for dummyDate and dummyNumber were configured to increase monotonically, with dummyDate representing a date value and `dummyNumber` representing a numeric value. This setup was intentional to ensure that facet aggregation would consistently result in cache hits and misses, providing a useful testing scenario.

+ +
+ + + + + + + + + + + + + +
Index AIndex B
+
+   "default_mapping": {
+    "dynamic": true,
+    "enabled": true,
+    "properties": {
+     "dummyNumber": {
+      "enabled": true,
+      "dynamic": false,
+      "fields": [
+       {
+        "name": "dummyNumber",
+        "type": "text",
+        "store": false,
+        "index": true,
+        "include_term_vectors": false,
+        "include_in_all": false,
+        "docvalues": true
+       }
+      ]
+     },
+     "dummyTerm": {
+      "enabled": true,
+      "dynamic": false,
+      "fields": [
+       {
+        "name": "dummyTerm",
+        "type": "text",
+        "store": false,
+        "index": true,
+        "include_term_vectors": false,
+        "include_in_all": false,
+        "docvalues": true
+       }
+      ]
+     },
+     "dummyDate": {
+      "enabled": true,
+      "dynamic": false,
+      "fields": [
+       {
+        "name": "dummyDate",
+        "type": "text",
+        "store": false,
+        "index": true,
+        "include_term_vectors": false,
+        "include_in_all": false,
+        "docvalues": true
+       }
+      ]
+     }
+    }
+   }
+			
+
+
+   "default_mapping": {
+    "dynamic": true,
+    "enabled": true,
+    "properties": {
+     "dummyNumber": {
+      "enabled": true,
+      "dynamic": false,
+      "fields": [
+       {
+        "name": "dummyNumber",
+        "type": "text",
+        "store": false,
+        "index": true,
+        "include_term_vectors": false,
+        "include_in_all": false,
+        "docvalues": false
+       }
+      ]
+     },
+     "dummyTerm": {
+      "enabled": true,
+      "dynamic": false,
+      "fields": [
+       {
+        "name": "dummyTerm",
+        "type": "text",
+        "store": false,
+        "index": true,
+        "include_term_vectors": false,
+        "include_in_all": false,
+        "docvalues": false
+       }
+      ]
+     },
+     "dummyDate": {
+      "enabled": true,
+      "dynamic": false,
+      "fields": [
+       {
+        "name": "dummyDate",
+        "type": "text",
+        "store": false,
+        "index": true,
+        "include_term_vectors": false,
+        "include_in_all": false,
+        "docvalues": false
+       }
+      ]
+     }
+    }
+   }
+			
+
Docvalues enabled across all three field mappingsDocvalues disabled across all three field mappings
+
+ +Document Format used for the test scenario: + +
+ + + + + + + + + + + + +
Document 1Document 2... Document iDocument 5000
+
+{
+	"dummyTerm":"Term",
+	"dummyDate":"2000-01-01T00:00:00,
+	"dummyNumber:1
+}
+			
+
+
+{
+	"dummyTerm":"Term",
+	"dummyDate":"2000-01-01T01:00:00,
+	"dummyNumber:2
+}
+			
+
+
+{
+	"dummyTerm":"Term",
+	"dummyDate":"2000-01-01T01:00:00"+(i hours),
+	"dummyNumber:i
+}
+			
+
+
+{
+	"dummyTerm":"Term",
+	"dummyDate":2000-01-01T01:00:00 + (5000 hours),
+	"dummyNumber:5000
+}
+			
+
+
+ +

Now I ran the following set of search requests across both the indexes, while increasing the number of documents indexed from 2000 to 4000.

+ +
+ + + + + + + + + + + + +
Request 1Request 2... Request iRequest 1000
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "term",
+    "field":"dummyTerm"
+  },
+  "facets":{
+    "myDate":{
+      "field":"dummyDate",
+      "size":100000,
+      "date_ranges":[
+        {
+          "start":"2000-01-01T00:00:00",
+          "end":"2000-01-01T01:00:00"
+        }
+      ]
+    },
+    "myNum":{
+      "field":"dummyNumber",
+      "size":100000,
+      "numeric_ranges":[
+        {
+          "min": 1000,
+          "max": 1001
+        }
+      ]
+    }
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "term",
+    "field":"dummyTerm"
+  },
+  "facets":{
+    "myDate":{
+      "field":"dummyDate",
+      "size":100000,
+      "date_ranges":[
+        {
+          "start":"2000-01-01T01:00:00",
+          "end":"2000-01-01T02:00:00"
+        }
+      ]
+    },
+    "myNum":{
+      "field":"dummyNumber",
+      "size":100000,
+      "numeric_ranges":[
+        {
+          "min": 999,
+          "max": 1000
+        }
+      ]
+    }
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "term",
+    "field":"dummyTerm"
+  },
+  "facets":{
+    "myDate":{
+      "field":"dummyDate",
+      "size":100000,
+      "date_ranges":[
+        {
+          "start":"2000-01-01T00:00:00" + i hour
+          "end":"2000-01-01T00:00:00" + (i+1) hour
+        }
+      ]
+    },
+    "myNum":{
+      "field":"dummyNumber",
+      "size":100000,
+      "numeric_ranges":[
+        {
+          "min": 1000-i,
+          "max": 1000-i+1
+        }
+      ]
+    }
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
+
+{
+  "explain": true,
+  "fields": [
+    "*"
+  ],
+  "highlight": {},
+  "query": {
+    "match": "term",
+    "field":"dummyTerm"
+  },
+  "facets":{
+    "myDate":{
+      "field":"dummyDate",
+      "size":100000,
+      "date_ranges":[
+        {
+          "start":"2000-01-01T01:00:00" + 1000 hour,
+          "end":"2000-01-01T02:00:00" + 1001 hour
+        }
+      ]
+    },
+    "myNum":{
+      "field":"dummyNumber",
+      "size":100000,
+      "numeric_ranges":[
+        {
+          "min": 0,
+          "max": 1
+        }
+      ]
+    }
+  },
+  "size": 10,
+  "from": 0
+}
+			
+
+
+ + +
+ + + + + + + + +
Bleve index size growth with increase in indexed documentsTotal query time for 1000 queries with increase in number of indexed documents
indexSizeVsNumDocs.pngqueryTimevsNumDocs.png
+
+ +
+ + + + + + + + + +
Average increase in index size (in bytes) by enabling DocValuesAverage reduction in time taken to perform 1000 queries (in milliseconds) by enabling DocValues
7762.4727.034
+Even at this small scale, with a small document size and a very limited number of indexed documents, we still observe a noticeable tradeoff. With just a slight increase in the index size (an average of 7KB) we obtain a 20ms reduction in the total execution time, on average, for only 1000 queries. + +

Technical Information

+ +

When a search request involves facet or sorting operations on a field F, these operations occur after the main search query is executed. For instance, if the main query yields a result of 200 documents, the sorting and faceting processes will be applied to these 200 documents. However, the main query result only provides a set of document IDs, not the actual document contents.

+ +

Here's where docValues become essential. If the field mapping for F is docValue enabled, the system can directly access the values for the field from the stored docValue part in the index file. This means that for each document ID returned in the search result, the field values are readily available.

+ +

However, if docValues are not enabled for field F, the system must take a different approach. It needs to "fetch the document" from the index file, read the value for field F, and cache this field-document pair in memory for further processing. The issue becomes apparent in the latter scenario. By not enabling docValues for field F, you essentially retrieve all the documents in the search result (at the worst case), which can be a substantial amount of data. Moreover, you have to cache this information in memory, leading to increased memory usage. As a result, query latency significantly suffers because you're essentially fetching and processing all documents, which can be both time-consuming and resource-intensive. Enabling docValues for the relevant fields is, therefore, a crucial optimization to enhance query performance and reduce memory overhead in such situations.

diff --git a/docs/sort_facet_supporting_docs/indexSizeVsNumDocs.png b/docs/sort_facet_supporting_docs/indexSizeVsNumDocs.png new file mode 100644 index 000000000..11211709d Binary files /dev/null and b/docs/sort_facet_supporting_docs/indexSizeVsNumDocs.png differ diff --git a/docs/sort_facet_supporting_docs/queryTimevsNumDocs.png b/docs/sort_facet_supporting_docs/queryTimevsNumDocs.png new file mode 100644 index 000000000..151b422aa Binary files /dev/null and b/docs/sort_facet_supporting_docs/queryTimevsNumDocs.png differ diff --git a/http/search.go b/http/search.go index 186d3d2c6..37a33f031 100644 --- a/http/search.go +++ b/http/search.go @@ -15,10 +15,12 @@ package http import ( + "context" "encoding/json" "fmt" "io" "net/http" + "time" "github.com/blevesearch/bleve/v2" "github.com/blevesearch/bleve/v2/search/query" @@ -80,8 +82,22 @@ func (h *SearchHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) { } } + // check for timeout and create context + var ctx context.Context + timeoutStr := req.FormValue("timeout") + if timeoutStr == "" { + ctx = context.Background() + } else { + timeout, err := time.ParseDuration(timeoutStr) + if err != nil { + showError(w, req, fmt.Sprintf("error parsing timeout value: %v", err), 400) + return + } + ctx, _ = context.WithTimeout(context.Background(), timeout) + } + // execute the query - searchResponse, err := index.Search(&searchRequest) + searchResponse, err := index.SearchInContext(ctx, &searchRequest) if err != nil { showError(w, req, fmt.Sprintf("error executing query: %v", err), 500) return diff --git a/index_impl.go b/index_impl.go index fe3a62e9e..5c9538822 100644 --- a/index_impl.go +++ b/index_impl.go @@ -496,7 +496,6 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr ctx = context.WithValue(ctx, search.GeoBufferPoolCallbackKey, search.GeoBufferPoolCallbackFunc(getBufferPool)) - // Using a disjunction query to get union of results from KNN query // and the original query searchQuery := disjunctQueryWithKNN(req) @@ -663,9 +662,9 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, var totalStoredFieldsBytes uint64 if len(req.Fields) > 0 || highlighter != nil { doc, err := r.Document(hit.ID) - totalStoredFieldsBytes = doc.StoredFieldsBytes() if err == nil && doc != nil { if len(req.Fields) > 0 { + totalStoredFieldsBytes = doc.StoredFieldsBytes() fieldsToLoad := deDuplicate(req.Fields) for _, f := range fieldsToLoad { doc.VisitFields(func(docF index.Field) {