diff --git a/opensearch/ltr_toy_model.png b/opensearch/ltr_toy_model.png index 1a56ed06..14d54f4b 100644 Binary files a/opensearch/ltr_toy_model.png and b/opensearch/ltr_toy_model.png differ diff --git a/week1/conf/ltr_featureset.json b/week1/conf/ltr_featureset.json index 029265d4..807716c9 100644 --- a/week1/conf/ltr_featureset.json +++ b/week1/conf/ltr_featureset.json @@ -12,6 +12,286 @@ "name": "{{keywords}}" } } + }, + { + "name": "manufacturer_phrase_match", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match_phrase": { + "manufacturer": { + "query": "{{keywords}}", + "slop": 6 + } + } + } + }, + { + "name": "dept_phrase_match", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match_phrase": { + "department": { + "query": "{{keywords}}", + "slop": 6 + } + } + } + }, + { + "name": "category_path_phrase_match", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match_phrase": { + "categoryPath": { + "query": "{{keywords}}", + "slop": 6 + } + } + } + }, + { + "name": "class_phrase_match", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match_phrase": { + "class": { + "query": "{{keywords}}", + "slop": 6 + } + } + } + }, + { + "name": "name_phrase_match", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match_phrase": { + "name": { + "query": "{{keywords}}", + "slop": 6 + } + } + } + }, + { + "name": "customer_review_avg", + "template_language": "mustache", + "template": { + "function_score": { + "functions": [ + { + "field_value_factor": { + "field": "customerReviewAverage", + "missing": 4.0 + } + } + ], + "query": { + "match_all": {} + } + } + } + }, + { + "name": "customer_review_count", + "template_language": "mustache", + "template": { + "function_score": { + "functions": [ + { + "field_value_factor": { + "field": "customerReviewCount", + "missing": 9 + } + } + ], + "query": { + "match_all": {} + } + } + } + }, + { + "name": "artist_name_phrase_match", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match_phrase": { + "artistName": { + "query": "{{keywords}}", + "slop": 6 + } + } + } + }, + { + "name": "short_description_phrase_match", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match_phrase": { + "shortDescription": { + "query": "{{keywords}}", + "slop": 6 + } + } + } + }, + { + "name": "long_description_phrase_match", + "params": [ + "keywords" + ], + "template_language": "mustache", + "template": { + "match_phrase": { + "longDescription": { + "query": "{{keywords}}", + "slop": 6 + } + } + } + }, + { + "name": "salesRankShortTerm", + "template_language": "mustache", + "template": { + "function_score": { + "functions": [ + { + "filter": { + "exists": { + "field": "salesRankShortTerm" + } + }, + "gauss": { + "salesRankShortTerm": { + "origin": "1.0", + "offset": "100", + "scale": "10000", + "decay": "0.1" + } + } + }, + { + "filter": { + "bool": { + "must_not": { + "exists": { + "field": "salesRankShortTerm" + } + } + } + }, + "script_score": { + "script": "if (doc['salesRankShortTerm'] == null || doc['salesRankShortTerm'].size() == 0 ){return 0.0}" + } + } + ], + "query": { + "match_all": {} + } + } + } + }, + { + "name": "in_store_availability", + "template_language": "mustache", + "template": { + "function_score": { + "query": { "match_all": {} }, + "functions": [ + { + "filter": { "match": { "inStoreAvailability": "true" } }, + "weight": 1 + }, + { + "weight": 0 + } + ], + "score_mode": "max", + "min_score": 0 + } + } + }, + { + "name": "online_availability", + "template_language": "mustache", + "template": { + "function_score": { + "query": { "match_all": {} }, + "functions": [ + { + "filter": { "match": { "onlineAvailability": "true" } }, + "weight": 1 + }, + { + "weight": 0 + } + ], + "score_mode": "max", + "min_score": 0 + } + } + }, + { + "name": "on_sale", + "template_language": "mustache", + "template": { + "function_score": { + "query": { "match_all": {} }, + "functions": [ + { + "filter": { "match": { "onSale": "true" } }, + "weight": 1 + }, + { + "weight": 0 + } + ], + "score_mode": "max", + "min_score": 0 + } + } + }, + { + "name": "digital", + "template_language": "mustache", + "template": { + "function_score": { + "query": { "match_all": {} }, + "functions": [ + { + "filter": { "match": { "digital": "true" } }, + "weight": 1 + }, + { + "weight": 0 + } + ], + "score_mode": "max", + "min_score": 0 + } + } } ] } diff --git a/week1/test/week1test.py b/week1/test/week1test.py new file mode 100644 index 00000000..86203854 --- /dev/null +++ b/week1/test/week1test.py @@ -0,0 +1,37 @@ + +hits=[ + { + '_index': 'searchml_ltr', + '_id': 'doc_e', + '_score': 0.0, + '_source': { + 'id': 'doc_e', + 'title': 'Pigs in a Blanket and Other Recipes', + 'price': '27.50', + 'in_stock': True, + 'body': "Pigs in a blanket aren't as cute as you would think given it's a food and not actual pigs wrapped in blankets.", + 'category': 'instructional' + }, + 'fields': { + '_ltrlog': [ + { + 'log_entry': [ + {'name': 'title_query', 'value': 1.1272218}, + {'name': 'body_query', 'value': 2.2908108}, + {'name': 'price_func', 'value': 27.5} + ] + + } + ] + }, + 'matched_queries': ['logged_featureset'] + } + ] + +log_entries = hits[0]['fields']['_ltrlog'][0]['log_entry'] + +print(log_entries) + +for entry in log_entries: + if entry['name'] == "title_query" : + print(entry['value']) diff --git a/week1/utilities/build_ltr.py b/week1/utilities/build_ltr.py index 95ae76e0..13e02ca9 100644 --- a/week1/utilities/build_ltr.py +++ b/week1/utilities/build_ltr.py @@ -327,7 +327,6 @@ ############# if args.xgb: # Defaults - bst, xgb_params = xgbu.train(args.xgb, args.xgb_rounds, args.xgb_conf) print("Dumping out model using feature map: %s" % args.xgb_feat_map) model = bst.get_dump(fmap=("%s/%s" % (output_dir, args.xgb_feat_map)), dump_format='json') diff --git a/week1/utilities/student_ltr.py b/week1/utilities/student_ltr.py index 2dda769d..7d5ecdfa 100644 --- a/week1/utilities/student_ltr.py +++ b/week1/utilities/student_ltr.py @@ -13,7 +13,8 @@ :param dictionary xgb_params The XGBoost configuration parameters, such as the objective function, e.g. {'objective': 'reg:logistic'} ''' def train(xgb_train_data, num_rounds=5, xgb_params=None ): - print("IMPLEMENT ME: xgb train") + dtrain = xgb.DMatrix(f'{xgb_train_data}?format=libsvm') + return xgb.train(xgb_params, dtrain, num_rounds) ##### Step 3.b: ''' @@ -35,7 +36,6 @@ def train(xgb_train_data, num_rounds=5, xgb_params=None ): :param string terms_field: The name of the field to filter our doc_ids on ''' def create_feature_log_query(query, doc_ids, click_prior_query, featureset_name, ltr_store_name, size=200, terms_field="_id"): - print("IMPLEMENT ME: create_feature_log_query with proper LTR syntax") return { 'size': size, 'query': { @@ -45,9 +45,27 @@ def create_feature_log_query(query, doc_ids, click_prior_query, featureset_name, "terms": { terms_field: doc_ids } - } + }, + { # use the LTR query bring in the LTR feature set + "sltr": { + "_name": "logged_featureset", + "featureset": featureset_name, + "store": ltr_store_name, + "params": { + "keywords": query + } + } + } ] } + }, + "ext": { + "ltr_log": { + "log_specs": { + "name": "log_entry", + "named_query": "logged_featureset" + } + } } } @@ -65,13 +83,29 @@ def create_feature_log_query(query, doc_ids, click_prior_query, featureset_name, :param str ltr_store_name: The name of the LTR store we are using to extract features from :param int rescore_size: The number of results to rescore :param float main_query_weight: A float indicating how much weight to give results that match in the original query -:param float rewcore_query_weight: A float indicating how much weight to give results that match in the rescored query +:param float rescore_query_weight: A float indicating how much weight to give results that match in the rescored query ''' def create_rescore_ltr_query(user_query: str, query_obj, click_prior_query: str, ltr_model_name: str, ltr_store_name: str, rescore_size=500, main_query_weight=1, rescore_query_weight=2): - print("IMPLEMENT ME: create_rescore_ltr_query") - # query_obj["rescore"] = { ... } + query_obj["rescore"] = { + "window_size": rescore_size, + "query": { + "rescore_query": { + "sltr": { + "params": { + "keywords": user_query + }, + "model": ltr_model_name, + "store": ltr_store_name, + "active_features": ["name_match","name_phrase_match","customer_review_avg","customer_review_count"] + } + }, + "score_mode": "total", + "query_weight": main_query_weight, + "rescore_query_weight": rescore_query_weight + } + } ##### Step Extract LTR Logged Features: @@ -85,17 +119,27 @@ def create_rescore_ltr_query(user_query: str, query_obj, click_prior_query: str, def extract_logged_features(hits, query_id): import numpy as np import pandas as pd - print("IMPLEMENT ME: __log_ltr_query_features: Extract log features out of the LTR:EXT response and place in a data frame") + # print("IMPLEMENT ME: __log_ltr_query_features: Extract log features out of the LTR:EXT response and place in a data frame") feature_results = {} feature_results["doc_id"] = [] # capture the doc id so we can join later feature_results["query_id"] = [] # ^^^ feature_results["sku"] = [] feature_results["name_match"] = [] + feature_results["name_phrase_match"] = [] + feature_results["customer_review_avg"] = [] + feature_results["customer_review_count"] = [] rng = np.random.default_rng(12345) for (idx, hit) in enumerate(hits): feature_results["doc_id"].append(int(hit['_id'])) # capture the doc id so we can join later feature_results["query_id"].append(query_id) # super redundant, but it will make it easier to join later feature_results["sku"].append(int(hit['_id'])) - feature_results["name_match"].append(rng.random()) + log_entries = hit['fields']['_ltrlog'][0]['log_entry'] + for entry in log_entries: + feature_name = entry.get('name', '') + feature_value = entry.get('value', 0) + if not feature_results.get(feature_name, None): + feature_results[feature_name] = [] + feature_results[feature_name].append(feature_value) frame = pd.DataFrame(feature_results) - return frame.astype({'doc_id': 'int64', 'query_id': 'int64', 'sku': 'int64'}) \ No newline at end of file + return frame.astype({'doc_id': 'int64', 'query_id': 'int64', 'sku': 'int64'}) +