zimonbizkit · zimonbizkit · Mar 19, 2023 · Mar 19, 2023 · Mar 19, 2023
diff --git a/utilities/query.py b/utilities/query.py
@@ -11,7 +11,7 @@
 import pandas as pd
 import fileinput
 import logging
-
+import fasttext
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -190,7 +190,37 @@ def search(client, user_query, index="bbuy_products", sort="_score", sortDir="de
     #### W3: classify the query
     #### W3: create filters and boosts
     # Note: you may also want to modify the `create_query` method above
-    query_obj = create_query(user_query, click_prior_query=None, filters=None, sort=sort, sortDir=sortDir, source=["name", "shortDescription"])
+
+    query_classifier = fasttext.load_model('/workspace/search_with_machine_learning_course/week3/query_classifier_mq_10k_lr_1_wng_2_e3.bin')
+
+    predicted, score = query_classifier.predict(user_query,5)
+    print(list(predicted))
+    print(score)
+
+    treshold = 0.5
+    predictions_dict = [(k.removeprefix('__label__'),v) for i ,(k,v) in enumerate(zip(predicted,score)) if v >= treshold]
+    c_prob =str(sum(list([float(v) for (k,v) in predictions_dict])))
+
+    #print(f"Cumulative probability of {c_prob}")
+
+    if len(predictions_dict)> 0:
+        print("Appending following predicted categories on query:")
+        print(predictions_dict)
+
+        _filters = []
+        _filters.append({
+            "terms": {
+                "categoryPathIds": list([k for (k,v) in predictions_dict]) 
+            }
+        })
+    else:
+        _filters= None
+
+    print(_filters)
+
+
+
+    query_obj = create_query(user_query, click_prior_query=None, filters=_filters, sort=sort, sortDir=sortDir, source=["name", "shortDescription"])
     logging.info(query_obj)
     response = client.search(query_obj, index=index)
     if response and response['hits']['hits'] and len(response['hits']['hits']) > 0:

diff --git a/week3/create_labeled_queries.py b/week3/create_labeled_queries.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import numpy as np
 import csv
+import re
 
 # Useful if you want to perform stemming.
 import nltk
@@ -16,12 +17,15 @@
 
 parser = argparse.ArgumentParser(description='Process arguments.')
 general = parser.add_argument_group("general")
-general.add_argument("--min_queries", default=1,  help="The minimum number of queries per category label (default is 1)")
+general.add_argument("--min_queries", default=1000,  help="The minimum number of queries per category label (default is 1)")
 general.add_argument("--output", default=output_file_name, help="the file to output to")
 
 args = parser.parse_args()
 output_file_name = args.output
 
+pd.set_option('display.max_rows', None)
+
+
 if args.min_queries:
     min_queries = int(args.min_queries)
 
@@ -50,8 +54,50 @@
 
 # IMPLEMENT ME: Convert queries to lowercase, and optionally implement other normalization, like stemming.
 
+queries_df['query'] = queries_df['query'].apply(lambda x: re.sub('[^a-zA-Z\d\s:]', ' ',str(x) ) ).apply(lambda x: re.sub('\ +', ' ', str(x)))
+queries_df['query'] = queries_df['query'].apply(lambda x: stemmer.stem(x))
+
+cat_counts_df = queries_df.groupby('category').count()
+cat_counts_df.rename(columns={'query':'query_count'},inplace=True)
+
+
+#print(cat_counts_df[(cat_counts_df['category'])=='abcat0701001'].head(1))
+
 # IMPLEMENT ME: Roll up categories to ancestors to satisfy the minimum number of queries per category.
 
+
+
+
+#joined_df = queries_df.set_index('category').join(cat_counts_df_below_100.set_index('category'))
+#merged_df = pd.merge(queries_df,cat_counts_df_below_100,how='outer')
+
+
+
+
+# Roll up categories to ancestors to satisfy the minimum number of queries per category.
+#print(queries_df.head(10))
+#print(parents_df.head(10))
+#print(cat_counts_df.head(10))
+print("rolling up categories")
+while True:
+    cat_count =  queries_df.groupby(['category']).size().reset_index(name="total_queries")
+    df_below_minimum = cat_count[cat_count["total_queries"] < min_queries]["category"].unique()
+    if df_below_minimum.any():
+        parent_cat = queries_df.merge(parents_df, on="category", how="left")
+        parent_cat["parent"].loc[pd.isnull] = root_category_id
+        parent_df = parent_cat.merge(cat_count, on="category",  how="left")
+        parent_df.loc[parent_df["total_queries"] < min_queries, "category"] = parent_df["parent"]
+        queries_df = parent_df.drop(["parent", "total_queries"], axis=1)
+    else:
+        break
+
+
+
+
+
+print('Categories rolled up, number of rolledup categories is')
+
+print(queries_df['category'].unique().size)
 # Create labels in fastText format.
 queries_df['label'] = '__label__' + queries_df['category']
 

diff --git a/week3/results.md b/week3/results.md
@@ -0,0 +1,45 @@
+### For query classification:
+
+1. How many unique categories did you see in your rolled up training data when you set the minimum number of queries per category to 1000? To 10000?
+
+_I was able to see a minimum of 388 categories with a default number of queries per category to 1000, 70 (!) with a minimum number of 10K queries per category_
+
+3. What were the best values you achieved for R@1, R@3, and R@5? You should have tried at least a few different models, varying the minimum number of queries per category, as well as trying different fastText parameters or query normalization. Report at least 2 of your runs.
+
+- With a minimum of 1K minimum queries per category, I was able to reach  
+    - [email protected]  
+    - [email protected] 
+    - [email protected]
+- Only changing the minimum number of queries to 10K and not changing epochs and learning rate, I was able to push 
+    - [email protected]
+    - [email protected]
+    - [email protected]
+- Now, by pushing all parameters for fosttext learning (wordNgrams at 2, three epochs and learning rate at 1) I was able to achieve
+    - [email protected]
+    - [email protected]
+    - [email protected]
+
+
+### For integrating query classification with search: 
+1. Give 2 or 3 examples of queries where you saw a dramatic positive change in the results because of filtering. Make sure to include the classifier output for those queries.
+
+- After putting the best classifier above with a probability threshold of 0.5, with the query
+    - **xbox**: we went from 3563 aggressively to just 69! The results were very positive as literally xbox filtered to the category  “videogames and consoles” . Thus, the results were just xbox consoles and bundles of the console, taking out games and peripherals. Probability score output, categories and scores:
+    `['__label__abcat0701001', '__label__abcat0700000', '__label__abcat0715001', '__label__abcat0715002', '__label__cat02724']`
+    `[0.51762199 0.14868429 0.09926192 0.02738438 0.02623567]`
+
+    - **Ps4**: the same effect as above, filtering down to the consoles category, from 1170 results to just 15. Probability scores 👇🏻
+    `['__label__cat02015', '__label__cat02009', '__label__cat09000', '__label__cat02001', '__label__abcat0900000']`
+    `[0.69825941 0.04239719 0.03412125 0.02841064 0.02170236]`
+
+
+3. Give 2 or 3 examples of queries where filtering hurt the results, either because the classifier was wrong or for some other reason:
+- With the configuration above (probability treshold to 0.5), harmed queries were
+    - iphone: went from 3241 results to just 776, but their results were not _literally_ iPhone devices, but rather accessories to them.
+    `['__label__abcat0811002', '__label__pcmcat209400050001', '__label__abcat0208011', '__label__pcmcat201900050009', '__label__abcat0208007']`
+    `[0.58636183 0.23716259 0.02968958 0.02421849 0.01961429]`
+
+
+    - dress: from 633 results to 69, but again no correct filtering by the correct category. The results were mostly movies in various formats
+    `['__label__cat02015', '__label__cat02009', '__label__cat09000', '__label__cat02001', '__label__abcat0900000']`
+    `[0.6951161  0.04240536 0.03508785 0.0285547  0.02147051]`