diff --git a/utilities/query.py b/utilities/query.py index da9228fe..b5ce2885 100644 --- a/utilities/query.py +++ b/utilities/query.py @@ -11,7 +11,7 @@ import pandas as pd import fileinput import logging - +import fasttext logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -190,7 +190,37 @@ def search(client, user_query, index="bbuy_products", sort="_score", sortDir="de #### W3: classify the query #### W3: create filters and boosts # Note: you may also want to modify the `create_query` method above - query_obj = create_query(user_query, click_prior_query=None, filters=None, sort=sort, sortDir=sortDir, source=["name", "shortDescription"]) + + query_classifier = fasttext.load_model('/workspace/search_with_machine_learning_course/week3/query_classifier_mq_10k_lr_1_wng_2_e3.bin') + + predicted, score = query_classifier.predict(user_query,5) + print(list(predicted)) + print(score) + + treshold = 0.5 + predictions_dict = [(k.removeprefix('__label__'),v) for i ,(k,v) in enumerate(zip(predicted,score)) if v >= treshold] + c_prob =str(sum(list([float(v) for (k,v) in predictions_dict]))) + + #print(f"Cumulative probability of {c_prob}") + + if len(predictions_dict)> 0: + print("Appending following predicted categories on query:") + print(predictions_dict) + + _filters = [] + _filters.append({ + "terms": { + "categoryPathIds": list([k for (k,v) in predictions_dict]) + } + }) + else: + _filters= None + + print(_filters) + + + + query_obj = create_query(user_query, click_prior_query=None, filters=_filters, sort=sort, sortDir=sortDir, source=["name", "shortDescription"]) logging.info(query_obj) response = client.search(query_obj, index=index) if response and response['hits']['hits'] and len(response['hits']['hits']) > 0: diff --git a/week3/create_labeled_queries.py b/week3/create_labeled_queries.py index 802e3044..370339c9 100644 --- a/week3/create_labeled_queries.py +++ b/week3/create_labeled_queries.py @@ -4,6 +4,7 @@ import pandas as pd import numpy as np import csv +import re # Useful if you want to perform stemming. import nltk @@ -16,12 +17,15 @@ parser = argparse.ArgumentParser(description='Process arguments.') general = parser.add_argument_group("general") -general.add_argument("--min_queries", default=1, help="The minimum number of queries per category label (default is 1)") +general.add_argument("--min_queries", default=1000, help="The minimum number of queries per category label (default is 1)") general.add_argument("--output", default=output_file_name, help="the file to output to") args = parser.parse_args() output_file_name = args.output +pd.set_option('display.max_rows', None) + + if args.min_queries: min_queries = int(args.min_queries) @@ -50,8 +54,50 @@ # IMPLEMENT ME: Convert queries to lowercase, and optionally implement other normalization, like stemming. +queries_df['query'] = queries_df['query'].apply(lambda x: re.sub('[^a-zA-Z\d\s:]', ' ',str(x) ) ).apply(lambda x: re.sub('\ +', ' ', str(x))) +queries_df['query'] = queries_df['query'].apply(lambda x: stemmer.stem(x)) + +cat_counts_df = queries_df.groupby('category').count() +cat_counts_df.rename(columns={'query':'query_count'},inplace=True) + + +#print(cat_counts_df[(cat_counts_df['category'])=='abcat0701001'].head(1)) + # IMPLEMENT ME: Roll up categories to ancestors to satisfy the minimum number of queries per category. + + + +#joined_df = queries_df.set_index('category').join(cat_counts_df_below_100.set_index('category')) +#merged_df = pd.merge(queries_df,cat_counts_df_below_100,how='outer') + + + + +# Roll up categories to ancestors to satisfy the minimum number of queries per category. +#print(queries_df.head(10)) +#print(parents_df.head(10)) +#print(cat_counts_df.head(10)) +print("rolling up categories") +while True: + cat_count = queries_df.groupby(['category']).size().reset_index(name="total_queries") + df_below_minimum = cat_count[cat_count["total_queries"] < min_queries]["category"].unique() + if df_below_minimum.any(): + parent_cat = queries_df.merge(parents_df, on="category", how="left") + parent_cat["parent"].loc[pd.isnull] = root_category_id + parent_df = parent_cat.merge(cat_count, on="category", how="left") + parent_df.loc[parent_df["total_queries"] < min_queries, "category"] = parent_df["parent"] + queries_df = parent_df.drop(["parent", "total_queries"], axis=1) + else: + break + + + + + +print('Categories rolled up, number of rolledup categories is') + +print(queries_df['category'].unique().size) # Create labels in fastText format. queries_df['label'] = '__label__' + queries_df['category'] diff --git a/week3/results.md b/week3/results.md new file mode 100644 index 00000000..d20a1393 --- /dev/null +++ b/week3/results.md @@ -0,0 +1,45 @@ +### For query classification: + +1. How many unique categories did you see in your rolled up training data when you set the minimum number of queries per category to 1000? To 10000? + +_I was able to see a minimum of 388 categories with a default number of queries per category to 1000, 70 (!) with a minimum number of 10K queries per category_ + +3. What were the best values you achieved for R@1, R@3, and R@5? You should have tried at least a few different models, varying the minimum number of queries per category, as well as trying different fastText parameters or query normalization. Report at least 2 of your runs. + +- With a minimum of 1K minimum queries per category, I was able to reach + - R1@0.3 + - R3@0.399 + - R5@0.4 +- Only changing the minimum number of queries to 10K and not changing epochs and learning rate, I was able to push + - R1@0.399 + - R3@0.518 + - R5@0.58 +- Now, by pushing all parameters for fosttext learning (wordNgrams at 2, three epochs and learning rate at 1) I was able to achieve + - R1@0.419 + - R3@0.607 + - R5@0.674 + + +### For integrating query classification with search: +1. Give 2 or 3 examples of queries where you saw a dramatic positive change in the results because of filtering. Make sure to include the classifier output for those queries. + +- After putting the best classifier above with a probability threshold of 0.5, with the query + - **xbox**: we went from 3563 aggressively to just 69! The results were very positive as literally xbox filtered to the category “videogames and consoles” . Thus, the results were just xbox consoles and bundles of the console, taking out games and peripherals. Probability score output, categories and scores: + `['__label__abcat0701001', '__label__abcat0700000', '__label__abcat0715001', '__label__abcat0715002', '__label__cat02724']` + `[0.51762199 0.14868429 0.09926192 0.02738438 0.02623567]` + + - **Ps4**: the same effect as above, filtering down to the consoles category, from 1170 results to just 15. Probability scores 👇🏻 + `['__label__cat02015', '__label__cat02009', '__label__cat09000', '__label__cat02001', '__label__abcat0900000']` + `[0.69825941 0.04239719 0.03412125 0.02841064 0.02170236]` + + +3. Give 2 or 3 examples of queries where filtering hurt the results, either because the classifier was wrong or for some other reason: +- With the configuration above (probability treshold to 0.5), harmed queries were + - iphone: went from 3241 results to just 776, but their results were not _literally_ iPhone devices, but rather accessories to them. + `['__label__abcat0811002', '__label__pcmcat209400050001', '__label__abcat0208011', '__label__pcmcat201900050009', '__label__abcat0208007']` + `[0.58636183 0.23716259 0.02968958 0.02421849 0.01961429]` + + + - dress: from 633 results to 69, but again no correct filtering by the correct category. The results were mostly movies in various formats + `['__label__cat02015', '__label__cat02009', '__label__cat09000', '__label__cat02001', '__label__abcat0900000']` + `[0.6951161 0.04240536 0.03508785 0.0285547 0.02147051]`