-
Notifications
You must be signed in to change notification settings - Fork 76
/
Copy pathhand_written.py
1242 lines (996 loc) · 56.7 KB
/
hand_written.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import json
import os
import re
import time
import lotus
import pandas as pd
from lotus.models import OpenAIModel
############################################################################################################
###################################### Match based queries #################################################
############################################################################################################
def pipeline_0():
query = "Among the schools with the average score in Math over 560 in the SAT test, how many schools are in the bay area?"
answer = 71
scores_df = pd.read_csv("../pandas_dfs/california_schools/satscores.csv")
scores_df = scores_df[scores_df["AvgScrMath"] > 560]
scores_df = scores_df.sem_filter("{cname} is a county in the Bay Area")
prediction = len(scores_df)
return prediction, answer
def pipeline_1():
query = (
"What is the telephone number for the school with the lowest average score in reading in Southern California?"
)
answer = "(562) 944-0033"
scores_df = pd.read_csv("../pandas_dfs/california_schools/satscores.csv")
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
scores_df = scores_df.sem_filter("{cname} is a county name in Southern California")
scores_df = scores_df.loc[[scores_df["AvgScrRead"].idxmin()]]
merged_df = pd.merge(scores_df, schools_df, left_on="cds", right_on="CDSCode")
prediction = merged_df.Phone.values[0]
return prediction, answer
def pipeline_3():
query = "How many test takers are there at the school/s in a county with population over 2 million?"
answer = 244742
scores_df = pd.read_csv("../pandas_dfs/california_schools/satscores.csv")
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
unique_counties = pd.DataFrame(schools_df["County"].unique(), columns=["County"])
unique_counties = unique_counties.sem_map(
"What is the population of {County} in California? Answer with only the number without commas. Respond with your best guess."
)
counties_over_2m = set()
for _, row in unique_counties.iterrows():
try:
if int(re.findall(r"\d+", row._map)[-1]) > 2000000:
counties_over_2m.add(row.County)
except:
pass
schools_df = schools_df[schools_df["County"].isin(counties_over_2m)]
merged_df = pd.merge(scores_df, schools_df, left_on="cds", right_on="CDSCode")
prediction = int(merged_df["NumTstTakr"].sum())
return prediction, answer
def pipeline_4():
query = "What is the grade span offered in the school with the highest longitude in cities in that are part of the 'Silicon Valley' region?"
answer = "K-5"
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
unique_cities = pd.DataFrame(schools_df["City"].unique(), columns=["City"])
unique_cities = unique_cities.sem_filter("{City} is a city in the Silicon Valley region")
schools_df = schools_df[schools_df["City"].isin(unique_cities["City"])]
schools_df = schools_df.sort_values(by=["Longitude"], key=abs, ascending=False).head(1)
prediction = schools_df["GSoffered"].tolist()[0]
return prediction, answer
def pipeline_5():
query = "What are the two most common first names among the female school administrators?"
answer = ["Jennifer", "Lisa"]
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
schools_df = (
schools_df.groupby("AdmFName1").size().reset_index(name="count").sort_values("count", ascending=False).head(20)
)
schools_df = schools_df.sem_filter("{AdmFName1} is a female first name")
prediction = schools_df["AdmFName1"].tolist()[:2]
return prediction, answer
def pipeline_6():
query = "Among the posts owned by csgillespie, how many of them are root posts and mention academic papers?"
answer = 4
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
users_df = users_df[users_df["DisplayName"] == "csgillespie"]
posts_df = posts_df[posts_df["ParentId"].isna()]
merged_df = pd.merge(users_df, posts_df, left_on="Id", right_on="OwnerUserId")
merged_df = merged_df.sem_filter("{Body} mentions academic papers")
prediction = len(merged_df)
return prediction, answer
def pipeline_8():
query = "How many of the comments with a score of 17 are about statistics?"
answer = 4
comments_df = pd.read_csv("../pandas_dfs/codebase_community/comments.csv")
comments_df = comments_df[comments_df["Score"] == 17]
comments_df = comments_df.sem_filter("{Text} is about statistics")
prediction = len(comments_df)
return prediction, answer
def pipeline_10():
query = "Of the posts with views above 80000, how many discuss the R programming language?"
answer = 3
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
posts_df = posts_df[posts_df["ViewCount"] > 80000]
posts_df = posts_df.sem_filter("{Body} discusses the R programming language")
prediction = len(posts_df)
return prediction, answer
def pipeline_11():
query = "Please give the names of the races held on the circuits in the middle east."
answer = [
"Bahrain Grand Prix",
"Turkish Grand Prix",
"Abu Dhabi Grand Prix",
"Azerbaijan Grand Prix",
"European Grand Prix",
]
circuits_df = pd.read_csv("../pandas_dfs/formula_1/circuits.csv")
races_df = pd.read_csv("../pandas_dfs/formula_1/races.csv")
circuits_df = circuits_df.sem_filter("{country} is in the Middle East")
merged_df = pd.merge(circuits_df, races_df, on="circuitId", suffixes=["_circuit", "_race"]).drop_duplicates(
subset="name_race"
)
prediction = merged_df["name_race"].tolist()
return prediction, answer
def pipeline_13():
query = "How many Asian drivers competed in the 2008 Australian Grand Prix?"
answer = 2
drivers_df = pd.read_csv("../pandas_dfs/formula_1/drivers.csv")
races_df = pd.read_csv("../pandas_dfs/formula_1/races.csv")
results_df = pd.read_csv("../pandas_dfs/formula_1/results.csv")
drivers_df = drivers_df.sem_filter("{nationality} is Asian")
races_df = races_df[(races_df["name"] == "Australian Grand Prix") & (races_df["year"] == 2008)]
merged_df = pd.merge(pd.merge(races_df, results_df, on="raceId"), drivers_df, on="driverId")
prediction = len(merged_df)
return prediction, answer
def pipeline_16():
query = "What is the preferred foot when attacking of the player with the most Ballon d'Or awards of all time?"
answer = "left"
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
attributes_df = pd.read_csv("../pandas_dfs/european_football_2/Player_Attributes.csv")
key_player = lm(
[
{
"role": "user",
"content": "What is the first and last name of the player who has won the most Ballon d'Or awards of all time? Respond with only the name and no other words.",
}
]
)[0]
players_df = players_df[players_df["player_name"] == key_player]
merged_df = pd.merge(players_df, attributes_df, on="player_api_id")
merged_df = merged_df[["player_name", "preferred_foot"]]
prediction = merged_df["preferred_foot"].values[0]
return prediction, answer
def pipeline_18():
query = "List the football player with a birthyear of 1970 who is an Aquarius"
answer = "Hans Vonk"
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
players_df = players_df[players_df["birthday"].str.startswith("1970")]
players_df = players_df.sem_filter("Someone born on {birthday} would be an Aquarius")
prediction = players_df["player_name"].values[0]
return prediction, answer
def pipeline_19():
query = "Please list the league from the country which is landlocked."
answer = "Switzerland Super League"
leagues_df = pd.read_csv("../pandas_dfs/european_football_2/League.csv")
countries_df = pd.read_csv("../pandas_dfs/european_football_2/Country.csv")
countries_df = countries_df.sem_filter("{name} is landlocked")
merged_df = pd.merge(
leagues_df, countries_df, left_on="country_id", right_on="id", suffixes=["_league", "_country"]
)
prediction = merged_df["name_league"].values[0]
return prediction, answer
def pipeline_20():
query = "How many matches in the 2008/2009 season were held in countries where French is an official language?"
answer = 866
matches_df = pd.read_csv("../pandas_dfs/european_football_2/Match.csv")
countries_df = pd.read_csv("../pandas_dfs/european_football_2/Country.csv")
matches_df = matches_df[matches_df["season"] == "2008/2009"]
countries_df = countries_df.sem_filter("{name} has French as an official language")
merged_df = pd.merge(matches_df, countries_df, left_on="country_id", right_on="id")
prediction = len(merged_df)
return prediction, answer
def pipeline_21():
query = "Of the top three away teams that scored the most goals, which one has the most fans?"
answer = "FC Barcelona"
teams_df = pd.read_csv("../pandas_dfs/european_football_2/Team.csv")
matches_df = pd.read_csv("../pandas_dfs/european_football_2/Match.csv")
merged_df = pd.merge(matches_df, teams_df, left_on="away_team_api_id", right_on="team_api_id")
merged_df = (
merged_df.sort_values("away_team_goal", ascending=False).drop_duplicates(subset="team_long_name").head(3)
)
prediction = merged_df.sem_topk("What {team_long_name} has the most fans?", 1).team_long_name.values[0]
return prediction, answer
def pipeline_24():
query = "Which year recorded the most gas use paid in the higher value currency?"
answer = 2013
customers_df = pd.read_csv("../pandas_dfs/debit_card_specializing/customers.csv")
yearmonth_df = pd.read_csv("../pandas_dfs/debit_card_specializing/yearmonth.csv")
unique_currencies = customers_df["Currency"].unique()
most_value = (
pd.DataFrame(unique_currencies, columns=["Currency"])
.sem_topk("What {Currency} is the highest value currency?", 1)
.Currency.values[0]
)
customers_df = customers_df[customers_df["Currency"] == most_value]
yearmonth_df["year"] = yearmonth_df["Date"] // 100
merged_df = pd.merge(customers_df, yearmonth_df, on="CustomerID")
merged_df = merged_df.groupby("year")["Consumption"].sum().reset_index()
merged_df = merged_df.sort_values("Consumption", ascending=False)
prediction = int(merged_df["year"].values[0])
return prediction, answer
def pipeline_108():
query = "Among the posts that were voted by user 1465, determine if the post is relevant to machine learning. Respond with YES if it is and NO if it is not."
answer = ["YES", "YES", "YES"]
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
votes_df = pd.read_csv("../pandas_dfs/codebase_community/votes.csv")
votes_df = votes_df[votes_df["UserId"] == 1465]
merged_df = pd.merge(posts_df, votes_df, left_on="Id", right_on="PostId")
merged_df = merged_df.sem_map(
"{Body} is relevant to machine learning. Answer with YES if it is and NO if it is not."
)
prediction = merged_df._map.tolist()
return prediction, answer
def pipeline_109():
query = "Extract the statistical term from the post titles which were edited by Vebjorn Ljosa."
answer = ["beta-binomial distribution", "AdaBoost", "SVM", "Kolmogorov-Smirnov statistic"]
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
merged_df = pd.merge(posts_df, users_df, left_on="OwnerUserId", right_on="Id")
merged_df = merged_df[merged_df["DisplayName"] == "Vebjorn Ljosa"]
merged_df = merged_df.sem_map("Extract the statistical term from {Title}. Respond with only the statistical term.")
prediction = merged_df._map.tolist()
return prediction, answer
def pipeline_110():
query = "List the Comment Ids of the positive comments made by the top 5 newest users on the post with the title 'Analysing wind data with R'"
answer = [11449]
comments_df = pd.read_csv("../pandas_dfs/codebase_community/comments.csv")
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
posts_df = posts_df[posts_df["Title"] == "Analysing wind data with R"]
merged_df = pd.merge(comments_df, posts_df, left_on="PostId", right_on="Id", suffixes=["_comment", "_post"])
merged_df = pd.merge(merged_df, users_df, left_on="UserId", right_on="Id", suffixes=["_merged", "_user"])
merged_df = merged_df.sort_values(by=["CreationDate_user"], ascending=False).head(5)
merged_df = merged_df.sem_filter("The sentiment of {Text} is positive")
prediction = merged_df.Id_comment.tolist()
return prediction, answer
def pipeline_111():
query = 'For the post from which the tag "bayesian" is excerpted from, identify whether the body of the post is True or False. Answer with True or False ONLY.'
answer = "TRUE"
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
tags_df = pd.read_csv("../pandas_dfs/codebase_community/tags.csv")
tags_df = tags_df[tags_df["TagName"] == "bayesian"]
merged_df = pd.merge(tags_df, posts_df, left_on="ExcerptPostId", right_on="Id")
prediction = merged_df.sem_map("Determine whether the content in {Body} is true. Respond with only TRUE or FALSE.")[
"_map"
].values[0]
return prediction, answer
############################################################################################################
###################################### Comparison based queries ############################################
############################################################################################################
def pipeline_25():
query = "What is the average total price of the transactions taken place in gas stations in the country which is historically known as Bohemia, to the nearest integer?"
answer = "453"
transactions_df = pd.read_csv("../pandas_dfs/debit_card_specializing/transactions_1k.csv")
gasstations_df = pd.read_csv("../pandas_dfs/debit_card_specializing/gasstations.csv")
countries = lm(
[
{
"role": "user",
"content": "What are abbreviations for the country historically known as Bohemia? If there are multiple possible abbreivations list them as a python list with quotes around each abbreviation.",
}
]
)[0]
try:
countries = eval(countries)
except:
countries = [countries]
gasstations_df = gasstations_df[gasstations_df["Country"].isin(countries)]
merged_df = pd.merge(transactions_df, gasstations_df, on="GasStationID")
prediction = round(merged_df["Price"].mean())
return prediction, answer
def pipeline_27():
query = (
"List the username of the oldest user located in the capital city of Austria who obtained the Supporter badge."
)
answer = "ymihere"
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
badges_df = pd.read_csv("../pandas_dfs/codebase_community/badges.csv")
merged_df = pd.merge(users_df, badges_df, left_on="Id", right_on="UserId")
merged_df = merged_df[merged_df["Name"] == "Supporter"]
location = lm(
[
{
"role": "user",
"content": "What is the capital city of Austria? Respond with only the city name and no other words.",
}
]
)[0]
location = f"{location}, Austria"
merged_df = merged_df[merged_df["Location"] == location]
prediction = merged_df.sort_values(by=["Age"], ascending=False).DisplayName.values.tolist()[0]
return prediction, answer
def pipeline_29():
query = "What is the difference in gas consumption between customers who pay using the currency of the Czech Republic and who pay the currency of European Union in 2012, to the nearest integer?"
answer = 402524570
customers_df = pd.read_csv("../pandas_dfs/debit_card_specializing/customers.csv")
yearmonth_df = pd.read_csv("../pandas_dfs/debit_card_specializing/yearmonth.csv")
countries = {"Area": ["Czech Republic", "European Union"]}
countries_df = pd.DataFrame(countries)
currency_df = countries_df.sem_map(
"Given {Area}, return the 3 letter currency code for the area. Answer with the code ONLY.", suffix="currency"
)
currencies = currency_df["currency"].values.tolist()
yearmonth_df = yearmonth_df[yearmonth_df["Date"] // 100 == 2012]
merged_df = pd.merge(customers_df, yearmonth_df, on="CustomerID")
first_df = merged_df[merged_df["Currency"] == currencies[0]]
second_df = merged_df[merged_df["Currency"] == currencies[1]]
prediction = round(first_df["Consumption"].sum() - second_df["Consumption"].sum())
return prediction, answer
def pipeline_30():
query = "Is it true that more SMEs pay in Czech koruna than in the second-largest reserved currency in the world?"
answer = "Yes"
customers_df = pd.read_csv("../pandas_dfs/debit_card_specializing/customers.csv")
customers_df = customers_df[customers_df["Segment"] == "SME"]
first_df = customers_df[customers_df["Currency"] == "CZK"]
currency = lm(
[
{
"role": "user",
"content": "What is the 3 letter code for the second-largest reserved currency in the world? Respond with only the 3 letter code and no other words.",
}
]
)[0]
second_df = customers_df[customers_df["Currency"] == currency]
if len(first_df) > len(second_df):
prediction = "Yes"
else:
prediction = "No"
return prediction, answer
def pipeline_33():
query = "What is the total number of schools whose total SAT scores are greater or equal to 1500 whose mailing city is the county seat of Lake County, California?"
answer = 2
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
scores_df = pd.read_csv("../pandas_dfs/california_schools/satscores.csv")
city = lm(
[
{
"role": "user",
"content": "What is the name of the city that is the county seat of Lake County, California? Respond with only the city name and no other words.",
}
]
)[0]
schools_df = schools_df[schools_df["City"] == city]
scores_df["total"] = scores_df["AvgScrRead"] + scores_df["AvgScrMath"] + scores_df["AvgScrWrite"]
scores_df = scores_df[scores_df["total"] >= 1500]
merged_df = pd.merge(scores_df, schools_df, left_on="cds", right_on="CDSCode")
prediction = len(merged_df)
return prediction, answer
def pipeline_35():
query = "How many drivers born after the end of the Vietnam War have been ranked 2?"
answer = 27
drivers_df = pd.read_csv("../pandas_dfs/formula_1/drivers.csv")
year_df = drivers_df.head(1).sem_map(
"Given {driverId}, return the year of the end of the Vietnam War. Answer with the year ONLY.", suffix="year"
)
vietnamyear = int(year_df["year"].values.tolist()[0])
drivers_df["birthyear"] = drivers_df.dropna(subset=["dob"])["dob"].str[:4].astype(int)
drivers_df = drivers_df[drivers_df["birthyear"] > vietnamyear]
prediction = len(drivers_df)
return prediction, answer
def pipeline_36():
query = "Among all European Grand Prix races, what is the percentage of the races were hosted in the country where the Bundesliga happens, to the nearest whole number?"
answer = 52
circuits_df = pd.read_csv("../pandas_dfs/formula_1/circuits.csv")
races_df = pd.read_csv("../pandas_dfs/formula_1/races.csv")
races_df = races_df[races_df["name"] == "European Grand Prix"]
merged_df = pd.merge(circuits_df, races_df, on="circuitId")
denom = len(merged_df)
merged_df = merged_df.sem_filter("{country} is where the Bundesliga happens")
numer = len(merged_df)
prediction = int(numer * 100 / denom)
return prediction, answer
def pipeline_37():
query = "From 2010 to 2015, what was the average overall rating, rounded to the nearest integer, of players who are higher than 170 and shorter than Michael Jordan?"
answer = 69
jordan_df = pd.DataFrame({"Name": ["Michael Jordan"]})
jordan_df = jordan_df.sem_map(
"Given {Name}, provide the height in cm. Answer with ONLY the number to one decimal place.", suffix="height"
)
jordan_height = float(jordan_df["height"].values.tolist()[0])
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
players_df = players_df[players_df["height"] > 170]
players_df = players_df[players_df["height"] < jordan_height]
attributes_df = pd.read_csv("../pandas_dfs/european_football_2/Player_Attributes.csv")
attributes_df["year"] = attributes_df["date"].str[:4].astype(int)
attributes_df = attributes_df[attributes_df["year"] >= 2010]
attributes_df = attributes_df[attributes_df["year"] <= 2015]
merged_df = pd.merge(players_df, attributes_df, on="player_api_id")
prediction = round(merged_df["overall_rating"].mean())
return prediction, answer
def pipeline_38():
query = "Among the drivers that finished the race in the 2008 Australian Grand Prix, how many debuted earlier than Lewis Hamilton?"
answer = 3
drivers_df = pd.read_csv("../pandas_dfs/formula_1/drivers.csv")
races_df = pd.read_csv("../pandas_dfs/formula_1/races.csv")
results_df = pd.read_csv("../pandas_dfs/formula_1/results.csv")
races_df = races_df[races_df["name"] == "Australian Grand Prix"]
races_df = races_df[races_df["year"] == 2008]
results_df = results_df[results_df["time"].notnull()]
drivers_df = drivers_df.sem_map(
"What year did driver {forename} {surname} debut in Formula 1? Answer with the year ONLY.", suffix="debut"
)
drivers_df["debut"] = pd.to_numeric(drivers_df["debut"], errors="coerce")
drivers_df = drivers_df.dropna(subset=["debut"])
merged_df = pd.merge(results_df, races_df, on="raceId").merge(drivers_df, on="driverId")
merged_df = merged_df[merged_df["year"] > merged_df["debut"]]
prediction = len(merged_df)
return prediction, answer
def pipeline_39():
query = "How many players were born after the year of the 14th FIFA World Cup?"
answer = 3028
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
year_df = players_df.head(1).sem_map(
"Given {id}, return the year of the 14th FIFA World Cup. Answer with the year ONLY.", suffix="year"
)
wcyear = int(year_df["year"].values.tolist()[0])
players_df["birthyear"] = players_df["birthday"].str[:4].astype(int)
players_df = players_df[players_df["birthyear"] > wcyear]
prediction = len(players_df)
return prediction, answer
def pipeline_40():
query = "Among the players whose height is over 180, how many of them have a volley score of over 70 and are taller than Stephen Curry?"
answer = 88
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
attributes_df = pd.read_csv("../pandas_dfs/european_football_2/Player_Attributes.csv")
players_df = players_df[players_df["height"] >= 180]
attributes_df = attributes_df[attributes_df["volleys"] > 70]
merged_df = pd.merge(players_df, attributes_df, on="player_api_id").drop_duplicates(subset="player_api_id")
steph_height = lm(
[
{
"role": "user",
"content": "How tall is Stephen Curry in centimeters? Box your final answer with \\boxed{just a number}.",
}
]
)[0]
steph_height = float(re.search(r"\\boxed\{(\d+(\.\d+)?)\}", steph_height).group(1))
merged_df = merged_df[merged_df["height"] > int(steph_height)]
prediction = len(merged_df)
return prediction, answer
def pipeline_41():
query = "Give the number of schools with the percent eligible for free meals in K-12 is more than 0.1 and test takers whose test score is greater than or equal to the score one hundred points less than the maximum."
answer = 1
scores_df = pd.read_csv("../pandas_dfs/california_schools/satscores.csv")
frpm_df = pd.read_csv("../pandas_dfs/california_schools/frpm.csv")
frpm_df = frpm_df[(frpm_df["Free Meal Count (K-12)"] / frpm_df["Enrollment (K-12)"]) > 0.1]
max_score = lm(
[
{
"role": "user",
"content": "What is the maximum SAT score?. Box your final answer with \\boxed{just a number}.",
}
]
)[0]
max_score = float(re.search(r"\\boxed\{(\d+(\.\d+)?)\}", max_score).group(1))
scores_df = scores_df[scores_df["AvgScrRead"] + scores_df["AvgScrMath"] >= max_score - 300]
merged_df = pd.merge(scores_df, frpm_df, left_on="cds", right_on="CDSCode").drop_duplicates(subset="cds")
prediction = len(merged_df)
return prediction, answer
def pipeline_42():
query = "How many schools have the difference in enrollements between K-12 and ages 5-17 as more than average high school class size?"
answer = 1236
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
frpm_df = pd.read_csv("../pandas_dfs/california_schools/frpm.csv")
avg_class_size = lm(
[
{
"role": "user",
"content": "What is the average high school class size? Box your final answer with \\boxed{just a number}.",
}
]
)[0]
avg_class_size = float(re.search(r"\\boxed\{(\d+(\.\d+)?)\}", avg_class_size).group(1))
frpm_df = frpm_df[frpm_df["Enrollment (K-12)"] - frpm_df["Enrollment (Ages 5-17)"] > avg_class_size]
merged_df = pd.merge(schools_df, frpm_df, on="CDSCode").drop_duplicates(subset="CDSCode")
prediction = len(merged_df)
return prediction, answer
def pipeline_43():
query = "Among the users who have more than 100 upvotes, how many of them are older than the median age in America?"
answer = 32
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
users_df = users_df[users_df["UpVotes"] > 100]
median_age = lm(
[
{
"role": "user",
"content": "What is the median age in America? Box your final answer with \\boxed{just a number}.",
}
]
)[0]
median_age = float(re.search(r"\\boxed\{(\d+(\.\d+)?)\}", median_age).group(1))
users_df = users_df[users_df["Age"] > median_age]
prediction = len(users_df)
return prediction, answer
def pipeline_44():
query = "Please list the player names taller than 6 foot 8?"
answer = ["Kristof van Hout"]
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
height = lm(
[
{
"role": "user",
"content": "What is 6 foot 8 in centimeters? Box your final answer with \\boxed{just a number}.",
}
]
)[0]
height = float(re.search(r"\\boxed\{(\d+(\.\d+)?)\}", height).group(1))
players_df = players_df[players_df["height"] > height]
prediction = players_df["player_name"].values.tolist()
return prediction, answer
def pipeline_45():
query = "How many players whose first names are Adam and weigh more than 77.1kg?"
answer = 24
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
players_df = players_df[players_df["player_name"].str.startswith("Adam")]
pounds = lm(
[{"role": "user", "content": "What is 77.1kg in pounds? Box your final answer with \\boxed{just a number}."}]
)[0]
pounds = float(re.search(r"\\boxed\{(\d+(\.\d+)?)\}", pounds).group(1))
players_df = players_df[players_df["weight"] > pounds]
prediction = len(players_df)
return prediction, answer
def pipeline_46():
query = "Please provide the names of top three football players who are over 5 foot 11 tall in alphabetical order."
answer = ["Aaron Appindangoye", "Aaron Galindo", "Aaron Hughes"]
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
height = lm(
[
{
"role": "user",
"content": "What is 5 foot 11 in centimeters? Box your final answer with \\boxed{just a number}.",
}
]
)[0]
height = float(re.search(r"\\boxed\{(\d+(\.\d+)?)\}", height).group(1))
players_df = players_df[players_df["height"] > height]
players_df = players_df.sort_values("player_name")
prediction = players_df["player_name"].head(3).values.tolist()
return prediction, answer
def pipeline_47():
query = "How many transactions taken place in the gas station in the Czech Republic are with a price of over 42.74 US dollars?"
answer = 56
transactions_df = pd.read_csv("../pandas_dfs/debit_card_specializing/transactions_1k.csv")
gasstations_df = pd.read_csv("../pandas_dfs/debit_card_specializing/gasstations.csv")
gasstations_df = gasstations_df[gasstations_df["Country"] == "CZE"]
merged_df = pd.merge(transactions_df, gasstations_df, on="GasStationID")
price = lm(
[{"role": "user", "content": "What is 42.74 USD in CZK? Box your final answer with \\boxed{just a number}."}]
)[0]
price = float(re.search(r"\\boxed\{(\d+(\.\d+)?)\}", price).group(1))
merged_df = merged_df[merged_df["Price"] > price]
prediction = len(merged_df)
return prediction, answer
def pipeline_48():
query = "Which of these circuits is located closer to a capital city, Silverstone Circuit, Hockenheimring or Hungaroring?"
answer = "Hungaroring"
circuits_df = pd.read_csv("../pandas_dfs/formula_1/circuits.csv")
circuits_df = circuits_df[circuits_df["name"].isin(["Silverstone Circuit", "Hockenheimring", "Hungaroring"])]
prediction = circuits_df.sem_topk("What circuit, named {name} is located closer to a capital city?", 1).name.values[
0
]
return prediction, answer
def pipeline_49():
query = "Which race was Alex Yoong in when he was in the top half of finishers?"
answer = "Australian Grand Prix"
drivers_df = pd.read_csv("../pandas_dfs/formula_1/drivers.csv")
drivers_df = drivers_df[(drivers_df["forename"] == "Alex") & (drivers_df["surname"] == "Yoong")]
driverStandings_df = pd.read_csv("../pandas_dfs/formula_1/driverStandings.csv")
races_df = pd.read_csv("../pandas_dfs/formula_1/races.csv")
merged_df = pd.merge(drivers_df, driverStandings_df, on="driverId").merge(races_df, on="raceId")
prediction = merged_df.sem_filter(
"The {position} is in the top half of number racers in a formula 1 race."
).name.values[0]
return prediction, answer
############################################################################################################
###################################### Ranking based queries ###############################################
############################################################################################################
def pipeline_50():
query = "Among the magnet schools with SAT test takers of over 500, which school name sounds most futuristic?"
answer = "Polytechnic High"
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
schools_df = schools_df[schools_df["Magnet"] == 1]
satscores_df = pd.read_csv("../pandas_dfs/california_schools/satscores.csv")
satscores_df = satscores_df[satscores_df["NumTstTakr"] > 500]
merged_df = pd.merge(schools_df, satscores_df, left_on="CDSCode", right_on="cds")
prediction = merged_df.sem_topk("What {School} sounds most futuristic?", 1).School.values[0]
return prediction, answer
def pipeline_51():
query = "Of the 5 posts wih highest popularity, list their titles in order of most technical to least technical."
answer = [
"How to interpret and report eta squared / partial eta squared in statistically significant and non-significant analyses?",
"How to interpret F- and p-value in ANOVA?",
"What is the meaning of p values and t values in statistical tests?",
"How to choose between Pearson and Spearman correlation?",
"How do I get the number of rows of a data.frame in R?",
]
posts_df = (
pd.read_csv("../pandas_dfs/codebase_community/posts.csv").sort_values(by=["ViewCount"], ascending=False).head(5)
)
prediction = posts_df.sem_topk("What {Title} is most technical?", 5).Title.values.tolist()
return prediction, answer
def pipeline_52():
query = "What are the Post Ids of the top 2 posts in order of most grateful comments received on 9-14-2014"
answer = [115372, 115254]
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
comments_df = pd.read_csv("../pandas_dfs/codebase_community/comments.csv")
comments_df = comments_df[comments_df["CreationDate"].str.startswith("2014-09-14")]
merged_df = pd.merge(posts_df, comments_df, left_on="Id", right_on="PostId")
merged_df = merged_df.sem_filter("The sentiment on {Text} is that of someone being grateful.")
merged_df = merged_df.groupby("Id_x").size().sort_values(ascending=False)
prediction = list(merged_df.index[:2])
return prediction, answer
def pipeline_53():
query = "For the post owned by csgillespie with the highest popularity, what is the most sarcastic comment?"
answer = "That pirates / global warming chart is clearly cooked up by conspiracy theorists - anyone can see they have deliberately plotted even spacing for unequal time periods to avoid showing the recent sharp increase in temperature as pirates are almost entirely wiped out. We all know that as temperatures rise it makes the rum evaporate and pirates cannot survive those conditions."
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
users_df = users_df[users_df["DisplayName"] == "csgillespie"]
merged_df = (
pd.merge(posts_df, users_df, left_on="OwnerUserId", right_on="Id")
.sort_values(by=["ViewCount"], ascending=False)
.head(1)
)
comments_df = pd.read_csv("../pandas_dfs/codebase_community/comments.csv")
merged_df_with_comments = pd.merge(merged_df, comments_df, left_on="Id_x", right_on="PostId")
prediction = merged_df_with_comments.sem_topk("What {Text} is most sarcastic?", 1).Text.values[0]
return prediction, answer
def pipeline_54():
query = "Among the top 10 most popular tags, which is the least related to statistics?"
answer = "self-study"
tags_df = pd.read_csv("../pandas_dfs/codebase_community/tags.csv")
tags_df = tags_df.sort_values("Count", ascending=False)
tags_df = tags_df.head(10)
prediction = tags_df.sem_topk("{TagName} is the least related to statistics?", 1).TagName.values[0]
return prediction, answer
def pipeline_55():
query = "Of the top 10 most favorited posts, what is the Id of the most lighthearted post?"
answer = 423
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
posts_df = posts_df.sort_values("FavoriteCount", ascending=False).head(10)
prediction = posts_df.sem_topk("What {Body} is most lighthearted?", 1).Id.values[0]
prediction = int(prediction)
return prediction, answer
def pipeline_56():
query = "Among the posts owned by a user over 65 with a score of over 10, what are the post id's of the top 2 posts made with the least expertise?"
answer = [8485, 15670]
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
users_df = users_df[users_df["Age"] > 65]
posts_df = posts_df[posts_df["Score"] > 10]
merged_df = pd.merge(users_df, posts_df, left_on="Id", right_on="OwnerUserId", suffixes=["_users", "_posts"])
prediction = merged_df.sem_topk("What {Body} is made with the least expertise?", 2).Id_posts.values.tolist()
return prediction, answer
def pipeline_57():
query = "Among the badges obtained by csgillespie in 2011, which is the most creatively named?"
answer = "Strunk & White"
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
badges_df = pd.read_csv("../pandas_dfs/codebase_community/badges.csv")
users_df = users_df[users_df["DisplayName"] == "csgillespie"]
merged_df = pd.merge(users_df, badges_df, left_on="Id", right_on="UserId").drop_duplicates("Name")
prediction = merged_df.sem_topk("What {Name} is most creative?", 1).Name.values[0]
return prediction, answer
def pipeline_58():
query = "Of the posts owned by Yevgeny, what are the id's of the top 3 most pessimistic?"
answer = [23819, 24216, 35748]
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
posts_df = pd.read_csv("../pandas_dfs/codebase_community/posts.csv")
users_df = users_df[users_df["DisplayName"] == "Yevgeny"]
merged_df = pd.merge(users_df, posts_df, left_on="Id", right_on="OwnerUserId", suffixes=["_users", "_posts"])
prediction = merged_df.sem_topk("What {Body} is most pessimistic?", 3).Id_posts.values.tolist()
return prediction, answer
def pipeline_59():
query = "Of the top 10 players taller than 180 ordered by average heading accuracy, what are the top 3 most unique sounding names?"
answer = ["Naldo", "Per Mertesacker", "Didier Drogba"]
players_df = pd.read_csv("../pandas_dfs/european_football_2/Player.csv")
attributes_df = pd.read_csv("../pandas_dfs/european_football_2/Player_Attributes.csv")
players_df = players_df[players_df["height"] > 180]
merged_df = pd.merge(players_df, attributes_df, on="player_api_id", suffixes=["_players", "_attributes"])
merged_df = merged_df.groupby("player_api_id")
merged_df = merged_df.agg({"heading_accuracy": "mean", "player_name": "first"}).reset_index()
merged_df = merged_df.sort_values("heading_accuracy", ascending=False).head(10)
prediction = merged_df.sem_topk("What {player_name} is most unique sounding?", 3).player_name.values.tolist()
return prediction, answer
def pipeline_60():
query = "Out of users that have obtained at least 200 badges, what are the top 2 display names that seem most based off a real name?"
answer = ["Jeromy Anglim", "Glen_b"]
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
badges_df = pd.read_csv("../pandas_dfs/codebase_community/badges.csv")
merged_df = pd.merge(users_df, badges_df, left_on="Id", right_on="UserId")
merged_df = merged_df.groupby("DisplayName").filter(lambda x: len(x) >= 200).drop_duplicates(subset="DisplayName")
prediction = merged_df.sem_topk(
"What {DisplayName} seems most based off a real name?", 2
).DisplayName.values.tolist()
return prediction, answer
def pipeline_61():
query = "Of the cities containing exclusively virtual schools which are the top 3 safest places to live?"
answer = ["Thousand Oaks", "Simi Valley", "Westlake Village"]
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
schools_df = schools_df[schools_df["Virtual"] == "F"].drop_duplicates(subset="City")
prediction = schools_df.sem_topk("What {City} is the safest place to live?", 3).City.values.tolist()
return prediction, answer
def pipeline_62():
query = "List the cities containing the top 5 most enrolled schools in order from most diverse to least diverse. "
answer = ["Long Beach", "Paramount", "Granada Hills", "Temecula", "Carmichael"]
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
frpm_df = pd.read_csv("../pandas_dfs/california_schools/frpm.csv")
merged_df = pd.merge(schools_df, frpm_df, on="CDSCode")
merged_df = merged_df.sort_values("Enrollment (K-12)", ascending=False).head(5)
prediction = merged_df.sem_topk("What {City} is the most diverse?", 5).City.values.tolist()
return prediction, answer
def pipeline_63():
query = "Please list the top three continuation schools with the lowest eligible free rates for students aged 5-17 and rank them based on the overall affordability of their respective cities."
answer = ["Del Amigo High (Continuation)", "Rancho del Mar High (Continuation)", "Millennium High Alternative"]
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
frpm_df = pd.read_csv("../pandas_dfs/california_schools/frpm.csv")
frpm_df = frpm_df[frpm_df["Educational Option Type"] == "Continuation School"]
frpm_df["frpm_rate"] = frpm_df["Free Meal Count (Ages 5-17)"] / frpm_df["Enrollment (Ages 5-17)"]
frpm_df = frpm_df.sort_values("frpm_rate", ascending=True).head(3)
merged_df = pd.merge(schools_df, frpm_df, on="CDSCode")
prediction = merged_df.sem_topk("What {City} is the most affordable?", 3).School.values.tolist()
return prediction, answer
def pipeline_64():
query = "Of the schools with the top 3 SAT excellence rate, order their counties by academic reputation from strongest to weakest."
answer = "Santa Clara County"
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
satscores_df = pd.read_csv("../pandas_dfs/california_schools/satscores.csv")
satscores_df["excellence_rate"] = satscores_df["NumGE1500"] / satscores_df["NumTstTakr"]
satscores_df = satscores_df.sort_values("excellence_rate", ascending=False).head(3)
merged_df = pd.merge(schools_df, satscores_df, left_on="CDSCode", right_on="cds")
prediction = merged_df.sem_topk("What {County} has the strongest academic reputation?", 3).County.values[0]
return prediction, answer
def pipeline_65():
query = "Among the cities with the top 10 lowest enrollment for students in grades 1 through 12, which are the top 2 most popular cities to visit?"
answer = ["Death Valley", "Shaver Lake"]
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
frpm_df = pd.read_csv("../pandas_dfs/california_schools/frpm.csv")
merged_df = pd.merge(schools_df, frpm_df, on="CDSCode")
merged_df = merged_df.groupby("City").agg({"Enrollment (K-12)": "sum"}).reset_index()
merged_df = merged_df.sort_values("Enrollment (K-12)", ascending=True).head(10)
prediction = merged_df.sem_topk(
"What {City}/location in California is the most popular to visit?", 2
).City.values.tolist()
return prediction, answer
def pipeline_106():
query = "Of the top 5 users with the most views, who has their social media linked in their AboutMe section?"
answer = "whuber"
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
users_df = users_df.sort_values("Views", ascending=False).head(5)
prediction = users_df.sem_filter("The {AboutMe} contains a link to social media.").DisplayName.values[0]
return prediction, answer
def pipeline_107():
query = "Of all the comments commented by the user with a username of Harvey Motulsky and with a score of 5, rank the post ids in order of most helpful to least helpful."
answer = [89457, 64710, 4945]
users_df = pd.read_csv("../pandas_dfs/codebase_community/users.csv")
comments_df = pd.read_csv("../pandas_dfs/codebase_community/comments.csv")
users_df = users_df[users_df["DisplayName"] == "Harvey Motulsky"]
comments_df = comments_df[comments_df["Score"] == 5]
merged_df = pd.merge(users_df, comments_df, left_on="Id", right_on="UserId")
prediction = merged_df.sem_topk("What {Text} is most helpful?", 3).PostId.values.tolist()
return prediction, answer
def pipeline_952():
query = "Of the constructors that have been ranked 1 in 2014, which has the most prestige"
answer = "Ferrari"
constructors_df = pd.read_csv("../pandas_dfs/formula_1/constructors.csv")
results_df = pd.read_csv("../pandas_dfs/formula_1/results.csv")
races_df = pd.read_csv("../pandas_dfs/formula_1/races.csv")
merged_df = pd.merge(results_df, constructors_df, on="constructorId", suffixes=["_results", "_constructors"])
merged_df = pd.merge(merged_df, races_df, on="raceId", suffixes=["_merged", "_races"])
merged_df = merged_df[(merged_df["rank"] == 1) & (merged_df["year"] == 2014)].drop_duplicates(
subset="constructorId"
)
merged_df = merged_df.rename(columns={"name_merged": "name"})
prediction = merged_df.sem_topk("What {name} is most prestigious?", 1).name.values[0]
return prediction, answer
def pipeline_1000():
query = "Of the 5 racetracks that hosted the most recent races, rank the locations by distance to the equator."
answer = ["Mexico City", "Sao Paulo", "Abu Dhabi", "Austin", "Suzuka"]
circuits_df = pd.read_csv("../pandas_dfs/formula_1/circuits.csv")
races_df = pd.read_csv("../pandas_dfs/formula_1/races.csv")
merged_df = pd.merge(circuits_df, races_df, on="circuitId")
merged_df = merged_df.sort_values("date", ascending=False).head(5)
prediction = merged_df.sem_topk("What {location} is closest to the equator?", 5).location.values.tolist()
return prediction, answer
############################################################################################################
###################################### Aggregation based queries ###########################################
############################################################################################################
def pipeline_81():
query = "Summarize the qualities of the schools with an average score in Math under 600 in the SAT test and are exclusively virtual."
answer = ""
scores_df = pd.read_csv("../pandas_dfs/california_schools/satscores.csv")
schools_df = pd.read_csv("../pandas_dfs/california_schools/schools.csv")
scores_df = scores_df[scores_df["AvgScrMath"] < 600]
schools_df = schools_df[schools_df["Virtual"] == "F"]
merged_df = pd.merge(scores_df, schools_df, left_on="cds", right_on="CDSCode")
prediction = merged_df.sem_agg("Summarize the qualities of the schools", all_cols=True)._output[0]
return prediction, answer
def pipeline_82():