diff --git a/README.md b/README.md index ed8806c..cd4b413 100644 --- a/README.md +++ b/README.md @@ -431,6 +431,11 @@ CodeWhisper's performance has been evaluated across different models using the E | -------------------------- | ------------ | -------- | -------- | ------------------------------------------------------------------------------ | | claude-3-5-sonnet-20240620 | 80.27% | 1619.49 | 3.4000 | `./benchmark/run_benchmark.sh --workers 5 --no-plan` | | gpt-4o-2024-08-06 | 81.51% | 986.68 | 1.6800 | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model gpt-4o-2024-08-06` | +| deepseek-coder | 76.89% | 5850.58 | 0.0000\* | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model deepseek-coder` | + +\*The cost calculation was not working properly for this benchmark run. + +> **Note:** All benchmarks are one-shot only, unlike other benchmarks which use multiple generations that depend on the results of the test run. The full reports used to generate these results are available in the `benchmark/reports/` directory. diff --git a/benchmark/README.md b/benchmark/README.md index 595be4a..78f74ba 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -15,6 +15,9 @@ CodeWhisper's performance has been evaluated across different models using the E | -------------------------- | ------------ | -------- | -------- | ------------------------------------------------------------------------------ | | claude-3-5-sonnet-20240620 | 80.27% | 1619.49 | 3.4000 | `./benchmark/run_benchmark.sh --workers 5 --no-plan` | | gpt-4o-2024-08-06 | 81.51% | 986.68 | 1.6800 | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model gpt-4o-2024-08-06` | +| deepseek-coder | 76.89% | 5850.58 | 0.0000\* | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model deepseek-coder` | + +\*The cost calculation was not working properly for this benchmark run. > **Note:** All benchmarks are one-shot only, unlike other benchmarks which use multiple generations that depend on the results of the test run. diff --git a/benchmark/reports/benchmark_report_deepseek-coder_diff_reference.md b/benchmark/reports/benchmark_report_deepseek-coder_diff_reference.md new file mode 100644 index 0000000..8800102 --- /dev/null +++ b/benchmark/reports/benchmark_report_deepseek-coder_diff_reference.md @@ -0,0 +1,1280 @@ +# CodeWhisper Benchmark Report + +## Summary + +- **Total time:** 5850.58 seconds +- **Total cost:** $0.0000 +- **Passed exercises:** 88/133 (66.17%) +- **Total tests passed:** 4032/5244 (76.89%) + +### 2. acronym + +- **Time taken:** 20.59 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/17 (76.47%) +- **Failed tests:** + - test_apostrophes + +# CodeWhisper Benchmark Report + +## Detailed Results + +### 1. accumulate + +- **Time taken:** 21.38 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 4. all-your-base + +- **Time taken:** 27.47 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 22/22 (100.00%) + +### 5. allergies + +- **Time taken:** 30.32 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 51/51 (100.00%) + +### 7. anagram + +- **Time taken:** 20.89 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 19/19 (100.00%) + +### 3. affine-cipher + +- **Time taken:** 42.63 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 17/17 (100.00%) + +### 8. armstrong-numbers + +- **Time taken:** 22.29 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 6. alphametics + +- **Time taken:** 25.86 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 9. atbash-cipher + +- **Time taken:** 26.69 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 12. binary + +- **Time taken:** 20.81 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 11. beer-song + +- **Time taken:** 30.09 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 13. binary-search + +- **Time taken:** 22.48 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 10. bank-account + +- **Time taken:** 36.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 17/17 (100.00%) + +### 14. binary-search-tree + +- **Time taken:** 34.44 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 15. bob + +- **Time taken:** 23.77 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 26/26 (100.00%) + +### 16. book-store + +- **Time taken:** 29.92 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 49/71 (69.01%) +- **Failed tests:** + - test_check_that_groups_of_four_are_created_properly_even_when_there_are_more_groups_of_three_than_groups_of_five + - test_four_groups_of_four_are_cheaper_than_two_groups_each_of_five_and_three + - test_one_group_of_one_and_two_plus_three_groups_of_four_is_cheaper_than_one_group_of_each_size + - test_shuffled_book_order + - test_two_groups_of_four_and_a_group_of_five + - test_two_groups_of_four_is_cheaper_than_group_of_five_plus_group_of_three + - test_two_groups_of_four_is_cheaper_than_groups_of_five_and_three + +### 17. bottle-song + +- **Time taken:** 27.97 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 59/88 (67.05%) +- **Failed tests:** + - test_all_verses + - test_first_generic_verse + - test_first_two_verses + - test_last_generic_verse + - test_last_three_verses + - test_verse_with_1_bottle + - test_verse_with_2_bottles + +### 19. change + +- **Time taken:** 26.76 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 29/50 (58.00%) +- **Failed tests:** + - test_another_possible_change_without_unit_coins_available + - test_change_with_lilliputian_coins + - test_large_target_values + - test_multiple_coin_change + - test_possible_change_without_unit_coins_available + +### 22. collatz-conjecture + +- **Time taken:** 21.39 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 18. bowling + +- **Time taken:** 49.52 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 270/357 (75.63%) +- **Failed tests:** + - test_two_rolls_in_a_frame_cannot_score_more_than_10_points + +### 21. clock + +- **Time taken:** 32.59 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 26/42 (61.90%) + +### 20. circular-buffer + +- **Time taken:** 49.51 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 25. crypto-square + +- **Time taken:** 24.30 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 45/58 (77.59%) + +### 24. connect + +- **Time taken:** 38.97 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 20/30 (66.67%) +- **Failed tests:** + - test_illegal_diagonal_does_not_make_a_winner + - test_o_wins_crossing_from_top_to_bottom + - test_only_edges_does_not_make_a_winner + +### 27. darts + +- **Time taken:** 19.66 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 14/14 (100.00%) + +### 23. complex-numbers + +- **Time taken:** 50.18 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 58/63 (92.06%) + +### 26. custom-set + +- **Time taken:** 42.11 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 41/41 (100.00%) + +### 28. diamond + +- **Time taken:** 27.74 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 6/6 (100.00%) + +### 30. diffie-hellman + +- **Time taken:** 26.22 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 29. difference-of-squares + +- **Time taken:** 29.80 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 31. dnd-character + +- **Time taken:** 28.32 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 20/20 (100.00%) + +### 32. dominoes + +- **Time taken:** 32.88 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 67/96 (69.79%) +- **Failed tests:** + - test_empty_input_empty_output + +### 34. eliuds-eggs + +- **Time taken:** 21.65 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 5/5 (100.00%) + +### 33. dot-dsl + +- **Time taken:** 32.03 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 36. etl + +- **Time taken:** 19.65 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 5/5 (100.00%) + +### 35. error-handling + +- **Time taken:** 37.27 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 6/6 (100.00%) + +### 37. flatten-array + +- **Time taken:** 21.13 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 40. gigasecond + +- **Time taken:** 15.36 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 6/6 (100.00%) + +### 38. food-chain + +- **Time taken:** 41.20 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 52/70 (74.29%) +- **Failed tests:** + - test_fly + - test_full_song + - test_multiple_verses + +### 43. grains + +- **Time taken:** 24.32 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 39. forth + +- **Time taken:** 50.52 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 62/75 (82.67%) +- **Failed tests:** + - test_user_defined_words_can_use_different_words_with_the_same_name + +### 42. grade-school + +- **Time taken:** 37.40 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 21/21 (100.00%) + +### 45. hamming + +- **Time taken:** 21.42 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 47. hello-world + +- **Time taken:** 14.43 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 2/2 (100.00%) + +### 41. go-counting + +- **Time taken:** 53.15 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/14 (57.14%) + +### 44. grep + +- **Time taken:** 37.59 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 66/83 (79.52%) +- **Failed tests:** + - test_multiple_files_one_match_print_file_names_flag + - test_multiple_files_several_matches_file_flag_takes_precedence_over_line_number_flag + - test_one_file_one_match_file_flag_takes_precedence_over_line_flag + - test_one_file_one_match_print_file_names_flag + +### 48. hexadecimal + +- **Time taken:** 23.06 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/18 (83.33%) + +### 46. hangman + +- **Time taken:** 37.73 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 49. high-scores + +- **Time taken:** 20.52 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 50. house + +- **Time taken:** 28.69 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 51. isbn-verifier + +- **Time taken:** 32.19 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 20/20 (100.00%) + +### 52. isogram + +- **Time taken:** 25.12 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 53. killer-sudoku-helper + +- **Time taken:** 27.20 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 14/14 (100.00%) + +### 54. kindergarten-garden + +- **Time taken:** 31.55 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 82/103 (79.61%) + +### 55. knapsack + +- **Time taken:** 25.27 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 57. leap + +- **Time taken:** 21.87 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 56. largest-series-product + +- **Time taken:** 27.49 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 60. list-ops + +- **Time taken:** 40.33 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 32/39 (82.05%) +- **Failed tests:** + - test_foldr_direction_dependent_function_applied_to_non_empty_list + - test_foldr_foldr_add_string + +### 61. luhn + +- **Time taken:** 31.26 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 24/24 (100.00%) + +### 59. linked-list + +- **Time taken:** 46.48 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 25/25 (100.00%) + +### 58. ledger + +- **Time taken:** 600.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** deepseek-coder +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - CodeWhisper execution timed out after 1 minutes + +### 64. matrix + +- **Time taken:** 22.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 63. matching-brackets + +- **Time taken:** 26.14 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 21/21 (100.00%) + +### 62. markdown + +- **Time taken:** 600.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** deepseek-coder +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - CodeWhisper execution timed out after 1 minutes + +### 66. minesweeper + +- **Time taken:** 32.49 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 16/16 (100.00%) + +### 67. nth-prime + +- **Time taken:** 27.93 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 65. meetup + +- **Time taken:** 51.63 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 251/305 (82.30%) +- **Failed tests:** + - test_nonexistent_fifth_friday_of_august_2022 + - test_nonexistent_fifth_monday_of_february_2022 + - test_nonexistent_fifth_thursday_of_may_2023 + +### 69. octal + +- **Time taken:** 25.77 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 68. ocr-numbers + +- **Time taken:** 34.65 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 60/103 (58.25%) +- **Failed tests:** + - test_garbled_numbers_in_a_string_are_replaced_with + - test_numbers_separated_by_empty_lines_are_recognized_lines_are_joined_by_commas + - test_recognizes_0 + - test_recognizes_1 + - test_recognizes_110101100 + - test_recognizes_2 + - test_recognizes_3 + - test_recognizes_4 + - test_recognizes_5 + - test_recognizes_6 + - test_recognizes_7 + - test_recognizes_8 + - test_recognizes_9 + - test_recognizes_string_of_decimal_numbers + +### 72. pangram + +- **Time taken:** 21.02 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 73. pascals-triangle + +- **Time taken:** 26.14 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 74. perfect-numbers + +- **Time taken:** 28.31 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 14/14 (100.00%) + +### 70. paasio + +- **Time taken:** 600.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** deepseek-coder +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - CodeWhisper execution timed out after 1 minutes + +### 77. point-mutations + +- **Time taken:** 17.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 71. palindrome-products + +- **Time taken:** 60.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** deepseek-coder +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - Exercise execution timed out + +### 75. phone-number + +- **Time taken:** 37.53 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 27/31 (87.10%) +- **Failed tests:** + - test_invalid_with_punctuations + +### 76. pig-latin + +- **Time taken:** 30.32 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 26/30 (86.67%) +- **Failed tests:** + - test_word_beginning_with_th + +### 80. prime-factors + +- **Time taken:** 23.19 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 82. proverb + +- **Time taken:** 23.05 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 78. poker + +- **Time taken:** 41.50 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 179/334 (53.59%) +- **Failed tests:** + - test_a_straight_beats_three_of_a_kind + - test_aces_can_start_a_straight_a_2_3_4_5 + - test_aces_can_start_a_straight_flush_a_2_3_4_5 + - test_aces_cannot_be_in_the_middle_of_a_straight_q_k_a_2_3 + - test_both_hands_have_a_flush_tie_goes_to_high_card_down_to_the_last_one_if_necessary + - test_both_hands_have_a_full_house_tie_goes_to_highest_ranked_triplet + - test_both_hands_have_a_straight_flush_tie_goes_to_highest_ranked_card + - test_both_hands_have_four_of_a_kind_tie_goes_to_high_quad + - test_both_hands_have_the_same_pair_high_card_wins + - test_both_hands_have_three_of_a_kind_tie_goes_to_highest_ranked_triplet + - test_both_hands_have_two_identically_ranked_pairs_tie_goes_to_remaining_card_kicker + - test_both_hands_have_two_pairs_highest_ranked_pair_wins + - test_both_hands_have_two_pairs_that_add_to_the_same_value_win_goes_to_highest_pair + - test_both_hands_have_two_pairs_with_the_same_highest_ranked_pair_tie_goes_to_low_pair + - test_both_hands_with_a_straight_tie_goes_to_highest_ranked_card + - test_even_though_an_ace_is_usually_high_a_5_high_straight_flush_is_the_lowest_scoring_straight_flush + - test_even_though_an_ace_is_usually_high_a_5_high_straight_is_the_lowest_scoring_straight + - test_flush_beats_a_straight + - test_four_of_a_kind_beats_a_full_house + - test_full_house_beats_a_flush + - test_highest_pair_wins + - test_multiple_hands_with_the_same_high_cards_tie_compares_next_highest_ranked_down_to_last_card + - test_one_pair_beats_high_card + - test_single_hand_always_wins + - test_three_of_a_kind_beats_two_pair + - test_two_pairs_beats_one_pair + - test_two_pairs_first_ranked_by_largest_pair + - test_winning_high_card_hand_also_has_the_lowest_card + - test_with_multiple_decks_both_hands_have_a_full_house_with_the_same_triplet_tie_goes_to_the_pair + - test_with_multiple_decks_both_hands_with_identical_four_of_a_kind_tie_determined_by_kicker + - test_with_multiple_decks_two_players_can_have_same_three_of_a_kind_ties_go_to_highest_remaining_cards + +### 81. protein-translation + +- **Time taken:** 32.12 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 27/27 (100.00%) + +### 79. pov + +- **Time taken:** 38.88 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 16/16 (100.00%) + +### 83. pythagorean-triplet + +- **Time taken:** 24.80 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 84. queen-attack + +- **Time taken:** 29.95 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 86. raindrops + +- **Time taken:** 21.19 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 19/19 (100.00%) + +### 85. rail-fence-cipher + +- **Time taken:** 36.04 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 90. resistor-color + +- **Time taken:** 25.60 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 5/5 (100.00%) + +### 87. rational-numbers + +- **Time taken:** 46.94 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 458/597 (76.72%) + +### 89. rectangles + +- **Time taken:** 30.42 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 21/28 (75.00%) +- **Failed tests:** + - test_corner_is_required_for_a_rectangle_to_be_complete + - test_large_input_with_many_rectangles + +### 88. react + +- **Time taken:** 39.28 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 51/64 (79.69%) + +### 91. resistor-color-duo + +- **Time taken:** 26.03 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 95. reverse-string + +- **Time taken:** 18.39 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 93. resistor-color-trio + +- **Time taken:** 30.05 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 96. rna-transcription + +- **Time taken:** 18.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 92. resistor-color-expert + +- **Time taken:** 40.71 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 30/37 (81.08%) +- **Failed tests:** + - test_green_brown_orange_and_grey + - test_red_black_red_and_green + +### 94. rest-api + +- **Time taken:** 42.74 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 55/74 (74.32%) + +### 97. robot-name + +- **Time taken:** 24.78 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 53/70 (75.71%) + +### 99. roman-numerals + +- **Time taken:** 27.36 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 28/28 (100.00%) + +### 100. rotational-cipher + +- **Time taken:** 23.00 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 98. robot-simulator + +- **Time taken:** 36.61 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 19/19 (100.00%) + +### 102. saddle-points + +- **Time taken:** 25.09 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/18 (83.33%) + +### 101. run-length-encoding + +- **Time taken:** 32.75 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 14/14 (100.00%) + +### 103. satellite + +- **Time taken:** 34.80 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 107. secret-handshake + +- **Time taken:** 24.07 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 44/83 (53.01%) +- **Failed tests:** + - test_all_possible_actions + - test_close_your_eyes_for_100 + - test_combine_two_actions + - test_double_blink_for_10 + - test_jump_for_1000 + - test_reverse_two_actions + - test_reversing_no_actions_still_gives_no_actions + - test_reversing_one_action_gives_the_same_action + - test_wink_for_1 + +### 105. scale-generator + +- **Time taken:** 39.67 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 26/43 (60.47%) +- **Failed tests:** + - test_harmonic_minor + - test_locrian_mode + +### 106. scrabble-score + +- **Time taken:** 28.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 104. say + +- **Time taken:** 52.08 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 39/58 (67.24%) +- **Failed tests:** + - test_a_big_number + - test_one_billion + - test_one_million + - test_one_million_two_thousand_three_hundred_forty_five + - test_one_thousand + - test_one_thousand_two_hundred_thirty_four + +### 108. series + +- **Time taken:** 24.54 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 110. sieve + +- **Time taken:** 25.41 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 6/6 (100.00%) + +### 111. simple-cipher + +- **Time taken:** 32.45 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 109. sgf-parsing + +- **Time taken:** 46.63 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 112/167 (67.07%) +- **Failed tests:** + - test_escaped_backslash_in_property_value_becomes_just_a_backslash + - test_escaped_closing_bracket_within_property_value_becomes_just_a_closing_bracket + - test_escaped_newline_in_property_value_is_converted_to_nothing_at_all + - test_escaped_t_and_n_in_property_value_are_just_letters_not_whitespace + - test_escaped_tab_in_property_value_is_converted_to_space + - test_mixing_various_kinds_of_whitespace_and_escaped_characters_in_property_value + - test_multiple_properties + - test_multiple_property_values + - test_node_without_properties + - test_opening_bracket_within_property_value_doesn_t_need_to_be_escaped + - test_parentheses_in_property_value_don_t_need_to_be_escaped + - test_properties_without_delimiter + - test_semicolon_in_property_value_doesn_t_need_to_be_escaped + - test_single_node_tree + - test_two_child_trees + - test_two_nodes + - test_within_property_values_newlines_remain_as_newlines + - test_within_property_values_whitespace_characters_such_as_tab_are_converted_to_spaces + +### 112. simple-linked-list + +- **Time taken:** 41.64 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 42/49 (85.71%) + +### 113. space-age + +- **Time taken:** 40.87 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 114. spiral-matrix + +- **Time taken:** 26.99 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 115. square-root + +- **Time taken:** 23.20 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 116. strain + +- **Time taken:** 24.09 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 118. sum-of-multiples + +- **Time taken:** 23.94 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 17/17 (100.00%) + +### 117. sublist + +- **Time taken:** 27.67 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 56/90 (62.22%) +- **Failed tests:** + - test_at_end_of_superlist + - test_at_start_of_superlist + - test_consecutive + - test_empty_list_within_non_empty_list + - test_false_start + - test_in_middle_of_superlist + - test_large_lists + - test_non_empty_list_contains_empty_list + - test_sublist_at_end + - test_sublist_at_start + - test_sublist_in_middle + +### 120. transpose + +- **Time taken:** 24.31 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 50/60 (83.33%) +- **Failed tests:** + - test_first_line_longer_than_second_line + - test_jagged_triangle + - test_mixed_line_length + +### 119. tournament + +- **Time taken:** 39.91 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 123. trinary + +- **Time taken:** 24.34 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 122. triangle + +- **Time taken:** 27.26 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 87/121 (71.90%) + +### 121. tree-building + +- **Time taken:** 45.29 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 43/56 (76.79%) +- **Failed tests:** + - test_cycle_directly + - test_no_root_node + - test_non_continuous + - test_root_node_has_parent + +### 124. twelve-days + +- **Time taken:** 38.52 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 16/16 (100.00%) + +### 126. two-fer + +- **Time taken:** 17.10 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 4/4 (100.00%) + +### 127. variable-length-quantity + +- **Time taken:** 33.48 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 89/247 (36.03%) +- **Failed tests:** + - test_arbitrary_double_byte + - test_arbitrary_quadruple_byte + - test_arbitrary_quintuple_byte + - test_arbitrary_triple_byte + - test_four_bytes + - test_largest_double_byte + - test_largest_quadruple_byte + - test_largest_triple_byte + - test_many_multi_byte_values + - test_maximum_32_bit_integer + - test_maximum_32_bit_integer_input + - test_multiple_values + - test_smallest_double_byte + - test_smallest_quadruple_byte + - test_smallest_quintuple_byte + - test_smallest_triple_byte + - test_three_bytes + - test_two_bytes + - test_two_multi_byte_values + +### 125. two-bucket + +- **Time taken:** 48.48 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 25/46 (54.35%) +- **Failed tests:** + - test_measure_using_bucket_one_of_size_3_and_bucket_two_of_size_5_start_with_bucket_one + - test_measure_using_bucket_one_of_size_3_and_bucket_two_of_size_5_start_with_bucket_two + - test_measure_using_bucket_one_of_size_7_and_bucket_two_of_size_11_start_with_bucket_one + - test_measure_using_bucket_one_of_size_7_and_bucket_two_of_size_11_start_with_bucket_two + - test_with_the_same_buckets_but_a_different_goal_then_it_is_possible + +### 128. word-count + +- **Time taken:** 30.81 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 18/18 (100.00%) + +### 129. word-search + +- **Time taken:** 36.59 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 25/25 (100.00%) + +### 130. wordy + +- **Time taken:** 40.62 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 56/75 (74.67%) +- **Failed tests:** + - test_unknown_operation + +### 132. zebra-puzzle + +- **Time taken:** 20.37 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 3/3 (100.00%) + +### 131. yacht + +- **Time taken:** 48.20 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 30/30 (100.00%) + +### 133. zipper + +- **Time taken:** 47.62 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + diff --git a/benchmark/utils.ts b/benchmark/utils.ts index 533a4e5..a5cacf0 100644 --- a/benchmark/utils.ts +++ b/benchmark/utils.ts @@ -49,7 +49,7 @@ export async function runCodeWhisper( const relInstructionsFile = path.relative(exerciseDir, instructionsFile); const planFlag = noPlan ? '--no-plan' : '--accept-plan'; - const cmd = `node /app/dist/cli/index.js task -t "Complete the following task" --description "Complete the task described in the instructions.md file by modifying the file ${relSolutionFile}. Ensure the solution passes the tests in ${relTestFile}." -i "Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc. Only use standard python libraries, don't suggest installing any packages. The test file that is provided is 100% correct and will pass if the solution is correct." --skip-files ${planFlag} --model "${model}" --path "${exerciseDir}" ${diffMode} -f "${relSolutionFile}" "${relTestFile}" "${relInstructionsFile}" --log-ai-interactions`; + const cmd = `node /app/dist/cli/index.js task -t "Complete the following task" --description "Complete the task described in the instructions.md file by modifying the file ${relSolutionFile}. Ensure the solution passes the tests in ${relTestFile}. These files are found directly in the project directory and are not in subdirectories. They are not in the src directory." -i "Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc. Only use standard python libraries, don't suggest installing any packages. The test file that is provided is 100% correct and will pass if the solution is correct." --skip-files ${planFlag} --model "${model}" --path "${exerciseDir}" ${diffMode} -f "${relSolutionFile}" "${relTestFile}" "${relInstructionsFile}" --log-ai-interactions`; const startTime = Date.now(); const { stdout } = await execAsync(cmd);