From a82619ed48a200ed56b27b71df10c857aa17c486 Mon Sep 17 00:00:00 2001 From: Gordon Mickel Date: Fri, 16 Aug 2024 10:57:08 +0200 Subject: [PATCH] feat: add benchmark tool (#95) * wip * chore: make files executable * feat: finalize benchmark implementation * add debug mode * docs: update documentation * fix benchmarking, add results * add reports for transparency * docs: update readme * docs: stress how good our results are * docs: stress how good our results are * docs: add deepseek-coder benchmark * docs: update README * docs: fix link * remove wrong report files * rerun benchmarks after fix and update results --- .dockerignore | 169 +++ .gitignore | 4 + README.md | 103 +- USAGE.md | 1 + benchmark/Dockerfile | 48 + benchmark/README.md | 82 ++ benchmark/benchmark.ts | 258 ++++ benchmark/docker_build.sh | 16 + benchmark/package.json | 37 + ...ark_report_claude_sonnet_diff_reference.md | 1176 +++++++++++++++ ...rk_report_deepseek-coder_diff_reference.md | 1277 +++++++++++++++++ ...report_gpt-4o-2024-08-06_diff_reference.md | 1226 ++++++++++++++++ benchmark/run_benchmark.sh | 95 ++ benchmark/tsconfig.json | 20 + benchmark/tsup.config.ts | 16 + benchmark/types.ts | 28 + benchmark/utils.ts | 250 ++++ package.json | 1 - pnpm-lock.yaml | 33 + pnpm-workspace.yaml | 4 +- src/ai/task-workflow.ts | 10 +- src/cli/index.ts | 5 + src/types/index.ts | 1 + 23 files changed, 4834 insertions(+), 26 deletions(-) create mode 100644 .dockerignore create mode 100644 benchmark/Dockerfile create mode 100644 benchmark/README.md create mode 100644 benchmark/benchmark.ts create mode 100755 benchmark/docker_build.sh create mode 100644 benchmark/package.json create mode 100644 benchmark/reports/benchmark_report_claude_sonnet_diff_reference.md create mode 100644 benchmark/reports/benchmark_report_deepseek-coder_diff_reference.md create mode 100644 benchmark/reports/benchmark_report_gpt-4o-2024-08-06_diff_reference.md create mode 100755 benchmark/run_benchmark.sh create mode 100644 benchmark/tsconfig.json create mode 100644 benchmark/tsup.config.ts create mode 100644 benchmark/types.ts create mode 100644 benchmark/utils.ts diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1004d41 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,169 @@ +# Test-related files +tests/fixtures/**/.gitignore +tests/**/*.log + +# Temporary files +*.tmp +*.temp + +### Node ### +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* +.pnpm-debug.log* + + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# Snowpack dependency directory (https://snowpack.dev/) +web_modules/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional stylelint cache +.stylelintcache + +# Microbundle cache +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variable files +.env +.env.development.local +.env.test.local +.env.production.local +.env.local + +# parcel-bundler cache (https://parceljs.org/) +.cache +.parcel-cache + +# Next.js build output +.next +out + +# Nuxt.js build / generate output +.nuxt +dist + +# Gatsby files +.cache/ +# Comment in the public line in if your project uses Gatsby and not Next.js +# https://nextjs.org/blog/next-9-1#public-directory-support +# public + +# vuepress build output +.vuepress/dist + +# vuepress v2.x temp and cache directory +.temp + +# Docusaurus cache and generated files +.docusaurus + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +# TernJS port file +.tern-port + +# Stores VSCode versions used for testing VSCode extensions +.vscode-test + +# yarn v2 +.yarn/cache +.yarn/unplugged +.yarn/build-state.yml +.yarn/install-state.gz +.pnp.* + +### Node Patch ### +# Serverless Webpack directories +.webpack/ + +# Optional stylelint cache + +# SvelteKit build / generate output +.svelte-kit + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Vim configurations +.vim + +todos.md +codewhisper.md +testing +ElPlan.md +ElPlanFilter.md +codewhisper-task-output.json +demotask.md +.codewhisper-task-cache.json diff --git a/.gitignore b/.gitignore index 1004d41..58b99ed 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,7 @@ ElPlanFilter.md codewhisper-task-output.json demotask.md .codewhisper-task-cache.json + +# benchmark reports +benchmark/reports/ +!benchmark/reports/*_reference.md diff --git a/README.md b/README.md index 09dac11..d16081c 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ AI-Powered End-to-End Task Implementation & blazingly fast Codebase-to-LLM Conte [Templates](#-templates) • [Configuration](#-configuration) • [API](#-api) • +[Benchmarking](#benchmarking) • [Contributing](#-contributing) • [Roadmap](#-roadmap) • [FAQ](#-faq) @@ -27,7 +28,7 @@ AI-Powered End-to-End Task Implementation & blazingly fast Codebase-to-LLM Conte CodeWhisper is a powerful tool that bridges the gap between your codebase and Large Language Models (LLMs). It serves two primary functions: -1. **AI-Powered End-to-End Task Implementation**: Tackle complex, codebase-spanning tasks with ease. CodeWhisper doesn't just suggest snippets; it plans, generates, and applies comprehensive code changes across your entire project, from backend logic to frontend integration. +1. **AI-Powered End-to-End Task Implementation**: Tackle complex, codebase-spanning tasks with ease. CodeWhisper doesn't just suggest snippets; it plans, generates, and applies comprehensive code changes across your entire project, from backend logic to frontend integration. CodeWhisper's generations are SOTA and outperform other AI-code generation tools in benchmarks. See [Benchmarking](#benchmarking) for more details. 2. **Precision-Guided Context Curation for LLMs**: Harness the power of human insight to feed AI exactly what it needs. Quickly transform carefully selected parts of your codebase into rich, relevant context for LLMs, ensuring more accurate and project-aligned results. @@ -111,26 +112,27 @@ While CodeWhisper excels at performing individual coding tasks and even large fe ## ✨ Key Features -| Feature | Description | -| ----------------------------------------------- | ----------------------------------------------------------------- | -| 🧠 AI-powered task planning and code generation | Leverage AI to plan and implement complex coding tasks | -| 🔄 Full git integration | Version control of AI-generated changes | -| 🔄 Diff-based code modifications | Handle larger edits within output token limits | -| 🌍 Support for various LLM providers | Compatible with Anthropic, OpenAI, Ollama and Groq | -| 🔐 Support for local models | Use local models via Ollama | -| 🚀 Blazingly fast code processing | Concurrent workers for improved performance | -| 🎯 Customizable file filtering and exclusion | Fine-tune which files to include in the context | -| 📊 Intelligent caching | Improved performance through smart caching | -| 🔧 Extensible template system | Interactive variable prompts for flexible output | -| 🖊️ Custom variables in templates | Support for single-line and multi-line custom variables | -| 💾 Value caching | Quick template reuse with cached values | -| 🖥️ CLI and programmatic API | Use CodeWhisper in scripts or as a library | -| 🔒 Respect for .gitignore | Option to use custom include and exclude globs | -| 🌈 Full language support | Compatible with all text-based file types | -| 🤖 Interactive mode | Granular file selection and template customization | -| ⚡ Optimized for large repositories | Efficient processing of extensive codebases | -| 📝 Detailed logging | Log AI prompts, responses, and parsing results | -| 🔗 GitHub integration | Fetch and work with issues (see [Configuration](#-configuration)) | +| Feature | Description | +| ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 🧠 AI-powered task planning and code generation | Leverage AI to plan and implement complex coding tasks | +| 🚀 SOTA generations | CodeWhisper's generations are SOTA and outperform other AI-code generation tools in benchmarks, even though it uses one-shot generation. See [Benchmarking](#benchmarking) for more details. | +| 🔄 Full git integration | Version control of AI-generated changes | +| 🔄 Diff-based code modifications | Handle larger edits within output token limits | +| 🌍 Support for various LLM providers | Compatible with Anthropic, OpenAI, Ollama and Groq | +| 🔐 Support for local models | Use local models via Ollama | +| 🚀 Blazingly fast code processing | Concurrent workers for improved performance | +| 🎯 Customizable file filtering and exclusion | Fine-tune which files to include in the context | +| 📊 Intelligent caching | Improved performance through smart caching | +| 🔧 Extensible template system | Interactive variable prompts for flexible output | +| 🖊️ Custom variables in templates | Support for single-line and multi-line custom variables | +| 💾 Value caching | Quick template reuse with cached values | +| 🖥️ CLI and programmatic API | Use CodeWhisper in scripts or as a library | +| 🔒 Respect for .gitignore | Option to use custom include and exclude globs | +| 🌈 Full language support | Compatible with all text-based file types | +| 🤖 Interactive mode | Granular file selection and template customization | +| ⚡ Optimized for large repositories | Efficient processing of extensive codebases | +| 📝 Detailed logging | Log AI prompts, responses, and parsing results | +| 🔗 GitHub integration | Fetch and work with issues (see [Configuration](#-configuration)) | ## 📺 Video @@ -220,6 +222,8 @@ This section is still under development. We are actively testing and evaluating \* Whole-file edit mode is generally more precise but may lead to issues with maximum output token length, potentially limiting the ability to process larger files or multiple files simultaneously. It can also result in incomplete outputs for very large files, with the model resorting to placeholders like "// other functions here" instead of providing full implementations. +For more details, see the [Benchmarking](#benchmarking) section. + #### Experimental Support - **Groq as a provider** @@ -386,6 +390,63 @@ For more detailed instructions on using the GitHub integration and other CodeWhi CodeWhisper can be used programmatically in your Node.js projects. For detailed API documentation and examples, please refer to [USAGE.md](USAGE.md). +## Benchmarking + +CodeWhisper includes a benchmarking tool to evaluate its performance on Exercism Python exercises. This tool allows you to assess the capabilities of different AI models and configurations. + +### Key Features + +- Docker-based execution for consistent environments +- Concurrent worker support for faster benchmarking +- Detailed Markdown reports with performance metrics +- Options to customize test runs (number of tests, planning mode, diff mode) + +### Usage + +1. Build the Docker image: + + ``` + ./benchmark/docker_build.sh + ``` + +2. Set up the appropriate API key as an environment variable. + +3. Run the benchmark: + ``` + ./benchmark/run_benchmark.sh --model --workers --tests [options] + ``` + +### Output + +The benchmark generates a detailed Markdown report including: + +- Summary statistics (total time, cost, pass percentage) +- Per-exercise results (time, cost, mode, model, tests passed) + +Reports are saved in `benchmark/reports/` with timestamped filenames. + +### Results + +CodeWhisper's performance has been evaluated across different models using the Exercism Python exercises. Below is a summary of the benchmark results: + +| Model | Tests Passed | Time (s) | Cost ($) | Command | +| -------------------------- | ------------ | -------- | -------- | ------------------------------------------------------------------------------ | +| claude-3-5-sonnet-20240620 | 80.26% | 1619.49 | 3.4000 | `./benchmark/run_benchmark.sh --workers 5 --no-plan` | +| gpt-4o-2024-08-06 | 81.51% | 986.68 | 1.6800 | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model gpt-4o-2024-08-06` | +| deepseek-coder | 76.98% | 5850.58 | 0.0000\* | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model deepseek-coder` | + +\*The cost calculation was not working properly for this benchmark run. + +> **Note:** All benchmarks are one-shot only, unlike other benchmarks which use multiple generations that depend on the results of the test run. + +The full reports used to generate these results are available in the `benchmark/reports/` directory. + +These results provide insights into the efficiency and accuracy of different models when used with CodeWhisper. The "Tests Passed" percentage indicates the proportion of Exercism tests successfully completed, while the time and cost metrics offer a view of the resource requirements for each model. + +As we continue to run benchmarks with various models and configurations, this table will be updated to provide a comprehensive comparison, helping users make informed decisions about which model might best suit their needs. + +For full details on running benchmarks, interpreting results, and available options, please refer to the [Benchmark README](./benchmark/README.md). + ## 🤝 Contributing We welcome contributions to CodeWhisper! Please read our [CONTRIBUTING.md](CONTRIBUTING.md) for details on our code of conduct and the process for submitting pull requests. diff --git a/USAGE.md b/USAGE.md index 22ba699..719d8b1 100644 --- a/USAGE.md +++ b/USAGE.md @@ -61,6 +61,7 @@ codewhisper task [options] | `-g, --gitignore ` | Path to .gitignore file (default: .gitignore) | | `-f, --filter ` | File patterns to include (use glob patterns, e.g., "src/\*_/_.js") | | `-e, --exclude ` | File patterns to exclude (use glob patterns, e.g., "\*_/_.test.js") | +| `--skip-files` | Skip the file selection step and use the files provided by the --filter and --exclude options | | `-s, --suppress-comments` | Strip comments from the code | | `-l, --line-numbers` | Add line numbers to code blocks | | `-cw, --context-window ` | Specify the context window for the AI model. Only applicable for Ollama models. | diff --git a/benchmark/Dockerfile b/benchmark/Dockerfile new file mode 100644 index 0000000..b088165 --- /dev/null +++ b/benchmark/Dockerfile @@ -0,0 +1,48 @@ +FROM node:20 + +# Enable corepack for pnpm support +RUN corepack enable + +# Install Python, pip, and build essentials +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + build-essential \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Set up pnpm to use a specific store directory in the container +RUN echo "store-dir=/root/.pnpm-store" > /root/.npmrc + +# Set up working directory for the main project +WORKDIR /app + +# Copy the entire CodeWhisper project +COPY .. . + +# Install dependencies for the main project +RUN pnpm install + +# Set NODE_ENV to development for the build process +ENV NODE_ENV=development + +# Build the main project +RUN pnpm run build + +# Change to the benchmark directory +WORKDIR /app/benchmark + +# Install dependencies for the benchmark +RUN pnpm install + +# Build the benchmark +RUN pnpm run build + +# Set environment variables back to production +ENV NODE_ENV=production + +# Set PATH to include CodeWhisper's dist directory +ENV PATH="/app/dist/cli:${PATH}" + +# Run benchmark +CMD ["node", "--unhandled-rejections=strict", "/app/benchmark/dist/benchmark.js"] diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..953e6f4 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,82 @@ +# CodeWhisper Benchmark + +This benchmark tool is designed to evaluate the performance of CodeWhisper on Exercism Python exercises. + +## Please note + +- Running the full benchmark will use a significant amount of tokens. +- Too many concurrent workers is likely to cause rate limiting issues. + +## Results + +CodeWhisper's performance has been evaluated across different models using the Exercism Python exercises. Below is a summary of the benchmark results: + +| Model | Tests Passed | Time (s) | Cost ($) | Command | +| -------------------------- | ------------ | -------- | -------- | ------------------------------------------------------------------------------ | +| claude-3-5-sonnet-20240620 | 80.26% | 1619.49 | 3.4000 | `./benchmark/run_benchmark.sh --workers 5 --no-plan` | +| gpt-4o-2024-08-06 | 81.51% | 986.68 | 1.6800 | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model gpt-4o-2024-08-06` | +| deepseek-coder | 76.98% | 5850.58 | 0.0000\* | `./benchmark/run_benchmark.sh --workers 5 --no-plan --model deepseek-coder` | + +\*The cost calculation was not working properly for this benchmark run. + +> **Note:** All benchmarks are one-shot only, unlike other benchmarks which use multiple generations that depend on the results of the test run. + +The full reports used to generate these results are available in the `benchmark/reports/` directory. + +These results provide insights into the efficiency and accuracy of different models when used with CodeWhisper. The "Tests Passed" percentage indicates the proportion of Exercism tests successfully completed, while the time and cost metrics offer a view of the resource requirements for each model. + +As we continue to run benchmarks with various models and configurations, this table will be updated to provide a comprehensive comparison, helping users make informed decisions about which model might best suit their needs. + +## Usage + +1. Build the Docker image: + + ``` + ./benchmark/docker_build.sh + ``` + +2. Set up the appropriate API key as an environment variable based on the model you intend to use: + + - For Claude models: `export ANTHROPIC_API_KEY=your_anthropic_api_key` + - For GPT models: `export OPENAI_API_KEY=your_openai_api_key` + - For Groq models: `export GROQ_API_KEY=your_groq_api_key` + - For DeepSeek models: `export DEEPSEEK_API_KEY=your_deepseek_api_key` + +3. Run the benchmark: + + ``` + ./benchmark/run_benchmark.sh --model --workers --tests [--no-plan] [--diff | --no-diff] + ``` + + Options: + + - `--model`: The AI model to use (default: claude-3-5-sonnet-20240620) + - `--workers`: Number of concurrent workers (default: 4) + - `--tests`: Number of tests to run (default: all tests) + - `--no-plan`: Disable the planning mode (default: false) + - `--diff`: Use the diff mode for AI-generated code modifications (overrides the model's default setting) + - `--no-diff`: Use the whole file edit mode for AI-generated code modifications (overrides the model's default setting) + +## Output + +The benchmark will generate a detailed Markdown report for each run, including: + +- Summary statistics (total time, total cost, percentage of passed tests) +- Detailed results for each exercise: + - Time taken + - Total cost (LLM API costs) + - Mode used (diff/whole) + - Model used + - Number of tests passed + - Any failed tests or errors encountered + +The report will be saved in the `benchmark/reports/` directory with a timestamp in the filename. + +A brief summary will also be displayed in the console after the benchmark completes. + +## Notes + +- Ensure that you've set the appropriate API key as an environment variable for the model you intend to use before running the benchmark. +- The benchmark runs in a Docker container to sandbox the execution and prevent changes to the host filesystem. +- Each model has a default setting for diff/whole file edit mode. The `--diff` and `--no-diff` options allow you to override this default behavior for the benchmark. +- You can run multiple benchmarks without overwriting previous results. Each run generates a new report file with a unique timestamp. diff --git a/benchmark/benchmark.ts b/benchmark/benchmark.ts new file mode 100644 index 0000000..016cbb4 --- /dev/null +++ b/benchmark/benchmark.ts @@ -0,0 +1,258 @@ +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { setTimeout } from 'node:timers/promises'; +import ora from 'ora'; +import pLimit from 'p-limit'; +import type { BenchmarkResult, SummaryStats } from './types'; +import { cloneRepo, runExercise } from './utils'; + +const DEBUG_MODE = process.env.DEBUG_MODE === 'true'; +const DEBUG_SKIP = DEBUG_MODE ? 113 : 0; // Skip the first 113 exercises in debug mode + +const EXERCISM_REPO = 'https://github.com/exercism/python.git'; +const REPO_DIR = '/tmp/exercism-python'; +const EXERCISES_DIR = path.join(REPO_DIR, 'exercises', 'practice'); + +async function main(): Promise { + console.log('Main function started'); + console.log('Process ID:', process.pid); + console.log('Node version:', process.version); + console.log('Current working directory:', process.cwd()); + console.log('Debug mode:', DEBUG_MODE ? 'ON' : 'OFF'); + + try { + const model = process.env.MODEL || 'claude-3-5-sonnet-20240620'; + const concurrentWorkers = Number.parseInt( + process.env.CONCURRENT_WORKERS || '4', + 10, + ); + const numTests = process.env.NUM_TESTS || 'all'; + const noPlan = process.env.NO_PLAN === 'true'; + const diffMode = process.env.DIFF_MODE || ''; + + console.log( + `Running benchmark with model: ${model}, workers: ${concurrentWorkers}, tests: ${numTests}, no-plan: ${noPlan}, diff-mode: ${diffMode || 'default'}`, + ); + + const spinner = ora('Starting benchmark').start(); + + // Clone Exercism repo + spinner.text = 'Cloning Exercism repository'; + await cloneRepo(EXERCISM_REPO, REPO_DIR); + + // Get list of exercises + let exercises = fs + .readdirSync(EXERCISES_DIR) + .map((dir) => path.join(EXERCISES_DIR, dir)) + .sort(); + + if (DEBUG_MODE) { + console.log(`DEBUG: Skipping the first ${DEBUG_SKIP} exercises`); + exercises = exercises.slice(DEBUG_SKIP); + } else if (numTests !== 'all') { + const numTestsInt = Number.parseInt(numTests, 10); + if (Number.isNaN(numTestsInt) || numTestsInt <= 0) { + throw new Error('Invalid number of tests specified'); + } + exercises = exercises + .sort(() => Math.random() - 0.5) + .slice(0, numTestsInt); + } + + console.log(`Total exercises to run: ${exercises.length}`); + + const reportDir = '/app/benchmark/reports'; + if (!fs.existsSync(reportDir)) { + fs.mkdirSync(reportDir, { recursive: true }); + } + + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const reportFileName = `benchmark_report_${timestamp}.md`; + const reportPath = path.join(reportDir, reportFileName); + + console.log(`Benchmark report will be saved as ${reportFileName}`); + + // Set up concurrent limit + const limit = pLimit(concurrentWorkers); + + // Run exercises concurrently and write results incrementally + spinner.text = 'Running exercises'; + const resultPromises = exercises.map((exerciseDir, index) => + limit(async () => { + const exerciseName = path.basename(exerciseDir); + spinner.text = `Running exercise: ${exerciseName}`; + console.log( + `Starting exercise ${index + 1}/${exercises.length}: ${exerciseName}`, + ); + + const exercisePromise = runExercise( + exerciseDir, + model, + noPlan, + diffMode, + ); + const timeoutPromise = setTimeout( + 60000, + 'Exercise execution timed out', + ); + + let result: BenchmarkResult; + try { + const raceResult = await Promise.race([ + exercisePromise, + timeoutPromise, + ]); + if (typeof raceResult === 'string') { + throw new Error(raceResult); + } + result = raceResult; + } catch (error) { + console.error(`Error in exercise ${exerciseName}:`, error); + result = { + exercise: exerciseName, + time_taken: 60, // 1 minute timeout + total_cost: 0, + mode_used: diffMode ? 'diff' : 'whole', + model_used: model, + test_passed: false, + test_output: 'Exercise execution timed out or errored', + total_tests: 0, + passed_tests: 0, + failed_tests: [], + errors: [ + error instanceof Error ? error.message : 'Unknown error occurred', + ], + }; + } + + // Write result to report file (use a lock here if necessary) + writeResultToReport(result, reportPath, index); + + console.log( + `Completed ${index + 1}/${exercises.length}: ${exerciseName}`, + ); + return result; + }), + ); + + const results = await Promise.all(resultPromises); + + spinner.succeed('Benchmark completed'); + + // Calculate summary + const summary: SummaryStats = results.reduce( + (acc, result) => { + acc.totalTime += result.time_taken; + acc.totalCost += result.total_cost; + acc.passedTests += result.test_passed ? 1 : 0; + acc.totalTests += result.total_tests; + acc.totalPassedTests += result.passed_tests; + return acc; + }, + { + totalTime: 0, + totalCost: 0, + passedTests: 0, + totalTests: 0, + totalPassedTests: 0, + }, + ); + + // Generate and prepend summary to the report + const summaryMarkdown = generateSummaryMarkdown(results, summary); + const existingReport = fs.readFileSync(reportPath, 'utf8'); + fs.writeFileSync(reportPath, summaryMarkdown + existingReport); + + console.log(`Benchmark report saved as ${reportFileName}`); + + // Output summary to console + console.log('\nSummary:'); + console.log(`Total time: ${summary.totalTime.toFixed(2)} seconds`); + console.log(`Total cost: $${summary.totalCost.toFixed(4)}`); + console.log( + `Passed exercises: ${summary.passedTests}/${results.length} (${((summary.passedTests / results.length) * 100).toFixed(2)}%)`, + ); + console.log( + `Total tests passed: ${summary.totalPassedTests}/${summary.totalTests} (${((summary.totalPassedTests / summary.totalTests) * 100).toFixed(2)}%)`, + ); + + console.log('Benchmark process finished. Exiting.'); + } catch (error) { + console.error('An error occurred during the benchmark:', error); + } finally { + // Ensure the process exits + process.exit(0); + } +} + +function writeResultToReport( + result: BenchmarkResult, + reportPath: string, + index: number, +): void { + let markdown = ''; + if (index === 0) { + markdown += '# CodeWhisper Benchmark Report\n\n'; + markdown += '## Detailed Results\n\n'; + } + + markdown += `### ${index + 1}. ${result.exercise}\n\n`; + markdown += `- **Time taken:** ${result.time_taken.toFixed(2)} seconds\n`; + markdown += `- **Cost:** $${result.total_cost.toFixed(4)}\n`; + markdown += `- **Mode used:** ${result.mode_used}\n`; + markdown += `- **Model used:** ${result.model_used}\n`; + const exerciseTestPassPercentage = + result.total_tests > 0 + ? ((result.passed_tests / result.total_tests) * 100).toFixed(2) + : '0.00'; + markdown += `- **Tests passed:** ${result.passed_tests}/${result.total_tests} (${exerciseTestPassPercentage}%)\n`; + + if (result.failed_tests.length > 0) { + markdown += '- **Failed tests:**\n'; + for (const test of result.failed_tests) { + markdown += ` - ${test}\n`; + } + } + + if (result.errors.length > 0) { + markdown += '- **Errors:**\n'; + for (const error of result.errors) { + markdown += ` - ${error}\n`; + } + } + + markdown += '\n'; + + fs.appendFileSync(reportPath, markdown); +} + +function generateSummaryMarkdown( + results: BenchmarkResult[], + summary: SummaryStats, +): string { + let markdown = '# CodeWhisper Benchmark Report\n\n'; + + markdown += '## Summary\n\n'; + markdown += `- **Total time:** ${summary.totalTime.toFixed(2)} seconds\n`; + markdown += `- **Total cost:** $${summary.totalCost.toFixed(4)}\n`; + markdown += `- **Passed exercises:** ${summary.passedTests}/${results.length} (${((summary.passedTests / results.length) * 100).toFixed(2)}%)\n`; + const testPassPercentage = + summary.totalTests > 0 + ? ((summary.totalPassedTests / summary.totalTests) * 100).toFixed(2) + : '0.00'; + markdown += `- **Total tests passed:** ${summary.totalPassedTests}/${summary.totalTests} (${testPassPercentage}%)\n\n`; + + return markdown; +} + +// Call main and handle any uncaught errors +main().catch((error) => { + console.error('Unhandled error in main:', error); + process.exit(1); +}); + +process.on('unhandledRejection', (reason, promise) => { + console.error('Unhandled Rejection at:', promise, 'reason:', reason); + // Optionally exit the process + // process.exit(1); +}); diff --git a/benchmark/docker_build.sh b/benchmark/docker_build.sh new file mode 100755 index 0000000..f1e4af3 --- /dev/null +++ b/benchmark/docker_build.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -e + +# Get the directory of the script +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +# Navigate to the parent directory of the script (main CodeWhisper project root) +cd "$SCRIPT_DIR/.." || exit + +echo "Building Docker image from $(pwd)" + +# Build the Docker image with verbose output +docker build --progress=plain -t codewhisper-benchmark -f "$SCRIPT_DIR/Dockerfile" . 2>&1 | tee build.log + +echo "Docker build completed. Check build.log for details." diff --git a/benchmark/package.json b/benchmark/package.json new file mode 100644 index 0000000..28ea77a --- /dev/null +++ b/benchmark/package.json @@ -0,0 +1,37 @@ +{ + "name": "codewhisper-benchmark", + "version": "1.0.0", + "description": "Benchmark tool for CodeWhisper", + "main": "dist/benchmark.js", + "type": "module", + "private": true, + "module": "./dist/benchmark.js", + "types": "./dist/benchmark.d.ts", + "typesVersions": { + "*": { + "*": ["./dist/*", "./dist/benchmark.d.ts"] + } + }, + "scripts": { + "prebuild": "pnpm run typecheck", + "build": "tsup", + "lint": "biome check .", + "lint:fix": "biome check . --write", + "start": "node dist/benchmark.js", + "typecheck": "tsc --noEmit" + }, + "dependencies": { + "ora": "8.0.1", + "p-limit": "^6.1.0", + "typescript": "5.5.4" + }, + "devDependencies": { + "tsup": "8.2.4", + "@types/node": "20.14.15" + }, + "packageManager": "pnpm@9.6.0", + "trustedDependencies": ["@biomejs/biome", "lefthook"], + "engines": { + "node": ">=20.0.0" + } +} diff --git a/benchmark/reports/benchmark_report_claude_sonnet_diff_reference.md b/benchmark/reports/benchmark_report_claude_sonnet_diff_reference.md new file mode 100644 index 0000000..4b24025 --- /dev/null +++ b/benchmark/reports/benchmark_report_claude_sonnet_diff_reference.md @@ -0,0 +1,1176 @@ +# CodeWhisper Benchmark Report + +## Summary + +- **Total time:** 1619.49 seconds +- **Total cost:** $3.4000 +- **Passed exercises:** 104/133 (78.20%) +- **Total tests passed:** 3595/4479 (80.26%) + +## Detailed Results + +### 2. acronym + +- **Time taken:** 9.26 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 13/17 (76.47%) +- **Failed tests:** + - test_underscore_emphasis + +### 1. accumulate + +- **Time taken:** 9.86 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 5. allergies + +- **Time taken:** 10.31 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 51/51 (100.00%) + +### 4. all-your-base + +- **Time taken:** 11.26 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 22/22 (100.00%) + +### 3. affine-cipher + +- **Time taken:** 12.68 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 17/17 (100.00%) + +### 7. anagram + +- **Time taken:** 9.14 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 19/19 (100.00%) + +### 8. armstrong-numbers + +- **Time taken:** 8.81 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 10/10 (100.00%) + +### 9. atbash-cipher + +- **Time taken:** 13.13 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 15/15 (100.00%) + +### 6. alphametics + +- **Time taken:** 12.43 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 10/10 (100.00%) + +### 12. binary + +- **Time taken:** 9.58 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 12/12 (100.00%) + +### 11. beer-song + +- **Time taken:** 11.63 seconds +- **Cost:** $0.0500 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 9/9 (100.00%) + +### 10. bank-account + +- **Time taken:** 18.61 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 17/17 (100.00%) + +### 13. binary-search + +- **Time taken:** 11.16 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 12/12 (100.00%) + +### 15. bob + +- **Time taken:** 9.38 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 26/26 (100.00%) + +### 14. binary-search-tree + +- **Time taken:** 14.94 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 11/11 (100.00%) + +### 16. book-store + +- **Time taken:** 14.12 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 170/282 (60.28%) + +### 17. bottle-song + +- **Time taken:** 14.65 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 8/8 (100.00%) + +### 18. bowling + +- **Time taken:** 14.42 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 116/188 (61.70%) +- **Failed tests:** + - test_cannot_roll_after_bonus_roll_for_spare + - test_cannot_roll_if_game_already_has_ten_frames + +### 19. change + +- **Time taken:** 13.96 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 14/14 (100.00%) + +### 20. circular-buffer + +- **Time taken:** 13.31 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 31/46 (67.39%) +- **Failed tests:** + - test_each_item_may_only_be_read_once + - test_initial_clear_does_not_affect_wrapping_around + +### 21. clock + +- **Time taken:** 12.01 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 56/56 (100.00%) + +### 22. collatz-conjecture + +- **Time taken:** 12.65 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 23. complex-numbers + +- **Time taken:** 15.64 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 44/44 (100.00%) + +### 24. connect + +- **Time taken:** 15.96 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 11/11 (100.00%) + +### 25. crypto-square + +- **Time taken:** 11.50 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 15/22 (68.18%) +- **Failed tests:** + - test_54_character_plaintext_results_in_7_chunks_the_last_two_with_trailing_spaces + - test_8_character_plaintext_results_in_3_chunks_the_last_one_with_a_trailing_space + +### 27. darts + +- **Time taken:** 11.48 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 14/14 (100.00%) + +### 26. custom-set + +- **Time taken:** 15.46 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 41/41 (100.00%) + +### 28. diamond + +- **Time taken:** 11.99 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 21/38 (55.26%) +- **Failed tests:** + - test_degenerate_case_with_no_row_containing_3_distinct_groups_of_spaces + - test_largest_possible_diamond + - test_smallest_non_degenerate_case_with_even_diamond_side_length + - test_smallest_non_degenerate_case_with_odd_diamond_side_length + +### 29. difference-of-squares + +- **Time taken:** 10.59 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 10/10 (100.00%) + +### 30. diffie-hellman + +- **Time taken:** 10.44 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 31. dnd-character + +- **Time taken:** 12.29 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 20/20 (100.00%) + +### 32. dominoes + +- **Time taken:** 14.33 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 14/14 (100.00%) + +### 34. eliuds-eggs + +- **Time taken:** 8.93 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 5/5 (100.00%) + +### 33. dot-dsl + +- **Time taken:** 12.94 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 13/13 (100.00%) + +### 36. etl + +- **Time taken:** 8.23 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 5/5 (100.00%) + +### 35. error-handling + +- **Time taken:** 12.34 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 6/6 (100.00%) + +### 37. flatten-array + +- **Time taken:** 8.52 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 12/12 (100.00%) + +### 40. gigasecond + +- **Time taken:** 7.77 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 6/6 (100.00%) + +### 38. food-chain + +- **Time taken:** 15.86 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 53/74 (71.62%) +- **Failed tests:** + - test_fly + - test_full_song + - test_multiple_verses + +### 42. grade-school + +- **Time taken:** 11.80 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 21/21 (100.00%) + +### 41. go-counting + +- **Time taken:** 16.82 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 12/12 (100.00%) + +### 39. forth + +- **Time taken:** 17.32 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 102/149 (68.46%) +- **Failed tests:** + - test_user_defined_words_cannot_redefine_negative_numbers + - test_user_defined_words_cannot_redefine_non_negative_numbers + +### 43. grains + +- **Time taken:** 9.68 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 12/12 (100.00%) + +### 44. grep + +- **Time taken:** 11.10 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 26/26 (100.00%) + +### 45. hamming + +- **Time taken:** 8.89 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 10/10 (100.00%) + +### 47. hello-world + +- **Time taken:** 8.60 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 2/2 (100.00%) + +### 48. hexadecimal + +- **Time taken:** 10.49 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 11/11 (100.00%) + +### 46. hangman + +- **Time taken:** 14.29 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 8/8 (100.00%) + +### 49. high-scores + +- **Time taken:** 8.98 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 13/13 (100.00%) + +### 52. isogram + +- **Time taken:** 8.75 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 15/15 (100.00%) + +### 50. house + +- **Time taken:** 11.55 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 112/165 (67.88%) +- **Failed tests:** + - test_full_rhyme + - test_multiple_verses + - test_verse_10_the_rooster_that_crowed_in_the_morn + - test_verse_11_the_farmer_sowing_his_corn + - test_verse_12_the_horse_and_the_hound_and_the_horn + - test_verse_eight_the_man_all_tattered_and_torn + - test_verse_five_the_dog_that_worried + - test_verse_four_the_cat_that_killed + - test_verse_nine_the_priest_all_shaven_and_shorn + - test_verse_seven_the_maiden_all_forlorn + - test_verse_six_the_cow_with_the_crumpled_horn + - test_verse_three_the_rat_that_ate + - test_verse_two_the_malt_that_lay + +### 51. isbn-verifier + +- **Time taken:** 11.47 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 20/20 (100.00%) + +### 53. killer-sudoku-helper + +- **Time taken:** 10.98 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 14/14 (100.00%) + +### 54. kindergarten-garden + +- **Time taken:** 11.69 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 20/20 (100.00%) + +### 57. leap + +- **Time taken:** 9.59 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 10/10 (100.00%) + +### 56. largest-series-product + +- **Time taken:** 11.33 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 15/15 (100.00%) + +### 55. knapsack + +- **Time taken:** 13.48 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 8/8 (100.00%) + +### 60. list-ops + +- **Time taken:** 10.68 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 32/39 (82.05%) +- **Failed tests:** + - test_foldr_direction_dependent_function_applied_to_non_empty_list + - test_foldr_foldr_add_string + +### 61. luhn + +- **Time taken:** 10.16 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 24/24 (100.00%) + +### 59. linked-list + +- **Time taken:** 15.54 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 25/25 (100.00%) + +### 63. matching-brackets + +- **Time taken:** 10.14 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 21/21 (100.00%) + +### 64. matrix + +- **Time taken:** 9.32 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 9/9 (100.00%) + +### 62. markdown + +- **Time taken:** 21.03 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 27/37 (72.97%) +- **Failed tests:** + - test_mixed_normal_italics_and_bold_text + - test_parsing_bold_text + - test_with_a_little_bit_of_everything + +### 65. meetup + +- **Time taken:** 16.66 seconds +- **Cost:** $0.0500 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 104/104 (100.00%) + +### 66. minesweeper + +- **Time taken:** 10.99 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 16/16 (100.00%) + +### 67. nth-prime + +- **Time taken:** 11.44 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 68. ocr-numbers + +- **Time taken:** 12.49 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 18/18 (100.00%) + +### 58. ledger + +- **Time taken:** 43.36 seconds +- **Cost:** $0.0800 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 110/145 (75.86%) +- **Failed tests:** + - test_american_negative_number_with_3_digits_before_decimal_point + - test_credit_and_debit + - test_dutch_locale + - test_dutch_locale_and_euros + - test_dutch_negative_number_with_3_digits_before_decimal_point + - test_euros + - test_final_order_tie_breaker_is_change + - test_multiple_entries_on_same_date_ordered_by_description + - test_one_entry + - test_overlong_description_is_truncated + +### 69. octal + +- **Time taken:** 10.85 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 11/11 (100.00%) + +### 72. pangram + +- **Time taken:** 8.02 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 13/13 (100.00%) + +### 71. palindrome-products + +- **Time taken:** 15.92 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 14/14 (100.00%) + +### 70. paasio + +- **Time taken:** 17.62 seconds +- **Cost:** $0.0500 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 32/36 (88.89%) + +### 73. pascals-triangle + +- **Time taken:** 11.53 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 15/17 (88.24%) + +### 74. perfect-numbers + +- **Time taken:** 9.06 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 14/14 (100.00%) + +### 75. phone-number + +- **Time taken:** 11.27 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 32/39 (82.05%) +- **Failed tests:** + - test_invalid_with_letters + - test_invalid_with_punctuations + +### 76. pig-latin + +- **Time taken:** 11.04 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 26/30 (86.67%) +- **Failed tests:** + - test_word_beginning_with_th + +### 77. point-mutations + +- **Time taken:** 10.67 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 9/9 (100.00%) + +### 80. prime-factors + +- **Time taken:** 10.13 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 13/13 (100.00%) + +### 79. pov + +- **Time taken:** 16.63 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 58/73 (79.45%) +- **Failed tests:** + - test_can_find_path_from_nodes_other_than_x + - test_can_find_path_not_involving_root + - test_can_find_path_to_sibling + +### 82. proverb + +- **Time taken:** 9.18 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 9/9 (100.00%) + +### 81. protein-translation + +- **Time taken:** 11.32 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 27/27 (100.00%) + +### 78. poker + +- **Time taken:** 20.38 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 128/173 (73.99%) +- **Failed tests:** + - test_aces_can_start_a_straight_a_2_3_4_5 + - test_aces_can_start_a_straight_flush_a_2_3_4_5 + +### 83. pythagorean-triplet + +- **Time taken:** 9.16 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 8/8 (100.00%) + +### 86. raindrops + +- **Time taken:** 7.79 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 19/19 (100.00%) + +### 84. queen-attack + +- **Time taken:** 12.10 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 15/15 (100.00%) + +### 85. rail-fence-cipher + +- **Time taken:** 12.77 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 87. rational-numbers + +- **Time taken:** 16.66 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 44/44 (100.00%) + +### 90. resistor-color + +- **Time taken:** 7.35 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 5/5 (100.00%) + +### 88. react + +- **Time taken:** 13.96 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 26/34 (76.47%) +- **Failed tests:** + - test_callbacks_should_not_be_called_if_dependencies_change_but_output_value_doesn_t_change + +### 89. rectangles + +- **Time taken:** 12.13 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 15/15 (100.00%) + +### 93. resistor-color-trio + +- **Time taken:** 8.64 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 11/11 (100.00%) + +### 95. reverse-string + +- **Time taken:** 6.13 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 8/8 (100.00%) + +### 91. resistor-color-duo + +- **Time taken:** 16.32 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 8/8 (100.00%) + +### 92. resistor-color-expert + +- **Time taken:** 12.70 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 26/33 (78.79%) + +### 96. rna-transcription + +- **Time taken:** 8.45 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 94. rest-api + +- **Time taken:** 16.24 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 10/10 (100.00%) + +### 99. roman-numerals + +- **Time taken:** 9.35 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 28/28 (100.00%) + +### 97. robot-name + +- **Time taken:** 13.42 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 5/5 (100.00%) + +### 98. robot-simulator + +- **Time taken:** 13.20 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 19/19 (100.00%) + +### 100. rotational-cipher + +- **Time taken:** 9.03 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 11/11 (100.00%) + +### 101. run-length-encoding + +- **Time taken:** 9.93 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 14/14 (100.00%) + +### 102. saddle-points + +- **Time taken:** 9.32 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 11/11 (100.00%) + +### 103. satellite + +- **Time taken:** 11.13 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 104. say + +- **Time taken:** 11.22 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 21/21 (100.00%) + +### 106. scrabble-score + +- **Time taken:** 10.57 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 12/12 (100.00%) + +### 105. scale-generator + +- **Time taken:** 13.65 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 34/72 (47.22%) +- **Failed tests:** + - test_chromatic_scale_with_sharps + - test_enigmatic + - test_major_scale_with_sharps + - test_octatonic + +### 107. secret-handshake + +- **Time taken:** 9.57 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 12/12 (100.00%) + +### 108. series + +- **Time taken:** 8.37 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 12/12 (100.00%) + +### 110. sieve + +- **Time taken:** 10.52 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 6/6 (100.00%) + +### 112. simple-linked-list + +- **Time taken:** 10.76 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 41/58 (70.69%) +- **Failed tests:** + - test_can_pop_from_non_empty_list + - test_non_empty_linked_list_to_list_is_list_with_all_elements + - test_non_empty_list_has_correct_head + - test_non_empty_list_traverse + - test_push_and_pop + +### 113. space-age + +- **Time taken:** 9.50 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 9/9 (100.00%) + +### 111. simple-cipher + +- **Time taken:** 12.41 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 13/13 (100.00%) + +### 109. sgf-parsing + +- **Time taken:** 18.99 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 64/93 (68.82%) +- **Failed tests:** + - test_within_property_values_newlines_remain_as_newlines + +### 116. strain + +- **Time taken:** 7.95 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 9/9 (100.00%) + +### 118. sum-of-multiples + +- **Time taken:** 6.74 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 17/17 (100.00%) + +### 115. square-root + +- **Time taken:** 9.32 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 114. spiral-matrix + +- **Time taken:** 13.66 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 7/7 (100.00%) + +### 117. sublist + +- **Time taken:** 13.82 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 26/30 (86.67%) +- **Failed tests:** + - test_unique_return_values + +### 122. triangle + +- **Time taken:** 9.85 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 22/22 (100.00%) + +### 120. transpose + +- **Time taken:** 11.52 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 56/69 (81.16%) +- **Failed tests:** + - test_first_line_longer_than_second_line + - test_mixed_line_length + - test_second_line_longer_than_first_line + - test_single_line + +### 119. tournament + +- **Time taken:** 14.31 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 13/13 (100.00%) + +### 123. trinary + +- **Time taken:** 8.96 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 8/8 (100.00%) + +### 121. tree-building + +- **Time taken:** 17.73 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 14/14 (100.00%) + +### 126. two-fer + +- **Time taken:** 7.06 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 4/4 (100.00%) + +### 124. twelve-days + +- **Time taken:** 11.38 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 16/16 (100.00%) + +### 127. variable-length-quantity + +- **Time taken:** 9.84 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 46/83 (55.42%) + +### 125. two-bucket + +- **Time taken:** 15.05 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 19/32 (59.38%) +- **Failed tests:** + - test_measure_using_bucket_one_of_size_3_and_bucket_two_of_size_5_start_with_bucket_two + - test_measure_using_bucket_one_of_size_7_and_bucket_two_of_size_11_start_with_bucket_two + - test_with_the_same_buckets_but_a_different_goal_then_it_is_possible + +### 128. word-count + +- **Time taken:** 10.04 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 18/18 (100.00%) + +### 129. word-search + +- **Time taken:** 16.07 seconds +- **Cost:** $0.0400 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 25/25 (100.00%) + +### 131. yacht + +- **Time taken:** 12.64 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 30/30 (100.00%) + +### 130. wordy + +- **Time taken:** 14.94 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 51/67 (76.12%) +- **Failed tests:** + - test_missing_operation + - test_non_math_question + - test_reject_postfix_notation + - test_reject_prefix_notation + - test_unknown_operation + +### 133. zipper + +- **Time taken:** 14.06 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 15/15 (100.00%) + +### 132. zebra-puzzle + +- **Time taken:** 19.41 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** claude-3-5-sonnet-20240620 +- **Tests passed:** 9/16 (56.25%) +- **Failed tests:** + - test_resident_who_drinks_water + - test_resident_who_owns_zebra diff --git a/benchmark/reports/benchmark_report_deepseek-coder_diff_reference.md b/benchmark/reports/benchmark_report_deepseek-coder_diff_reference.md new file mode 100644 index 0000000..fb521f4 --- /dev/null +++ b/benchmark/reports/benchmark_report_deepseek-coder_diff_reference.md @@ -0,0 +1,1277 @@ +# CodeWhisper Benchmark Report + +## Summary + +- **Total time:** 5850.58 seconds +- **Total cost:** $0.0000 +- **Passed exercises:** 88/133 (66.17%) +- **Total tests passed:** 3448/4479 (76.98%) + +## Detailed Results + +### 2. acronym + +- **Time taken:** 20.59 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/17 (76.47%) +- **Failed tests:** + - test_apostrophes + +### 1. accumulate + +- **Time taken:** 21.38 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 4. all-your-base + +- **Time taken:** 27.47 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 22/22 (100.00%) + +### 5. allergies + +- **Time taken:** 30.32 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 51/51 (100.00%) + +### 7. anagram + +- **Time taken:** 20.89 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 19/19 (100.00%) + +### 3. affine-cipher + +- **Time taken:** 42.63 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 17/17 (100.00%) + +### 8. armstrong-numbers + +- **Time taken:** 22.29 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 6. alphametics + +- **Time taken:** 25.86 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 9. atbash-cipher + +- **Time taken:** 26.69 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 12. binary + +- **Time taken:** 20.81 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 11. beer-song + +- **Time taken:** 30.09 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 13. binary-search + +- **Time taken:** 22.48 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 10. bank-account + +- **Time taken:** 36.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 17/17 (100.00%) + +### 14. binary-search-tree + +- **Time taken:** 34.44 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 15. bob + +- **Time taken:** 23.77 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 26/26 (100.00%) + +### 16. book-store + +- **Time taken:** 29.92 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 49/71 (69.01%) +- **Failed tests:** + - test_check_that_groups_of_four_are_created_properly_even_when_there_are_more_groups_of_three_than_groups_of_five + - test_four_groups_of_four_are_cheaper_than_two_groups_each_of_five_and_three + - test_one_group_of_one_and_two_plus_three_groups_of_four_is_cheaper_than_one_group_of_each_size + - test_shuffled_book_order + - test_two_groups_of_four_and_a_group_of_five + - test_two_groups_of_four_is_cheaper_than_group_of_five_plus_group_of_three + - test_two_groups_of_four_is_cheaper_than_groups_of_five_and_three + +### 17. bottle-song + +- **Time taken:** 27.97 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 59/88 (67.05%) +- **Failed tests:** + - test_all_verses + - test_first_generic_verse + - test_first_two_verses + - test_last_generic_verse + - test_last_three_verses + - test_verse_with_1_bottle + - test_verse_with_2_bottles + +### 19. change + +- **Time taken:** 26.76 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 29/50 (58.00%) +- **Failed tests:** + - test_another_possible_change_without_unit_coins_available + - test_change_with_lilliputian_coins + - test_large_target_values + - test_multiple_coin_change + - test_possible_change_without_unit_coins_available + +### 22. collatz-conjecture + +- **Time taken:** 21.39 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 18. bowling + +- **Time taken:** 49.52 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 270/357 (75.63%) +- **Failed tests:** + - test_two_rolls_in_a_frame_cannot_score_more_than_10_points + +### 21. clock + +- **Time taken:** 32.59 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 26/42 (61.90%) + +### 20. circular-buffer + +- **Time taken:** 49.51 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 25. crypto-square + +- **Time taken:** 24.30 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 45/58 (77.59%) + +### 24. connect + +- **Time taken:** 38.97 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 20/30 (66.67%) +- **Failed tests:** + - test_illegal_diagonal_does_not_make_a_winner + - test_o_wins_crossing_from_top_to_bottom + - test_only_edges_does_not_make_a_winner + +### 27. darts + +- **Time taken:** 19.66 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 14/14 (100.00%) + +### 23. complex-numbers + +- **Time taken:** 50.18 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 58/63 (92.06%) + +### 26. custom-set + +- **Time taken:** 42.11 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 41/41 (100.00%) + +### 28. diamond + +- **Time taken:** 27.74 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 6/6 (100.00%) + +### 30. diffie-hellman + +- **Time taken:** 26.22 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 29. difference-of-squares + +- **Time taken:** 29.80 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 31. dnd-character + +- **Time taken:** 28.32 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 20/20 (100.00%) + +### 32. dominoes + +- **Time taken:** 32.88 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 67/96 (69.79%) +- **Failed tests:** + - test_empty_input_empty_output + +### 34. eliuds-eggs + +- **Time taken:** 21.65 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 5/5 (100.00%) + +### 33. dot-dsl + +- **Time taken:** 32.03 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 36. etl + +- **Time taken:** 19.65 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 5/5 (100.00%) + +### 35. error-handling + +- **Time taken:** 37.27 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 6/6 (100.00%) + +### 37. flatten-array + +- **Time taken:** 21.13 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 40. gigasecond + +- **Time taken:** 15.36 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 6/6 (100.00%) + +### 38. food-chain + +- **Time taken:** 41.20 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 52/70 (74.29%) +- **Failed tests:** + - test_fly + - test_full_song + - test_multiple_verses + +### 43. grains + +- **Time taken:** 24.32 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 39. forth + +- **Time taken:** 50.52 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 62/75 (82.67%) +- **Failed tests:** + - test_user_defined_words_can_use_different_words_with_the_same_name + +### 42. grade-school + +- **Time taken:** 37.40 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 21/21 (100.00%) + +### 45. hamming + +- **Time taken:** 21.42 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 47. hello-world + +- **Time taken:** 14.43 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 2/2 (100.00%) + +### 41. go-counting + +- **Time taken:** 53.15 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/14 (57.14%) + +### 44. grep + +- **Time taken:** 37.59 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 66/83 (79.52%) +- **Failed tests:** + - test_multiple_files_one_match_print_file_names_flag + - test_multiple_files_several_matches_file_flag_takes_precedence_over_line_number_flag + - test_one_file_one_match_file_flag_takes_precedence_over_line_flag + - test_one_file_one_match_print_file_names_flag + +### 48. hexadecimal + +- **Time taken:** 23.06 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/18 (83.33%) + +### 46. hangman + +- **Time taken:** 37.73 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 49. high-scores + +- **Time taken:** 20.52 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 50. house + +- **Time taken:** 28.69 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 51. isbn-verifier + +- **Time taken:** 32.19 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 20/20 (100.00%) + +### 52. isogram + +- **Time taken:** 25.12 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 53. killer-sudoku-helper + +- **Time taken:** 27.20 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 14/14 (100.00%) + +### 54. kindergarten-garden + +- **Time taken:** 31.55 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 82/103 (79.61%) + +### 55. knapsack + +- **Time taken:** 25.27 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 57. leap + +- **Time taken:** 21.87 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 10/10 (100.00%) + +### 56. largest-series-product + +- **Time taken:** 27.49 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 60. list-ops + +- **Time taken:** 40.33 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 32/39 (82.05%) +- **Failed tests:** + - test_foldr_direction_dependent_function_applied_to_non_empty_list + - test_foldr_foldr_add_string + +### 61. luhn + +- **Time taken:** 31.26 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 24/24 (100.00%) + +### 59. linked-list + +- **Time taken:** 46.48 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 25/25 (100.00%) + +### 58. ledger + +- **Time taken:** 600.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** deepseek-coder +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - CodeWhisper execution timed out after 1 minutes + +### 64. matrix + +- **Time taken:** 22.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 63. matching-brackets + +- **Time taken:** 26.14 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 21/21 (100.00%) + +### 62. markdown + +- **Time taken:** 600.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** deepseek-coder +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - CodeWhisper execution timed out after 1 minutes + +### 66. minesweeper + +- **Time taken:** 32.49 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 16/16 (100.00%) + +### 67. nth-prime + +- **Time taken:** 27.93 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 65. meetup + +- **Time taken:** 51.63 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 251/305 (82.30%) +- **Failed tests:** + - test_nonexistent_fifth_friday_of_august_2022 + - test_nonexistent_fifth_monday_of_february_2022 + - test_nonexistent_fifth_thursday_of_may_2023 + +### 69. octal + +- **Time taken:** 25.77 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 68. ocr-numbers + +- **Time taken:** 34.65 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 60/103 (58.25%) +- **Failed tests:** + - test_garbled_numbers_in_a_string_are_replaced_with + - test_numbers_separated_by_empty_lines_are_recognized_lines_are_joined_by_commas + - test_recognizes_0 + - test_recognizes_1 + - test_recognizes_110101100 + - test_recognizes_2 + - test_recognizes_3 + - test_recognizes_4 + - test_recognizes_5 + - test_recognizes_6 + - test_recognizes_7 + - test_recognizes_8 + - test_recognizes_9 + - test_recognizes_string_of_decimal_numbers + +### 72. pangram + +- **Time taken:** 21.02 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 73. pascals-triangle + +- **Time taken:** 26.14 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 74. perfect-numbers + +- **Time taken:** 28.31 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 14/14 (100.00%) + +### 70. paasio + +- **Time taken:** 600.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** deepseek-coder +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - CodeWhisper execution timed out after 1 minutes + +### 77. point-mutations + +- **Time taken:** 17.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 71. palindrome-products + +- **Time taken:** 60.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** deepseek-coder +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - Exercise execution timed out + +### 75. phone-number + +- **Time taken:** 37.53 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 27/31 (87.10%) +- **Failed tests:** + - test_invalid_with_punctuations + +### 76. pig-latin + +- **Time taken:** 30.32 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 26/30 (86.67%) +- **Failed tests:** + - test_word_beginning_with_th + +### 80. prime-factors + +- **Time taken:** 23.19 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 82. proverb + +- **Time taken:** 23.05 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 78. poker + +- **Time taken:** 41.50 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 179/334 (53.59%) +- **Failed tests:** + - test_a_straight_beats_three_of_a_kind + - test_aces_can_start_a_straight_a_2_3_4_5 + - test_aces_can_start_a_straight_flush_a_2_3_4_5 + - test_aces_cannot_be_in_the_middle_of_a_straight_q_k_a_2_3 + - test_both_hands_have_a_flush_tie_goes_to_high_card_down_to_the_last_one_if_necessary + - test_both_hands_have_a_full_house_tie_goes_to_highest_ranked_triplet + - test_both_hands_have_a_straight_flush_tie_goes_to_highest_ranked_card + - test_both_hands_have_four_of_a_kind_tie_goes_to_high_quad + - test_both_hands_have_the_same_pair_high_card_wins + - test_both_hands_have_three_of_a_kind_tie_goes_to_highest_ranked_triplet + - test_both_hands_have_two_identically_ranked_pairs_tie_goes_to_remaining_card_kicker + - test_both_hands_have_two_pairs_highest_ranked_pair_wins + - test_both_hands_have_two_pairs_that_add_to_the_same_value_win_goes_to_highest_pair + - test_both_hands_have_two_pairs_with_the_same_highest_ranked_pair_tie_goes_to_low_pair + - test_both_hands_with_a_straight_tie_goes_to_highest_ranked_card + - test_even_though_an_ace_is_usually_high_a_5_high_straight_flush_is_the_lowest_scoring_straight_flush + - test_even_though_an_ace_is_usually_high_a_5_high_straight_is_the_lowest_scoring_straight + - test_flush_beats_a_straight + - test_four_of_a_kind_beats_a_full_house + - test_full_house_beats_a_flush + - test_highest_pair_wins + - test_multiple_hands_with_the_same_high_cards_tie_compares_next_highest_ranked_down_to_last_card + - test_one_pair_beats_high_card + - test_single_hand_always_wins + - test_three_of_a_kind_beats_two_pair + - test_two_pairs_beats_one_pair + - test_two_pairs_first_ranked_by_largest_pair + - test_winning_high_card_hand_also_has_the_lowest_card + - test_with_multiple_decks_both_hands_have_a_full_house_with_the_same_triplet_tie_goes_to_the_pair + - test_with_multiple_decks_both_hands_with_identical_four_of_a_kind_tie_determined_by_kicker + - test_with_multiple_decks_two_players_can_have_same_three_of_a_kind_ties_go_to_highest_remaining_cards + +### 81. protein-translation + +- **Time taken:** 32.12 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 27/27 (100.00%) + +### 79. pov + +- **Time taken:** 38.88 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 16/16 (100.00%) + +### 83. pythagorean-triplet + +- **Time taken:** 24.80 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 84. queen-attack + +- **Time taken:** 29.95 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) + +### 86. raindrops + +- **Time taken:** 21.19 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 19/19 (100.00%) + +### 85. rail-fence-cipher + +- **Time taken:** 36.04 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 90. resistor-color + +- **Time taken:** 25.60 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 5/5 (100.00%) + +### 87. rational-numbers + +- **Time taken:** 46.94 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 458/597 (76.72%) + +### 89. rectangles + +- **Time taken:** 30.42 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 21/28 (75.00%) +- **Failed tests:** + - test_corner_is_required_for_a_rectangle_to_be_complete + - test_large_input_with_many_rectangles + +### 88. react + +- **Time taken:** 39.28 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 51/64 (79.69%) + +### 91. resistor-color-duo + +- **Time taken:** 26.03 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 95. reverse-string + +- **Time taken:** 18.39 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 93. resistor-color-trio + +- **Time taken:** 30.05 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 96. rna-transcription + +- **Time taken:** 18.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 92. resistor-color-expert + +- **Time taken:** 40.71 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 30/37 (81.08%) +- **Failed tests:** + - test_green_brown_orange_and_grey + - test_red_black_red_and_green + +### 94. rest-api + +- **Time taken:** 42.74 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 55/74 (74.32%) + +### 97. robot-name + +- **Time taken:** 24.78 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 53/70 (75.71%) + +### 99. roman-numerals + +- **Time taken:** 27.36 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 28/28 (100.00%) + +### 100. rotational-cipher + +- **Time taken:** 23.00 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 11/11 (100.00%) + +### 98. robot-simulator + +- **Time taken:** 36.61 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 19/19 (100.00%) + +### 102. saddle-points + +- **Time taken:** 25.09 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/18 (83.33%) + +### 101. run-length-encoding + +- **Time taken:** 32.75 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 14/14 (100.00%) + +### 103. satellite + +- **Time taken:** 34.80 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 107. secret-handshake + +- **Time taken:** 24.07 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 44/83 (53.01%) +- **Failed tests:** + - test_all_possible_actions + - test_close_your_eyes_for_100 + - test_combine_two_actions + - test_double_blink_for_10 + - test_jump_for_1000 + - test_reverse_two_actions + - test_reversing_no_actions_still_gives_no_actions + - test_reversing_one_action_gives_the_same_action + - test_wink_for_1 + +### 105. scale-generator + +- **Time taken:** 39.67 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 26/43 (60.47%) +- **Failed tests:** + - test_harmonic_minor + - test_locrian_mode + +### 106. scrabble-score + +- **Time taken:** 28.56 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 104. say + +- **Time taken:** 52.08 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 39/58 (67.24%) +- **Failed tests:** + - test_a_big_number + - test_one_billion + - test_one_million + - test_one_million_two_thousand_three_hundred_forty_five + - test_one_thousand + - test_one_thousand_two_hundred_thirty_four + +### 108. series + +- **Time taken:** 24.54 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 12/12 (100.00%) + +### 110. sieve + +- **Time taken:** 25.41 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 6/6 (100.00%) + +### 111. simple-cipher + +- **Time taken:** 32.45 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 109. sgf-parsing + +- **Time taken:** 46.63 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 112/167 (67.07%) +- **Failed tests:** + - test_escaped_backslash_in_property_value_becomes_just_a_backslash + - test_escaped_closing_bracket_within_property_value_becomes_just_a_closing_bracket + - test_escaped_newline_in_property_value_is_converted_to_nothing_at_all + - test_escaped_t_and_n_in_property_value_are_just_letters_not_whitespace + - test_escaped_tab_in_property_value_is_converted_to_space + - test_mixing_various_kinds_of_whitespace_and_escaped_characters_in_property_value + - test_multiple_properties + - test_multiple_property_values + - test_node_without_properties + - test_opening_bracket_within_property_value_doesn_t_need_to_be_escaped + - test_parentheses_in_property_value_don_t_need_to_be_escaped + - test_properties_without_delimiter + - test_semicolon_in_property_value_doesn_t_need_to_be_escaped + - test_single_node_tree + - test_two_child_trees + - test_two_nodes + - test_within_property_values_newlines_remain_as_newlines + - test_within_property_values_whitespace_characters_such_as_tab_are_converted_to_spaces + +### 112. simple-linked-list + +- **Time taken:** 41.64 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 42/49 (85.71%) + +### 113. space-age + +- **Time taken:** 40.87 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 114. spiral-matrix + +- **Time taken:** 26.99 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 115. square-root + +- **Time taken:** 23.20 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 7/7 (100.00%) + +### 116. strain + +- **Time taken:** 24.09 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 9/9 (100.00%) + +### 118. sum-of-multiples + +- **Time taken:** 23.94 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 17/17 (100.00%) + +### 117. sublist + +- **Time taken:** 27.67 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 56/90 (62.22%) +- **Failed tests:** + - test_at_end_of_superlist + - test_at_start_of_superlist + - test_consecutive + - test_empty_list_within_non_empty_list + - test_false_start + - test_in_middle_of_superlist + - test_large_lists + - test_non_empty_list_contains_empty_list + - test_sublist_at_end + - test_sublist_at_start + - test_sublist_in_middle + +### 120. transpose + +- **Time taken:** 24.31 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 50/60 (83.33%) +- **Failed tests:** + - test_first_line_longer_than_second_line + - test_jagged_triangle + - test_mixed_line_length + +### 119. tournament + +- **Time taken:** 39.91 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 13/13 (100.00%) + +### 123. trinary + +- **Time taken:** 24.34 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 8/8 (100.00%) + +### 122. triangle + +- **Time taken:** 27.26 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 87/121 (71.90%) + +### 121. tree-building + +- **Time taken:** 45.29 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 43/56 (76.79%) +- **Failed tests:** + - test_cycle_directly + - test_no_root_node + - test_non_continuous + - test_root_node_has_parent + +### 124. twelve-days + +- **Time taken:** 38.52 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 16/16 (100.00%) + +### 126. two-fer + +- **Time taken:** 17.10 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 4/4 (100.00%) + +### 127. variable-length-quantity + +- **Time taken:** 33.48 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 89/247 (36.03%) +- **Failed tests:** + - test_arbitrary_double_byte + - test_arbitrary_quadruple_byte + - test_arbitrary_quintuple_byte + - test_arbitrary_triple_byte + - test_four_bytes + - test_largest_double_byte + - test_largest_quadruple_byte + - test_largest_triple_byte + - test_many_multi_byte_values + - test_maximum_32_bit_integer + - test_maximum_32_bit_integer_input + - test_multiple_values + - test_smallest_double_byte + - test_smallest_quadruple_byte + - test_smallest_quintuple_byte + - test_smallest_triple_byte + - test_three_bytes + - test_two_bytes + - test_two_multi_byte_values + +### 125. two-bucket + +- **Time taken:** 48.48 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 25/46 (54.35%) +- **Failed tests:** + - test_measure_using_bucket_one_of_size_3_and_bucket_two_of_size_5_start_with_bucket_one + - test_measure_using_bucket_one_of_size_3_and_bucket_two_of_size_5_start_with_bucket_two + - test_measure_using_bucket_one_of_size_7_and_bucket_two_of_size_11_start_with_bucket_one + - test_measure_using_bucket_one_of_size_7_and_bucket_two_of_size_11_start_with_bucket_two + - test_with_the_same_buckets_but_a_different_goal_then_it_is_possible + +### 128. word-count + +- **Time taken:** 30.81 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 18/18 (100.00%) + +### 129. word-search + +- **Time taken:** 36.59 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 25/25 (100.00%) + +### 130. wordy + +- **Time taken:** 40.62 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 56/75 (74.67%) +- **Failed tests:** + - test_unknown_operation + +### 132. zebra-puzzle + +- **Time taken:** 20.37 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 3/3 (100.00%) + +### 131. yacht + +- **Time taken:** 48.20 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 30/30 (100.00%) + +### 133. zipper + +- **Time taken:** 47.62 seconds +- **Cost:** $0.0000 +- **Mode used:** diff +- **Model used:** deepseek-coder +- **Tests passed:** 15/15 (100.00%) diff --git a/benchmark/reports/benchmark_report_gpt-4o-2024-08-06_diff_reference.md b/benchmark/reports/benchmark_report_gpt-4o-2024-08-06_diff_reference.md new file mode 100644 index 0000000..c7d953b --- /dev/null +++ b/benchmark/reports/benchmark_report_gpt-4o-2024-08-06_diff_reference.md @@ -0,0 +1,1226 @@ +# CodeWhisper Benchmark Report + +## Summary + +- **Total time:** 986.68 seconds +- **Total cost:** $1.6800 +- **Passed exercises:** 96/133 (72.18%) +- **Total tests passed:** 3651/4479 (81.51%) + +### 2. acronym + +- **Time taken:** 4.13 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 13/17 (76.47%) +- **Failed tests:** + - test_underscore_emphasis + +# CodeWhisper Benchmark Report + +## Detailed Results + +### 1. accumulate + +- **Time taken:** 5.38 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 7/7 (100.00%) + +### 5. allergies + +- **Time taken:** 7.10 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 51/51 (100.00%) + +### 3. affine-cipher + +- **Time taken:** 8.05 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 17/17 (100.00%) + +### 4. all-your-base + +- **Time taken:** 8.73 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 22/22 (100.00%) + +### 7. anagram + +- **Time taken:** 4.49 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 19/19 (100.00%) + +### 8. armstrong-numbers + +- **Time taken:** 4.72 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 10/10 (100.00%) + +### 9. atbash-cipher + +- **Time taken:** 5.68 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 15/15 (100.00%) + +### 10. bank-account + +- **Time taken:** 7.33 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 17/17 (100.00%) + +### 12. binary + +- **Time taken:** 5.76 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 12/12 (100.00%) + +### 11. beer-song + +- **Time taken:** 7.72 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 9/9 (100.00%) + +### 6. alphametics + +- **Time taken:** 6.35 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 10/10 (100.00%) + +### 13. binary-search + +- **Time taken:** 6.32 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 12/12 (100.00%) + +### 14. binary-search-tree + +- **Time taken:** 6.19 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 11/11 (100.00%) + +### 15. bob + +- **Time taken:** 4.88 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 26/26 (100.00%) + +### 16. book-store + +- **Time taken:** 5.93 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 29/36 (80.56%) +- **Failed tests:** + - test_one_group_of_one_and_two_plus_three_groups_of_four_is_cheaper_than_one_group_of_each_size + - test_two_groups_of_four_is_cheaper_than_groups_of_five_and_three + +### 17. bottle-song + +- **Time taken:** 7.89 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 59/88 (67.05%) +- **Failed tests:** + - test_all_verses + - test_first_generic_verse + - test_first_two_verses + - test_last_generic_verse + - test_last_three_verses + - test_verse_with_1_bottle + - test_verse_with_2_bottles + +### 19. change + +- **Time taken:** 7.50 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 29/50 (58.00%) +- **Failed tests:** + - test_another_possible_change_without_unit_coins_available + - test_change_with_lilliputian_coins + - test_large_target_values + - test_multiple_coin_change + - test_possible_change_without_unit_coins_available + +### 22. collatz-conjecture + +- **Time taken:** 4.60 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 7/7 (100.00%) + +### 21. clock + +- **Time taken:** 7.13 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 56/56 (100.00%) + +### 18. bowling + +- **Time taken:** 15.07 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 91/121 (75.21%) +- **Failed tests:** + - test_the_second_bonus_rolls_after_a_strike_in_the_last_frame_cannot_be_a_strike_if_the_first_one_is_not_a_strike + - test_two_bonus_rolls_after_a_strike_in_the_last_frame_cannot_score_more_than_10_points + - test_two_rolls_in_a_frame_cannot_score_more_than_10_points + +### 25. crypto-square + +- **Time taken:** 5.37 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 9/9 (100.00%) + +### 20. circular-buffer + +- **Time taken:** 16.31 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 15/15 (100.00%) + +### 24. connect + +- **Time taken:** 8.50 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 23/36 (63.89%) +- **Failed tests:** + - test_o_wins_crossing_from_top_to_bottom + - test_x_wins_crossing_from_left_to_right + - test_x_wins_using_a_convoluted_path + - test_x_wins_using_a_spiral_path + +### 23. complex-numbers + +- **Time taken:** 12.64 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 58/63 (92.06%) + +### 27. darts + +- **Time taken:** 4.51 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 14/14 (100.00%) + +### 26. custom-set + +- **Time taken:** 9.42 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 41/41 (100.00%) + +### 28. diamond + +- **Time taken:** 6.00 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 6/6 (100.00%) + +### 29. difference-of-squares + +- **Time taken:** 5.80 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 10/10 (100.00%) + +### 30. diffie-hellman + +- **Time taken:** 5.38 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 7/7 (100.00%) + +### 31. dnd-character + +- **Time taken:** 6.46 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 36/43 (83.72%) + +### 34. eliuds-eggs + +- **Time taken:** 3.91 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 5/5 (100.00%) + +### 32. dominoes + +- **Time taken:** 5.88 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 14/14 (100.00%) + +### 33. dot-dsl + +- **Time taken:** 8.89 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 13/13 (100.00%) + +### 37. flatten-array + +- **Time taken:** 4.24 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 12/12 (100.00%) + +### 36. etl + +- **Time taken:** 4.87 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 5/5 (100.00%) + +### 35. error-handling + +- **Time taken:** 6.94 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 6/6 (100.00%) + +### 40. gigasecond + +- **Time taken:** 4.02 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 6/6 (100.00%) + +### 38. food-chain + +- **Time taken:** 9.06 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 84/120 (70.00%) +- **Failed tests:** + - test_bird + - test_cat + - test_cow + - test_dog + - test_full_song + - test_goat + - test_multiple_verses + +### 42. grade-school + +- **Time taken:** 6.89 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 21/21 (100.00%) + +### 43. grains + +- **Time taken:** 4.47 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 12/12 (100.00%) + +### 39. forth + +- **Time taken:** 9.91 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 59/71 (83.10%) +- **Failed tests:** + - test_user_defined_words_can_use_different_words_with_the_same_name + +### 45. hamming + +- **Time taken:** 3.71 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 10/10 (100.00%) + +### 44. grep + +- **Time taken:** 7.34 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 26/26 (100.00%) + +### 47. hello-world + +- **Time taken:** 4.05 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 2/2 (100.00%) + +### 46. hangman + +- **Time taken:** 6.86 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 8/8 (100.00%) + +### 48. hexadecimal + +- **Time taken:** 4.49 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 11/11 (100.00%) + +### 41. go-counting + +- **Time taken:** 17.63 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 8/14 (57.14%) + +### 49. high-scores + +- **Time taken:** 6.09 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 53/65 (81.54%) + +### 52. isogram + +- **Time taken:** 5.26 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 15/15 (100.00%) + +### 53. killer-sudoku-helper + +- **Time taken:** 4.27 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 14/14 (100.00%) + +### 51. isbn-verifier + +- **Time taken:** 7.38 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 20/20 (100.00%) + +### 50. house + +- **Time taken:** 10.02 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 184/237 (77.64%) +- **Failed tests:** + - test_full_rhyme + - test_multiple_verses + - test_verse_10_the_rooster_that_crowed_in_the_morn + - test_verse_11_the_farmer_sowing_his_corn + - test_verse_12_the_horse_and_the_hound_and_the_horn + - test_verse_eight_the_man_all_tattered_and_torn + - test_verse_five_the_dog_that_worried + - test_verse_four_the_cat_that_killed + - test_verse_nine_the_priest_all_shaven_and_shorn + - test_verse_seven_the_maiden_all_forlorn + - test_verse_six_the_cow_with_the_crumpled_horn + - test_verse_three_the_rat_that_ate + - test_verse_two_the_malt_that_lay + +### 54. kindergarten-garden + +- **Time taken:** 4.99 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 20/20 (100.00%) + +### 55. knapsack + +- **Time taken:** 4.98 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 8/8 (100.00%) + +### 57. leap + +- **Time taken:** 3.75 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 10/10 (100.00%) + +### 56. largest-series-product + +- **Time taken:** 5.43 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 15/15 (100.00%) + +### 61. luhn + +- **Time taken:** 5.59 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 24/24 (100.00%) + +### 60. list-ops + +- **Time taken:** 6.33 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 25/25 (100.00%) + +### 59. linked-list + +- **Time taken:** 10.01 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 97/122 (79.51%) + +### 63. matching-brackets + +- **Time taken:** 4.48 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 21/21 (100.00%) + +### 64. matrix + +- **Time taken:** 4.91 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 9/9 (100.00%) + +### 67. nth-prime + +- **Time taken:** 4.77 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 7/7 (100.00%) + +### 66. minesweeper + +- **Time taken:** 6.79 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 16/16 (100.00%) + +### 58. ledger + +- **Time taken:** 20.05 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 24/40 (60.00%) + +### 62. markdown + +- **Time taken:** 17.12 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 18/18 (100.00%) + +### 65. meetup + +- **Time taken:** 11.83 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 212/251 (84.46%) + +### 69. octal + +- **Time taken:** 5.37 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 11/11 (100.00%) + +### 68. ocr-numbers + +- **Time taken:** 6.43 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 18/18 (100.00%) + +### 72. pangram + +- **Time taken:** 4.55 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 13/13 (100.00%) + +### 70. paasio + +- **Time taken:** 10.25 seconds +- **Cost:** $0.0300 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 227/304 (74.67%) + +### 73. pascals-triangle + +- **Time taken:** 5.81 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 15/17 (88.24%) + +### 74. perfect-numbers + +- **Time taken:** 5.52 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 14/14 (100.00%) + +### 75. phone-number + +- **Time taken:** 8.38 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 77/111 (69.37%) +- **Failed tests:** + - test_invalid_if_area_code_starts_with_0 + - test_invalid_if_area_code_starts_with_0_on_valid_11_digit_number + - test_invalid_if_area_code_starts_with_1 + - test_invalid_if_area_code_starts_with_1_on_valid_11_digit_number + - test_invalid_if_exchange_code_starts_with_0 + - test_invalid_if_exchange_code_starts_with_0_on_valid_11_digit_number + - test_invalid_if_exchange_code_starts_with_1 + - test_invalid_if_exchange_code_starts_with_1_on_valid_11_digit_number + - test_invalid_when_11_digits_does_not_start_with_a_1 + - test_invalid_with_letters + - test_invalid_with_punctuations + +### 77. point-mutations + +- **Time taken:** 4.72 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 9/9 (100.00%) + +### 76. pig-latin + +- **Time taken:** 5.21 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 23/23 (100.00%) + +### 71. palindrome-products + +- **Time taken:** 9.70 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 14/14 (100.00%) + +### 80. prime-factors + +- **Time taken:** 4.93 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 13/13 (100.00%) + +### 81. protein-translation + +- **Time taken:** 7.36 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 27/27 (100.00%) + +### 78. poker + +- **Time taken:** 10.97 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 134/229 (58.52%) +- **Failed tests:** + - test_aces_can_start_a_straight_a_2_3_4_5 + - test_both_hands_have_a_flush_tie_goes_to_high_card_down_to_the_last_one_if_necessary + - test_both_hands_have_a_full_house_tie_goes_to_highest_ranked_triplet + - test_both_hands_have_four_of_a_kind_tie_goes_to_high_quad + - test_both_hands_have_the_same_pair_high_card_wins + - test_both_hands_have_three_of_a_kind_tie_goes_to_highest_ranked_triplet + - test_both_hands_have_two_pairs_highest_ranked_pair_wins + - test_both_hands_have_two_pairs_that_add_to_the_same_value_win_goes_to_highest_pair + - test_both_hands_have_two_pairs_with_the_same_highest_ranked_pair_tie_goes_to_low_pair + - test_four_of_a_kind_beats_a_full_house + - test_full_house_beats_a_flush + - test_highest_pair_wins + - test_one_pair_beats_high_card + - test_three_of_a_kind_beats_two_pair + - test_winning_high_card_hand_also_has_the_lowest_card + - test_with_multiple_decks_two_players_can_have_same_three_of_a_kind_ties_go_to_highest_remaining_cards + +### 79. pov + +- **Time taken:** 8.62 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 45/73 (61.64%) +- **Failed tests:** + - test_can_find_path_from_nodes_other_than_x + - test_can_find_path_not_involving_root + - test_can_find_path_to_cousin + - test_can_find_path_to_parent + - test_can_find_path_to_sibling + - test_errors_if_source_does_not_exist + +### 82. proverb + +- **Time taken:** 5.37 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 9/9 (100.00%) + +### 86. raindrops + +- **Time taken:** 4.09 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 19/19 (100.00%) + +### 85. rail-fence-cipher + +- **Time taken:** 6.99 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 17/27 (62.96%) + +### 83. pythagorean-triplet + +- **Time taken:** 5.28 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 8/8 (100.00%) + +### 84. queen-attack + +- **Time taken:** 8.21 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 15/15 (100.00%) + +### 87. rational-numbers + +- **Time taken:** 11.40 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 44/44 (100.00%) + +### 90. resistor-color + +- **Time taken:** 5.13 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 5/5 (100.00%) + +### 88. react + +- **Time taken:** 8.44 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 30/37 (81.08%) + +### 91. resistor-color-duo + +- **Time taken:** 5.25 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 8/8 (100.00%) + +### 89. rectangles + +- **Time taken:** 7.30 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 15/15 (100.00%) + +### 95. reverse-string + +- **Time taken:** 3.14 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 8/8 (100.00%) + +### 96. rna-transcription + +- **Time taken:** 4.96 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 7/7 (100.00%) + +### 93. resistor-color-trio + +- **Time taken:** 6.78 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 11/11 (100.00%) + +### 94. rest-api + +- **Time taken:** 7.90 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 63/73 (86.30%) +- **Failed tests:** + - test_lender_owes_borrower + - test_lender_owes_borrower_less_than_new_loan + - test_lender_owes_borrower_same_as_new_loan + +### 92. resistor-color-expert + +- **Time taken:** 10.30 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 52/65 (80.00%) +- **Failed tests:** + - test_brown_red_orange_green_and_blue + - test_green_brown_orange_and_grey + - test_red_black_red_and_green + - test_violet_orange_red_and_grey + +### 97. robot-name + +- **Time taken:** 6.13 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 5/5 (100.00%) + +### 99. roman-numerals + +- **Time taken:** 4.98 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 28/28 (100.00%) + +### 98. robot-simulator + +- **Time taken:** 6.43 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 19/19 (100.00%) + +### 100. rotational-cipher + +- **Time taken:** 5.47 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 11/11 (100.00%) + +### 102. saddle-points + +- **Time taken:** 6.11 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 11/11 (100.00%) + +### 103. satellite + +- **Time taken:** 5.15 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 7/7 (100.00%) + +### 101. run-length-encoding + +- **Time taken:** 8.58 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 14/14 (100.00%) + +### 107. secret-handshake + +- **Time taken:** 4.29 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 12/12 (100.00%) + +### 105. scale-generator + +- **Time taken:** 7.41 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 34/69 (49.28%) +- **Failed tests:** + - test_enigmatic + - test_harmonic_minor + - test_locrian_mode + - test_pentatonic + +### 106. scrabble-score + +- **Time taken:** 5.24 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 12/12 (100.00%) + +### 104. say + +- **Time taken:** 8.74 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 21/21 (100.00%) + +### 108. series + +- **Time taken:** 6.27 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 17/21 (80.95%) +- **Failed tests:** + - test_empty_series_is_invalid + +### 111. simple-cipher + +- **Time taken:** 5.58 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 13/13 (100.00%) + +### 112. simple-linked-list + +- **Time taken:** 7.97 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 59/78 (75.64%) +- **Failed tests:** + - test_can_pop_from_non_empty_list + - test_non_empty_list_has_correct_head + - test_non_empty_list_traverse + - test_push_and_pop + +### 110. sieve + +- **Time taken:** 8.81 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 6/6 (100.00%) + +### 109. sgf-parsing + +- **Time taken:** 11.15 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 71/104 (68.27%) +- **Failed tests:** + - test_all_lowercase_property + - test_escaped_closing_bracket_within_property_value_becomes_just_a_closing_bracket + - test_mixing_various_kinds_of_whitespace_and_escaped_characters_in_property_value + - test_opening_bracket_within_property_value_doesn_t_need_to_be_escaped + - test_properties_without_delimiter + - test_semicolon_in_property_value_doesn_t_need_to_be_escaped + - test_two_nodes + - test_upper_and_lowercase_property + - test_within_property_values_whitespace_characters_such_as_tab_are_converted_to_spaces + +### 113. space-age + +- **Time taken:** 8.02 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 9/9 (100.00%) + +### 114. spiral-matrix + +- **Time taken:** 6.30 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 7/7 (100.00%) + +### 115. square-root + +- **Time taken:** 4.57 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 7/7 (100.00%) + +### 116. strain + +- **Time taken:** 4.90 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 9/9 (100.00%) + +### 118. sum-of-multiples + +- **Time taken:** 4.19 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 17/17 (100.00%) + +### 117. sublist + +- **Time taken:** 5.74 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 23/23 (100.00%) + +### 120. transpose + +- **Time taken:** 5.15 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 56/69 (81.16%) +- **Failed tests:** + - test_first_line_longer_than_second_line + - test_mixed_line_length + - test_second_line_longer_than_first_line + - test_single_line + +### 122. triangle + +- **Time taken:** 6.29 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 22/22 (100.00%) + +### 119. tournament + +- **Time taken:** 10.52 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 13/13 (100.00%) + +### 123. trinary + +- **Time taken:** 6.51 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 8/8 (100.00%) + +### 121. tree-building + +- **Time taken:** 10.11 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 27/34 (79.41%) +- **Failed tests:** + - test_no_root_node + - test_non_continuous + +### 124. twelve-days + +- **Time taken:** 5.98 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 112/195 (57.44%) +- **Failed tests:** + - test_eighth_day_eight_maids_a_milking + - test_eleventh_day_eleven_pipers_piping + - test_fifth_day_five_gold_rings + - test_fourth_day_four_calling_birds + - test_ninth_day_nine_ladies_dancing + - test_second_day_two_turtle_doves + - test_seventh_day_seven_swans_a_swimming + - test_sixth_day_six_geese_a_laying + - test_tenth_day_ten_lords_a_leaping + - test_third_day_three_french_hens + - test_twelfth_day_twelve_drummers_drumming + +### 126. two-fer + +- **Time taken:** 4.19 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 4/4 (100.00%) + +### 128. word-count + +- **Time taken:** 4.57 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 21/25 (84.00%) +- **Failed tests:** + - test_non_alphanumeric + +### 127. variable-length-quantity + +- **Time taken:** 7.02 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 30/34 (88.24%) +- **Failed tests:** + - test_incomplete_sequence_causes_error_even_if_value_is_zero + +### 129. word-search + +- **Time taken:** 6.72 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 25/25 (100.00%) + +### 130. wordy + +- **Time taken:** 6.54 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 84/115 (73.04%) +- **Failed tests:** + - test_reject_problem_with_no_operands_or_operators + - test_unknown_operation + +### 131. yacht + +- **Time taken:** 10.06 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 30/30 (100.00%) + +### 133. zipper + +- **Time taken:** 10.58 seconds +- **Cost:** $0.0200 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 15/15 (100.00%) + +### 132. zebra-puzzle + +- **Time taken:** 13.48 seconds +- **Cost:** $0.0100 +- **Mode used:** diff +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 6/10 (60.00%) +- **Failed tests:** + - test_resident_who_drinks_water + +### 125. two-bucket + +- **Time taken:** 60.00 seconds +- **Cost:** $0.0000 +- **Mode used:** whole +- **Model used:** gpt-4o-2024-08-06 +- **Tests passed:** 0/0 (0.00%) +- **Errors:** + - Exercise execution timed out diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh new file mode 100755 index 0000000..9166336 --- /dev/null +++ b/benchmark/run_benchmark.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +# Default values +MODEL="claude-3-5-sonnet-20240620" +CONCURRENT_WORKERS=4 +NUM_TESTS="all" # Changed default to "all" +NO_PLAN=false +DIFF_MODE="" + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + --model) + MODEL="$2" + shift + shift + ;; + --workers) + CONCURRENT_WORKERS="$2" + shift + shift + ;; + --tests) + NUM_TESTS="$2" + shift + shift + ;; + --no-plan) + NO_PLAN=true + shift + ;; + --diff) + DIFF_MODE="--diff" + shift + ;; + --no-diff) + DIFF_MODE="--no-diff" + shift + ;; + --debug) + DEBUG_MODE=true + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Determine which API key to use based on the model +if [[ $MODEL == claude* ]]; then + API_KEY=$ANTHROPIC_API_KEY +elif [[ $MODEL == gpt* ]]; then + API_KEY=$OPENAI_API_KEY +elif [[ $MODEL == groq* ]]; then + API_KEY=$GROQ_API_KEY +elif [[ $MODEL == deepseek* ]]; then + API_KEY=$DEEPSEEK_API_KEY +else + echo "Unknown model type. Please ensure you've set the correct API key." + exit 1 +fi + +# Check if the required API key is set +if [ -z "$API_KEY" ]; then + echo "Error: API key for the selected model is not set." + exit 1 +fi + +# Get the directory where the script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + + +# Create a directory for reports if it doesn't exist +mkdir -p "$SCRIPT_DIR/reports" + +# Run the Docker container +docker run -it --rm \ + -e ANTHROPIC_API_KEY=$API_KEY \ + -e OPENAI_API_KEY=$API_KEY \ + -e GROQ_API_KEY=$API_KEY \ + -e DEEPSEEK_API_KEY=$API_KEY \ + -e MODEL=$MODEL \ + -e CONCURRENT_WORKERS=$CONCURRENT_WORKERS \ + -e NUM_TESTS=$NUM_TESTS \ + -e NO_PLAN=$NO_PLAN \ + -e DIFF_MODE=$DIFF_MODE \ + -e DEBUG_MODE=$DEBUG_MODE \ + -v "$SCRIPT_DIR/reports":/app/benchmark/reports \ + codewhisper-benchmark diff --git a/benchmark/tsconfig.json b/benchmark/tsconfig.json new file mode 100644 index 0000000..0c139fe --- /dev/null +++ b/benchmark/tsconfig.json @@ -0,0 +1,20 @@ +{ + "compilerOptions": { + "allowJs": true, + "target": "ESNext", + "lib": ["ESNext"], + "module": "ESNext", + "moduleResolution": "Bundler", + "resolveJsonModule": true, + "strict": true, + "strictNullChecks": true, + "noEmit": true, + "esModuleInterop": true, + "skipDefaultLibCheck": true, + "skipLibCheck": true, + "sourceMap": true, + "forceConsistentCasingInFileNames": true + }, + "include": ["./*.ts"], + "exclude": ["node_modules"] +} diff --git a/benchmark/tsup.config.ts b/benchmark/tsup.config.ts new file mode 100644 index 0000000..1949f16 --- /dev/null +++ b/benchmark/tsup.config.ts @@ -0,0 +1,16 @@ +import { defineConfig } from 'tsup'; + +export default defineConfig({ + entry: { + benchmark: 'benchmark.ts', + }, + format: ['esm'], + splitting: false, + clean: true, + shims: true, + name: 'codewhisper-benchmark', + dts: true, + esbuildOptions: (options) => { + options.platform = 'node'; + }, +}); diff --git a/benchmark/types.ts b/benchmark/types.ts new file mode 100644 index 0000000..e1350ea --- /dev/null +++ b/benchmark/types.ts @@ -0,0 +1,28 @@ +export interface CodeWhisperResult { + output: string; + time: number; + totalCost: number; + modeUsed: 'diff' | 'whole'; +} + +export interface BenchmarkResult { + exercise: string; + time_taken: number; + total_cost: number; + mode_used: 'diff' | 'whole'; + model_used: string; + test_passed: boolean; + test_output: string; + total_tests: number; + passed_tests: number; + failed_tests: string[]; + errors: string[]; +} + +export type SummaryStats = { + totalTime: number; + totalCost: number; + passedTests: number; + totalTests: number; + totalPassedTests: number; +}; diff --git a/benchmark/utils.ts b/benchmark/utils.ts new file mode 100644 index 0000000..a5cacf0 --- /dev/null +++ b/benchmark/utils.ts @@ -0,0 +1,250 @@ +import { exec } from 'node:child_process'; +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { setTimeout } from 'node:timers/promises'; +import { promisify } from 'node:util'; +import type { BenchmarkResult, CodeWhisperResult } from './types'; + +const execAsync = promisify(exec); + +export async function cloneRepo( + repoUrl: string, + targetDir: string, +): Promise { + console.log(`Cloning repository into ${targetDir}`); + try { + // Remove the directory if it exists + if (fs.existsSync(targetDir)) { + fs.rmSync(targetDir, { recursive: true, force: true }); + console.log(`Removed existing directory: ${targetDir}`); + } + + // Clone the repository + await execAsync(`git clone --depth 1 ${repoUrl} ${targetDir}`); + console.log('Repository cloned successfully.'); + } catch (error) { + console.error('Error cloning repository:', error); + throw error; + } +} + +export async function runCodeWhisper( + exerciseDir: string, + model: string, + noPlan: boolean, + diffMode: string, +): Promise { + const configFile = path.join(exerciseDir, '.meta', 'config.json'); + const config = JSON.parse(fs.readFileSync(configFile, 'utf-8')); + + const solutionFile = path.join(exerciseDir, config.files.solution[0]); + const testFile = path.join(exerciseDir, config.files.test[0]); + // const introductionFile = path.join(exerciseDir, '.docs', 'introduction.md'); + const instructionsFile = path.join(exerciseDir, '.docs', 'instructions.md'); + + // Use relative paths for the -f option + const relSolutionFile = path.relative(exerciseDir, solutionFile); + const relTestFile = path.relative(exerciseDir, testFile); + // const relIntroductionFile = path.relative(exerciseDir, introductionFile); + const relInstructionsFile = path.relative(exerciseDir, instructionsFile); + + const planFlag = noPlan ? '--no-plan' : '--accept-plan'; + const cmd = `node /app/dist/cli/index.js task -t "Complete the following task" --description "Complete the task described in the instructions.md file by modifying the file ${relSolutionFile}. Ensure the solution passes the tests in ${relTestFile}. These files are found directly in the project directory and are not in subdirectories. They are not in the src directory." -i "Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc. Only use standard python libraries, don't suggest installing any packages. The test file that is provided is 100% correct and will pass if the solution is correct." --skip-files ${planFlag} --model "${model}" --path "${exerciseDir}" ${diffMode} -f "${relSolutionFile}" "${relTestFile}" "${relInstructionsFile}" --log-ai-interactions`; + + const startTime = Date.now(); + const { stdout } = await execAsync(cmd); + const endTime = Date.now(); + + // Check if the solution file exists and has content + if ( + !fs.existsSync(solutionFile) || + fs.readFileSync(solutionFile, 'utf-8').trim() === '' + ) { + throw new Error( + `Solution file ${solutionFile} was not created or is empty`, + ); + } + + // Parse the total cost from the output + const costMatches = stdout.match(/Total cost so far: \$(\d+\.\d{2}) USD/g); + let totalCost = 0; + if (costMatches && costMatches.length > 0) { + const lastCostMatch = costMatches[costMatches.length - 1]; + totalCost = Number.parseFloat( + lastCostMatch.match(/\$(\d+\.\d{2})/)?.[1] ?? '0', + ); + } + + // Determine the mode used + const modeUsed = stdout.includes('diff') ? 'diff' : 'whole'; + + return { + output: stdout, + time: endTime - startTime, + totalCost, + modeUsed, + }; +} + +export async function runTests(testFile: string): Promise<{ + passed: boolean; + output: string; + total_tests: number; + passed_tests: number; + failed_tests: string[]; +}> { + const testFileName = path.basename(testFile); + console.log(`Running tests for ${testFileName}`); + + try { + const testDir = path.dirname(testFile); + const { stdout, stderr } = await execAsync( + `python3 -m unittest ${testFileName}`, + { cwd: testDir }, + ); + const output = stdout + stderr; + return parseTestOutput(output, testFileName); + } catch (error) { + console.error(`Error running tests for ${testFileName}:`, error); + if (error instanceof Error && 'stdout' in error && 'stderr' in error) { + const output = (error.stdout as string) + (error.stderr as string); + return parseTestOutput(output, testFileName); + } + return { + passed: false, + output: error instanceof Error ? error.message : 'Unknown error occurred', + total_tests: 0, + passed_tests: 0, + failed_tests: ['Error running tests'], + }; + } +} + +function parseTestOutput( + output: string, + testFileName: string, +): { + passed: boolean; + output: string; + total_tests: number; + passed_tests: number; + failed_tests: string[]; +} { + // Count dots for passed tests and F's for failed tests + const passedCount = (output.match(/\./g) || []).length; + const failedCount = (output.match(/F/g) || []).length; + const totalTests = passedCount + failedCount; + + // Extract failed test names + const failedTests = (output.match(/FAIL: (test_\w+)/g) || []).map( + (match) => match.split(': ')[1], + ); + + const passed = failedCount === 0; + + console.log( + `Tests completed for ${testFileName}. Total: ${totalTests}, Passed: ${passedCount}, Failed: ${failedCount}`, + ); + + return { + passed, + output, + total_tests: totalTests, + passed_tests: passedCount, + failed_tests: failedTests, + }; +} + +export async function runExercise( + exerciseDir: string, + model: string, + noPlan: boolean, + diffMode: string, +): Promise { + const exerciseName = path.basename(exerciseDir); + console.log(`Starting exercise: ${exerciseName}`); + + try { + // Read the config file to get the correct test file name + const configFile = path.join(exerciseDir, '.meta', 'config.json'); + const config = JSON.parse(fs.readFileSync(configFile, 'utf-8')); + const testFileName = config.files.test[0]; // Get the first test file name + + // Run CodeWhisper with a timeout + const codewhisperPromise = runCodeWhisper( + exerciseDir, + model, + noPlan, + diffMode, + ); + const timeoutPromise = setTimeout(60000, 'CodeWhisper execution timed out'); + + const codewhisperResult = await Promise.race([ + codewhisperPromise, + timeoutPromise, + ]); + + if (codewhisperResult === 'CodeWhisper execution timed out') { + console.log( + `CodeWhisper execution for ${exerciseName} timed out after 1 minute`, + ); + return { + exercise: exerciseName, + time_taken: 600, + total_cost: 0, + mode_used: diffMode ? 'diff' : 'whole', + model_used: model, + test_passed: false, + test_output: 'CodeWhisper execution timed out', + total_tests: 0, + passed_tests: 0, + failed_tests: [], + errors: ['CodeWhisper execution timed out after 1 minutes'], + }; + } + + console.log( + `CodeWhisper execution for ${exerciseName} completed. Running tests.`, + ); + + // Run tests + const testFile = path.join(exerciseDir, testFileName); + const testResult = await runTests(testFile); + + console.log( + `Tests for ${exerciseName} completed. Result: ${testResult.passed ? 'PASSED' : 'FAILED'}`, + ); + + // Calculate metrics + return { + exercise: exerciseName, + time_taken: (codewhisperResult as CodeWhisperResult).time / 1000, // Convert to seconds + total_cost: (codewhisperResult as CodeWhisperResult).totalCost, + mode_used: (codewhisperResult as CodeWhisperResult).modeUsed, + model_used: model, + test_passed: testResult.passed, + test_output: testResult.output, + total_tests: testResult.total_tests, + passed_tests: testResult.passed_tests, + failed_tests: testResult.failed_tests, + errors: [], + }; + } catch (error) { + console.error(`Error in exercise ${exerciseName}:`, error); + return { + exercise: exerciseName, + time_taken: 0, + total_cost: 0, + mode_used: diffMode ? 'diff' : 'whole', + model_used: model, + test_passed: false, + test_output: '', + total_tests: 0, + passed_tests: 0, + failed_tests: [], + errors: [ + error instanceof Error ? error.message : 'Unknown error occurred', + ], + }; + } +} diff --git a/package.json b/package.json index 0d9b20a..ae8af15 100644 --- a/package.json +++ b/package.json @@ -139,7 +139,6 @@ "vitest": "2.0.5" }, "packageManager": "pnpm@9.6.0", - "workspaces": ["apps/*", "packages/*"], "trustedDependencies": ["@biomejs/biome", "lefthook"], "publishConfig": { "registry": "https://registry.npmjs.org/", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fdf566f..9424286 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -139,6 +139,25 @@ importers: specifier: 2.0.5 version: 2.0.5(@types/node@20.14.15)(@vitest/ui@2.0.5) + benchmark: + dependencies: + ora: + specifier: 8.0.1 + version: 8.0.1 + p-limit: + specifier: ^6.1.0 + version: 6.1.0 + typescript: + specifier: 5.5.4 + version: 5.5.4 + devDependencies: + '@types/node': + specifier: 20.14.15 + version: 20.14.15 + tsup: + specifier: 8.2.4 + version: 8.2.4(jiti@1.21.6)(postcss@8.4.41)(tsx@4.16.2)(typescript@5.5.4) + packages: '@ai-sdk/anthropic@0.0.39': @@ -2332,6 +2351,10 @@ packages: resolution: {integrity: sha512-vvcXsLAJ9Dr5rQOPk7toZQZJApBl2K4J6dANSsEuh6QI41JYcsS/qhTGa9ErIUUgK3WNQoJYvylxvjqmiqEA9Q==} engines: {node: '>=4'} + p-limit@6.1.0: + resolution: {integrity: sha512-H0jc0q1vOzlEk0TqAKXKZxdl7kX3OFUzCnNVUnq5Pc3DGo0kpeaMuPqxQn235HibwBEb0/pm9dgKTjXy66fBkg==} + engines: {node: '>=18'} + p-locate@2.0.0: resolution: {integrity: sha512-nQja7m7gSKuewoVRen45CtVfODR3crN3goVQ0DDZ9N3yHxgpkuBhZqsaiotSQRrADUrne346peY7kT3TSACykg==} engines: {node: '>=4'} @@ -3102,6 +3125,10 @@ packages: resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==} engines: {node: '>=12'} + yocto-queue@1.1.1: + resolution: {integrity: sha512-b4JR1PFR10y1mKjhHY9LaGo6tmrgjit7hxVIeAmyMw3jegXR4dhYqLaQF5zMXZxY7tLpMyJeLjr1C4rLmkVe8g==} + engines: {node: '>=12.20'} + yoctocolors-cjs@2.1.2: resolution: {integrity: sha512-cYVsTjKl8b+FrnidjibDWskAv7UKOfcwaVZdp/it9n1s9fU3IkgDbhdIRKCW4JDsAlECJY0ytoVPT3sK6kideA==} engines: {node: '>=18'} @@ -5219,6 +5246,10 @@ snapshots: dependencies: p-try: 1.0.0 + p-limit@6.1.0: + dependencies: + yocto-queue: 1.1.1 + p-locate@2.0.0: dependencies: p-limit: 1.3.0 @@ -6010,6 +6041,8 @@ snapshots: y18n: 5.0.8 yargs-parser: 21.1.1 + yocto-queue@1.1.1: {} + yoctocolors-cjs@2.1.2: {} yoctocolors@2.1.1: {} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index cfd3c81..ad5cea0 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -1,5 +1,3 @@ packages: - - playground + - benchmark - docs - - packages/* - - examples/* diff --git a/src/ai/task-workflow.ts b/src/ai/task-workflow.ts index 547e5da..a84a2a0 100644 --- a/src/ai/task-workflow.ts +++ b/src/ai/task-workflow.ts @@ -57,7 +57,15 @@ export async function runAIAssistedTask(options: AiAssistedTaskOptions) { basePath, filters, ); - const selectedFiles = await selectFiles(options, basePath); + + let selectedFiles: string[] = []; + + if (!options.skipFiles) { + selectedFiles = await selectFiles(options, basePath); + selectedFiles.push(...(options.filter ?? [])); + } else { + selectedFiles = options.filter ?? []; + } spinner.start('Processing files...'); const processedFiles = await processFiles( diff --git a/src/cli/index.ts b/src/cli/index.ts index f458ad7..479dc5f 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -103,6 +103,11 @@ export function cli(_args: string[]) { '-e, --exclude ', 'File patterns to exclude (use glob patterns, e.g., "**/*.test.js")', ) + .option( + '--skip-files', + 'Optionally skip the file selection step and use the files provided by the --filter and --exclude options', + false, + ) .option('-s, --suppress-comments', 'Strip comments from the code', false) .option('-l, --line-numbers', 'Add line numbers to code blocks', false) .option('--case-sensitive', 'Use case-sensitive pattern matching', false) diff --git a/src/types/index.ts b/src/types/index.ts index 1904b99..b1f20d4 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -65,6 +65,7 @@ export type AiAssistedTaskOptions = Pick< plan?: boolean; context?: string[]; acceptPlan?: boolean; + skipFiles?: boolean; }; export type ProcessOptions = Pick<