Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Whisper: added support for custom models #221

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# An example collection of gpnvim configurations

## Content:

- [Groq based config](./groq_gp_config.lua)
90 changes: 90 additions & 0 deletions examples/groq_gp_config.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
local OPENAI_KEY = "<your_key>"
local GROQ_KEY = "<your_key>"
local OPENAI_HOST = "https://api.openai.com/v1/chat/completions"
local GROQ_HOST = "https://api.groq.com/openai/v1/chat/completions"
local GROQ_AUDIO = "https://api.groq.com/openai/v1/audio/transcriptions"

local GROQ_WHISPER_MODEL = "distil-whisper-large-v3-en";

-- Gp (GPT prompt) lua plugin for Neovim
-- https://github.com/Robitx/gp.nvim/

--------------------------------------------------------------------------------
-- Default config
--------------------------------------------------------------------------------
---@class GpConfig
-- README_REFERENCE_MARKER_START
local config = {
providers = {
openai = {
endpoint = OPENAI_HOST,
secret = OPENAI_KEY,
},
groq = {
endpoint = GROQ_HOST,
secret = GROQ_KEY,
},
},

chat_shortcut_respond = { modes = { "n", "i", "v", "x" }, shortcut = "<C-g><cr>" },
chat_confirm_delete = false,

-- prefix for all commands
cmd_prefix = "Gp",

default_chat_agent = "GroqLLAMA_8B",
whisper = {
-- -- TODO: In the future, when gpnvim will support whisper options
endpoint = GROQ_AUDIO,
secret = GROQ_KEY,
model = GROQ_WHISPER_MODEL,
store_dir = "/tmp/gp_whisper"
},

agents = {
{
provider = "openai",
name = "ChatGPT4o",
chat = false,
command = true,
-- string with model name or table with model name and parameters
model = { model = "gpt-4o", temperature = 0.8, top_p = 1 },
-- system prompt (use this to specify the persona/role of the AI)
system_prompt = "You are an AI working as a code editor.\n\n"
.. "Please AVOID COMMENTARY OUTSIDE OF THE SNIPPET RESPONSE.\n"
.. "START AND END YOUR ANSWER WITH:\n\n```",
},
{
provider = "openai",
name = "ChatGPT4o-mini",
chat = true,
command = true,
-- string with model name or table with model name and parameters
model = { model = "gpt-4o-mini", temperature = 0.8, top_p = 1 },
-- system prompt (use this to specify the persona/role of the AI)
system_prompt = "You are an AI working as a code editor.\n\n"
.. "Please AVOID COMMENTARY OUTSIDE OF THE SNIPPET RESPONSE.\n"
.. "START AND END YOUR ANSWER WITH:\n\n```",
},
{
provider = "groq",
name = "GroqLLAMA_8B",
chat = true,
command = true,
-- string with model name or table with model name and parameters
model = { model = "llama-3.1-70b-versatile", temperature = 0.8, top_p = 1 },
system_prompt = "You are an AI helping the user with code and other tasks\n\n"
.. "Please AVOID COMMENTARY OUTSIDE OF THE SNIPPET RESPONSE.\n",
},
{
provider = "groq",
name = "GroqLLAMA_8B",
chat = true,
command = true,
-- string with model name or table with model name and parameters
model = { model = "llama-3.2-11b-text-preview", temperature = 0.8, top_p = 1 },
system_prompt = "Given a task or problem, please provide a concise and well-formatted solution or answer.\n\n"
.. "Please keep your response within a code snippet, and avoid unnecessary commentary.\n",
},
},
}
10 changes: 9 additions & 1 deletion lua/gp/config.lua
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,12 @@ local config = {
-- you can disable whisper completely by whisper = {disable = true}
disable = false,

-- OpenAI audio/transcriptions api endpoint to transcribe audio to text
-- OpenAI compatible audio/transcriptions api endpoint to transcribe audio to text
endpoint = "https://api.openai.com/v1/audio/transcriptions",

-- Override the secret for whisper. It can be empty as well, in case of running local whisper server
secret = os.getenv("OPENAI_API_KEY"),

-- directory for storing whisper files
store_dir = (os.getenv("TMPDIR") or os.getenv("TEMP") or "/tmp") .. "/gp_whisper",
-- multiplier of RMS level dB for threshold used by sox to detect silence vs speech
Expand All @@ -404,6 +408,10 @@ local config = {
-- whisper_rec_cmd = {"arecord", "-c", "1", "-f", "S16_LE", "-r", "48000", "-d", "3600", "rec.wav"},
-- whisper_rec_cmd = {"ffmpeg", "-y", "-f", "avfoundation", "-i", ":0", "-t", "3600", "rec.wav"},
rec_cmd = nil,

-- Whisper model to use. See https://platform.openai.com/docs/models/whisper for more info for OpenAI models
-- Can also set a custom model. Locally, or groq-compatible models can be used.
model = "whisper-1",
},

-- image generation settings
Expand Down
63 changes: 38 additions & 25 deletions lua/gp/whisper.lua
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ local vault = require("gp.vault")
local default_config = require("gp.config")

local W = {
--@class GpConfig
config = {},
cmd = {},
disabled = false,
Expand Down Expand Up @@ -52,11 +53,11 @@ local whisper = function(callback, language)
return
end

local bearer = vault.get_secret("openai_api_key")
if not bearer then
logger.error("OpenAI API key not found")
return
end
-- get the secret for whisper. It can be empty as well, in case of running local whisper server
local bearer = W.config.secret or vault.get_secret("openai_api_key")

-- get the model.
local model = W.config.model or "whisper-1"

local rec_file = W.config.store_dir .. "/rec.wav"
local rec_options = {
Expand Down Expand Up @@ -180,34 +181,46 @@ local whisper = function(callback, language)

-- transcribe the recording
local transcribe = function()
local cmd = "cd "
.. W.config.store_dir
.. " && "
.. "export LC_NUMERIC='C' && "
-- normalize volume to -3dB
.. "sox --norm=-3 rec.wav norm.wav && "
-- get RMS level dB * silence threshold
.. "t=$(sox 'norm.wav' -n channels 1 stats 2>&1 | grep 'RMS lev dB' "
local cd_cmd = "cd " .. W.config.store_dir
local export_lc_numeric_cmd = "export LC_NUMERIC='C'"
local sox_norm_cmd = "sox --norm=-3 rec.wav norm.wav"
local sox_silence_t = "t=$(sox 'norm.wav' -n channels 1 stats 2>&1 | grep 'RMS lev dB' "
.. " | sed -e 's/.* //' | awk '{print $1*"
.. W.config.silence
.. "}') && "
-- remove silence, speed up, pad and convert to mp3
.. "sox -q norm.wav -C 196.5 final.mp3 silence -l 1 0.05 $t'dB' -1 1.0 $t'dB'"
.. " pad 0.1 0.1 tempo "
.. "}')"
local remove_silence_cmd = "sox -q norm.wav -C 196.5 final.mp3 silence -l 1 0.05 $t'dB' -1 1.0 $t'dB' pad 0.1 0.1 tempo "
.. W.config.tempo
.. " && "
-- call openai
.. curl

local curl_bearer_header = ""
if bearer ~= "" then
curl_bearer_header = '-H "Authorization: Bearer ' .. bearer .. '" '
end

local curl_cmd = curl
.. " --max-time 20 "
.. W.config.endpoint
.. ' -s -H "Authorization: Bearer '
.. bearer
.. '" -H "Content-Type: multipart/form-data" '
.. '-F model="whisper-1" -F language="'
.. " -s "
.. curl_bearer_header
.. '-H "Content-Type: multipart/form-data" '
.. '-F model="'
.. model
.. '" -F "language='
.. language
.. '" -F file="@final.mp3" '
.. '" -F "[email protected]" '
.. '-F response_format="json"'

local cmd = cd_cmd
.. " && "
.. export_lc_numeric_cmd
.. " && "
.. sox_norm_cmd
.. " && "
.. sox_silence_t
.. " && "
.. remove_silence_cmd
.. " && "
.. curl_cmd

tasker.run(nil, "bash", { "-c", cmd }, function(code, signal, stdout, _)
if code ~= 0 then
logger.error(string.format("Whisper query exited: %d, %d", code, signal))
Expand Down