-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
.Net: Audio-to-Text abstraction and OpenAI implementation (#4932)
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> This PR contains new interface for audio-to-text services and implementation of OpenAI connector to use Azure/OpenAI Whisper model. The implementation relies on Azure .NET SDK. ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> 1. Added new `IAudioToTextService` interface. 2. Added new `AzureOpenAIAudioToTextService` and `OpenAIAudioToTextService` implementations. 3. Added new `OpenAIAudioToTextExecutionSettings` to define audio-to-text configuration (e.g. filename, language etc.). 4. Added unit and integration tests. ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
- Loading branch information
1 parent
b9abe0a
commit 1490eb6
Showing
18 changed files
with
1,171 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
95 changes: 95 additions & 0 deletions
95
dotnet/src/Connectors/Connectors.OpenAI/AudioToText/AzureOpenAIAudioToTextService.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Collections.Generic; | ||
using System.Diagnostics.CodeAnalysis; | ||
using System.Net.Http; | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
using Azure.AI.OpenAI; | ||
using Azure.Core; | ||
using Microsoft.Extensions.Logging; | ||
using Microsoft.SemanticKernel.AudioToText; | ||
using Microsoft.SemanticKernel.Contents; | ||
using Microsoft.SemanticKernel.Services; | ||
|
||
namespace Microsoft.SemanticKernel.Connectors.OpenAI; | ||
|
||
/// <summary> | ||
/// Azure OpenAI audio-to-text service. | ||
/// </summary> | ||
[Experimental("SKEXP0005")] | ||
public sealed class AzureOpenAIAudioToTextService : IAudioToTextService | ||
{ | ||
/// <summary>Core implementation shared by Azure OpenAI services.</summary> | ||
private readonly AzureOpenAIClientCore _core; | ||
|
||
/// <inheritdoc/> | ||
public IReadOnlyDictionary<string, object?> Attributes => this._core.Attributes; | ||
|
||
/// <summary> | ||
/// Creates an instance of the <see cref="AzureOpenAIAudioToTextService"/> with API key auth. | ||
/// </summary> | ||
/// <param name="deploymentName">Azure OpenAI deployment name, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param> | ||
/// <param name="endpoint">Azure OpenAI deployment URL, see https://learn.microsoft.com/azure/cognitive-services/openai/quickstart</param> | ||
/// <param name="apiKey">Azure OpenAI API key, see https://learn.microsoft.com/azure/cognitive-services/openai/quickstart</param> | ||
/// <param name="modelId">Azure OpenAI model id, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param> | ||
/// <param name="httpClient">Custom <see cref="HttpClient"/> for HTTP requests.</param> | ||
/// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param> | ||
public AzureOpenAIAudioToTextService( | ||
string deploymentName, | ||
string endpoint, | ||
string apiKey, | ||
string? modelId = null, | ||
HttpClient? httpClient = null, | ||
ILoggerFactory? loggerFactory = null) | ||
{ | ||
this._core = new(deploymentName, endpoint, apiKey, httpClient, loggerFactory?.CreateLogger(typeof(AzureOpenAIAudioToTextService))); | ||
this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId); | ||
} | ||
|
||
/// <summary> | ||
/// Creates an instance of the <see cref="AzureOpenAIAudioToTextService"/> with AAD auth. | ||
/// </summary> | ||
/// <param name="deploymentName">Azure OpenAI deployment name, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param> | ||
/// <param name="endpoint">Azure OpenAI deployment URL, see https://learn.microsoft.com/azure/cognitive-services/openai/quickstart</param> | ||
/// <param name="credentials">Token credentials, e.g. DefaultAzureCredential, ManagedIdentityCredential, EnvironmentCredential, etc.</param> | ||
/// <param name="modelId">Azure OpenAI model id, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param> | ||
/// <param name="httpClient">Custom <see cref="HttpClient"/> for HTTP requests.</param> | ||
/// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param> | ||
public AzureOpenAIAudioToTextService( | ||
string deploymentName, | ||
string endpoint, | ||
TokenCredential credentials, | ||
string? modelId = null, | ||
HttpClient? httpClient = null, | ||
ILoggerFactory? loggerFactory = null) | ||
{ | ||
this._core = new(deploymentName, endpoint, credentials, httpClient, loggerFactory?.CreateLogger(typeof(AzureOpenAIAudioToTextService))); | ||
this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId); | ||
} | ||
|
||
/// <summary> | ||
/// Creates an instance of the <see cref="AzureOpenAIAudioToTextService"/> using the specified <see cref="OpenAIClient"/>. | ||
/// </summary> | ||
/// <param name="deploymentName">Azure OpenAI deployment name, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param> | ||
/// <param name="openAIClient">Custom <see cref="OpenAIClient"/>.</param> | ||
/// <param name="modelId">Azure OpenAI model id, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param> | ||
/// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param> | ||
public AzureOpenAIAudioToTextService( | ||
string deploymentName, | ||
OpenAIClient openAIClient, | ||
string? modelId = null, | ||
ILoggerFactory? loggerFactory = null) | ||
{ | ||
this._core = new(deploymentName, openAIClient, loggerFactory?.CreateLogger(typeof(AzureOpenAIAudioToTextService))); | ||
this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId); | ||
} | ||
|
||
/// <inheritdoc/> | ||
public Task<TextContent> GetTextContentAsync( | ||
AudioContent content, | ||
PromptExecutionSettings? executionSettings = null, | ||
Kernel? kernel = null, | ||
CancellationToken cancellationToken = default) | ||
=> this._core.GetTextContentFromAudioAsync(content, executionSettings, cancellationToken); | ||
} |
86 changes: 86 additions & 0 deletions
86
dotnet/src/Connectors/Connectors.OpenAI/AudioToText/OpenAIAudioToTextExecutionSettings.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System; | ||
using System.Text.Json; | ||
using System.Text.Json.Serialization; | ||
using Microsoft.SemanticKernel.Text; | ||
|
||
namespace Microsoft.SemanticKernel.Connectors.OpenAI; | ||
|
||
/// <summary> | ||
/// Execution settings for OpenAI audio-to-text request. | ||
/// </summary> | ||
public sealed class OpenAIAudioToTextExecutionSettings : PromptExecutionSettings | ||
{ | ||
/// <summary> | ||
/// Filename or identifier associated with audio data. | ||
/// Should be in format {filename}.{extension} | ||
/// </summary> | ||
[JsonPropertyName("filename")] | ||
public string Filename { get; set; } | ||
|
||
/// <summary> | ||
/// An optional language of the audio data as two-letter ISO-639-1 language code (e.g. 'en' or 'es'). | ||
/// </summary> | ||
[JsonPropertyName("language")] | ||
public string? Language { get; set; } | ||
|
||
/// <summary> | ||
/// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language. | ||
/// </summary> | ||
[JsonPropertyName("prompt")] | ||
public string? Prompt { get; set; } | ||
|
||
/// <summary> | ||
/// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt. Default is 'json'. | ||
/// </summary> | ||
[JsonPropertyName("response_format")] | ||
public string ResponseFormat { get; set; } = "json"; | ||
|
||
/// <summary> | ||
/// The sampling temperature, between 0 and 1. | ||
/// Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. | ||
/// If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. | ||
/// Default is 0. | ||
/// </summary> | ||
[JsonPropertyName("temperature")] | ||
public float Temperature { get; set; } = 0; | ||
|
||
/// <summary> | ||
/// Creates an instance of <see cref="OpenAIAudioToTextExecutionSettings"/> class. | ||
/// </summary> | ||
/// <param name="filename">Filename or identifier associated with audio data. Should be in format {filename}.{extension}</param> | ||
public OpenAIAudioToTextExecutionSettings(string filename) | ||
{ | ||
this.Filename = filename; | ||
} | ||
|
||
/// <summary> | ||
/// Converts <see cref="PromptExecutionSettings"/> to derived <see cref="OpenAIAudioToTextExecutionSettings"/> type. | ||
/// </summary> | ||
/// <param name="executionSettings">Instance of <see cref="PromptExecutionSettings"/>.</param> | ||
/// <returns>Instance of <see cref="OpenAIAudioToTextExecutionSettings"/>.</returns> | ||
public static OpenAIAudioToTextExecutionSettings? FromExecutionSettings(PromptExecutionSettings? executionSettings) | ||
{ | ||
if (executionSettings is null) | ||
{ | ||
return null; | ||
} | ||
|
||
if (executionSettings is OpenAIAudioToTextExecutionSettings settings) | ||
{ | ||
return settings; | ||
} | ||
|
||
var json = JsonSerializer.Serialize(executionSettings); | ||
|
||
var openAIExecutionSettings = JsonSerializer.Deserialize<OpenAIAudioToTextExecutionSettings>(json, JsonOptionsCache.ReadPermissive); | ||
|
||
if (openAIExecutionSettings is not null) | ||
{ | ||
return openAIExecutionSettings; | ||
} | ||
|
||
throw new ArgumentException($"Invalid execution settings, cannot convert to {nameof(OpenAIAudioToTextExecutionSettings)}", nameof(executionSettings)); | ||
} | ||
} |
72 changes: 72 additions & 0 deletions
72
dotnet/src/Connectors/Connectors.OpenAI/AudioToText/OpenAIAudioToTextService.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Collections.Generic; | ||
using System.Diagnostics.CodeAnalysis; | ||
using System.Net.Http; | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
using Azure.AI.OpenAI; | ||
using Microsoft.Extensions.Logging; | ||
using Microsoft.SemanticKernel.AudioToText; | ||
using Microsoft.SemanticKernel.Contents; | ||
using Microsoft.SemanticKernel.Services; | ||
|
||
namespace Microsoft.SemanticKernel.Connectors.OpenAI; | ||
|
||
/// <summary> | ||
/// OpenAI audio-to-text service. | ||
/// </summary> | ||
[Experimental("SKEXP0005")] | ||
public sealed class OpenAIAudioToTextService : IAudioToTextService | ||
{ | ||
/// <summary>Core implementation shared by OpenAI services.</summary> | ||
private readonly OpenAIClientCore _core; | ||
|
||
/// <inheritdoc/> | ||
public IReadOnlyDictionary<string, object?> Attributes => this._core.Attributes; | ||
|
||
/// <summary> | ||
/// Creates an instance of the <see cref="OpenAIAudioToTextService"/> with API key auth. | ||
/// </summary> | ||
/// <param name="modelId">Model name</param> | ||
/// <param name="apiKey">OpenAI API Key</param> | ||
/// <param name="organization">OpenAI Organization Id (usually optional)</param> | ||
/// <param name="httpClient">Custom <see cref="HttpClient"/> for HTTP requests.</param> | ||
/// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param> | ||
public OpenAIAudioToTextService( | ||
string modelId, | ||
string apiKey, | ||
string? organization = null, | ||
HttpClient? httpClient = null, | ||
ILoggerFactory? loggerFactory = null) | ||
{ | ||
this._core = new(modelId, apiKey, organization, httpClient, loggerFactory?.CreateLogger(typeof(OpenAIAudioToTextService))); | ||
|
||
this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId); | ||
this._core.AddAttribute(OpenAIClientCore.OrganizationKey, organization); | ||
} | ||
|
||
/// <summary> | ||
/// Creates an instance of the <see cref="OpenAIAudioToTextService"/> using the specified <see cref="OpenAIClient"/>. | ||
/// </summary> | ||
/// <param name="modelId">Model name</param> | ||
/// <param name="openAIClient">Custom <see cref="OpenAIClient"/> for HTTP requests.</param> | ||
/// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param> | ||
public OpenAIAudioToTextService( | ||
string modelId, | ||
OpenAIClient openAIClient, | ||
ILoggerFactory? loggerFactory = null) | ||
{ | ||
this._core = new(modelId, openAIClient, loggerFactory?.CreateLogger(typeof(OpenAIAudioToTextService))); | ||
|
||
this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId); | ||
} | ||
|
||
/// <inheritdoc/> | ||
public Task<TextContent> GetTextContentAsync( | ||
AudioContent content, | ||
PromptExecutionSettings? executionSettings = null, | ||
Kernel? kernel = null, | ||
CancellationToken cancellationToken = default) | ||
=> this._core.GetTextContentFromAudioAsync(content, executionSettings, cancellationToken); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.