.Net: Audio-to-Text abstraction and OpenAI implementation (#4932)

### Motivation and Context  This PR contains new interface for audio-to-text services and implementation of OpenAI connector to use Azure/OpenAI Whisper model. The implementation relies on Azure .NET SDK. ### Description  1. Added new `IAudioToTextService` interface. 2. Added new `AzureOpenAIAudioToTextService` and `OpenAIAudioToTextService` implementations. 3. Added new `OpenAIAudioToTextExecutionSettings` to define audio-to-text configuration (e.g. filename, language etc.). 4. Added unit and integration tests. ### Contribution Checklist  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
microsoft · Feb 14, 2024 · 1490eb6 · 1490eb6
1 parent b9abe0a
commit 1490eb6
Show file tree

Hide file tree

Showing 18 changed files with 1,171 additions and 3 deletions.
diff --git a/dotnet/docs/EXPERIMENTS.md b/dotnet/docs/EXPERIMENTS.md
@@ -16,6 +16,7 @@ You can use the following diagnostic IDs to ignore warnings or errors for a part
 - SKEXP0002: Image services
 - SKEXP0003: Memory connectors
 - SKEXP0004: Kernel Filters
+- SKEXP0005: Audio services
 
 ## OpenAI and Azure OpenAI services
 

diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/AzureOpenAIAudioToTextService.cs b/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/AzureOpenAIAudioToTextService.cs
@@ -0,0 +1,95 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
+using System.Net.Http;
+using System.Threading;
+using System.Threading.Tasks;
+using Azure.AI.OpenAI;
+using Azure.Core;
+using Microsoft.Extensions.Logging;
+using Microsoft.SemanticKernel.AudioToText;
+using Microsoft.SemanticKernel.Contents;
+using Microsoft.SemanticKernel.Services;
+
+namespace Microsoft.SemanticKernel.Connectors.OpenAI;
+
+/// <summary>
+/// Azure OpenAI audio-to-text service.
+/// </summary>
+[Experimental("SKEXP0005")]
+public sealed class AzureOpenAIAudioToTextService : IAudioToTextService
+{
+    /// <summary>Core implementation shared by Azure OpenAI services.</summary>
+    private readonly AzureOpenAIClientCore _core;
+
+    /// <inheritdoc/>
+    public IReadOnlyDictionary<string, object?> Attributes => this._core.Attributes;
+
+    /// <summary>
+    /// Creates an instance of the <see cref="AzureOpenAIAudioToTextService"/> with API key auth.
+    /// </summary>
+    /// <param name="deploymentName">Azure OpenAI deployment name, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param>
+    /// <param name="endpoint">Azure OpenAI deployment URL, see https://learn.microsoft.com/azure/cognitive-services/openai/quickstart</param>
+    /// <param name="apiKey">Azure OpenAI API key, see https://learn.microsoft.com/azure/cognitive-services/openai/quickstart</param>
+    /// <param name="modelId">Azure OpenAI model id, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param>
+    /// <param name="httpClient">Custom <see cref="HttpClient"/> for HTTP requests.</param>
+    /// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param>
+    public AzureOpenAIAudioToTextService(
+        string deploymentName,
+        string endpoint,
+        string apiKey,
+        string? modelId = null,
+        HttpClient? httpClient = null,
+        ILoggerFactory? loggerFactory = null)
+    {
+        this._core = new(deploymentName, endpoint, apiKey, httpClient, loggerFactory?.CreateLogger(typeof(AzureOpenAIAudioToTextService)));
+        this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId);
+    }
+
+    /// <summary>
+    /// Creates an instance of the <see cref="AzureOpenAIAudioToTextService"/> with AAD auth.
+    /// </summary>
+    /// <param name="deploymentName">Azure OpenAI deployment name, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param>
+    /// <param name="endpoint">Azure OpenAI deployment URL, see https://learn.microsoft.com/azure/cognitive-services/openai/quickstart</param>
+    /// <param name="credentials">Token credentials, e.g. DefaultAzureCredential, ManagedIdentityCredential, EnvironmentCredential, etc.</param>
+    /// <param name="modelId">Azure OpenAI model id, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param>
+    /// <param name="httpClient">Custom <see cref="HttpClient"/> for HTTP requests.</param>
+    /// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param>
+    public AzureOpenAIAudioToTextService(
+        string deploymentName,
+        string endpoint,
+        TokenCredential credentials,
+        string? modelId = null,
+        HttpClient? httpClient = null,
+        ILoggerFactory? loggerFactory = null)
+    {
+        this._core = new(deploymentName, endpoint, credentials, httpClient, loggerFactory?.CreateLogger(typeof(AzureOpenAIAudioToTextService)));
+        this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId);
+    }
+
+    /// <summary>
+    /// Creates an instance of the <see cref="AzureOpenAIAudioToTextService"/> using the specified <see cref="OpenAIClient"/>.
+    /// </summary>
+    /// <param name="deploymentName">Azure OpenAI deployment name, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param>
+    /// <param name="openAIClient">Custom <see cref="OpenAIClient"/>.</param>
+    /// <param name="modelId">Azure OpenAI model id, see https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource</param>
+    /// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param>
+    public AzureOpenAIAudioToTextService(
+        string deploymentName,
+        OpenAIClient openAIClient,
+        string? modelId = null,
+        ILoggerFactory? loggerFactory = null)
+    {
+        this._core = new(deploymentName, openAIClient, loggerFactory?.CreateLogger(typeof(AzureOpenAIAudioToTextService)));
+        this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId);
+    }
+
+    /// <inheritdoc/>
+    public Task<TextContent> GetTextContentAsync(
+        AudioContent content,
+        PromptExecutionSettings? executionSettings = null,
+        Kernel? kernel = null,
+        CancellationToken cancellationToken = default)
+        => this._core.GetTextContentFromAudioAsync(content, executionSettings, cancellationToken);
+}
diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/OpenAIAudioToTextExecutionSettings.cs b/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/OpenAIAudioToTextExecutionSettings.cs
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+using Microsoft.SemanticKernel.Text;
+
+namespace Microsoft.SemanticKernel.Connectors.OpenAI;
+
+/// <summary>
+/// Execution settings for OpenAI audio-to-text request.
+/// </summary>
+public sealed class OpenAIAudioToTextExecutionSettings : PromptExecutionSettings
+{
+    /// <summary>
+    /// Filename or identifier associated with audio data.
+    /// Should be in format {filename}.{extension}
+    /// </summary>
+    [JsonPropertyName("filename")]
+    public string Filename { get; set; }
+
+    /// <summary>
+    /// An optional language of the audio data as two-letter ISO-639-1 language code (e.g. 'en' or 'es').
+    /// </summary>
+    [JsonPropertyName("language")]
+    public string? Language { get; set; }
+
+    /// <summary>
+    /// An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.
+    /// </summary>
+    [JsonPropertyName("prompt")]
+    public string? Prompt { get; set; }
+
+    /// <summary>
+    /// The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt. Default is 'json'.
+    /// </summary>
+    [JsonPropertyName("response_format")]
+    public string ResponseFormat { get; set; } = "json";
+
+    /// <summary>
+    /// The sampling temperature, between 0 and 1.
+    /// Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+    /// If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
+    /// Default is 0.
+    /// </summary>
+    [JsonPropertyName("temperature")]
+    public float Temperature { get; set; } = 0;
+
+    /// <summary>
+    /// Creates an instance of <see cref="OpenAIAudioToTextExecutionSettings"/> class.
+    /// </summary>
+    /// <param name="filename">Filename or identifier associated with audio data. Should be in format {filename}.{extension}</param>
+    public OpenAIAudioToTextExecutionSettings(string filename)
+    {
+        this.Filename = filename;
+    }
+
+    /// <summary>
+    /// Converts <see cref="PromptExecutionSettings"/> to derived <see cref="OpenAIAudioToTextExecutionSettings"/> type.
+    /// </summary>
+    /// <param name="executionSettings">Instance of <see cref="PromptExecutionSettings"/>.</param>
+    /// <returns>Instance of <see cref="OpenAIAudioToTextExecutionSettings"/>.</returns>
+    public static OpenAIAudioToTextExecutionSettings? FromExecutionSettings(PromptExecutionSettings? executionSettings)
+    {
+        if (executionSettings is null)
+        {
+            return null;
+        }
+
+        if (executionSettings is OpenAIAudioToTextExecutionSettings settings)
+        {
+            return settings;
+        }
+
+        var json = JsonSerializer.Serialize(executionSettings);
+
+        var openAIExecutionSettings = JsonSerializer.Deserialize<OpenAIAudioToTextExecutionSettings>(json, JsonOptionsCache.ReadPermissive);
+
+        if (openAIExecutionSettings is not null)
+        {
+            return openAIExecutionSettings;
+        }
+
+        throw new ArgumentException($"Invalid execution settings, cannot convert to {nameof(OpenAIAudioToTextExecutionSettings)}", nameof(executionSettings));
+    }
+}
diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/OpenAIAudioToTextService.cs b/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/OpenAIAudioToTextService.cs
@@ -0,0 +1,72 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
+using System.Net.Http;
+using System.Threading;
+using System.Threading.Tasks;
+using Azure.AI.OpenAI;
+using Microsoft.Extensions.Logging;
+using Microsoft.SemanticKernel.AudioToText;
+using Microsoft.SemanticKernel.Contents;
+using Microsoft.SemanticKernel.Services;
+
+namespace Microsoft.SemanticKernel.Connectors.OpenAI;
+
+/// <summary>
+/// OpenAI audio-to-text service.
+/// </summary>
+[Experimental("SKEXP0005")]
+public sealed class OpenAIAudioToTextService : IAudioToTextService
+{
+    /// <summary>Core implementation shared by OpenAI services.</summary>
+    private readonly OpenAIClientCore _core;
+
+    /// <inheritdoc/>
+    public IReadOnlyDictionary<string, object?> Attributes => this._core.Attributes;
+
+    /// <summary>
+    /// Creates an instance of the <see cref="OpenAIAudioToTextService"/> with API key auth.
+    /// </summary>
+    /// <param name="modelId">Model name</param>
+    /// <param name="apiKey">OpenAI API Key</param>
+    /// <param name="organization">OpenAI Organization Id (usually optional)</param>
+    /// <param name="httpClient">Custom <see cref="HttpClient"/> for HTTP requests.</param>
+    /// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param>
+    public OpenAIAudioToTextService(
+        string modelId,
+        string apiKey,
+        string? organization = null,
+        HttpClient? httpClient = null,
+        ILoggerFactory? loggerFactory = null)
+    {
+        this._core = new(modelId, apiKey, organization, httpClient, loggerFactory?.CreateLogger(typeof(OpenAIAudioToTextService)));
+
+        this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId);
+        this._core.AddAttribute(OpenAIClientCore.OrganizationKey, organization);
+    }
+
+    /// <summary>
+    /// Creates an instance of the <see cref="OpenAIAudioToTextService"/> using the specified <see cref="OpenAIClient"/>.
+    /// </summary>
+    /// <param name="modelId">Model name</param>
+    /// <param name="openAIClient">Custom <see cref="OpenAIClient"/> for HTTP requests.</param>
+    /// <param name="loggerFactory">The <see cref="ILoggerFactory"/> to use for logging. If null, no logging will be performed.</param>
+    public OpenAIAudioToTextService(
+        string modelId,
+        OpenAIClient openAIClient,
+        ILoggerFactory? loggerFactory = null)
+    {
+        this._core = new(modelId, openAIClient, loggerFactory?.CreateLogger(typeof(OpenAIAudioToTextService)));
+
+        this._core.AddAttribute(AIServiceExtensions.ModelIdKey, modelId);
+    }
+
+    /// <inheritdoc/>
+    public Task<TextContent> GetTextContentAsync(
+        AudioContent content,
+        PromptExecutionSettings? executionSettings = null,
+        Kernel? kernel = null,
+        CancellationToken cancellationToken = default)
+        => this._core.GetTextContentFromAudioAsync(content, executionSettings, cancellationToken);
+}
diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AzureSdk/ClientCore.cs b/dotnet/src/Connectors/Connectors.OpenAI/AzureSdk/ClientCore.cs
@@ -17,6 +17,7 @@
 using Microsoft.Extensions.Logging;
 using Microsoft.Extensions.Logging.Abstractions;
 using Microsoft.SemanticKernel.ChatCompletion;
+using Microsoft.SemanticKernel.Contents;
 using Microsoft.SemanticKernel.Http;
 
 #pragma warning disable CA2208 // Instantiate argument exceptions correctly
@@ -197,6 +198,16 @@ internal async IAsyncEnumerable<StreamingTextContent> GetStreamingTextContentsAs
         };
     }
 
+    private static Dictionary<string, object?> GetResponseMetadata(AudioTranscription audioTranscription)
+    {
+        return new Dictionary<string, object?>(3)
+        {
+            { nameof(audioTranscription.Language), audioTranscription.Language },
+            { nameof(audioTranscription.Duration), audioTranscription.Duration },
+            { nameof(audioTranscription.Segments), audioTranscription.Segments }
+        };
+    }
+
     /// <summary>
     /// Generates an embedding from the given <paramref name="data"/>.
     /// </summary>
@@ -230,6 +241,33 @@ internal async Task<IList<ReadOnlyMemory<float>>> GetEmbeddingsAsync(
         return result;
     }
 
+    internal async Task<TextContent> GetTextContentFromAudioAsync(
+        AudioContent content,
+        PromptExecutionSettings? executionSettings,
+        CancellationToken cancellationToken)
+    {
+        Verify.NotNull(content.Data);
+
+        OpenAIAudioToTextExecutionSettings? audioExecutionSettings = OpenAIAudioToTextExecutionSettings.FromExecutionSettings(executionSettings);
+
+        Verify.ValidFilename(audioExecutionSettings?.Filename);
+
+        var audioOptions = new AudioTranscriptionOptions
+        {
+            AudioData = content.Data,
+            DeploymentName = this.DeploymentOrModelName,
+            Filename = audioExecutionSettings.Filename,
+            Language = audioExecutionSettings.Language,
+            Prompt = audioExecutionSettings.Prompt,
+            ResponseFormat = audioExecutionSettings.ResponseFormat,
+            Temperature = audioExecutionSettings.Temperature
+        };
+
+        AudioTranscription responseData = (await RunRequestAsync(() => this.Client.GetAudioTranscriptionAsync(audioOptions, cancellationToken)).ConfigureAwait(false)).Value;
+
+        return new TextContent(responseData.Text, this.DeploymentOrModelName, metadata: GetResponseMetadata(responseData));
+    }
+
     /// <summary>
     /// Generate a new chat message
     /// </summary>

diff --git a/dotnet/src/Connectors/Connectors.OpenAI/Connectors.OpenAI.csproj b/dotnet/src/Connectors/Connectors.OpenAI/Connectors.OpenAI.csproj
@@ -6,7 +6,7 @@
     <RootNamespace>$(AssemblyName)</RootNamespace>
     <TargetFramework>netstandard2.0</TargetFramework>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-    <NoWarn>$(NoWarn);NU5104;SKEXP0013,SKEXP0014</NoWarn>
+    <NoWarn>$(NoWarn);NU5104;SKEXP0005,SKEXP0013,SKEXP0014</NoWarn>
     <EnablePackageValidation>true</EnablePackageValidation>
   </PropertyGroup>