microsoft · moonbox3 · Oct 10, 2024 · Aug 14, 2024 · Aug 16, 2024 · Aug 16, 2024
@@ -73,7 +73,8 @@ hugging_face = [
     "torch == 2.4.1"
 ]
 mongo = [
-    "motor >= 3.3.2,< 3.7.0"
+    "pymongo >= 4.8.0, < 4.9",
+    "motor >= 3.3.2,< 3.6.0"
 ]
 notebooks = [
     "ipykernel ~= 6.29"
@@ -88,6 +89,9 @@ mistralai = [
 ollama = [
     "ollama ~= 0.2"
 ]
+onnx = [
+    "onnxruntime-genai ~= 0.4; platform_system != 'Darwin'"
+]
 anthropic = [
     "anthropic ~= 0.32"
 ]

@@ -10,7 +10,7 @@ This section contains code snippets that demonstrate the usage of Semantic Kerne
 | Filtering | Creating and using Filters |
 | Functions | Invoking [`Method`](https://github.com/microsoft/semantic-kernel/blob/main/python/semantic_kernel/functions/kernel_function_from_method.py) or [`Prompt`](https://github.com/microsoft/semantic-kernel/blob/main/python/semantic_kernel/functions/kernel_function_from_prompt.py) functions with [`Kernel`](https://github.com/microsoft/semantic-kernel/blob/main/python/semantic_kernel/kernel.py) |
 | Grounding | An example of how to perform LLM grounding |
-| Local Models | Using the [`OpenAI connector`](https://github.com/microsoft/semantic-kernel/blob/main/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_chat_completion.py) to talk to models hosted locally in Ollama and LM Studio |
+| Local Models | Using the [`OpenAI connector`](https://github.com/microsoft/semantic-kernel/blob/main/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_chat_completion.py) and [`OnnxGenAI connector`](https://github.com/microsoft/semantic-kernel/blob/main/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_chat_completion.py) to talk to models hosted locally in Ollama, OnnxGenAI and LM Studio |
 | Logging | Showing how to set up logging |
 | Memory | Using [`Memory`](https://github.com/microsoft/semantic-kernel/tree/main/dotnet/src/SemanticKernel.Abstractions/Memory) AI concepts |
 | Model-as-a-Service | Using models deployed as [`serverless APIs on Azure AI Studio`](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-serverless?tabs=azure-ai-studio) to benchmark model performance against open-source datasets |

@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+import asyncio
+
+from semantic_kernel.connectors.ai.onnx import OnnxGenAIChatCompletion, OnnxGenAIPromptExecutionSettings
+from semantic_kernel.contents.chat_history import ChatHistory
+from semantic_kernel.kernel import Kernel
+
+# This concept sample shows how to use the Onnx connector with
+# a local model running in Onnx
+
+kernel = Kernel()
+
+service_id = "phi3"
+#############################################
+# Make sure to download an ONNX model
+# (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx)
+# If onnxruntime-genai is used:
+# use the model stored in /cpu folder
+# If onnxruntime-genai-cuda is installed for gpu use:
+# use the model stored in /cuda folder
+# Then set ONNX_GEN_AI_CHAT_MODEL_FOLDER environment variable to the path to the model folder
+#############################################
+streaming = True
+
+chat_completion = OnnxGenAIChatCompletion(ai_model_id=service_id, template="phi3")
+settings = OnnxGenAIPromptExecutionSettings()
+
+system_message = """You are a helpful assistant."""
+chat_history = ChatHistory(system_message=system_message)
+
+
+async def chat() -> bool:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return False
+    chat_history.add_user_message(user_input)
+    if streaming:
+        print("Mosscap:> ", end="")
+        message = ""
+        async for chunk in chat_completion.get_streaming_chat_message_content(
+            chat_history=chat_history, settings=settings, kernel=kernel
+        ):
+            if chunk:
+                print(str(chunk), end="")
+                message += str(chunk)
+        chat_history.add_assistant_message(message)
+        print("")
+    else:
+        answer = await chat_completion.get_chat_message_content(
+            chat_history=chat_history, settings=settings, kernel=kernel
+        )
+        print(f"Mosscap:> {answer}")
+        chat_history.add_message(answer)
+    return True
+
+
+async def main() -> None:
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+import asyncio
+
+from semantic_kernel.connectors.ai.onnx import OnnxGenAIChatCompletion, OnnxGenAIPromptExecutionSettings
+from semantic_kernel.contents import AuthorRole, ChatHistory, ChatMessageContent, ImageContent
+from semantic_kernel.kernel import Kernel
+
+# This concept sample shows how to use the Onnx connector with
+# a local model running in Onnx
+
+kernel = Kernel()
+
+service_id = "phi3"
+#############################################
+# Make sure to download an ONNX model
+# If onnxruntime-genai is used:
+# (https://huggingface.co/microsoft/Phi-3-vision-128k-instruct-onnx-cpu)
+# If onnxruntime-genai-cuda is installed for gpu use:
+# (https://huggingface.co/microsoft/Phi-3-vision-128k-instruct-onnx-gpu)
+# Then set ONNX_GEN_AI_CHAT_MODEL_FOLDER environment variable to the path to the model folder
+#############################################
+streaming = True
+
+chat_completion = OnnxGenAIChatCompletion(ai_model_id=service_id, template="phi3v")
+
+# Max length property is important to allocate RAM
+# If the value is too big, you ran out of memory
+# If the value is too small, your input is limited
+settings = OnnxGenAIPromptExecutionSettings(max_length=4096)
+
+system_message = """
+You are a helpful assistant.
+You know about provided images and the history of the conversation.
+"""
+chat_history = ChatHistory(system_message=system_message)
+
+
+async def chat() -> bool:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return False
+    chat_history.add_user_message(user_input)
+    if streaming:
+        print("Mosscap:> ", end="")
+        message = ""
+        async for chunk in chat_completion.get_streaming_chat_message_content(
+            chat_history=chat_history, settings=settings, kernel=kernel
+        ):
+            print(chunk.content, end="")
+            if chunk.content:
+                message += chunk.content
+        chat_history.add_assistant_message(message)
+        print("")
+    else:
+        answer = await chat_completion.get_chat_message_content(
+            chat_history=chat_history, settings=settings, kernel=kernel
+        )
+        print(f"Mosscap:> {answer}")
+        chat_history.add_message(message)
+    return True
+
+
+async def main() -> None:
+    chatting = True
+    image_path = input("Image Path (leave empty if no image): ")
+    if image_path:
+        chat_history.add_message(
+            ChatMessageContent(
+                role=AuthorRole.USER,
+                items=[
+                    ImageContent.from_image_path(image_path=image_path),
+                ],
+            ),
+        )
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,76 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+import asyncio
+
+from semantic_kernel.connectors.ai.onnx import OnnxGenAITextCompletion
+from semantic_kernel.functions.kernel_arguments import KernelArguments
+from semantic_kernel.kernel import Kernel
+
+# This concept sample shows how to use the Onnx connector with
+# a local model running in Onnx
+
+kernel = Kernel()
+
+service_id = "phi3"
+#############################################
+# Make sure to download an ONNX model
+# (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx)
+# If onnxruntime-genai is used:
+# use the model stored in /cpu folder
+# If onnxruntime-genai-cuda is installed for gpu use:
+# use the model stored in /cuda folder
+# Then set ONNX_GEN_AI_TEXT_MODEL_FOLDER environment variable to the path to the model folder
+#############################################
+streaming = True
+
+kernel.add_service(OnnxGenAITextCompletion(ai_model_id=service_id))
+
+settings = kernel.get_prompt_execution_settings_from_service_id(service_id)
+
+# Phi3 Model is using chat templates to generate responses
+# With the Chat Template the model understands
+# the context and roles of the conversation better
+# https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format
+chat_function = kernel.add_function(
+    plugin_name="ChatBot",
+    function_name="Chat",
+    prompt="<|user|>{{$user_input}}<|end|><|assistant|>",
+    template_format="semantic-kernel",
+    prompt_execution_settings=settings,
+)
+
+
+async def chat() -> bool:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return False
+
+    if streaming:
+        print("Mosscap:> ", end="")
+        async for chunk in kernel.invoke_stream(chat_function, KernelArguments(user_input=user_input)):
+            print(chunk[0].text, end="")
+        print("\n")
+    else:
+        answer = await kernel.invoke(chat_function, KernelArguments(user_input=user_input))
+        print(f"Mosscap:> {answer}")
+    return True
+
+
+async def main() -> None:
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -68,6 +68,9 @@ def __init__(
         except ValidationError as ex:
             raise ServiceInitializationError("Failed to create Ollama settings.", ex) from ex
 
+        if not ollama_settings.model:
+            raise ServiceInitializationError("Please provide ai_model_id or OLLAMA_MODEL env variable is required")
+
         super().__init__(
             service_id=service_id or ollama_settings.model,
             ai_model_id=ollama_settings.model,

@@ -0,0 +1,9 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from semantic_kernel.connectors.ai.onnx.onnx_gen_ai_prompt_execution_settings import (
+    OnnxGenAIPromptExecutionSettings,
+)
+from semantic_kernel.connectors.ai.onnx.services.onnx_gen_ai_chat_completion import OnnxGenAIChatCompletion
+from semantic_kernel.connectors.ai.onnx.services.onnx_gen_ai_text_completion import OnnxGenAITextCompletion
+
+__all__ = ['OnnxGenAIChatCompletion', 'OnnxGenAIPromptExecutionSettings', 'OnnxGenAITextCompletion']
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+from pydantic import Field
+
+from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
+
+
+class OnnxGenAIPromptExecutionSettings(PromptExecutionSettings):
+    """OnnxGenAI prompt execution settings."""
+
+    diversity_penalty: float | None = Field(None, ge=0.0, le=1.0)
+    do_sample: bool = False
+    early_stopping: bool = True
+    length_penalty: float | None = Field(None, ge=0.0, le=1.0)
+    max_length: int = Field(3072, gt=0)
+    min_length: int | None = Field(None, gt=0)
+    no_repeat_ngram_size: int = 0
+    num_beams: int | None = Field(None, gt=0)
+    num_return_sequences: int | None = Field(None, gt=0)
+    past_present_share_buffer: int = True
+    repetition_penalty: float | None = Field(None, ge=0.0, le=1.0)
+    temperature: float | None = Field(None, ge=0.0, le=2.0)
+    top_k: int | None = Field(None, gt=0)
+    top_p: float | None = Field(None, ge=0.0, le=1.0)
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from typing import ClassVar
+
+from semantic_kernel.kernel_pydantic import KernelBaseSettings
+
+
+class OnnxGenAISettings(KernelBaseSettings):
+    """Onnx Gen AI model settings.
+
+    The settings are first loaded from environment variables with the prefix 'ONNX_GEN_AI_'. If the
+    environment variables are not found, the settings can be loaded from a .env file with the
+    encoding 'utf-8'. If the settings are not found in the .env file, the settings are ignored;
+    however, validation will fail alerting that the settings are missing.
+
+    Optional settings for prefix 'ONNX_GEN_AI_' are:
+    - chat_model_folder: Path to the Onnx chat model folder (ENV: ONNX_GEN_AI_CHAT_MODEL_FOLDER).
+    - text_model_folder: Path to the Onnx text model folder (ENV: ONNX_GEN_AI_TEXT_MODEL_FOLDER).
+    """
+
+    env_prefix: ClassVar[str] = "ONNX_GEN_AI_"
+    chat_model_folder: str | None = None
+    text_model_folder: str | None = None