From 69c7a6a61bb41d7bae29a0c6276717638d00a4bc Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Wed, 18 Dec 2024 19:39:16 +0900
Subject: [PATCH 1/7] Include a function_invoke_attempt index with Streaming
 CMC. Update tests and samples.

---
 .../anthropic_api_function_calling.py         | 206 --------------
 .../chat_completion_with_function_calling.py  | 257 ++++++++++++++++++
 .../chat_gpt_api_function_calling.py          | 207 --------------
 .../chat_mistral_ai_api_function_calling.py   | 215 ---------------
 .../services/anthropic_chat_completion.py     |  13 +-
 .../azure_ai_inference_chat_completion.py     |   7 +-
 .../services/bedrock_chat_completion.py       |   8 +-
 .../ai/chat_completion_client_base.py         |  13 +-
 .../connectors/ai/function_calling_utils.py   |  12 +-
 .../services/google_ai_chat_completion.py     |   9 +-
 .../services/vertex_ai_chat_completion.py     |   9 +-
 .../services/mistral_ai_chat_completion.py    |   7 +-
 .../ollama/services/ollama_chat_completion.py |  15 +-
 .../services/onnx_gen_ai_chat_completion.py   |   9 +-
 .../open_ai/services/azure_chat_completion.py |   3 +-
 .../services/open_ai_chat_completion_base.py  |   6 +-
 .../streaming_chat_message_content.py         |  36 ++-
 python/tests/samples/test_concepts.py         |   8 +-
 18 files changed, 377 insertions(+), 663 deletions(-)
 delete mode 100644 python/samples/concepts/auto_function_calling/anthropic_api_function_calling.py
 create mode 100644 python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py
 delete mode 100644 python/samples/concepts/auto_function_calling/chat_gpt_api_function_calling.py
 delete mode 100644 python/samples/concepts/auto_function_calling/chat_mistral_ai_api_function_calling.py

diff --git a/python/samples/concepts/auto_function_calling/anthropic_api_function_calling.py b/python/samples/concepts/auto_function_calling/anthropic_api_function_calling.py
deleted file mode 100644
index 5769943157db..000000000000
--- a/python/samples/concepts/auto_function_calling/anthropic_api_function_calling.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-import asyncio
-import os
-from functools import reduce
-from typing import TYPE_CHECKING
-
-from semantic_kernel import Kernel
-from semantic_kernel.connectors.ai.anthropic import AnthropicChatCompletion, AnthropicChatPromptExecutionSettings
-from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
-from semantic_kernel.contents import ChatHistory
-from semantic_kernel.contents.chat_message_content import ChatMessageContent
-from semantic_kernel.contents.function_call_content import FunctionCallContent
-from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
-from semantic_kernel.contents.utils.author_role import AuthorRole
-from semantic_kernel.core_plugins.math_plugin import MathPlugin
-from semantic_kernel.core_plugins.time_plugin import TimePlugin
-from semantic_kernel.functions import KernelArguments
-
-if TYPE_CHECKING:
-    from semantic_kernel.functions import KernelFunction
-
-
-system_message = """
-You are a chat bot. Your name is Mosscap and
-you have one goal: figure out what people need.
-Your full name, should you need to know it, is
-Splendid Speckled Mosscap. You communicate
-effectively, but you tend to answer with long
-flowery prose. You are also a math wizard,
-especially for adding and subtracting.
-You also excel at joke telling, where your tone is often sarcastic.
-Once you have the answer I am looking for,
-you will return a full answer to me as soon as possible.
-"""
-
-# This concept example shows how to handle both streaming and non-streaming responses
-# To toggle the behavior, set the following flag accordingly:
-stream = False
-
-kernel = Kernel()
-
-# Note: the underlying model needs to support function calling.
-# https://docs.anthropic.com/en/docs/build-with-claude/tool-use#choosing-a-model
-kernel.add_service(AnthropicChatCompletion(service_id="chat", ai_model_id="claude-3-opus-20240229"))
-
-plugins_directory = os.path.join(__file__, "../../../../../prompt_template_samples/")
-# adding plugins to the kernel
-kernel.add_plugin(MathPlugin(), plugin_name="math")
-kernel.add_plugin(TimePlugin(), plugin_name="time")
-
-chat_function = kernel.add_function(
-    prompt="{{$chat_history}}{{$user_input}}",
-    plugin_name="ChatBot",
-    function_name="Chat",
-)
-
-# Enabling or disabling function calling is done by setting the `function_choice_behavior` attribute for the
-# prompt execution settings. When the function_call parameter is set to "auto" the model will decide which
-# function to use, if any.
-#
-# There are two ways to define the `function_choice_behavior` parameter:
-# 1. Using the type string as `"auto"` or `"required"`. For example:
-#   configure `function_choice_behavior="auto"` parameter directly in the execution settings.
-# 2. Using the FunctionChoiceBehavior class. For example:
-#   `function_choice_behavior=FunctionChoiceBehavior.Auto()`.
-# Both of these configure the `auto` tool_choice and all of the available plugins/functions
-# registered on the kernel. If you want to limit the available plugins/functions, you must
-# configure the `filters` dictionary attribute for each type of function choice behavior.
-# For example:
-#
-# from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
-
-# function_choice_behavior = FunctionChoiceBehavior.Auto(
-#     filters={"included_functions": ["time-date", "time-time", "math-Add"]}
-# )
-#
-# The filters attribute allows you to specify either: `included_functions`, `excluded_functions`,
-#  `included_plugins`, or `excluded_plugins`.
-
-execution_settings = AnthropicChatPromptExecutionSettings(
-    service_id="chat",
-    max_tokens=2000,
-    temperature=0.7,
-    top_p=0.8,
-    function_choice_behavior=FunctionChoiceBehavior.Auto(auto_invoke=True),
-)
-
-history = ChatHistory()
-
-history.add_system_message(system_message)
-history.add_user_message("Hi there, who are you?")
-history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
-
-arguments = KernelArguments(settings=execution_settings)
-
-
-def print_tool_calls(message: ChatMessageContent) -> None:
-    # A helper method to pretty print the tool calls from the message.
-    # This is only triggered if auto invoke tool calls is disabled.
-    items = message.items
-    formatted_tool_calls = []
-    for i, item in enumerate(items, start=1):
-        if isinstance(item, FunctionCallContent):
-            tool_call_id = item.id
-            function_name = item.name
-            function_arguments = item.arguments
-            formatted_str = (
-                f"tool_call {i} id: {tool_call_id}\n"
-                f"tool_call {i} function name: {function_name}\n"
-                f"tool_call {i} arguments: {function_arguments}"
-            )
-            formatted_tool_calls.append(formatted_str)
-    if len(formatted_tool_calls) > 0:
-        print("Tool calls:\n" + "\n\n".join(formatted_tool_calls))
-    else:
-        print("The model used its own knowledge and didn't return any tool calls.")
-
-
-async def handle_streaming(
-    kernel: Kernel,
-    chat_function: "KernelFunction",
-    arguments: KernelArguments,
-) -> str | None:
-    response = kernel.invoke_stream(
-        chat_function,
-        return_function_results=False,
-        arguments=arguments,
-    )
-
-    print("Mosscap:> ", end="")
-    streamed_chunks: list[StreamingChatMessageContent] = []
-    result_content = []
-    async for message in response:
-        if (
-            not execution_settings.function_choice_behavior.auto_invoke_kernel_functions
-            and isinstance(message[0], StreamingChatMessageContent)
-            and message[0].role == AuthorRole.ASSISTANT
-        ):
-            streamed_chunks.append(message[0])
-        elif isinstance(message[0], StreamingChatMessageContent) and message[0].role == AuthorRole.ASSISTANT:
-            result_content.append(message[0])
-            print(str(message[0]), end="")
-
-    if streamed_chunks:
-        streaming_chat_message = reduce(lambda first, second: first + second, streamed_chunks)
-        if hasattr(streaming_chat_message, "content"):
-            print(streaming_chat_message.content)
-        print("Auto tool calls is disabled, printing returned tool calls...")
-        print_tool_calls(streaming_chat_message)
-
-    print("\n")
-    if result_content:
-        return "".join([str(content) for content in result_content])
-    return None
-
-
-async def chat() -> bool:
-    try:
-        user_input = input("User:> ")
-    except KeyboardInterrupt:
-        print("\n\nExiting chat...")
-        return False
-    except EOFError:
-        print("\n\nExiting chat...")
-        return False
-
-    if user_input == "exit":
-        print("\n\nExiting chat...")
-        return False
-    arguments["user_input"] = user_input
-    arguments["chat_history"] = history
-
-    if stream:
-        result = await handle_streaming(kernel, chat_function, arguments=arguments)
-    else:
-        result = await kernel.invoke(chat_function, arguments=arguments)
-
-        # If tools are used, and auto invoke tool calls is False, the response will be of type
-        # ChatMessageContent with information about the tool calls, which need to be sent
-        # back to the model to get the final response.
-        function_calls = [item for item in result.value[-1].items if isinstance(item, FunctionCallContent)]
-        if not execution_settings.function_choice_behavior.auto_invoke_kernel_functions and len(function_calls) > 0:
-            print_tool_calls(result.value[0])
-            return True
-
-        print(f"Mosscap:> {result}")
-
-    history.add_user_message(user_input)
-    history.add_assistant_message(str(result))
-    return True
-
-
-async def main() -> None:
-    chatting = True
-    print(
-        "Welcome to the chat bot!\
-        \n  Type 'exit' to exit.\
-        \n  Try a math question to see the function calling in action (i.e. what is 3+3?)."
-    )
-    while chatting:
-        chatting = await chat()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py b/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py
new file mode 100644
index 000000000000..445106e222cb
--- /dev/null
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py
@@ -0,0 +1,257 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+from functools import reduce
+from typing import TYPE_CHECKING
+
+from samples.concepts.setup.chat_completion_services import Services, get_chat_completion_service_and_request_settings
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.contents.chat_message_content import ChatMessageContent
+from semantic_kernel.contents.function_call_content import FunctionCallContent
+from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
+from semantic_kernel.contents.utils.author_role import AuthorRole
+from semantic_kernel.core_plugins.math_plugin import MathPlugin
+from semantic_kernel.core_plugins.time_plugin import TimePlugin
+from semantic_kernel.functions import KernelArguments
+
+if TYPE_CHECKING:
+    from semantic_kernel.functions import KernelFunction
+
+# System message defining the behavior and persona of the chat bot.
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose. You are also a math wizard,
+especially for adding and subtracting.
+You also excel at joke telling, where your tone is often sarcastic.
+Once you have the answer I am looking for,
+you will return a full answer to me as soon as possible.
+"""
+
+# Toggle this flag to switch between streaming and non-streaming modes.
+stream = True
+
+# Create and configure the kernel.
+kernel = Kernel()
+
+# Load some sample plugins (for demonstration of function calling).
+kernel.add_plugin(MathPlugin(), plugin_name="math")
+kernel.add_plugin(TimePlugin(), plugin_name="time")
+
+# Define a chat function (a template for how to handle user input).
+chat_function = kernel.add_function(
+    prompt="{{$chat_history}}{{$user_input}}",
+    plugin_name="ChatBot",
+    function_name="Chat",
+)
+
+# You can select from the following chat completion services that support function calling:
+# - Services.OPENAI
+# - Services.AZURE_OPENAI
+# - Services.AZURE_AI_INFERENCE
+# - Services.ANTHROPIC
+# - Services.BEDROCK
+# - Services.GOOGLE_AI
+# - Services.MISTRAL_AI
+# - Services.OLLAMA
+# - Services.ONNX
+# - Services.VERTEX_AI
+# Please make sure you have configured your environment correctly for the selected chat completion service.
+chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
+
+# Configure the function choice behavior. Here, we set it to Auto with auto_invoke=True.
+# - If `auto_invoke=True`, the model will automatically choose and call functions as needed.
+# - If `auto_invoke=False`, the model may return tool call instructions that you must handle and call manually.
+request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto(auto_invoke=True)
+
+kernel.add_service(chat_completion_service)
+
+# Pass the request settings to the kernel arguments.
+arguments = KernelArguments(settings=request_settings)
+
+# Create a chat history to store the system message, initial messages, and the conversation.
+history = ChatHistory()
+history.add_system_message(system_message)
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+def print_tool_calls(message: ChatMessageContent) -> None:
+    """
+    A helper function to pretty print the tool calls found in a ChatMessageContent message.
+    This is useful when auto tool invocation is disabled and the model returns calls that you must handle.
+    """
+    items = message.items
+    formatted_tool_calls = []
+    for i, item in enumerate(items, start=1):
+        if isinstance(item, FunctionCallContent):
+            tool_call_id = item.id
+            function_name = item.name
+            function_arguments = item.arguments
+            formatted_str = (
+                f"tool_call {i} id: {tool_call_id}\n"
+                f"tool_call {i} function name: {function_name}\n"
+                f"tool_call {i} arguments: {function_arguments}"
+            )
+            formatted_tool_calls.append(formatted_str)
+    if len(formatted_tool_calls) > 0:
+        print("\n[Tool calls returned by the model]:\n" + "\n\n".join(formatted_tool_calls))
+    else:
+        print("\n[No tool calls returned by the model]")
+
+
+async def handle_streaming(
+    kernel: Kernel,
+    chat_function: "KernelFunction",
+    arguments: KernelArguments,
+) -> str | None:
+    """
+    Handle the streaming response from the model.
+    This function demonstrates two possible paths:
+
+    1. When auto function calling is ON (auto_invoke=True):
+       - The model may call tools automatically and produce a continuous
+         stream of assistant messages. We can simply print these as they come in.
+
+    2. When auto function calling is OFF (auto_invoke=False):
+       - The model may instead return tool call instructions embedded in the stream.
+         We can track these calls using `function_invoke_attempt` attributes and print
+         them for the user. The user can then manually invoke the tools and return the results
+         to the model for further completion.
+    """
+
+    response = kernel.invoke_stream(
+        chat_function,
+        return_function_results=False,
+        arguments=arguments,
+    )
+
+    # We will differentiate behavior based on whether auto invoking kernel functions is enabled.
+    auto_invoking = request_settings.function_choice_behavior.auto_invoke_kernel_functions
+
+    print("Mosscap:> ", end="", flush=True)
+
+    # If auto_invoking is False, the model may return separate streaming chunks containing tool instructions.
+    # We'll store them here.
+    streamed_tool_chunks: list[StreamingChatMessageContent] = []
+
+    # For content messages (the final assistant's response text), store them here.
+    streamed_response_chunks: list[StreamingChatMessageContent] = []
+
+    async for message in response:
+        msg = message[0]
+
+        # We only expect assistant messages here.
+        if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
+            continue
+
+        if auto_invoking:
+            # When auto invocation is ON, no special handling is needed. Just print out messages as they arrive.
+            streamed_response_chunks.append(msg)
+            print(str(msg), end="", flush=True)
+        else:
+            # When auto invocation is OFF, the model may send chunks that represent tool calls.
+            # Chunks that contain function call instructions will have a function_invoke_attempt attribute.
+            if hasattr(msg, "function_invoke_attempt"):
+                # This chunk is part of a tool call instruction sequence
+                streamed_tool_chunks.append(msg)
+            else:
+                # This chunk is normal assistant response text
+                streamed_response_chunks.append(msg)
+                print(str(msg), end="", flush=True)
+
+    print("\n", flush=True)
+
+    # If auto function calling was OFF, handle any tool call instructions we captured.
+    if not auto_invoking and streamed_tool_chunks:
+        # Group streamed chunks by `function_invoke_attempt` to handle each invocation attempt separately.
+        grouped_chunks = {}
+        for chunk in streamed_tool_chunks:
+            key = getattr(chunk, "function_invoke_attempt", None)
+            if key is not None:
+                grouped_chunks.setdefault(key, []).append(chunk)
+
+        # Process each group of chunks
+        for attempt, chunks in grouped_chunks.items():
+            try:
+                # Combine all chunks for a given attempt into one message.
+                combined_content = reduce(lambda first, second: first + second, chunks)
+                if hasattr(combined_content, "content"):
+                    print(f"[function_invoke_attempt {attempt} content]:\n{combined_content.content}")
+
+                print("[Auto function calling is OFF] Here are the returned tool calls:")
+                print_tool_calls(combined_content)
+            except Exception as e:
+                print(f"Error processing chunks for function_invoke_attempt {attempt}: {e}")
+
+    # Return the final concatenated assistant response (if any).
+    if streamed_response_chunks:
+        return "".join([str(content) for content in streamed_response_chunks])
+    return None
+
+
+async def chat() -> bool:
+    """
+    Continuously prompt the user for input and show the assistant's response.
+    Type 'exit' to exit.
+    """
+    try:
+        user_input = input("User:> ")
+    except (KeyboardInterrupt, EOFError):
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input.lower().strip() == "exit":
+        print("\n\nExiting chat...")
+        return False
+
+    arguments["user_input"] = user_input
+    arguments["chat_history"] = history
+
+    if stream:
+        # Handle streaming responses
+        result = await handle_streaming(kernel, chat_function, arguments=arguments)
+    else:
+        # Handle non-streaming responses
+        result = await kernel.invoke(chat_function, arguments=arguments)
+
+        # If function calls are returned and auto invoking is off, we must show them.
+        if not request_settings.function_choice_behavior.auto_invoke_kernel_functions and result and result.value:
+            # Extract function calls from the returned content
+            function_calls = [item for item in result.value[-1].items if isinstance(item, FunctionCallContent)]
+            if len(function_calls) > 0:
+                print_tool_calls(result.value[0])
+                # At this point, you'd handle these calls manually if desired.
+                # For now, we just print them.
+                return True
+
+        # If no function calls to handle, just print the assistant's response
+        if result:
+            print(f"Mosscap:> {result}")
+
+    # Update the chat history with the user's input and the assistant's response
+    if result:
+        history.add_user_message(user_input)
+        history.add_assistant_message(str(result))
+
+    return True
+
+
+async def main() -> None:
+    print(
+        "Welcome to the chat bot!\n"
+        "  Type 'exit' to exit.\n"
+        "  Try a math question to see function calling in action (e.g. 'what is 3+3?')."
+    )
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/samples/concepts/auto_function_calling/chat_gpt_api_function_calling.py b/python/samples/concepts/auto_function_calling/chat_gpt_api_function_calling.py
deleted file mode 100644
index 2ced79d2f8be..000000000000
--- a/python/samples/concepts/auto_function_calling/chat_gpt_api_function_calling.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-import asyncio
-import os
-from functools import reduce
-from typing import TYPE_CHECKING
-
-from semantic_kernel import Kernel
-from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
-from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion, OpenAIChatPromptExecutionSettings
-from semantic_kernel.contents import ChatHistory
-from semantic_kernel.contents.chat_message_content import ChatMessageContent
-from semantic_kernel.contents.function_call_content import FunctionCallContent
-from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
-from semantic_kernel.contents.utils.author_role import AuthorRole
-from semantic_kernel.core_plugins.math_plugin import MathPlugin
-from semantic_kernel.core_plugins.time_plugin import TimePlugin
-from semantic_kernel.functions import KernelArguments
-
-if TYPE_CHECKING:
-    from semantic_kernel.functions import KernelFunction
-
-
-system_message = """
-You are a chat bot. Your name is Mosscap and
-you have one goal: figure out what people need.
-Your full name, should you need to know it, is
-Splendid Speckled Mosscap. You communicate
-effectively, but you tend to answer with long
-flowery prose. You are also a math wizard,
-especially for adding and subtracting.
-You also excel at joke telling, where your tone is often sarcastic.
-Once you have the answer I am looking for,
-you will return a full answer to me as soon as possible.
-"""
-
-# This concept example shows how to handle both streaming and non-streaming responses
-# To toggle the behavior, set the following flag accordingly:
-stream = True
-
-kernel = Kernel()
-
-# Note: the underlying gpt-35/gpt-4 model version needs to be at least version 0613 to support tools.
-kernel.add_service(OpenAIChatCompletion(service_id="chat"))
-
-plugins_directory = os.path.join(__file__, "../../../../../prompt_template_samples/")
-# adding plugins to the kernel
-kernel.add_plugin(MathPlugin(), plugin_name="math")
-kernel.add_plugin(TimePlugin(), plugin_name="time")
-
-chat_function = kernel.add_function(
-    prompt="{{$chat_history}}{{$user_input}}",
-    plugin_name="ChatBot",
-    function_name="Chat",
-)
-
-# Enabling or disabling function calling is done by setting the `function_choice_behavior` attribute for the
-# prompt execution settings. When the function_call parameter is set to "auto" the model will decide which
-# function to use, if any.
-#
-# There are two ways to define the `function_choice_behavior` parameter:
-# 1. Using the type string as `"auto"`, `"required"`, or `"none"`. For example:
-#   configure `function_choice_behavior="auto"` parameter directly in the execution settings.
-# 2. Using the FunctionChoiceBehavior class. For example:
-#   `function_choice_behavior=FunctionChoiceBehavior.Auto()`.
-# Both of these configure the `auto` tool_choice and all of the available plugins/functions
-# registered on the kernel. If you want to limit the available plugins/functions, you must
-# configure the `filters` dictionary attribute for each type of function choice behavior.
-# For example:
-#
-# from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
-
-# function_choice_behavior = FunctionChoiceBehavior.Auto(
-#     filters={"included_functions": ["time-date", "time-time", "math-Add"]}
-# )
-#
-# The filters attribute allows you to specify either: `included_functions`, `excluded_functions`,
-#  `included_plugins`, or `excluded_plugins`.
-
-# Note: the number of responses for auto invoking tool calls is limited to 1.
-# If configured to be greater than one, this value will be overridden to 1.
-execution_settings = OpenAIChatPromptExecutionSettings(
-    service_id="chat",
-    max_tokens=2000,
-    temperature=0.7,
-    top_p=0.8,
-    function_choice_behavior=FunctionChoiceBehavior.Auto(auto_invoke=True),
-)
-
-history = ChatHistory()
-
-history.add_system_message(system_message)
-history.add_user_message("Hi there, who are you?")
-history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
-
-arguments = KernelArguments(settings=execution_settings)
-
-
-def print_tool_calls(message: ChatMessageContent) -> None:
-    # A helper method to pretty print the tool calls from the message.
-    # This is only triggered if auto invoke tool calls is disabled.
-    items = message.items
-    formatted_tool_calls = []
-    for i, item in enumerate(items, start=1):
-        if isinstance(item, FunctionCallContent):
-            tool_call_id = item.id
-            function_name = item.name
-            function_arguments = item.arguments
-            formatted_str = (
-                f"tool_call {i} id: {tool_call_id}\n"
-                f"tool_call {i} function name: {function_name}\n"
-                f"tool_call {i} arguments: {function_arguments}"
-            )
-            formatted_tool_calls.append(formatted_str)
-    if len(formatted_tool_calls) > 0:
-        print("Tool calls:\n" + "\n\n".join(formatted_tool_calls))
-    else:
-        print("The model used its own knowledge and didn't return any tool calls.")
-
-
-async def handle_streaming(
-    kernel: Kernel,
-    chat_function: "KernelFunction",
-    arguments: KernelArguments,
-) -> str | None:
-    response = kernel.invoke_stream(
-        chat_function,
-        return_function_results=False,
-        arguments=arguments,
-    )
-
-    print("Mosscap:> ", end="")
-    streamed_chunks: list[StreamingChatMessageContent] = []
-    result_content: list[StreamingChatMessageContent] = []
-    async for message in response:
-        if (
-            not execution_settings.function_choice_behavior.auto_invoke_kernel_functions
-            and isinstance(message[0], StreamingChatMessageContent)
-            and message[0].role == AuthorRole.ASSISTANT
-        ):
-            streamed_chunks.append(message[0])
-        elif isinstance(message[0], StreamingChatMessageContent) and message[0].role == AuthorRole.ASSISTANT:
-            result_content.append(message[0])
-            print(str(message[0]), end="")
-
-    if streamed_chunks:
-        streaming_chat_message = reduce(lambda first, second: first + second, streamed_chunks)
-        if hasattr(streaming_chat_message, "content"):
-            print(streaming_chat_message.content)
-        print("Auto tool calls is disabled, printing returned tool calls...")
-        print_tool_calls(streaming_chat_message)
-
-    print("\n")
-    if result_content:
-        return "".join([str(content) for content in result_content])
-    return None
-
-
-async def chat() -> bool:
-    try:
-        user_input = input("User:> ")
-    except KeyboardInterrupt:
-        print("\n\nExiting chat...")
-        return False
-    except EOFError:
-        print("\n\nExiting chat...")
-        return False
-
-    if user_input == "exit":
-        print("\n\nExiting chat...")
-        return False
-    arguments["user_input"] = user_input
-    arguments["chat_history"] = history
-
-    if stream:
-        result = await handle_streaming(kernel, chat_function, arguments=arguments)
-    else:
-        result = await kernel.invoke(chat_function, arguments=arguments)
-
-        # If tools are used, and auto invoke tool calls is False, the response will be of type
-        # ChatMessageContent with information about the tool calls, which need to be sent
-        # back to the model to get the final response.
-        function_calls = [item for item in result.value[-1].items if isinstance(item, FunctionCallContent)]
-        if not execution_settings.function_choice_behavior.auto_invoke_kernel_functions and len(function_calls) > 0:
-            print_tool_calls(result.value[0])
-            return True
-
-        print(f"Mosscap:> {result}")
-
-    history.add_user_message(user_input)
-    history.add_assistant_message(str(result))
-    return True
-
-
-async def main() -> None:
-    chatting = True
-    print(
-        "Welcome to the chat bot!\
-        \n  Type 'exit' to exit.\
-        \n  Try a math question to see the function calling in action (i.e. what is 3+3?)."
-    )
-    while chatting:
-        chatting = await chat()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/python/samples/concepts/auto_function_calling/chat_mistral_ai_api_function_calling.py b/python/samples/concepts/auto_function_calling/chat_mistral_ai_api_function_calling.py
deleted file mode 100644
index 5ee05a835e2a..000000000000
--- a/python/samples/concepts/auto_function_calling/chat_mistral_ai_api_function_calling.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-import asyncio
-import os
-from functools import reduce
-from typing import TYPE_CHECKING
-
-from semantic_kernel import Kernel
-from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
-from semantic_kernel.connectors.ai.mistral_ai import MistralAIChatCompletion, MistralAIChatPromptExecutionSettings
-from semantic_kernel.contents import ChatHistory
-from semantic_kernel.contents.chat_message_content import ChatMessageContent
-from semantic_kernel.contents.function_call_content import FunctionCallContent
-from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
-from semantic_kernel.contents.utils.author_role import AuthorRole
-from semantic_kernel.core_plugins.math_plugin import MathPlugin
-from semantic_kernel.core_plugins.time_plugin import TimePlugin
-from semantic_kernel.functions import KernelArguments
-
-if TYPE_CHECKING:
-    from semantic_kernel.functions import KernelFunction
-
-
-system_message = """
-You are a chat bot. Your name is Mosscap and
-you have one goal: figure out what people need.
-Your full name, should you need to know it, is
-Splendid Speckled Mosscap. You communicate
-effectively, but you tend to answer with long
-flowery prose. You are also a math wizard,
-especially for adding and subtracting.
-You also excel at joke telling, where your tone is often sarcastic.
-Once you have the answer I am looking for,
-you will return a full answer to me as soon as possible.
-"""
-
-# This concept example shows how to handle both streaming and non-streaming responses
-# To toggle the behavior, set the following flag accordingly:
-stream = True
-
-kernel = Kernel()
-
-# Note: the underlying Model must be Mistral Small, Mistral Large, Mixtral 8x22B, Mistral Nemo.
-# You can use MISTRALAI_API_KEY and MISTRALAI_CHAT_MODEL_ID environment variables to set the API key and model ID.
-# Or just set it here in the Constructor for testing
-kernel.add_service(
-    MistralAIChatCompletion(
-        service_id="chat",
-        # api_key=XXXXXXX,
-        # ai_model_id="mistral-large",
-    )
-)
-
-plugins_directory = os.path.join(__file__, "../../../../../prompt_template_samples/")
-# adding plugins to the kernel
-kernel.add_plugin(MathPlugin(), plugin_name="math")
-kernel.add_plugin(TimePlugin(), plugin_name="time")
-
-chat_function = kernel.add_function(
-    prompt="{{$chat_history}}{{$user_input}}",
-    plugin_name="ChatBot",
-    function_name="Chat",
-)
-
-# Enabling or disabling function calling is done by setting the `function_choice_behavior` attribute for the
-# prompt execution settings. When the function_call parameter is set to "auto" the model will decide which
-# function to use, if any.
-#
-# There are two ways to define the `function_choice_behavior` parameter:
-# 1. Using the type string as `"auto"`, `"required"`, or `"none"`. For example:
-#   configure `function_choice_behavior="auto"` parameter directly in the execution settings.
-# 2. Using the FunctionChoiceBehavior class. For example:
-#   `function_choice_behavior=FunctionChoiceBehavior.Auto()`.
-# Both of these configure the `auto` tool_choice and all of the available plugins/functions
-# registered on the kernel. If you want to limit the available plugins/functions, you must
-# configure the `filters` dictionary attribute for each type of function choice behavior.
-# For example:
-#
-# from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
-
-# function_choice_behavior = FunctionChoiceBehavior.Auto(
-#     filters={"included_functions": ["time-date", "time-time", "math-Add"]}
-# )
-#
-# The filters attribute allows you to specify either: `included_functions`, `excluded_functions`,
-#  `included_plugins`, or `excluded_plugins`.
-
-# Note: the number of responses for auto invoking tool calls is limited to 1.
-# If configured to be greater than one, this value will be overridden to 1.
-execution_settings = MistralAIChatPromptExecutionSettings(
-    service_id="chat",
-    max_tokens=2000,
-    temperature=0.7,
-    top_p=0.8,
-    function_choice_behavior=FunctionChoiceBehavior.Auto(auto_invoke=True),
-)
-
-history = ChatHistory()
-
-history.add_system_message(system_message)
-history.add_user_message("Hi there, who are you?")
-history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
-
-arguments = KernelArguments(settings=execution_settings)
-
-
-def print_tool_calls(message: ChatMessageContent) -> None:
-    # A helper method to pretty print the tool calls from the message.
-    # This is only triggered if auto invoke tool calls is disabled.
-    items = message.items
-    formatted_tool_calls = []
-    for i, item in enumerate(items, start=1):
-        if isinstance(item, FunctionCallContent):
-            tool_call_id = item.id
-            function_name = item.name
-            function_arguments = item.arguments
-            formatted_str = (
-                f"tool_call {i} id: {tool_call_id}\n"
-                f"tool_call {i} function name: {function_name}\n"
-                f"tool_call {i} arguments: {function_arguments}"
-            )
-            formatted_tool_calls.append(formatted_str)
-    if len(formatted_tool_calls) > 0:
-        print("Tool calls:\n" + "\n\n".join(formatted_tool_calls))
-    else:
-        print("The model used its own knowledge and didn't return any tool calls.")
-
-
-async def handle_streaming(
-    kernel: Kernel,
-    chat_function: "KernelFunction",
-    arguments: KernelArguments,
-) -> str | None:
-    response = kernel.invoke_stream(
-        chat_function,
-        return_function_results=False,
-        arguments=arguments,
-    )
-
-    print("Mosscap:> ", end="")
-    streamed_chunks: list[StreamingChatMessageContent] = []
-    result_content = []
-    async for message in response:
-        if (
-            not execution_settings.function_choice_behavior.auto_invoke_kernel_functions
-            and isinstance(message[0], StreamingChatMessageContent)
-            and message[0].role == AuthorRole.ASSISTANT
-        ):
-            streamed_chunks.append(message[0])
-        elif isinstance(message[0], StreamingChatMessageContent) and message[0].role == AuthorRole.ASSISTANT:
-            result_content.append(message[0])
-            print(str(message[0]), end="")
-
-    if streamed_chunks:
-        streaming_chat_message = reduce(lambda first, second: first + second, streamed_chunks)
-        if hasattr(streaming_chat_message, "content"):
-            print(streaming_chat_message.content)
-        print("Auto tool calls is disabled, printing returned tool calls...")
-        print_tool_calls(streaming_chat_message)
-
-    print("\n")
-    if result_content:
-        return "".join([str(content) for content in result_content])
-    return None
-
-
-async def chat() -> bool:
-    try:
-        user_input = input("User:> ")
-    except KeyboardInterrupt:
-        print("\n\nExiting chat...")
-        return False
-    except EOFError:
-        print("\n\nExiting chat...")
-        return False
-
-    if user_input == "exit":
-        print("\n\nExiting chat...")
-        return False
-    arguments["user_input"] = user_input
-    arguments["chat_history"] = history
-
-    if stream:
-        result = await handle_streaming(kernel, chat_function, arguments=arguments)
-    else:
-        result = await kernel.invoke(chat_function, arguments=arguments)
-
-        # If tools are used, and auto invoke tool calls is False, the response will be of type
-        # ChatMessageContent with information about the tool calls, which need to be sent
-        # back to the model to get the final response.
-        function_calls = [item for item in result.value[-1].items if isinstance(item, FunctionCallContent)]
-        if not execution_settings.function_choice_behavior.auto_invoke_kernel_functions and len(function_calls) > 0:
-            print_tool_calls(result.value[0])
-            return True
-
-        print(f"Mosscap:> {result}")
-
-    history.add_user_message(user_input)
-    history.add_assistant_message(str(result))
-    return True
-
-
-async def main() -> None:
-    chatting = True
-    print(
-        "Welcome to the chat bot!\
-        \n  Type 'exit' to exit.\
-        \n  Try a math question to see the function calling in action (i.e. what is 3+3?)."
-    )
-    while chatting:
-        chatting = await chat()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py b/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py
index ed2616ba71aa..f8490edba2cd 100644
--- a/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/anthropic/services/anthropic_chat_completion.py
@@ -154,6 +154,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         if not isinstance(settings, AnthropicChatPromptExecutionSettings):
             settings = self.get_prompt_execution_settings_from_settings(settings)
@@ -164,7 +165,7 @@ async def _inner_get_streaming_chat_message_contents(
         if settings.system is None and parsed_system_message is not None:
             settings.system = parsed_system_message
 
-        response = self._send_chat_stream_request(settings)
+        response = self._send_chat_stream_request(settings, function_invoke_attempt)
         if not isinstance(response, AsyncGenerator):
             raise ServiceInvalidResponseError("Expected an AsyncGenerator response.")
 
@@ -242,6 +243,7 @@ def _create_streaming_chat_message_content(
         self,
         stream_event: TextEvent | ContentBlockStopEvent | RawMessageDeltaEvent,
         metadata: dict[str, Any] = {},
+        function_invoke_attempt: int = 0,
     ) -> StreamingChatMessageContent:
         """Create a streaming chat message content object from a content block."""
         items: list[STREAMING_ITEM_TYPES] = []
@@ -275,6 +277,7 @@ def _create_streaming_chat_message_content(
             role=AuthorRole.ASSISTANT,
             finish_reason=finish_reason,
             items=items,
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     def update_settings_from_function_call_configuration_anthropic(
@@ -338,7 +341,9 @@ async def _send_chat_request(self, settings: AnthropicChatPromptExecutionSetting
         return [self._create_chat_message_content(response, response_metadata)]
 
     async def _send_chat_stream_request(
-        self, settings: AnthropicChatPromptExecutionSettings
+        self,
+        settings: AnthropicChatPromptExecutionSettings,
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], None]:
         """Send the chat stream request.
 
@@ -359,7 +364,9 @@ async def _send_chat_stream_request(
                         isinstance(stream_event, ContentBlockStopEvent)
                         and stream_event.content_block.type == "tool_use"
                     ):
-                        yield [self._create_streaming_chat_message_content(stream_event, metadata)]
+                        yield [
+                            self._create_streaming_chat_message_content(stream_event, metadata, function_invoke_attempt)
+                        ]
         except Exception as ex:
             raise ServiceResponseException(
                 f"{type(self)} service failed to complete the request",
diff --git a/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py b/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py
index e48268d223bb..8ac10561f142 100644
--- a/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/azure_ai_inference/services/azure_ai_inference_chat_completion.py
@@ -138,6 +138,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         if not isinstance(settings, AzureAIInferenceChatPromptExecutionSettings):
             settings = self.get_prompt_execution_settings_from_settings(settings)
@@ -157,7 +158,8 @@ async def _inner_get_streaming_chat_message_contents(
                 continue
             chunk_metadata = self._get_metadata_from_response(chunk)
             yield [
-                self._create_streaming_chat_message_content(chunk, choice, chunk_metadata) for choice in chunk.choices
+                self._create_streaming_chat_message_content(chunk, choice, chunk_metadata, function_invoke_attempt)
+                for choice in chunk.choices
             ]
 
     @override
@@ -255,6 +257,7 @@ def _create_streaming_chat_message_content(
         chunk: AsyncStreamingChatCompletions,
         choice: StreamingChatChoiceUpdate,
         metadata: dict[str, Any],
+        function_invoke_attempt: int,
     ) -> StreamingChatMessageContent:
         """Create a streaming chat message content object.
 
@@ -262,6 +265,7 @@ def _create_streaming_chat_message_content(
             chunk: The chunk from the response.
             choice: The choice from the response.
             metadata: The metadata from the response.
+            function_invoke_attempt: The function invoke attempt.
 
         Returns:
             A streaming chat message content object.
@@ -295,6 +299,7 @@ def _create_streaming_chat_message_content(
             inner_content=chunk,
             finish_reason=FinishReason(choice.finish_reason) if choice.finish_reason else None,
             metadata=metadata,
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     # endregion
diff --git a/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py b/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py
index 8827b310ac0d..c163b6ffda74 100644
--- a/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/bedrock/services/bedrock_chat_completion.py
@@ -128,6 +128,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         # Not all models support streaming: check if the model supports streaming before proceeding
         model_info = await self.get_foundation_model_info(self.ai_model_id)
@@ -146,7 +147,7 @@ async def _inner_get_streaming_chat_message_contents(
             elif "contentBlockStart" in event:
                 yield [self._parse_content_block_start_event(event)]
             elif "contentBlockDelta" in event:
-                yield [self._parse_content_block_delta_event(event)]
+                yield [self._parse_content_block_delta_event(event, function_invoke_attempt)]
             elif "contentBlockStop" in event:
                 continue
             elif "messageStop" in event:
@@ -338,7 +339,9 @@ def _parse_content_block_start_event(self, event: dict[str, Any]) -> StreamingCh
             inner_content=event,
         )
 
-    def _parse_content_block_delta_event(self, event: dict[str, Any]) -> StreamingChatMessageContent:
+    def _parse_content_block_delta_event(
+        self, event: dict[str, Any], function_invoke_attempt: int
+    ) -> StreamingChatMessageContent:
         """Parse the content block delta event.
 
         The content block delta event contains the completion.
@@ -363,6 +366,7 @@ def _parse_content_block_delta_event(self, event: dict[str, Any]) -> StreamingCh
             items=items,
             choice_index=0,
             inner_content=event,
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     def _parse_message_stop_event(self, event: dict[str, Any]) -> StreamingChatMessageContent:
diff --git a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py
index 6a673dccd5eb..de9edf36c268 100644
--- a/python/semantic_kernel/connectors/ai/chat_completion_client_base.py
+++ b/python/semantic_kernel/connectors/ai/chat_completion_client_base.py
@@ -64,15 +64,17 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         """Send a streaming chat request to the AI service.
 
         Args:
-            chat_history (ChatHistory): The chat history to send.
-            settings (PromptExecutionSettings): The settings for the request.
+            chat_history: The chat history to send.
+            settings: The settings for the request.
+            function_invoke_attempt: The current attempt count for automatically invoking functions.
 
         Yields:
-            streaming_chat_message_contents (list[StreamingChatMessageContent]): The streaming chat message contents.
+            streaming_chat_message_contents: The streaming chat message contents.
         """
         raise NotImplementedError("The _inner_get_streaming_chat_message_contents method is not implemented.")
         # Below is needed for mypy: https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
@@ -268,7 +270,9 @@ async def get_streaming_chat_message_contents(
                 # Hold the messages, if there are more than one response, it will not be used, so we flatten
                 all_messages: list["StreamingChatMessageContent"] = []
                 function_call_returned = False
-                async for messages in self._inner_get_streaming_chat_message_contents(chat_history, settings):
+                async for messages in self._inner_get_streaming_chat_message_contents(
+                    chat_history, settings, request_index
+                ):
                     for msg in messages:
                         if msg is not None:
                             all_messages.append(msg)
@@ -313,6 +317,7 @@ async def get_streaming_chat_message_contents(
                 function_result_messages = merge_streaming_function_results(
                     messages=chat_history.messages[-len(results) :],
                     ai_model_id=ai_model_id,  # type: ignore
+                    function_invoke_attempt=request_index,
                 )
                 if self._yield_function_result_messages(function_result_messages):
                     yield function_result_messages
diff --git a/python/semantic_kernel/connectors/ai/function_calling_utils.py b/python/semantic_kernel/connectors/ai/function_calling_utils.py
index 365d43565ed9..c7ab3dba6b39 100644
--- a/python/semantic_kernel/connectors/ai/function_calling_utils.py
+++ b/python/semantic_kernel/connectors/ai/function_calling_utils.py
@@ -101,6 +101,7 @@ def merge_function_results(
 def merge_streaming_function_results(
     messages: list[ChatMessageContent | StreamingChatMessageContent],
     ai_model_id: str,
+    function_invoke_attempt: int,
 ) -> list[StreamingChatMessageContent]:
     """Combine multiple streaming function result content types to one streaming chat message content type.
 
@@ -110,6 +111,7 @@ def merge_streaming_function_results(
     Args:
         messages: The list of streaming chat message content types.
         ai_model_id: The AI model ID.
+        function_invoke_attempt: The function invoke attempt.
 
     Returns:
         The combined streaming chat message content type.
@@ -118,4 +120,12 @@ def merge_streaming_function_results(
     for message in messages:
         items.extend([item for item in message.items if isinstance(item, FunctionResultContent)])
 
-    return [StreamingChatMessageContent(role=AuthorRole.TOOL, items=items, choice_index=0, ai_model_id=ai_model_id)]
+    return [
+        StreamingChatMessageContent(
+            role=AuthorRole.TOOL,
+            items=items,
+            choice_index=0,
+            ai_model_id=ai_model_id,
+            function_invoke_attempt=function_invoke_attempt,
+        )
+    ]
diff --git a/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py
index ca4ca998a122..df8f64cf4c6c 100644
--- a/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/google/google_ai/services/google_ai_chat_completion.py
@@ -147,6 +147,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         if not isinstance(settings, GoogleAIChatPromptExecutionSettings):
             settings = self.get_prompt_execution_settings_from_settings(settings)
@@ -167,7 +168,10 @@ async def _inner_get_streaming_chat_message_contents(
         )
 
         async for chunk in response:
-            yield [self._create_streaming_chat_message_content(chunk, candidate) for candidate in chunk.candidates]
+            yield [
+                self._create_streaming_chat_message_content(chunk, candidate, function_invoke_attempt)
+                for candidate in chunk.candidates
+            ]
 
     @override
     def _verify_function_choice_settings(self, settings: "PromptExecutionSettings") -> None:
@@ -268,12 +272,14 @@ def _create_streaming_chat_message_content(
         self,
         chunk: GenerateContentResponse,
         candidate: Candidate,
+        function_invoke_attempt: int = 0,
     ) -> StreamingChatMessageContent:
         """Create a streaming chat message content object.
 
         Args:
             chunk: The response from the service.
             candidate: The candidate from the response.
+            function_invoke_attempt: The function invoke attempt.
 
         Returns:
             A streaming chat message content object.
@@ -313,6 +319,7 @@ def _create_streaming_chat_message_content(
             inner_content=chunk,
             finish_reason=finish_reason,
             metadata=response_metadata,
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     # endregion
diff --git a/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py
index 45d66396ff34..6372c71c5b1c 100644
--- a/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/google/vertex_ai/services/vertex_ai_chat_completion.py
@@ -142,6 +142,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         if not isinstance(settings, VertexAIChatPromptExecutionSettings):
             settings = self.get_prompt_execution_settings_from_settings(settings)
@@ -162,7 +163,10 @@ async def _inner_get_streaming_chat_message_contents(
         )
 
         async for chunk in response:
-            yield [self._create_streaming_chat_message_content(chunk, candidate) for candidate in chunk.candidates]
+            yield [
+                self._create_streaming_chat_message_content(chunk, candidate, function_invoke_attempt)
+                for candidate in chunk.candidates
+            ]
 
     @override
     def _verify_function_choice_settings(self, settings: "PromptExecutionSettings") -> None:
@@ -262,12 +266,14 @@ def _create_streaming_chat_message_content(
         self,
         chunk: GenerationResponse,
         candidate: Candidate,
+        function_invoke_attempt: int,
     ) -> StreamingChatMessageContent:
         """Create a streaming chat message content object.
 
         Args:
             chunk: The response from the service.
             candidate: The candidate from the response.
+            function_invoke_attempt: The function invoke attempt.
 
         Returns:
             A streaming chat message content object.
@@ -308,6 +314,7 @@ def _create_streaming_chat_message_content(
             inner_content=chunk,
             finish_reason=finish_reason,
             metadata=response_metadata,
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     # endregion
diff --git a/python/semantic_kernel/connectors/ai/mistral_ai/services/mistral_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/mistral_ai/services/mistral_ai_chat_completion.py
index 46f0c9f64a2b..b374235225a4 100644
--- a/python/semantic_kernel/connectors/ai/mistral_ai/services/mistral_ai_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/mistral_ai/services/mistral_ai_chat_completion.py
@@ -159,6 +159,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         if not isinstance(settings, MistralAIChatPromptExecutionSettings):
             settings = self.get_prompt_execution_settings_from_settings(settings)
@@ -182,7 +183,9 @@ async def _inner_get_streaming_chat_message_contents(
                     continue
                 chunk_metadata = self._get_metadata_from_response(chunk.data)
                 yield [
-                    self._create_streaming_chat_message_content(chunk.data, choice, chunk_metadata)
+                    self._create_streaming_chat_message_content(
+                        chunk.data, choice, chunk_metadata, function_invoke_attempt
+                    )
                     for choice in chunk.data.choices
                 ]
 
@@ -216,6 +219,7 @@ def _create_streaming_chat_message_content(
         chunk: CompletionChunk,
         choice: CompletionResponseStreamChoice,
         chunk_metadata: dict[str, Any],
+        function_invoke_attempt: int,
     ) -> StreamingChatMessageContent:
         """Create a streaming chat message content object from a choice."""
         metadata = self._get_metadata_from_chat_choice(choice)
@@ -234,6 +238,7 @@ def _create_streaming_chat_message_content(
             role=AuthorRole(choice.delta.role) if choice.delta.role else AuthorRole.ASSISTANT,
             finish_reason=FinishReason(choice.finish_reason) if choice.finish_reason else None,
             items=items,
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     def _get_metadata_from_response(self, response: ChatCompletionResponse | CompletionChunk) -> dict[str, Any]:
diff --git a/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py b/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py
index bfb452d9fc2d..baf2d04f2914 100644
--- a/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/ollama/services/ollama_chat_completion.py
@@ -180,6 +180,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         if not isinstance(settings, OllamaChatPromptExecutionSettings):
             settings = self.get_prompt_execution_settings_from_settings(settings)
@@ -202,10 +203,10 @@ async def _inner_get_streaming_chat_message_contents(
 
         async for part in response_object:
             if isinstance(part, ChatResponse):
-                yield [self._create_streaming_chat_message_content_from_chat_response(part)]
+                yield [self._create_streaming_chat_message_content_from_chat_response(part, function_invoke_attempt)]
                 continue
             if isinstance(part, Mapping):
-                yield [self._create_streaming_chat_message_content(part)]
+                yield [self._create_streaming_chat_message_content(part, function_invoke_attempt)]
                 continue
             raise ServiceInvalidResponseError(
                 "Invalid response type from Ollama streaming chat completion. "
@@ -215,7 +216,9 @@ async def _inner_get_streaming_chat_message_contents(
     # endregion
 
     def _create_streaming_chat_message_content_from_chat_response(
-        self, response: ChatResponse
+        self,
+        response: ChatResponse,
+        function_invoke_attempt: int,
     ) -> StreamingChatMessageContent:
         """Create a chat message content from the response."""
         items: list[STREAMING_ITEM_TYPES] = []
@@ -235,6 +238,7 @@ def _create_streaming_chat_message_content_from_chat_response(
             inner_content=response,
             ai_model_id=self.ai_model_id,
             metadata=self._get_metadata_from_chat_response(response),
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     def _parse_tool_calls(self, tool_calls: Sequence[Message.ToolCall] | None, items: list[Any]):
@@ -299,7 +303,9 @@ def _create_chat_message_content(self, response: Mapping[str, Any]) -> ChatMessa
             metadata=self._get_metadata_from_response(response),
         )
 
-    def _create_streaming_chat_message_content(self, part: Mapping[str, Any]) -> StreamingChatMessageContent:
+    def _create_streaming_chat_message_content(
+        self, part: Mapping[str, Any], function_invoke_attempt: int
+    ) -> StreamingChatMessageContent:
         """Create a streaming chat message content from the response part."""
         items: list[STREAMING_ITEM_TYPES] = []
         if not (message := part.get("message", None)):
@@ -331,6 +337,7 @@ def _create_streaming_chat_message_content(self, part: Mapping[str, Any]) -> Str
             inner_content=part,
             ai_model_id=self.ai_model_id,
             metadata=self._get_metadata_from_response(part),
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     def _get_metadata_from_response(self, response: Mapping[str, Any]) -> dict[str, Any]:
diff --git a/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_chat_completion.py
index bb247cb55e43..28521975e366 100644
--- a/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_chat_completion.py
@@ -109,6 +109,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         """Create streaming chat message contents, in the number specified by the settings.
 
@@ -116,6 +117,7 @@ async def _inner_get_streaming_chat_message_contents(
             chat_history : A list of chat chat_history, that can be rendered into a
                 set of chat_history, from system, user, assistant and function.
             settings : Settings for the request.
+            function_invoke_attempt : The function invoke attempt.
 
         Yields:
             A stream representing the response(s) from the LLM.
@@ -127,7 +129,7 @@ async def _inner_get_streaming_chat_message_contents(
         images = self._get_images_from_history(chat_history)
         async for chunk in self._generate_next_token_async(prompt, settings, images):
             yield [
-                self._create_streaming_chat_message_content(choice_index, new_token)
+                self._create_streaming_chat_message_content(choice_index, new_token, function_invoke_attempt)
                 for choice_index, new_token in enumerate(chunk)
             ]
 
@@ -142,12 +144,15 @@ def _create_chat_message_content(self, choice: str) -> ChatMessageContent:
             ],
         )
 
-    def _create_streaming_chat_message_content(self, choice_index: int, choice: str) -> StreamingChatMessageContent:
+    def _create_streaming_chat_message_content(
+        self, choice_index: int, choice: str, function_invoke_attempt: int
+    ) -> StreamingChatMessageContent:
         return StreamingChatMessageContent(
             role=AuthorRole.ASSISTANT,
             choice_index=choice_index,
             content=choice,
             ai_model_id=self.ai_model_id,
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     @override
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_chat_completion.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_chat_completion.py
index 73e1a8fe62b7..03289fd45d58 100644
--- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_chat_completion.py
+++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_chat_completion.py
@@ -148,9 +148,10 @@ def _create_streaming_chat_message_content(
         chunk: ChatCompletionChunk,
         choice: ChunkChoice,
         chunk_metadata: dict[str, Any],
+        function_invoke_attempt: int = 0,
     ) -> "StreamingChatMessageContent":
         """Create an Azure streaming chat message content object from a choice."""
-        content = super()._create_streaming_chat_message_content(chunk, choice, chunk_metadata)
+        content = super()._create_streaming_chat_message_content(chunk, choice, chunk_metadata, function_invoke_attempt)
         assert isinstance(content, StreamingChatMessageContent) and isinstance(choice, ChunkChoice)  # nosec
         return self._add_tool_message_to_chat_message_content(content, choice)
 
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_chat_completion_base.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_chat_completion_base.py
index ec918dee605d..0c1e843c5d47 100644
--- a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_chat_completion_base.py
+++ b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_chat_completion_base.py
@@ -96,6 +96,7 @@ async def _inner_get_streaming_chat_message_contents(
         self,
         chat_history: "ChatHistory",
         settings: "PromptExecutionSettings",
+        function_invoke_attempt: int = 0,
     ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
         if not isinstance(settings, OpenAIChatPromptExecutionSettings):
             settings = self.get_prompt_execution_settings_from_settings(settings)
@@ -126,12 +127,13 @@ async def _inner_get_streaming_chat_message_contents(
                         inner_content=chunk,
                         ai_model_id=settings.ai_model_id,
                         metadata=chunk_metadata,
+                        function_invoke_attempt=function_invoke_attempt,
                     )
                     for i in range(settings.number_of_responses or 1)
                 ]
             else:
                 yield [
-                    self._create_streaming_chat_message_content(chunk, choice, chunk_metadata)
+                    self._create_streaming_chat_message_content(chunk, choice, chunk_metadata, function_invoke_attempt)
                     for choice in chunk.choices
                 ]
 
@@ -190,6 +192,7 @@ def _create_streaming_chat_message_content(
         chunk: ChatCompletionChunk,
         choice: ChunkChoice,
         chunk_metadata: dict[str, Any],
+        function_invoke_attempt: int,
     ) -> StreamingChatMessageContent:
         """Create a streaming chat message content object from a choice."""
         metadata = self._get_metadata_from_chat_choice(choice)
@@ -207,6 +210,7 @@ def _create_streaming_chat_message_content(
             role=(AuthorRole(choice.delta.role) if choice.delta and choice.delta.role else AuthorRole.ASSISTANT),
             finish_reason=(FinishReason(choice.finish_reason) if choice.finish_reason else None),
             items=items,
+            function_invoke_attempt=function_invoke_attempt,
         )
 
     def _get_metadata_from_chat_response(self, response: ChatCompletion) -> dict[str, Any]:
diff --git a/python/semantic_kernel/contents/streaming_chat_message_content.py b/python/semantic_kernel/contents/streaming_chat_message_content.py
index 51110b43ea5c..683b498d0c69 100644
--- a/python/semantic_kernel/contents/streaming_chat_message_content.py
+++ b/python/semantic_kernel/contents/streaming_chat_message_content.py
@@ -4,6 +4,8 @@
 from typing import Any, Union, overload
 from xml.etree.ElementTree import Element  # nosec
 
+from pydantic import Field
+
 from semantic_kernel.contents.chat_message_content import ChatMessageContent
 from semantic_kernel.contents.function_call_content import FunctionCallContent
 from semantic_kernel.contents.function_result_content import FunctionResultContent
@@ -51,6 +53,12 @@ class StreamingChatMessageContent(ChatMessageContent, StreamingContentMixin):
         __add__: Combines two StreamingChatMessageContent instances.
     """
 
+    function_invoke_attempt: int | None = Field(
+        default=0,
+        description="Tracks the current attempt count for automatically invoking functions. "
+        "This value increments with each subsequent automatic invocation attempt.",
+    )
+
     @overload
     def __init__(
         self,
@@ -63,6 +71,7 @@ def __init__(
         finish_reason: FinishReason | None = None,
         ai_model_id: str | None = None,
         metadata: dict[str, Any] | None = None,
+        function_invoke_attempt: int | None = None,
     ) -> None: ...
 
     @overload
@@ -77,6 +86,7 @@ def __init__(
         finish_reason: FinishReason | None = None,
         ai_model_id: str | None = None,
         metadata: dict[str, Any] | None = None,
+        function_invoke_attempt: int | None = None,
     ) -> None: ...
 
     def __init__(  # type: ignore
@@ -91,26 +101,30 @@ def __init__(  # type: ignore
         finish_reason: FinishReason | None = None,
         ai_model_id: str | None = None,
         metadata: dict[str, Any] | None = None,
+        function_invoke_attempt: int | None = None,
     ):
         """Create a new instance of StreamingChatMessageContent.
 
         Args:
-            role: ChatRole - The role of the chat message.
-            choice_index: int - The index of the choice that generated this response.
-            items: list[TextContent, FunctionCallContent, FunctionResultContent, ImageContent] - The content.
-            content: str - The text of the response.
-            inner_content: Optional[Any] - The inner content of the response,
+            role: The role of the chat message.
+            choice_index: The index of the choice that generated this response.
+            items: The content.
+            content: The text of the response.
+            inner_content: The inner content of the response,
                 this should hold all the information from the response so even
                 when not creating a subclass a developer can leverage the full thing.
-            name: Optional[str] - The name of the response.
-            encoding: Optional[str] - The encoding of the text.
-            finish_reason: Optional[FinishReason] - The reason the response was finished.
-            metadata: Dict[str, Any] - Any metadata that should be attached to the response.
-            ai_model_id: Optional[str] - The id of the AI model that generated this response.
+            name: The name of the response.
+            encoding: The encoding of the text.
+            finish_reason: The reason the response was finished.
+            metadata: Any metadata that should be attached to the response.
+            ai_model_id: The id of the AI model that generated this response.
+            function_invoke_attempt: Tracks the current attempt count for automatically
+                invoking functions. This value increments with each subsequent automatic invocation attempt.
         """
         kwargs: dict[str, Any] = {
             "role": role,
             "choice_index": choice_index,
+            "function_invoke_attempt": function_invoke_attempt,
         }
         if encoding:
             kwargs["encoding"] = encoding
@@ -180,6 +194,7 @@ def __add__(self, other: "StreamingChatMessageContent") -> "StreamingChatMessage
             metadata=self.metadata | other.metadata,
             encoding=self.encoding,
             finish_reason=self.finish_reason or other.finish_reason,
+            function_invoke_attempt=self.function_invoke_attempt,
         )
 
     def to_element(self) -> "Element":
@@ -214,5 +229,6 @@ def __hash__(self) -> int:
             self.encoding,
             self.finish_reason,
             self.choice_index,
+            self.function_invoke_attempt,
             *self.items,
         ))
diff --git a/python/tests/samples/test_concepts.py b/python/tests/samples/test_concepts.py
index bf3ff42ede2c..6e8d4ad1a9e0 100644
--- a/python/tests/samples/test_concepts.py
+++ b/python/tests/samples/test_concepts.py
@@ -8,7 +8,9 @@
 import pytest
 from pytest import mark, param
 
-from samples.concepts.auto_function_calling.chat_gpt_api_function_calling import main as chat_gpt_api_function_calling
+from samples.concepts.auto_function_calling.chat_completion_with_function_calling import (
+    main as chat_completion_with_function_calling,
+)
 from samples.concepts.auto_function_calling.functions_defined_in_json_prompt import (
     main as function_defined_in_json_prompt,
 )
@@ -106,9 +108,9 @@
         ),
     ),
     param(
-        chat_gpt_api_function_calling,
+        chat_completion_with_function_calling,
         ["What is 3+3?", "exit"],
-        id="chat_gpt_api_function_calling",
+        id="chat_completion_with_function_calling",
         marks=pytest.mark.skipif(
             os.getenv(COMPLETIONS_CONCEPT_SAMPLE, None) is None, reason="Not running completion samples."
         ),

From ac5a1057a4b9ddab29bb22a06961a9fa8bc1bdc4 Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Thu, 19 Dec 2024 08:47:21 +0900
Subject: [PATCH 2/7] Break the auto function calling samples into two:
 streaming and non-streaming

---
 .../chat_completion_with_function_calling.py  | 141 ++--------
 ...pletion_with_function_calling_streaming.py | 244 ++++++++++++++++++
 2 files changed, 269 insertions(+), 116 deletions(-)
 create mode 100644 python/samples/concepts/auto_function_calling/chat_completion_with_function_calling_streaming.py

diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py b/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py
index 445106e222cb..92d92f17db49 100644
--- a/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft. All rights reserved.
 
 import asyncio
-from functools import reduce
 from typing import TYPE_CHECKING
 
 from samples.concepts.setup.chat_completion_services import Services, get_chat_completion_service_and_request_settings
@@ -10,14 +9,21 @@
 from semantic_kernel.contents import ChatHistory
 from semantic_kernel.contents.chat_message_content import ChatMessageContent
 from semantic_kernel.contents.function_call_content import FunctionCallContent
-from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
-from semantic_kernel.contents.utils.author_role import AuthorRole
 from semantic_kernel.core_plugins.math_plugin import MathPlugin
 from semantic_kernel.core_plugins.time_plugin import TimePlugin
 from semantic_kernel.functions import KernelArguments
 
 if TYPE_CHECKING:
-    from semantic_kernel.functions import KernelFunction
+    pass
+
+#####################################################################
+# This sample demonstrates how to build a conversational chatbot    #
+# using Semantic Kernel, featuring dynamic function calling,        #
+# non-streaming responses, and support for math and time plugins.   #
+# The chatbot is designed to interact with the user, call functions #
+# as needed, and return responses. If auto function calling is      #
+# disabled, then the tool calls will be printed to the console.     #
+#####################################################################
 
 # System message defining the behavior and persona of the chat bot.
 system_message = """
@@ -33,9 +39,6 @@
 you will return a full answer to me as soon as possible.
 """
 
-# Toggle this flag to switch between streaming and non-streaming modes.
-stream = True
-
 # Create and configure the kernel.
 kernel = Kernel()
 
@@ -62,7 +65,7 @@
 # - Services.ONNX
 # - Services.VERTEX_AI
 # Please make sure you have configured your environment correctly for the selected chat completion service.
-chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
+chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.OPENAI)
 
 # Configure the function choice behavior. Here, we set it to Auto with auto_invoke=True.
 # - If `auto_invoke=True`, the model will automatically choose and call functions as needed.
@@ -105,96 +108,6 @@ def print_tool_calls(message: ChatMessageContent) -> None:
         print("\n[No tool calls returned by the model]")
 
 
-async def handle_streaming(
-    kernel: Kernel,
-    chat_function: "KernelFunction",
-    arguments: KernelArguments,
-) -> str | None:
-    """
-    Handle the streaming response from the model.
-    This function demonstrates two possible paths:
-
-    1. When auto function calling is ON (auto_invoke=True):
-       - The model may call tools automatically and produce a continuous
-         stream of assistant messages. We can simply print these as they come in.
-
-    2. When auto function calling is OFF (auto_invoke=False):
-       - The model may instead return tool call instructions embedded in the stream.
-         We can track these calls using `function_invoke_attempt` attributes and print
-         them for the user. The user can then manually invoke the tools and return the results
-         to the model for further completion.
-    """
-
-    response = kernel.invoke_stream(
-        chat_function,
-        return_function_results=False,
-        arguments=arguments,
-    )
-
-    # We will differentiate behavior based on whether auto invoking kernel functions is enabled.
-    auto_invoking = request_settings.function_choice_behavior.auto_invoke_kernel_functions
-
-    print("Mosscap:> ", end="", flush=True)
-
-    # If auto_invoking is False, the model may return separate streaming chunks containing tool instructions.
-    # We'll store them here.
-    streamed_tool_chunks: list[StreamingChatMessageContent] = []
-
-    # For content messages (the final assistant's response text), store them here.
-    streamed_response_chunks: list[StreamingChatMessageContent] = []
-
-    async for message in response:
-        msg = message[0]
-
-        # We only expect assistant messages here.
-        if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
-            continue
-
-        if auto_invoking:
-            # When auto invocation is ON, no special handling is needed. Just print out messages as they arrive.
-            streamed_response_chunks.append(msg)
-            print(str(msg), end="", flush=True)
-        else:
-            # When auto invocation is OFF, the model may send chunks that represent tool calls.
-            # Chunks that contain function call instructions will have a function_invoke_attempt attribute.
-            if hasattr(msg, "function_invoke_attempt"):
-                # This chunk is part of a tool call instruction sequence
-                streamed_tool_chunks.append(msg)
-            else:
-                # This chunk is normal assistant response text
-                streamed_response_chunks.append(msg)
-                print(str(msg), end="", flush=True)
-
-    print("\n", flush=True)
-
-    # If auto function calling was OFF, handle any tool call instructions we captured.
-    if not auto_invoking and streamed_tool_chunks:
-        # Group streamed chunks by `function_invoke_attempt` to handle each invocation attempt separately.
-        grouped_chunks = {}
-        for chunk in streamed_tool_chunks:
-            key = getattr(chunk, "function_invoke_attempt", None)
-            if key is not None:
-                grouped_chunks.setdefault(key, []).append(chunk)
-
-        # Process each group of chunks
-        for attempt, chunks in grouped_chunks.items():
-            try:
-                # Combine all chunks for a given attempt into one message.
-                combined_content = reduce(lambda first, second: first + second, chunks)
-                if hasattr(combined_content, "content"):
-                    print(f"[function_invoke_attempt {attempt} content]:\n{combined_content.content}")
-
-                print("[Auto function calling is OFF] Here are the returned tool calls:")
-                print_tool_calls(combined_content)
-            except Exception as e:
-                print(f"Error processing chunks for function_invoke_attempt {attempt}: {e}")
-
-    # Return the final concatenated assistant response (if any).
-    if streamed_response_chunks:
-        return "".join([str(content) for content in streamed_response_chunks])
-    return None
-
-
 async def chat() -> bool:
     """
     Continuously prompt the user for input and show the assistant's response.
@@ -213,26 +126,22 @@ async def chat() -> bool:
     arguments["user_input"] = user_input
     arguments["chat_history"] = history
 
-    if stream:
-        # Handle streaming responses
-        result = await handle_streaming(kernel, chat_function, arguments=arguments)
-    else:
-        # Handle non-streaming responses
-        result = await kernel.invoke(chat_function, arguments=arguments)
+    # Handle non-streaming responses
+    result = await kernel.invoke(chat_function, arguments=arguments)
 
-        # If function calls are returned and auto invoking is off, we must show them.
-        if not request_settings.function_choice_behavior.auto_invoke_kernel_functions and result and result.value:
-            # Extract function calls from the returned content
-            function_calls = [item for item in result.value[-1].items if isinstance(item, FunctionCallContent)]
-            if len(function_calls) > 0:
-                print_tool_calls(result.value[0])
-                # At this point, you'd handle these calls manually if desired.
-                # For now, we just print them.
-                return True
+    # If function calls are returned and auto invoking is off, we must show them.
+    if not request_settings.function_choice_behavior.auto_invoke_kernel_functions and result and result.value:
+        # Extract function calls from the returned content
+        function_calls = [item for item in result.value[-1].items if isinstance(item, FunctionCallContent)]
+        if len(function_calls) > 0:
+            print_tool_calls(result.value[0])
+            # At this point, you'd handle these calls manually if desired.
+            # For now, we just print them.
+            return True
 
-        # If no function calls to handle, just print the assistant's response
-        if result:
-            print(f"Mosscap:> {result}")
+    # If no function calls to handle, just print the assistant's response
+    if result:
+        print(f"Mosscap:> {result}")
 
     # Update the chat history with the user's input and the assistant's response
     if result:
diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling_streaming.py b/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling_streaming.py
new file mode 100644
index 000000000000..e8c6412263cd
--- /dev/null
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling_streaming.py
@@ -0,0 +1,244 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+from functools import reduce
+from typing import TYPE_CHECKING
+
+from samples.concepts.setup.chat_completion_services import Services, get_chat_completion_service_and_request_settings
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.contents.chat_message_content import ChatMessageContent
+from semantic_kernel.contents.function_call_content import FunctionCallContent
+from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
+from semantic_kernel.contents.utils.author_role import AuthorRole
+from semantic_kernel.core_plugins.math_plugin import MathPlugin
+from semantic_kernel.core_plugins.time_plugin import TimePlugin
+from semantic_kernel.functions import KernelArguments
+
+if TYPE_CHECKING:
+    from semantic_kernel.functions import KernelFunction
+
+#####################################################################
+# This sample demonstrates how to build a conversational chatbot    #
+# using Semantic Kernel, featuring dynamic function calling,        #
+# streaming responses, and support for math and time plugins.       #
+# The chatbot is designed to interact with the user, call functions #
+# as needed, and return responses. If auto function calling is      #
+# disabled, then the tool calls will be printed to the console.     #
+#####################################################################
+
+# System message defining the behavior and persona of the chat bot.
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose. You are also a math wizard,
+especially for adding and subtracting.
+You also excel at joke telling, where your tone is often sarcastic.
+Once you have the answer I am looking for,
+you will return a full answer to me as soon as possible.
+"""
+
+# Create and configure the kernel.
+kernel = Kernel()
+
+# Load some sample plugins (for demonstration of function calling).
+kernel.add_plugin(MathPlugin(), plugin_name="math")
+kernel.add_plugin(TimePlugin(), plugin_name="time")
+
+# Define a chat function (a template for how to handle user input).
+chat_function = kernel.add_function(
+    prompt="{{$chat_history}}{{$user_input}}",
+    plugin_name="ChatBot",
+    function_name="Chat",
+)
+
+# You can select from the following chat completion services that support function calling:
+# - Services.OPENAI
+# - Services.AZURE_OPENAI
+# - Services.AZURE_AI_INFERENCE
+# - Services.ANTHROPIC
+# - Services.BEDROCK
+# - Services.GOOGLE_AI
+# - Services.MISTRAL_AI
+# - Services.OLLAMA
+# - Services.ONNX
+# - Services.VERTEX_AI
+# Please make sure you have configured your environment correctly for the selected chat completion service.
+chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
+
+# Configure the function choice behavior. Here, we set it to Auto with auto_invoke=True.
+# - If `auto_invoke=True`, the model will automatically choose and call functions as needed.
+# - If `auto_invoke=False`, the model may return tool call instructions that you must handle and call manually.
+request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto(auto_invoke=True)
+
+kernel.add_service(chat_completion_service)
+
+# Pass the request settings to the kernel arguments.
+arguments = KernelArguments(settings=request_settings)
+
+# Create a chat history to store the system message, initial messages, and the conversation.
+history = ChatHistory()
+history.add_system_message(system_message)
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+def print_tool_calls(message: ChatMessageContent) -> None:
+    """
+    A helper function to pretty print the tool calls found in a ChatMessageContent message.
+    This is useful when auto tool invocation is disabled and the model returns calls that you must handle.
+    """
+    items = message.items
+    formatted_tool_calls = []
+    for i, item in enumerate(items, start=1):
+        if isinstance(item, FunctionCallContent):
+            tool_call_id = item.id
+            function_name = item.name
+            function_arguments = item.arguments
+            formatted_str = (
+                f"tool_call {i} id: {tool_call_id}\n"
+                f"tool_call {i} function name: {function_name}\n"
+                f"tool_call {i} arguments: {function_arguments}"
+            )
+            formatted_tool_calls.append(formatted_str)
+    if len(formatted_tool_calls) > 0:
+        print("\n[Tool calls returned by the model]:\n" + "\n\n".join(formatted_tool_calls))
+    else:
+        print("\n[No tool calls returned by the model]")
+
+
+async def handle_streaming(
+    kernel: Kernel,
+    chat_function: "KernelFunction",
+    arguments: KernelArguments,
+) -> str | None:
+    """
+    Handle the streaming response from the model.
+    This function demonstrates two possible paths:
+
+    1. When auto function calling is ON (auto_invoke=True):
+       - The model may call tools automatically and produce a continuous
+         stream of assistant messages. We can simply print these as they come in.
+
+    2. When auto function calling is OFF (auto_invoke=False):
+       - The model may instead return tool call instructions embedded in the stream.
+         We can track these calls using `function_invoke_attempt` attributes and print
+         them for the user. The user can then manually invoke the tools and return the results
+         to the model for further completion.
+    """
+
+    response = kernel.invoke_stream(
+        chat_function,
+        return_function_results=False,
+        arguments=arguments,
+    )
+
+    # We will differentiate behavior based on whether auto invoking kernel functions is enabled.
+    auto_invoking = request_settings.function_choice_behavior.auto_invoke_kernel_functions
+
+    print("Mosscap:> ", end="", flush=True)
+
+    # If auto_invoking is False, the model may return separate streaming chunks containing tool instructions.
+    # We'll store them here.
+    streamed_tool_chunks: list[StreamingChatMessageContent] = []
+
+    # For content messages (the final assistant's response text), store them here.
+    streamed_response_chunks: list[StreamingChatMessageContent] = []
+
+    async for message in response:
+        msg = message[0]
+
+        # We only expect assistant messages here.
+        if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
+            continue
+
+        if auto_invoking:
+            # When auto invocation is ON, no special handling is needed. Just print out messages as they arrive.
+            streamed_response_chunks.append(msg)
+            print(str(msg), end="", flush=True)
+        else:
+            # When auto invocation is OFF, the model may send chunks that represent tool calls.
+            # Chunks that contain function call instructions will have a function_invoke_attempt attribute.
+            if hasattr(msg, "function_invoke_attempt"):
+                # This chunk is part of a tool call instruction sequence
+                streamed_tool_chunks.append(msg)
+            else:
+                # This chunk is normal assistant response text
+                streamed_response_chunks.append(msg)
+                print(str(msg), end="", flush=True)
+
+    print("\n", flush=True)
+
+    # If auto function calling was OFF, handle any tool call instructions we captured.
+    if not auto_invoking and streamed_tool_chunks:
+        # Group streamed chunks by `function_invoke_attempt` to handle each invocation attempt separately.
+        grouped_chunks = {}
+        for chunk in streamed_tool_chunks:
+            key = getattr(chunk, "function_invoke_attempt", None)
+            if key is not None:
+                grouped_chunks.setdefault(key, []).append(chunk)
+
+        # Process each group of chunks
+        for attempt, chunks in grouped_chunks.items():
+            try:
+                # Combine all chunks for a given attempt into one message.
+                combined_content = reduce(lambda first, second: first + second, chunks)
+                if hasattr(combined_content, "content"):
+                    print(f"[function_invoke_attempt {attempt} content]:\n{combined_content.content}")
+
+                print("[Auto function calling is OFF] Here are the returned tool calls:")
+                print_tool_calls(combined_content)
+            except Exception as e:
+                print(f"Error processing chunks for function_invoke_attempt {attempt}: {e}")
+
+    # Return the final concatenated assistant response (if any).
+    if streamed_response_chunks:
+        return "".join([str(content) for content in streamed_response_chunks])
+    return None
+
+
+async def chat() -> bool:
+    """
+    Continuously prompt the user for input and show the assistant's response.
+    Type 'exit' to exit.
+    """
+    try:
+        user_input = input("User:> ")
+    except (KeyboardInterrupt, EOFError):
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input.lower().strip() == "exit":
+        print("\n\nExiting chat...")
+        return False
+
+    arguments["user_input"] = user_input
+    arguments["chat_history"] = history
+
+    result = await handle_streaming(kernel, chat_function, arguments=arguments)
+
+    # Update the chat history with the user's input and the assistant's response
+    if result:
+        history.add_user_message(user_input)
+        history.add_assistant_message(str(result))
+
+    return True
+
+
+async def main() -> None:
+    print(
+        "Welcome to the chat bot!\n"
+        "  Type 'exit' to exit.\n"
+        "  Try a math question to see function calling in action (e.g. 'what is 3+3?')."
+    )
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 8464649f4375d60e2a606bfb13b9d2f72578a484 Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Thu, 19 Dec 2024 10:01:16 +0900
Subject: [PATCH 3/7] Break samples down further - streaming and non streaming
 auto invoke and manual invoke

---
 ...t_completion_with_auto_function_calling.py | 125 +++++++++++++
 ...on_with_auto_function_calling_streaming.py | 169 ++++++++++++++++++
 ...ompletion_with_manual_function_calling.py} |  14 +-
 ...with_manual_function_calling_streaming.py} |  32 ++--
 4 files changed, 313 insertions(+), 27 deletions(-)
 create mode 100644 python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling.py
 create mode 100644 python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling_streaming.py
 rename python/samples/concepts/auto_function_calling/{chat_completion_with_function_calling.py => chat_completion_with_manual_function_calling.py} (93%)
 rename python/samples/concepts/auto_function_calling/{chat_completion_with_function_calling_streaming.py => chat_completion_with_manual_function_calling_streaming.py} (88%)

diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling.py b/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling.py
new file mode 100644
index 000000000000..2dad6517e31c
--- /dev/null
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling.py
@@ -0,0 +1,125 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+from typing import TYPE_CHECKING
+
+from samples.concepts.setup.chat_completion_services import Services, get_chat_completion_service_and_request_settings
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.core_plugins.math_plugin import MathPlugin
+from semantic_kernel.core_plugins.time_plugin import TimePlugin
+from semantic_kernel.functions import KernelArguments
+
+if TYPE_CHECKING:
+    pass
+
+#####################################################################
+# This sample demonstrates how to build a conversational chatbot    #
+# using Semantic Kernel, featuring auto function calling,           #
+# non-streaming responses, and support for math and time plugins.   #
+# The chatbot is designed to interact with the user, call functions #
+# as needed, and return responses.                                  #
+#####################################################################
+
+# System message defining the behavior and persona of the chat bot.
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose. You are also a math wizard,
+especially for adding and subtracting.
+You also excel at joke telling, where your tone is often sarcastic.
+Once you have the answer I am looking for,
+you will return a full answer to me as soon as possible.
+"""
+
+# Create and configure the kernel.
+kernel = Kernel()
+
+# Load some sample plugins (for demonstration of function calling).
+kernel.add_plugin(MathPlugin(), plugin_name="math")
+kernel.add_plugin(TimePlugin(), plugin_name="time")
+
+# Define a chat function (a template for how to handle user input).
+chat_function = kernel.add_function(
+    prompt="{{$chat_history}}{{$user_input}}",
+    plugin_name="ChatBot",
+    function_name="Chat",
+)
+
+# You can select from the following chat completion services that support function calling:
+# - Services.OPENAI
+# - Services.AZURE_OPENAI
+# - Services.AZURE_AI_INFERENCE
+# - Services.ANTHROPIC
+# - Services.BEDROCK
+# - Services.GOOGLE_AI
+# - Services.MISTRAL_AI
+# - Services.OLLAMA
+# - Services.ONNX
+# - Services.VERTEX_AI
+# Please make sure you have configured your environment correctly for the selected chat completion service.
+chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
+
+# Configure the function choice behavior. Here, we set it to Auto, where auto_invoke=True by default.
+# With `auto_invoke=True`, the model will automatically choose and call functions as needed.
+request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto()
+
+kernel.add_service(chat_completion_service)
+
+# Pass the request settings to the kernel arguments.
+arguments = KernelArguments(settings=request_settings)
+
+# Create a chat history to store the system message, initial messages, and the conversation.
+history = ChatHistory()
+history.add_system_message(system_message)
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+async def chat() -> bool:
+    """
+    Continuously prompt the user for input and show the assistant's response.
+    Type 'exit' to exit.
+    """
+    try:
+        user_input = input("User:> ")
+    except (KeyboardInterrupt, EOFError):
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input.lower().strip() == "exit":
+        print("\n\nExiting chat...")
+        return False
+
+    arguments["user_input"] = user_input
+    arguments["chat_history"] = history
+
+    # Handle non-streaming responses
+    result = await kernel.invoke(chat_function, arguments=arguments)
+
+    # Update the chat history with the user's input and the assistant's response
+    if result:
+        print(f"Mosscap:> {result}")
+        history.add_user_message(user_input)
+        history.add_assistant_message(str(result))
+
+    return True
+
+
+async def main() -> None:
+    print(
+        "Welcome to the chat bot!\n"
+        "  Type 'exit' to exit.\n"
+        "  Try a math question to see function calling in action (e.g. 'what is 3+3?')."
+    )
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling_streaming.py b/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling_streaming.py
new file mode 100644
index 000000000000..86435032ba4d
--- /dev/null
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling_streaming.py
@@ -0,0 +1,169 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+from typing import TYPE_CHECKING
+
+from samples.concepts.setup.chat_completion_services import Services, get_chat_completion_service_and_request_settings
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
+from semantic_kernel.contents.utils.author_role import AuthorRole
+from semantic_kernel.core_plugins.math_plugin import MathPlugin
+from semantic_kernel.core_plugins.time_plugin import TimePlugin
+from semantic_kernel.functions import KernelArguments
+
+if TYPE_CHECKING:
+    from semantic_kernel.functions import KernelFunction
+
+#####################################################################
+# This sample demonstrates how to build a conversational chatbot    #
+# using Semantic Kernel, featuring auto function calling,           #
+# streaming responses, and support for math and time plugins.       #
+# The chatbot is designed to interact with the user, call functions #
+# as needed, and return responses.                                  #
+#####################################################################
+
+# System message defining the behavior and persona of the chat bot.
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose. You are also a math wizard,
+especially for adding and subtracting.
+You also excel at joke telling, where your tone is often sarcastic.
+Once you have the answer I am looking for,
+you will return a full answer to me as soon as possible.
+"""
+
+# Create and configure the kernel.
+kernel = Kernel()
+
+# Load some sample plugins (for demonstration of function calling).
+kernel.add_plugin(MathPlugin(), plugin_name="math")
+kernel.add_plugin(TimePlugin(), plugin_name="time")
+
+# Define a chat function (a template for how to handle user input).
+chat_function = kernel.add_function(
+    prompt="{{$chat_history}}{{$user_input}}",
+    plugin_name="ChatBot",
+    function_name="Chat",
+)
+
+# You can select from the following chat completion services that support function calling:
+# - Services.OPENAI
+# - Services.AZURE_OPENAI
+# - Services.AZURE_AI_INFERENCE
+# - Services.ANTHROPIC
+# - Services.BEDROCK
+# - Services.GOOGLE_AI
+# - Services.MISTRAL_AI
+# - Services.OLLAMA
+# - Services.ONNX
+# - Services.VERTEX_AI
+# Please make sure you have configured your environment correctly for the selected chat completion service.
+chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
+
+# Configure the function choice behavior. Here, we set it to Auto, where auto_invoke=True by default.
+# With `auto_invoke=True`, the model will automatically choose and call functions as needed.
+request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto()
+
+kernel.add_service(chat_completion_service)
+
+# Pass the request settings to the kernel arguments.
+arguments = KernelArguments(settings=request_settings)
+
+# Create a chat history to store the system message, initial messages, and the conversation.
+history = ChatHistory()
+history.add_system_message(system_message)
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+async def handle_streaming(
+    kernel: Kernel,
+    chat_function: "KernelFunction",
+    arguments: KernelArguments,
+) -> str | None:
+    """
+    Handle the streaming response from the model.
+    This function demonstrates two possible paths:
+
+    When auto function calling is ON (auto_invoke=True):
+    - The model may call tools automatically and produce a continuous
+        stream of assistant messages. We can simply print these as they come in.
+    """
+
+    response = kernel.invoke_stream(
+        chat_function,
+        return_function_results=False,
+        arguments=arguments,
+    )
+
+    print("Mosscap:> ", end="", flush=True)
+
+    # For content messages (the final assistant's response text), store them here.
+    streamed_response_chunks: list[StreamingChatMessageContent] = []
+
+    async for message in response:
+        msg = message[0]
+
+        # We only expect assistant messages here.
+        if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
+            continue
+
+        # When auto invocation is ON, no special handling is needed. Just print out messages as they arrive.
+        streamed_response_chunks.append(msg)
+        print(str(msg), end="", flush=True)
+
+    print("\n", flush=True)
+
+    # Return the final concatenated assistant response (if any).
+    if streamed_response_chunks:
+        return "".join([str(content) for content in streamed_response_chunks])
+    return None
+
+
+async def chat() -> bool:
+    """
+    Continuously prompt the user for input and show the assistant's response.
+    Type 'exit' to exit.
+    """
+    try:
+        user_input = input("User:> ")
+    except (KeyboardInterrupt, EOFError):
+        print("\n\nExiting chat...")
+        return False
+
+    if user_input.lower().strip() == "exit":
+        print("\n\nExiting chat...")
+        return False
+
+    arguments["user_input"] = user_input
+    arguments["chat_history"] = history
+
+    result = await handle_streaming(kernel, chat_function, arguments=arguments)
+
+    # Update the chat history with the user's input and the assistant's response
+    if result:
+        history.add_user_message(user_input)
+        history.add_assistant_message(str(result))
+
+    return True
+
+
+async def main() -> None:
+    print(
+        "Welcome to the chat bot!\n"
+        "  Type 'exit' to exit.\n"
+        "  Try a math question to see function calling in action (e.g. 'what is 3+3?')."
+    )
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
similarity index 93%
rename from python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py
rename to python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
index 92d92f17db49..978fa3ea24ec 100644
--- a/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling.py
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
@@ -18,11 +18,11 @@
 
 #####################################################################
 # This sample demonstrates how to build a conversational chatbot    #
-# using Semantic Kernel, featuring dynamic function calling,        #
+# using Semantic Kernel, featuring manual function calling,         #
 # non-streaming responses, and support for math and time plugins.   #
 # The chatbot is designed to interact with the user, call functions #
-# as needed, and return responses. If auto function calling is      #
-# disabled, then the tool calls will be printed to the console.     #
+# as needed, and return responses. With auto function calling       #
+# disabled, the tool calls will be printed to the console.          #
 #####################################################################
 
 # System message defining the behavior and persona of the chat bot.
@@ -67,10 +67,10 @@
 # Please make sure you have configured your environment correctly for the selected chat completion service.
 chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.OPENAI)
 
-# Configure the function choice behavior. Here, we set it to Auto with auto_invoke=True.
-# - If `auto_invoke=True`, the model will automatically choose and call functions as needed.
-# - If `auto_invoke=False`, the model may return tool call instructions that you must handle and call manually.
-request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto(auto_invoke=True)
+# Configure the function choice behavior. Here, we set it to Auto, where auto_invoke=False.
+# With `FunctionChoiceBehavior(auto_invoke=False)`, the model may return tool call instructions
+# that you must handle and call manually. We will only print the tool calls in this sample.
+request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto(auto_invoke=False)
 
 kernel.add_service(chat_completion_service)
 
diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling_streaming.py b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling_streaming.py
similarity index 88%
rename from python/samples/concepts/auto_function_calling/chat_completion_with_function_calling_streaming.py
rename to python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling_streaming.py
index e8c6412263cd..b749ff2a87a0 100644
--- a/python/samples/concepts/auto_function_calling/chat_completion_with_function_calling_streaming.py
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling_streaming.py
@@ -70,10 +70,10 @@
 # Please make sure you have configured your environment correctly for the selected chat completion service.
 chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
 
-# Configure the function choice behavior. Here, we set it to Auto with auto_invoke=True.
-# - If `auto_invoke=True`, the model will automatically choose and call functions as needed.
-# - If `auto_invoke=False`, the model may return tool call instructions that you must handle and call manually.
-request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto(auto_invoke=True)
+# Configure the function choice behavior. Here, we set it to Auto, where auto_invoke=False.
+# With `FunctionChoiceBehavior(auto_invoke=False)`, the model may return tool call instructions
+# that you must handle and call manually. We will only print the tool calls in this sample.
+request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto(auto_invoke=False)
 
 kernel.add_service(chat_completion_service)
 
@@ -137,9 +137,6 @@ async def handle_streaming(
         arguments=arguments,
     )
 
-    # We will differentiate behavior based on whether auto invoking kernel functions is enabled.
-    auto_invoking = request_settings.function_choice_behavior.auto_invoke_kernel_functions
-
     print("Mosscap:> ", end="", flush=True)
 
     # If auto_invoking is False, the model may return separate streaming chunks containing tool instructions.
@@ -156,25 +153,20 @@ async def handle_streaming(
         if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
             continue
 
-        if auto_invoking:
-            # When auto invocation is ON, no special handling is needed. Just print out messages as they arrive.
+        # When auto invocation is OFF, the model may send chunks that represent tool calls.
+        # Chunks that contain function call instructions will have a function_invoke_attempt attribute.
+        if hasattr(msg, "function_invoke_attempt"):
+            # This chunk is part of a tool call instruction sequence
+            streamed_tool_chunks.append(msg)
+        else:
+            # This chunk is normal assistant response text
             streamed_response_chunks.append(msg)
             print(str(msg), end="", flush=True)
-        else:
-            # When auto invocation is OFF, the model may send chunks that represent tool calls.
-            # Chunks that contain function call instructions will have a function_invoke_attempt attribute.
-            if hasattr(msg, "function_invoke_attempt"):
-                # This chunk is part of a tool call instruction sequence
-                streamed_tool_chunks.append(msg)
-            else:
-                # This chunk is normal assistant response text
-                streamed_response_chunks.append(msg)
-                print(str(msg), end="", flush=True)
 
     print("\n", flush=True)
 
     # If auto function calling was OFF, handle any tool call instructions we captured.
-    if not auto_invoking and streamed_tool_chunks:
+    if streamed_tool_chunks:
         # Group streamed chunks by `function_invoke_attempt` to handle each invocation attempt separately.
         grouped_chunks = {}
         for chunk in streamed_tool_chunks:

From e1e9e74567c28e0f4bda67d5d05297e377370af0 Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Thu, 19 Dec 2024 12:57:27 +0900
Subject: [PATCH 4/7] Model rebuild for openai plugin predicate context.

---
 .../chat_completion_with_manual_function_calling.py    |  2 +-
 python/tests/conftest.py                               | 10 ++++++++++
 python/tests/samples/test_concepts.py                  |  2 +-
 .../unit/connectors/openapi_plugin/test_sk_openapi.py  |  2 +-
 python/tests/unit/functions/test_kernel_plugins.py     |  8 ++++----
 python/tests/unit/kernel/test_kernel.py                |  2 +-
 6 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
index 978fa3ea24ec..0d91f410adff 100644
--- a/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
@@ -65,7 +65,7 @@
 # - Services.ONNX
 # - Services.VERTEX_AI
 # Please make sure you have configured your environment correctly for the selected chat completion service.
-chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.OPENAI)
+chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
 
 # Configure the function choice behavior. Here, we set it to Auto, where auto_invoke=False.
 # With `FunctionChoiceBehavior(auto_invoke=False)`, the model may return tool call instructions
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
index 697cce70712e..e6a01549f020 100644
--- a/python/tests/conftest.py
+++ b/python/tests/conftest.py
@@ -15,6 +15,9 @@
 from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
     OpenAIEmbeddingPromptExecutionSettings,
 )
+from semantic_kernel.connectors.openai_plugin.openai_function_execution_parameters import (
+    OpenAIFunctionExecutionParameters,
+)
 from semantic_kernel.data.record_definition.vector_store_model_decorator import vectorstoremodel
 from semantic_kernel.data.record_definition.vector_store_model_definition import VectorStoreRecordDefinition
 from semantic_kernel.data.record_definition.vector_store_record_fields import (
@@ -686,3 +689,10 @@ class DataModelClass(BaseModel):
         key: Annotated[str, VectorStoreRecordKeyField()]
 
     return DataModelClass
+
+
+@fixture
+def define_openai_predicate_context():
+    from semantic_kernel.connectors.openapi_plugin import OperationSelectionPredicateContext  # noqa: F401
+
+    OpenAIFunctionExecutionParameters.model_rebuild()
diff --git a/python/tests/samples/test_concepts.py b/python/tests/samples/test_concepts.py
index 6e8d4ad1a9e0..e108221d6217 100644
--- a/python/tests/samples/test_concepts.py
+++ b/python/tests/samples/test_concepts.py
@@ -8,7 +8,7 @@
 import pytest
 from pytest import mark, param
 
-from samples.concepts.auto_function_calling.chat_completion_with_function_calling import (
+from samples.concepts.auto_function_calling.chat_completion_with_auto_function_calling import (
     main as chat_completion_with_function_calling,
 )
 from samples.concepts.auto_function_calling.functions_defined_in_json_prompt import (
diff --git a/python/tests/unit/connectors/openapi_plugin/test_sk_openapi.py b/python/tests/unit/connectors/openapi_plugin/test_sk_openapi.py
index 1d25486b5a86..0df4bc63b0f6 100644
--- a/python/tests/unit/connectors/openapi_plugin/test_sk_openapi.py
+++ b/python/tests/unit/connectors/openapi_plugin/test_sk_openapi.py
@@ -749,7 +749,7 @@ def predicate_callback(context):
     return runner, operations, exec_settings
 
 
-def test_predicate_callback_applied(openapi_runner_with_predicate_callback):
+def test_predicate_callback_applied(openapi_runner_with_predicate_callback, define_openai_predicate_context):
     _, operations, exec_settings = openapi_runner_with_predicate_callback
 
     skipped_operations = []
diff --git a/python/tests/unit/functions/test_kernel_plugins.py b/python/tests/unit/functions/test_kernel_plugins.py
index fd9102f7a5c9..8e487e7022cd 100644
--- a/python/tests/unit/functions/test_kernel_plugins.py
+++ b/python/tests/unit/functions/test_kernel_plugins.py
@@ -498,7 +498,7 @@ def test_from_object_class(custom_plugin_class):
 
 
 @patch("semantic_kernel.connectors.openai_plugin.openai_utils.OpenAIUtils.parse_openai_manifest_for_openapi_spec_url")
-async def test_from_openai_from_file(mock_parse_openai_manifest):
+async def test_from_openai_from_file(mock_parse_openai_manifest, define_openai_predicate_context):
     openai_spec_file = os.path.join(os.path.dirname(__file__), "../../assets/test_plugins")
     with open(os.path.join(openai_spec_file, "TestOpenAIPlugin", "akv-openai.json")) as file:
         openai_spec = file.read()
@@ -526,7 +526,7 @@ async def test_from_openai_from_file(mock_parse_openai_manifest):
 
 @patch("httpx.AsyncClient.get")
 @patch("semantic_kernel.connectors.openai_plugin.openai_utils.OpenAIUtils.parse_openai_manifest_for_openapi_spec_url")
-async def test_from_openai_plugin_from_url(mock_parse_openai_manifest, mock_get):
+async def test_from_openai_plugin_from_url(mock_parse_openai_manifest, mock_get, define_openai_predicate_context):
     openai_spec_file_path = os.path.join(
         os.path.dirname(__file__), "../../assets/test_plugins", "TestOpenAIPlugin", "akv-openai.json"
     )
@@ -561,12 +561,12 @@ async def test_from_openai_plugin_from_url(mock_parse_openai_manifest, mock_get)
     mock_get.assert_awaited_once_with(fake_plugin_url, headers={"User-Agent": HTTP_USER_AGENT})
 
 
-async def test_from_openai_fail():
+async def test_from_openai_fail(define_openai_predicate_context):
     with raises(PluginInitializationError):
         await KernelPlugin.from_openai(plugin_name="TestOpenAIPlugin")
 
 
-async def test_from_openai_fail_json_parsing():
+async def test_from_openai_fail_json_parsing(define_openai_predicate_context):
     with raises(PluginInitializationError):
         await KernelPlugin.from_openai(plugin_name="TestOpenAIPlugin", plugin_str="test")
 
diff --git a/python/tests/unit/kernel/test_kernel.py b/python/tests/unit/kernel/test_kernel.py
index 808c69d4fc6e..ef935c030b57 100644
--- a/python/tests/unit/kernel/test_kernel.py
+++ b/python/tests/unit/kernel/test_kernel.py
@@ -589,7 +589,7 @@ def func2(arg1: str) -> str:
 
 
 @patch("semantic_kernel.connectors.openai_plugin.openai_utils.OpenAIUtils.parse_openai_manifest_for_openapi_spec_url")
-async def test_add_plugin_from_openai(mock_parse_openai_manifest, kernel: Kernel):
+async def test_add_plugin_from_openai(mock_parse_openai_manifest, kernel: Kernel, define_openai_predicate_context):
     base_folder = os.path.join(os.path.dirname(__file__), "../../assets/test_plugins")
     with open(os.path.join(base_folder, "TestOpenAIPlugin", "akv-openai.json")) as file:
         openai_spec = file.read()

From b6f216ed25277dd197cd38791f219a64c791eb7f Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Thu, 19 Dec 2024 13:06:14 +0900
Subject: [PATCH 5/7] model rebuild for openapi tests

---
 .../unit/connectors/openapi_plugin/test_sk_openapi.py     | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/tests/unit/connectors/openapi_plugin/test_sk_openapi.py b/python/tests/unit/connectors/openapi_plugin/test_sk_openapi.py
index 0df4bc63b0f6..094d57619c53 100644
--- a/python/tests/unit/connectors/openapi_plugin/test_sk_openapi.py
+++ b/python/tests/unit/connectors/openapi_plugin/test_sk_openapi.py
@@ -733,6 +733,10 @@ async def dummy_auth_callback(**kwargs):
 
 @pytest.fixture
 def openapi_runner_with_predicate_callback():
+    from semantic_kernel.connectors.openapi_plugin import OperationSelectionPredicateContext  # noqa: F401
+
+    OpenAPIFunctionExecutionParameters.model_rebuild()
+
     # Define a dummy predicate callback
     def predicate_callback(context):
         # Skip operations with DELETE method or containing 'internal' in the path
@@ -809,6 +813,10 @@ async def test_run_operation_with_error(mock_request, openapi_runner):
 
 
 def test_invalid_server_url_override():
+    from semantic_kernel.connectors.openapi_plugin import OperationSelectionPredicateContext  # noqa: F401
+
+    OpenAPIFunctionExecutionParameters.model_rebuild()
+
     with pytest.raises(ValueError, match="Invalid server_url_override: invalid_url"):
         params = OpenAPIFunctionExecutionParameters(server_url_override="invalid_url")
         params.model_post_init(None)

From 852d6d923e3483a59b0ba982e543d52bb129f7ac Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Thu, 19 Dec 2024 16:37:10 +0900
Subject: [PATCH 6/7] clean up conditional check

---
 .../chat_completion_with_manual_function_calling.py           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
index 0d91f410adff..162c415c4a64 100644
--- a/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling.py
@@ -129,8 +129,8 @@ async def chat() -> bool:
     # Handle non-streaming responses
     result = await kernel.invoke(chat_function, arguments=arguments)
 
-    # If function calls are returned and auto invoking is off, we must show them.
-    if not request_settings.function_choice_behavior.auto_invoke_kernel_functions and result and result.value:
+    # If function calls are returned, we show them on the console.
+    if result and result.value:
         # Extract function calls from the returned content
         function_calls = [item for item in result.value[-1].items if isinstance(item, FunctionCallContent)]
         if len(function_calls) > 0:

From 087ff2678b185d9ef1697dce0d4e0ceda8653fa2 Mon Sep 17 00:00:00 2001
From: Evan Mattson <evan.mattson@microsoft.com>
Date: Thu, 19 Dec 2024 16:53:26 +0900
Subject: [PATCH 7/7] Clean up samples

---
 ...t_completion_with_auto_function_calling.py |   2 +-
 ...on_with_auto_function_calling_streaming.py | 112 +++-------
 ..._with_manual_function_calling_streaming.py | 205 +++++++-----------
 3 files changed, 112 insertions(+), 207 deletions(-)

diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling.py b/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling.py
index 2dad6517e31c..c74ebc322489 100644
--- a/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling.py
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling.py
@@ -105,7 +105,7 @@ async def chat() -> bool:
     if result:
         print(f"Mosscap:> {result}")
         history.add_user_message(user_input)
-        history.add_assistant_message(str(result))
+        history.add_message(result.value[0])  # Capture the full context of the response
 
     return True
 
diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling_streaming.py b/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling_streaming.py
index 86435032ba4d..f7aa767ffa23 100644
--- a/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling_streaming.py
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_auto_function_calling_streaming.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft. All rights reserved.
 
 import asyncio
-from typing import TYPE_CHECKING
 
 from samples.concepts.setup.chat_completion_services import Services, get_chat_completion_service_and_request_settings
 from semantic_kernel import Kernel
@@ -13,9 +12,6 @@
 from semantic_kernel.core_plugins.time_plugin import TimePlugin
 from semantic_kernel.functions import KernelArguments
 
-if TYPE_CHECKING:
-    from semantic_kernel.functions import KernelFunction
-
 #####################################################################
 # This sample demonstrates how to build a conversational chatbot    #
 # using Semantic Kernel, featuring auto function calling,           #
@@ -66,8 +62,7 @@
 # Please make sure you have configured your environment correctly for the selected chat completion service.
 chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
 
-# Configure the function choice behavior. Here, we set it to Auto, where auto_invoke=True by default.
-# With `auto_invoke=True`, the model will automatically choose and call functions as needed.
+# Configure the function choice behavior. Here, we set it to Auto.
 request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto()
 
 kernel.add_service(chat_completion_service)
@@ -82,87 +77,52 @@
 history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
 
 
-async def handle_streaming(
-    kernel: Kernel,
-    chat_function: "KernelFunction",
-    arguments: KernelArguments,
-) -> str | None:
-    """
-    Handle the streaming response from the model.
-    This function demonstrates two possible paths:
-
-    When auto function calling is ON (auto_invoke=True):
-    - The model may call tools automatically and produce a continuous
-        stream of assistant messages. We can simply print these as they come in.
-    """
-
-    response = kernel.invoke_stream(
-        chat_function,
-        return_function_results=False,
-        arguments=arguments,
+async def main() -> None:
+    print(
+        "Welcome to the chat bot!\n"
+        "  Type 'exit' to exit.\n"
+        "  Try a math question to see function calling in action (e.g. 'what is 3+3?')."
     )
 
-    print("Mosscap:> ", end="", flush=True)
-
-    # For content messages (the final assistant's response text), store them here.
-    streamed_response_chunks: list[StreamingChatMessageContent] = []
-
-    async for message in response:
-        msg = message[0]
-
-        # We only expect assistant messages here.
-        if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
-            continue
+    while True:
+        try:
+            user_input = input("User:> ")
+        except (KeyboardInterrupt, EOFError):
+            print("\n\nExiting chat...")
+            break
 
-        # When auto invocation is ON, no special handling is needed. Just print out messages as they arrive.
-        streamed_response_chunks.append(msg)
-        print(str(msg), end="", flush=True)
+        if user_input.lower().strip() == "exit":
+            print("\n\nExiting chat...")
+            break
 
-    print("\n", flush=True)
+        arguments["user_input"] = user_input
+        arguments["chat_history"] = history
 
-    # Return the final concatenated assistant response (if any).
-    if streamed_response_chunks:
-        return "".join([str(content) for content in streamed_response_chunks])
-    return None
+        # Directly handle streaming of the assistant's response here
+        print("Mosscap:> ", end="", flush=True)
 
+        streamed_response_chunks: list[StreamingChatMessageContent] = []
 
-async def chat() -> bool:
-    """
-    Continuously prompt the user for input and show the assistant's response.
-    Type 'exit' to exit.
-    """
-    try:
-        user_input = input("User:> ")
-    except (KeyboardInterrupt, EOFError):
-        print("\n\nExiting chat...")
-        return False
+        async for message in kernel.invoke_stream(
+            chat_function,
+            return_function_results=False,
+            arguments=arguments,
+        ):
+            msg = message[0]
 
-    if user_input.lower().strip() == "exit":
-        print("\n\nExiting chat...")
-        return False
+            # We only expect assistant messages here.
+            if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
+                continue
 
-    arguments["user_input"] = user_input
-    arguments["chat_history"] = history
+            streamed_response_chunks.append(msg)
+            print(str(msg), end="", flush=True)
 
-    result = await handle_streaming(kernel, chat_function, arguments=arguments)
+        print("\n", flush=True)
 
-    # Update the chat history with the user's input and the assistant's response
-    if result:
-        history.add_user_message(user_input)
-        history.add_assistant_message(str(result))
-
-    return True
-
-
-async def main() -> None:
-    print(
-        "Welcome to the chat bot!\n"
-        "  Type 'exit' to exit.\n"
-        "  Try a math question to see function calling in action (e.g. 'what is 3+3?')."
-    )
-    chatting = True
-    while chatting:
-        chatting = await chat()
+        if streamed_response_chunks:
+            result = "".join([str(content) for content in streamed_response_chunks])
+            history.add_user_message(user_input)
+            history.add_assistant_message(result)
 
 
 if __name__ == "__main__":
diff --git a/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling_streaming.py b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling_streaming.py
index b749ff2a87a0..360c0d670f45 100644
--- a/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling_streaming.py
+++ b/python/samples/concepts/auto_function_calling/chat_completion_with_manual_function_calling_streaming.py
@@ -17,7 +17,7 @@
 from semantic_kernel.functions import KernelArguments
 
 if TYPE_CHECKING:
-    from semantic_kernel.functions import KernelFunction
+    pass
 
 #####################################################################
 # This sample demonstrates how to build a conversational chatbot    #
@@ -56,25 +56,12 @@
     function_name="Chat",
 )
 
-# You can select from the following chat completion services that support function calling:
-# - Services.OPENAI
-# - Services.AZURE_OPENAI
-# - Services.AZURE_AI_INFERENCE
-# - Services.ANTHROPIC
-# - Services.BEDROCK
-# - Services.GOOGLE_AI
-# - Services.MISTRAL_AI
-# - Services.OLLAMA
-# - Services.ONNX
-# - Services.VERTEX_AI
-# Please make sure you have configured your environment correctly for the selected chat completion service.
+# Configure the chat completion service and request settings.
 chat_completion_service, request_settings = get_chat_completion_service_and_request_settings(Services.AZURE_OPENAI)
 
-# Configure the function choice behavior. Here, we set it to Auto, where auto_invoke=False.
-# With `FunctionChoiceBehavior(auto_invoke=False)`, the model may return tool call instructions
-# that you must handle and call manually. We will only print the tool calls in this sample.
+# Configure the function choice behavior to Auto with auto_invoke=False.
+# This means the model may return tool calls that must be manually handled.
 request_settings.function_choice_behavior = FunctionChoiceBehavior.Auto(auto_invoke=False)
-
 kernel.add_service(chat_completion_service)
 
 # Pass the request settings to the kernel arguments.
@@ -111,125 +98,83 @@ def print_tool_calls(message: ChatMessageContent) -> None:
         print("\n[No tool calls returned by the model]")
 
 
-async def handle_streaming(
-    kernel: Kernel,
-    chat_function: "KernelFunction",
-    arguments: KernelArguments,
-) -> str | None:
-    """
-    Handle the streaming response from the model.
-    This function demonstrates two possible paths:
-
-    1. When auto function calling is ON (auto_invoke=True):
-       - The model may call tools automatically and produce a continuous
-         stream of assistant messages. We can simply print these as they come in.
-
-    2. When auto function calling is OFF (auto_invoke=False):
-       - The model may instead return tool call instructions embedded in the stream.
-         We can track these calls using `function_invoke_attempt` attributes and print
-         them for the user. The user can then manually invoke the tools and return the results
-         to the model for further completion.
-    """
-
-    response = kernel.invoke_stream(
-        chat_function,
-        return_function_results=False,
-        arguments=arguments,
-    )
-
-    print("Mosscap:> ", end="", flush=True)
-
-    # If auto_invoking is False, the model may return separate streaming chunks containing tool instructions.
-    # We'll store them here.
-    streamed_tool_chunks: list[StreamingChatMessageContent] = []
-
-    # For content messages (the final assistant's response text), store them here.
-    streamed_response_chunks: list[StreamingChatMessageContent] = []
-
-    async for message in response:
-        msg = message[0]
-
-        # We only expect assistant messages here.
-        if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
-            continue
-
-        # When auto invocation is OFF, the model may send chunks that represent tool calls.
-        # Chunks that contain function call instructions will have a function_invoke_attempt attribute.
-        if hasattr(msg, "function_invoke_attempt"):
-            # This chunk is part of a tool call instruction sequence
-            streamed_tool_chunks.append(msg)
-        else:
-            # This chunk is normal assistant response text
-            streamed_response_chunks.append(msg)
-            print(str(msg), end="", flush=True)
-
-    print("\n", flush=True)
-
-    # If auto function calling was OFF, handle any tool call instructions we captured.
-    if streamed_tool_chunks:
-        # Group streamed chunks by `function_invoke_attempt` to handle each invocation attempt separately.
-        grouped_chunks = {}
-        for chunk in streamed_tool_chunks:
-            key = getattr(chunk, "function_invoke_attempt", None)
-            if key is not None:
-                grouped_chunks.setdefault(key, []).append(chunk)
-
-        # Process each group of chunks
-        for attempt, chunks in grouped_chunks.items():
-            try:
-                # Combine all chunks for a given attempt into one message.
-                combined_content = reduce(lambda first, second: first + second, chunks)
-                if hasattr(combined_content, "content"):
-                    print(f"[function_invoke_attempt {attempt} content]:\n{combined_content.content}")
-
-                print("[Auto function calling is OFF] Here are the returned tool calls:")
-                print_tool_calls(combined_content)
-            except Exception as e:
-                print(f"Error processing chunks for function_invoke_attempt {attempt}: {e}")
-
-    # Return the final concatenated assistant response (if any).
-    if streamed_response_chunks:
-        return "".join([str(content) for content in streamed_response_chunks])
-    return None
-
-
-async def chat() -> bool:
-    """
-    Continuously prompt the user for input and show the assistant's response.
-    Type 'exit' to exit.
-    """
-    try:
-        user_input = input("User:> ")
-    except (KeyboardInterrupt, EOFError):
-        print("\n\nExiting chat...")
-        return False
-
-    if user_input.lower().strip() == "exit":
-        print("\n\nExiting chat...")
-        return False
-
-    arguments["user_input"] = user_input
-    arguments["chat_history"] = history
-
-    result = await handle_streaming(kernel, chat_function, arguments=arguments)
-
-    # Update the chat history with the user's input and the assistant's response
-    if result:
-        history.add_user_message(user_input)
-        history.add_assistant_message(str(result))
-
-    return True
-
-
 async def main() -> None:
     print(
         "Welcome to the chat bot!\n"
         "  Type 'exit' to exit.\n"
         "  Try a math question to see function calling in action (e.g. 'what is 3+3?')."
     )
-    chatting = True
-    while chatting:
-        chatting = await chat()
+
+    while True:
+        # Get user input
+        try:
+            user_input = input("User:> ")
+        except (KeyboardInterrupt, EOFError):
+            print("\n\nExiting chat...")
+            break
+
+        if user_input.lower().strip() == "exit":
+            print("\n\nExiting chat...")
+            break
+
+        # Prepare arguments for the model invocation
+        arguments["user_input"] = user_input
+        arguments["chat_history"] = history
+
+        print("Mosscap:> ", end="", flush=True)
+
+        # Lists to store streamed chunks
+        streamed_tool_chunks: list[StreamingChatMessageContent] = []
+        streamed_response_chunks: list[StreamingChatMessageContent] = []
+
+        async for message in kernel.invoke_stream(
+            chat_function,
+            return_function_results=False,
+            arguments=arguments,
+        ):
+            msg = message[0]
+
+            # Expecting assistant messages only
+            if not isinstance(msg, StreamingChatMessageContent) or msg.role != AuthorRole.ASSISTANT:
+                continue
+
+            # If auto_invoking is False, the model may send tool calls in separate chunks.
+            if hasattr(msg, "function_invoke_attempt"):
+                # This chunk is part of a tool call instruction
+                streamed_tool_chunks.append(msg)
+            else:
+                # Normal assistant response text
+                streamed_response_chunks.append(msg)
+                print(str(msg), end="", flush=True)
+
+        print("\n", flush=True)
+
+        # If we have tool call instructions
+        if streamed_tool_chunks:
+            # Group streamed tool chunks by `function_invoke_attempt`
+            grouped_chunks = {}
+            for chunk in streamed_tool_chunks:
+                key = getattr(chunk, "function_invoke_attempt", None)
+                if key is not None:
+                    grouped_chunks.setdefault(key, []).append(chunk)
+
+            # Process each group of chunks
+            for attempt, chunks in grouped_chunks.items():
+                try:
+                    combined_content = reduce(lambda first, second: first + second, chunks)
+                    if hasattr(combined_content, "content"):
+                        print(f"[function_invoke_attempt {attempt} content]:\n{combined_content.content}")
+
+                    print("[Auto function calling is OFF] Here are the returned tool calls:")
+                    print_tool_calls(combined_content)
+                except Exception as e:
+                    print(f"Error processing chunks for function_invoke_attempt {attempt}: {e}")
+
+        # Update the chat history with user input and assistant response, if any
+        if streamed_response_chunks:
+            result = "".join([str(content) for content in streamed_response_chunks])
+            history.add_user_message(user_input)
+            history.add_assistant_message(str(result))
 
 
 if __name__ == "__main__":