Litellm dev 12 24 2024 p4 (#7407)

* fix(invoke_handler.py): fix mock response iterator to handle tool calling returns tool call if returned by model response * fix(prometheus.py): add new 'tokens_by_tag' metric on prometheus allows tracking 'token usage' by task * feat(prometheus.py): add input + output token tracking by tag * feat(prometheus.py): add tag based deployment failure tracking allows admin to track failure by use-case
BerriAI · Dec 25, 2024 · 39dabb2 · 39dabb2
1 parent 81be0b4
commit 39dabb2
Show file tree

Hide file tree

Showing 5 changed files with 209 additions and 12 deletions.
diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
@@ -76,7 +76,7 @@ def __init__(
                     UserAPIKeyLabelNames.TEAM.value,
                     UserAPIKeyLabelNames.TEAM_ALIAS.value,
                     UserAPIKeyLabelNames.USER.value,
-                    UserAPIKeyLabelNames.LITELLM_MODEL.value,
+                    UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value,
                 ],
                 buckets=LATENCY_BUCKETS,
             )
@@ -85,7 +85,7 @@ def __init__(
                 "litellm_llm_api_latency_metric",
                 "Total latency (seconds) for a models LLM API call",
                 labelnames=[
-                    UserAPIKeyLabelNames.LITELLM_MODEL.value,
+                    UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value,
                     UserAPIKeyLabelNames.API_KEY_HASH.value,
                     UserAPIKeyLabelNames.API_KEY_ALIAS.value,
                     UserAPIKeyLabelNames.TEAM.value,
@@ -140,6 +140,14 @@ def __init__(
                 ],
             )
 
+            # Counter for tokens by tag
+            self.litellm_tokens_by_tag_metric = Counter(
+                "litellm_total_tokens_by_tag",
+                "Total number of input + output tokens from LLM requests by custom metadata tags",
+                labelnames=[
+                    UserAPIKeyLabelNames.TAG.value,
+                ],
+            )
             self.litellm_input_tokens_metric = Counter(
                 "litellm_input_tokens",
                 "Total number of input tokens from LLM requests",
@@ -153,6 +161,16 @@ def __init__(
                     "user",
                 ],
             )
+
+            # Counter for input tokens by tag
+            self.litellm_input_tokens_by_tag_metric = Counter(
+                "litellm_input_tokens_by_tag",
+                "Total number of input tokens from LLM requests by custom metadata tags",
+                labelnames=[
+                    UserAPIKeyLabelNames.TAG.value,
+                ],
+            )
+
             self.litellm_output_tokens_metric = Counter(
                 "litellm_output_tokens",
                 "Total number of output tokens from LLM requests",
@@ -167,6 +185,15 @@ def __init__(
                 ],
             )
 
+            # Counter for output tokens by tag
+            self.litellm_output_tokens_by_tag_metric = Counter(
+                "litellm_output_tokens_by_tag",
+                "Total number of output tokens from LLM requests by custom metadata tags",
+                labelnames=[
+                    UserAPIKeyLabelNames.TAG.value,
+                ],
+            )
+
             # Remaining Budget for Team
             self.litellm_remaining_team_budget_metric = Gauge(
                 "litellm_remaining_team_budget_metric",
@@ -237,10 +264,10 @@ def __init__(
 
             # Get all keys
             _logged_llm_labels = [
-                "litellm_model_name",
-                "model_id",
-                "api_base",
-                "api_provider",
+                UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value,
+                UserAPIKeyLabelNames.MODEL_ID.value,
+                UserAPIKeyLabelNames.API_BASE.value,
+                UserAPIKeyLabelNames.API_PROVIDER.value,
             ]
             team_and_key_labels = [
                 "hashed_api_key",
@@ -275,6 +302,16 @@ def __init__(
                 + EXCEPTION_LABELS
                 + team_and_key_labels,
             )
+            self.litellm_deployment_failure_by_tag_responses = Counter(
+                "litellm_deployment_failure_by_tag_responses",
+                "Total number of failed LLM API calls for a specific LLM deploymeny by custom metadata tags",
+                labelnames=[
+                    UserAPIKeyLabelNames.REQUESTED_MODEL.value,
+                    UserAPIKeyLabelNames.TAG.value,
+                ]
+                + _logged_llm_labels
+                + EXCEPTION_LABELS,
+            )
             self.litellm_deployment_total_requests = Counter(
                 name="litellm_deployment_total_requests",
                 documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure",
@@ -490,6 +527,14 @@ def _increment_token_metrics(
             user_id,
         ).inc(standard_logging_payload["total_tokens"])
 
+        _tags = standard_logging_payload["request_tags"]
+        for tag in _tags:
+            self.litellm_tokens_by_tag_metric.labels(
+                **{
+                    UserAPIKeyLabelNames.TAG.value: tag,
+                }
+            ).inc(standard_logging_payload["total_tokens"])
+
         self.litellm_input_tokens_metric.labels(
             end_user_id,
             user_api_key,
@@ -500,6 +545,13 @@ def _increment_token_metrics(
             user_id,
         ).inc(standard_logging_payload["prompt_tokens"])
 
+        for tag in _tags:
+            self.litellm_input_tokens_by_tag_metric.labels(
+                **{
+                    UserAPIKeyLabelNames.TAG.value: tag,
+                }
+            ).inc(standard_logging_payload["prompt_tokens"])
+
         self.litellm_output_tokens_metric.labels(
             end_user_id,
             user_api_key,
@@ -510,6 +562,13 @@ def _increment_token_metrics(
             user_id,
         ).inc(standard_logging_payload["completion_tokens"])
 
+        for tag in _tags:
+            self.litellm_output_tokens_by_tag_metric.labels(
+                **{
+                    UserAPIKeyLabelNames.TAG.value: tag,
+                }
+            ).inc(standard_logging_payload["completion_tokens"])
+
     def _increment_remaining_budget_metrics(
         self,
         user_api_team: Optional[str],
@@ -651,7 +710,7 @@ def _set_latency_metrics(
             api_call_total_time_seconds = api_call_total_time.total_seconds()
             self.litellm_llm_api_latency_metric.labels(
                 **{
-                    UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
+                    UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value: model,
                     UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key,
                     UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias,
                     UserAPIKeyLabelNames.TEAM.value: user_api_team,
@@ -686,7 +745,7 @@ def _set_latency_metrics(
                     UserAPIKeyLabelNames.USER.value: standard_logging_payload[
                         "metadata"
                     ]["user_api_key_user_id"],
-                    UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
+                    UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value: model,
                 }
             ).observe(total_time_seconds)
 
@@ -862,6 +921,24 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
                 ],
             ).inc()
 
+            # tag based tracking
+            _tags = standard_logging_payload["request_tags"]
+            for tag in _tags:
+                self.litellm_deployment_failure_by_tag_responses.labels(
+                    **{
+                        UserAPIKeyLabelNames.REQUESTED_MODEL.value: model_group,
+                        UserAPIKeyLabelNames.TAG.value: tag,
+                        UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value: litellm_model_name,
+                        UserAPIKeyLabelNames.MODEL_ID.value: model_id,
+                        UserAPIKeyLabelNames.API_BASE.value: api_base,
+                        UserAPIKeyLabelNames.API_PROVIDER.value: llm_provider,
+                        UserAPIKeyLabelNames.EXCEPTION_CLASS.value: exception.__class__.__name__,
+                        UserAPIKeyLabelNames.EXCEPTION_STATUS.value: str(
+                            getattr(exception, "status_code", None)
+                        ),
+                    }
+                ).inc()
+
             self.litellm_deployment_total_requests.labels(
                 litellm_model_name=litellm_model_name,
                 model_id=model_id,
@@ -881,8 +958,12 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
             ).inc()
 
             pass
-        except Exception:
-            pass
+        except Exception as e:
+            verbose_logger.debug(
+                "Prometheus Error: set_llm_deployment_failure_metrics. Exception occured - {}".format(
+                    str(e)
+                )
+            )
 
     def set_llm_deployment_success_metrics(
         self,

diff --git a/litellm/llms/bedrock/chat/invoke_handler.py b/litellm/llms/bedrock/chat/invoke_handler.py
@@ -9,7 +9,17 @@
 import urllib.parse
 import uuid
 from functools import partial
-from typing import Any, AsyncIterator, Callable, Iterator, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    AsyncIterator,
+    Callable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
 
 import httpx  # type: ignore
 
@@ -36,8 +46,10 @@
 from litellm.types.llms.bedrock import *
 from litellm.types.llms.openai import (
     ChatCompletionToolCallChunk,
+    ChatCompletionToolCallFunctionChunk,
     ChatCompletionUsageBlock,
 )
+from litellm.types.utils import ChatCompletionMessageToolCall, Choices
 from litellm.types.utils import GenericStreamingChunk as GChunk
 from litellm.types.utils import ModelResponse, Usage
 from litellm.utils import CustomStreamWrapper, get_secret
@@ -1294,11 +1306,25 @@ def _chunk_parser(self, chunk_data: ModelResponse) -> GChunk:
             chunk_usage: Usage = getattr(chunk_data, "usage")
             text = chunk_data.choices[0].message.content or ""  # type: ignore
             tool_use = None
+            _model_response_tool_call = cast(
+                Optional[List[ChatCompletionMessageToolCall]],
+                cast(Choices, chunk_data.choices[0]).message.tool_calls,
+            )
             if self.json_mode is True:
                 text, tool_use = self._handle_json_mode_chunk(
                     text=text,
                     tool_calls=chunk_data.choices[0].message.tool_calls,  # type: ignore
                 )
+            elif _model_response_tool_call is not None:
+                tool_use = ChatCompletionToolCallChunk(
+                    id=_model_response_tool_call[0].id,
+                    type="function",
+                    function=ChatCompletionToolCallFunctionChunk(
+                        name=_model_response_tool_call[0].function.name,
+                        arguments=_model_response_tool_call[0].function.arguments,
+                    ),
+                    index=0,
+                )
             processed_chunk = GChunk(
                 text=text,
                 tool_use=tool_use,

diff --git a/litellm/types/integrations/prometheus.py b/litellm/types/integrations/prometheus.py
@@ -53,4 +53,11 @@ class UserAPIKeyLabelNames(Enum):
     TEAM = "team"
     TEAM_ALIAS = "team_alias"
     REQUESTED_MODEL = REQUESTED_MODEL
-    LITELLM_MODEL = "model"
+    v1_LITELLM_MODEL_NAME = "model"
+    v2_LITELLM_MODEL_NAME = "litellm_model_name"
+    TAG = "tag"
+    MODEL_ID = "model_id"
+    API_BASE = "api_base"
+    API_PROVIDER = "api_provider"
+    EXCEPTION_STATUS = EXCEPTION_STATUS
+    EXCEPTION_CLASS = EXCEPTION_CLASS
diff --git a/tests/local_testing/test_stream_chunk_builder.py b/tests/local_testing/test_stream_chunk_builder.py
@@ -745,3 +745,20 @@ def test_stream_chunk_builder_empty_initial_chunk():
 
     id = ChunkProcessor._get_chunk_id(chunks)
     assert id == "1"
+
+
+import json
+
+
+def get_current_weather(location, unit="fahrenheit"):
+    """Get the current weather in a given location"""
+    if "tokyo" in location.lower():
+        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
+    elif "san francisco" in location.lower():
+        return json.dumps(
+            {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
+        )
+    elif "paris" in location.lower():
+        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})
diff --git a/tests/local_testing/test_streaming.py b/tests/local_testing/test_streaming.py
@@ -3990,3 +3990,69 @@ def test_streaming_api_base():
         stream=True,
     )
     assert "https://api.openai.com" in stream._hidden_params["api_base"]
+
+
+def test_mock_response_iterator_tool_use():
+    """
+    Relevant Issue: https://github.com/BerriAI/litellm/issues/7364
+    """
+    from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator
+    from litellm.types.utils import (
+        ChatCompletionMessageToolCall,
+        Function,
+        Message,
+        Usage,
+        CompletionTokensDetailsWrapper,
+        PromptTokensDetailsWrapper,
+        Choices,
+    )
+
+    litellm.set_verbose = False
+    response = ModelResponse(
+        id="chatcmpl-Ai8KRI5vJPZXQ9SQvEJfTVuVqkyEZ",
+        created=1735081811,
+        model="o1-2024-12-17",
+        object="chat.completion",
+        system_fingerprint="fp_e6d02d4a78",
+        choices=[
+            Choices(
+                finish_reason="tool_calls",
+                index=0,
+                message=Message(
+                    content=None,
+                    role="assistant",
+                    tool_calls=[
+                        ChatCompletionMessageToolCall(
+                            function=Function(
+                                arguments='{"location":"San Francisco, CA","unit":"fahrenheit"}',
+                                name="get_current_weather",
+                            ),
+                            id="call_BfRX2S7YCKL0BtxbWMl89ZNk",
+                            type="function",
+                        )
+                    ],
+                    function_call=None,
+                ),
+            )
+        ],
+        usage=Usage(
+            completion_tokens=1955,
+            prompt_tokens=85,
+            total_tokens=2040,
+            completion_tokens_details=CompletionTokensDetailsWrapper(
+                accepted_prediction_tokens=0,
+                audio_tokens=0,
+                reasoning_tokens=1920,
+                rejected_prediction_tokens=0,
+                text_tokens=None,
+            ),
+            prompt_tokens_details=PromptTokensDetailsWrapper(
+                audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None
+            ),
+        ),
+        service_tier=None,
+    )
+    completion_stream = MockResponseIterator(model_response=response)
+    response_chunk = completion_stream._chunk_parser(chunk_data=response)
+
+    assert response_chunk["tool_use"] is not None