Skip to content

Commit

Permalink
Litellm dev 12 24 2024 p4 (#7407)
Browse files Browse the repository at this point in the history
* fix(invoke_handler.py): fix mock response iterator to handle tool calling

returns tool call if returned by model response

* fix(prometheus.py): add new 'tokens_by_tag' metric on prometheus

allows tracking 'token usage' by task

* feat(prometheus.py): add input + output token tracking by tag

* feat(prometheus.py): add tag based deployment failure tracking

allows admin to track failure by use-case
  • Loading branch information
krrishdholakia authored Dec 25, 2024
1 parent 81be0b4 commit 39dabb2
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 12 deletions.
101 changes: 91 additions & 10 deletions litellm/integrations/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(
UserAPIKeyLabelNames.TEAM.value,
UserAPIKeyLabelNames.TEAM_ALIAS.value,
UserAPIKeyLabelNames.USER.value,
UserAPIKeyLabelNames.LITELLM_MODEL.value,
UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value,
],
buckets=LATENCY_BUCKETS,
)
Expand All @@ -85,7 +85,7 @@ def __init__(
"litellm_llm_api_latency_metric",
"Total latency (seconds) for a models LLM API call",
labelnames=[
UserAPIKeyLabelNames.LITELLM_MODEL.value,
UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value,
UserAPIKeyLabelNames.API_KEY_HASH.value,
UserAPIKeyLabelNames.API_KEY_ALIAS.value,
UserAPIKeyLabelNames.TEAM.value,
Expand Down Expand Up @@ -140,6 +140,14 @@ def __init__(
],
)

# Counter for tokens by tag
self.litellm_tokens_by_tag_metric = Counter(
"litellm_total_tokens_by_tag",
"Total number of input + output tokens from LLM requests by custom metadata tags",
labelnames=[
UserAPIKeyLabelNames.TAG.value,
],
)
self.litellm_input_tokens_metric = Counter(
"litellm_input_tokens",
"Total number of input tokens from LLM requests",
Expand All @@ -153,6 +161,16 @@ def __init__(
"user",
],
)

# Counter for input tokens by tag
self.litellm_input_tokens_by_tag_metric = Counter(
"litellm_input_tokens_by_tag",
"Total number of input tokens from LLM requests by custom metadata tags",
labelnames=[
UserAPIKeyLabelNames.TAG.value,
],
)

self.litellm_output_tokens_metric = Counter(
"litellm_output_tokens",
"Total number of output tokens from LLM requests",
Expand All @@ -167,6 +185,15 @@ def __init__(
],
)

# Counter for output tokens by tag
self.litellm_output_tokens_by_tag_metric = Counter(
"litellm_output_tokens_by_tag",
"Total number of output tokens from LLM requests by custom metadata tags",
labelnames=[
UserAPIKeyLabelNames.TAG.value,
],
)

# Remaining Budget for Team
self.litellm_remaining_team_budget_metric = Gauge(
"litellm_remaining_team_budget_metric",
Expand Down Expand Up @@ -237,10 +264,10 @@ def __init__(

# Get all keys
_logged_llm_labels = [
"litellm_model_name",
"model_id",
"api_base",
"api_provider",
UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value,
UserAPIKeyLabelNames.MODEL_ID.value,
UserAPIKeyLabelNames.API_BASE.value,
UserAPIKeyLabelNames.API_PROVIDER.value,
]
team_and_key_labels = [
"hashed_api_key",
Expand Down Expand Up @@ -275,6 +302,16 @@ def __init__(
+ EXCEPTION_LABELS
+ team_and_key_labels,
)
self.litellm_deployment_failure_by_tag_responses = Counter(
"litellm_deployment_failure_by_tag_responses",
"Total number of failed LLM API calls for a specific LLM deploymeny by custom metadata tags",
labelnames=[
UserAPIKeyLabelNames.REQUESTED_MODEL.value,
UserAPIKeyLabelNames.TAG.value,
]
+ _logged_llm_labels
+ EXCEPTION_LABELS,
)
self.litellm_deployment_total_requests = Counter(
name="litellm_deployment_total_requests",
documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure",
Expand Down Expand Up @@ -490,6 +527,14 @@ def _increment_token_metrics(
user_id,
).inc(standard_logging_payload["total_tokens"])

_tags = standard_logging_payload["request_tags"]
for tag in _tags:
self.litellm_tokens_by_tag_metric.labels(
**{
UserAPIKeyLabelNames.TAG.value: tag,
}
).inc(standard_logging_payload["total_tokens"])

self.litellm_input_tokens_metric.labels(
end_user_id,
user_api_key,
Expand All @@ -500,6 +545,13 @@ def _increment_token_metrics(
user_id,
).inc(standard_logging_payload["prompt_tokens"])

for tag in _tags:
self.litellm_input_tokens_by_tag_metric.labels(
**{
UserAPIKeyLabelNames.TAG.value: tag,
}
).inc(standard_logging_payload["prompt_tokens"])

self.litellm_output_tokens_metric.labels(
end_user_id,
user_api_key,
Expand All @@ -510,6 +562,13 @@ def _increment_token_metrics(
user_id,
).inc(standard_logging_payload["completion_tokens"])

for tag in _tags:
self.litellm_output_tokens_by_tag_metric.labels(
**{
UserAPIKeyLabelNames.TAG.value: tag,
}
).inc(standard_logging_payload["completion_tokens"])

def _increment_remaining_budget_metrics(
self,
user_api_team: Optional[str],
Expand Down Expand Up @@ -651,7 +710,7 @@ def _set_latency_metrics(
api_call_total_time_seconds = api_call_total_time.total_seconds()
self.litellm_llm_api_latency_metric.labels(
**{
UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value: model,
UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key,
UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias,
UserAPIKeyLabelNames.TEAM.value: user_api_team,
Expand Down Expand Up @@ -686,7 +745,7 @@ def _set_latency_metrics(
UserAPIKeyLabelNames.USER.value: standard_logging_payload[
"metadata"
]["user_api_key_user_id"],
UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value: model,
}
).observe(total_time_seconds)

Expand Down Expand Up @@ -862,6 +921,24 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
],
).inc()

# tag based tracking
_tags = standard_logging_payload["request_tags"]
for tag in _tags:
self.litellm_deployment_failure_by_tag_responses.labels(
**{
UserAPIKeyLabelNames.REQUESTED_MODEL.value: model_group,
UserAPIKeyLabelNames.TAG.value: tag,
UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value: litellm_model_name,
UserAPIKeyLabelNames.MODEL_ID.value: model_id,
UserAPIKeyLabelNames.API_BASE.value: api_base,
UserAPIKeyLabelNames.API_PROVIDER.value: llm_provider,
UserAPIKeyLabelNames.EXCEPTION_CLASS.value: exception.__class__.__name__,
UserAPIKeyLabelNames.EXCEPTION_STATUS.value: str(
getattr(exception, "status_code", None)
),
}
).inc()

self.litellm_deployment_total_requests.labels(
litellm_model_name=litellm_model_name,
model_id=model_id,
Expand All @@ -881,8 +958,12 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
).inc()

pass
except Exception:
pass
except Exception as e:
verbose_logger.debug(
"Prometheus Error: set_llm_deployment_failure_metrics. Exception occured - {}".format(
str(e)
)
)

def set_llm_deployment_success_metrics(
self,
Expand Down
28 changes: 27 additions & 1 deletion litellm/llms/bedrock/chat/invoke_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,17 @@
import urllib.parse
import uuid
from functools import partial
from typing import Any, AsyncIterator, Callable, Iterator, List, Optional, Tuple, Union
from typing import (
Any,
AsyncIterator,
Callable,
Iterator,
List,
Optional,
Tuple,
Union,
cast,
)

import httpx # type: ignore

Expand All @@ -36,8 +46,10 @@
from litellm.types.llms.bedrock import *
from litellm.types.llms.openai import (
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
ChatCompletionUsageBlock,
)
from litellm.types.utils import ChatCompletionMessageToolCall, Choices
from litellm.types.utils import GenericStreamingChunk as GChunk
from litellm.types.utils import ModelResponse, Usage
from litellm.utils import CustomStreamWrapper, get_secret
Expand Down Expand Up @@ -1294,11 +1306,25 @@ def _chunk_parser(self, chunk_data: ModelResponse) -> GChunk:
chunk_usage: Usage = getattr(chunk_data, "usage")
text = chunk_data.choices[0].message.content or "" # type: ignore
tool_use = None
_model_response_tool_call = cast(
Optional[List[ChatCompletionMessageToolCall]],
cast(Choices, chunk_data.choices[0]).message.tool_calls,
)
if self.json_mode is True:
text, tool_use = self._handle_json_mode_chunk(
text=text,
tool_calls=chunk_data.choices[0].message.tool_calls, # type: ignore
)
elif _model_response_tool_call is not None:
tool_use = ChatCompletionToolCallChunk(
id=_model_response_tool_call[0].id,
type="function",
function=ChatCompletionToolCallFunctionChunk(
name=_model_response_tool_call[0].function.name,
arguments=_model_response_tool_call[0].function.arguments,
),
index=0,
)
processed_chunk = GChunk(
text=text,
tool_use=tool_use,
Expand Down
9 changes: 8 additions & 1 deletion litellm/types/integrations/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,11 @@ class UserAPIKeyLabelNames(Enum):
TEAM = "team"
TEAM_ALIAS = "team_alias"
REQUESTED_MODEL = REQUESTED_MODEL
LITELLM_MODEL = "model"
v1_LITELLM_MODEL_NAME = "model"
v2_LITELLM_MODEL_NAME = "litellm_model_name"
TAG = "tag"
MODEL_ID = "model_id"
API_BASE = "api_base"
API_PROVIDER = "api_provider"
EXCEPTION_STATUS = EXCEPTION_STATUS
EXCEPTION_CLASS = EXCEPTION_CLASS
17 changes: 17 additions & 0 deletions tests/local_testing/test_stream_chunk_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,3 +745,20 @@ def test_stream_chunk_builder_empty_initial_chunk():

id = ChunkProcessor._get_chunk_id(chunks)
assert id == "1"


import json


def get_current_weather(location, unit="fahrenheit"):
"""Get the current weather in a given location"""
if "tokyo" in location.lower():
return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
elif "san francisco" in location.lower():
return json.dumps(
{"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
)
elif "paris" in location.lower():
return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
else:
return json.dumps({"location": location, "temperature": "unknown"})
66 changes: 66 additions & 0 deletions tests/local_testing/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -3990,3 +3990,69 @@ def test_streaming_api_base():
stream=True,
)
assert "https://api.openai.com" in stream._hidden_params["api_base"]


def test_mock_response_iterator_tool_use():
"""
Relevant Issue: https://github.com/BerriAI/litellm/issues/7364
"""
from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator
from litellm.types.utils import (
ChatCompletionMessageToolCall,
Function,
Message,
Usage,
CompletionTokensDetailsWrapper,
PromptTokensDetailsWrapper,
Choices,
)

litellm.set_verbose = False
response = ModelResponse(
id="chatcmpl-Ai8KRI5vJPZXQ9SQvEJfTVuVqkyEZ",
created=1735081811,
model="o1-2024-12-17",
object="chat.completion",
system_fingerprint="fp_e6d02d4a78",
choices=[
Choices(
finish_reason="tool_calls",
index=0,
message=Message(
content=None,
role="assistant",
tool_calls=[
ChatCompletionMessageToolCall(
function=Function(
arguments='{"location":"San Francisco, CA","unit":"fahrenheit"}',
name="get_current_weather",
),
id="call_BfRX2S7YCKL0BtxbWMl89ZNk",
type="function",
)
],
function_call=None,
),
)
],
usage=Usage(
completion_tokens=1955,
prompt_tokens=85,
total_tokens=2040,
completion_tokens_details=CompletionTokensDetailsWrapper(
accepted_prediction_tokens=0,
audio_tokens=0,
reasoning_tokens=1920,
rejected_prediction_tokens=0,
text_tokens=None,
),
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None
),
),
service_tier=None,
)
completion_stream = MockResponseIterator(model_response=response)
response_chunk = completion_stream._chunk_parser(chunk_data=response)

assert response_chunk["tool_use"] is not None

0 comments on commit 39dabb2

Please sign in to comment.