Spaces:
Paused
Paused
| import json | |
| import os | |
| import sys | |
| import time | |
| from datetime import datetime | |
| from unittest.mock import AsyncMock, patch, MagicMock | |
| import pytest | |
| sys.path.insert( | |
| 0, os.path.abspath("../..") | |
| ) # Adds the parent directory to the system path | |
| import litellm | |
| async def test_litellm_overhead(model): | |
| litellm._turn_on_debug() | |
| start_time = datetime.now() | |
| if model == "openai/self_hosted": | |
| response = await litellm.acompletion( | |
| model=model, | |
| messages=[{"role": "user", "content": "Hello, world!"}], | |
| api_base="https://exampleopenaiendpoint-production.up.railway.app/", | |
| ) | |
| else: | |
| response = await litellm.acompletion( | |
| model=model, | |
| messages=[{"role": "user", "content": "Hello, world!"}], | |
| ) | |
| end_time = datetime.now() | |
| total_time_ms = (end_time - start_time).total_seconds() * 1000 | |
| print(response) | |
| print(response._hidden_params) | |
| litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"] | |
| # calculate percent of overhead caused by litellm | |
| overhead_percent = litellm_overhead_ms * 100 / total_time_ms | |
| print("##########################\n") | |
| print("total_time_ms", total_time_ms) | |
| print("response litellm_overhead_ms", litellm_overhead_ms) | |
| print("litellm overhead_percent {}%".format(overhead_percent)) | |
| print("##########################\n") | |
| assert litellm_overhead_ms > 0 | |
| assert litellm_overhead_ms < 1000 | |
| # latency overhead should be less than total request time | |
| assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000 | |
| # latency overhead should be under 40% of total request time | |
| assert overhead_percent < 40 | |
| pass | |
| async def test_litellm_overhead_stream(model): | |
| litellm._turn_on_debug() | |
| start_time = datetime.now() | |
| if model == "openai/self_hosted": | |
| response = await litellm.acompletion( | |
| model=model, | |
| messages=[{"role": "user", "content": "Hello, world!"}], | |
| api_base="https://exampleopenaiendpoint-production.up.railway.app/", | |
| stream=True, | |
| ) | |
| else: | |
| response = await litellm.acompletion( | |
| model=model, | |
| messages=[{"role": "user", "content": "Hello, world!"}], | |
| stream=True, | |
| ) | |
| async for chunk in response: | |
| print() | |
| end_time = datetime.now() | |
| total_time_ms = (end_time - start_time).total_seconds() * 1000 | |
| print(response) | |
| print(response._hidden_params) | |
| litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"] | |
| # calculate percent of overhead caused by litellm | |
| overhead_percent = litellm_overhead_ms * 100 / total_time_ms | |
| print("##########################\n") | |
| print("total_time_ms", total_time_ms) | |
| print("response litellm_overhead_ms", litellm_overhead_ms) | |
| print("litellm overhead_percent {}%".format(overhead_percent)) | |
| print("##########################\n") | |
| assert litellm_overhead_ms > 0 | |
| assert litellm_overhead_ms < 1000 | |
| # latency overhead should be less than total request time | |
| assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000 | |
| # latency overhead should be under 40% of total request time | |
| assert overhead_percent < 40 | |
| pass | |