MemGPT/letta/llm_api/openai.py

import warnings
from typing import Generator, List, Optional, Union

import httpx
import requests
from openai import OpenAI

from letta.constants import LETTA_MODEL_ENDPOINT
from letta.errors import ErrorCode, LLMAuthenticationError, LLMError
from letta.helpers.datetime_helpers import timestamp_to_datetime
from letta.llm_api.helpers import add_inner_thoughts_to_functions, convert_to_structured_output, make_post_request
from letta.llm_api.openai_client import (
    accepts_developer_role,
    requires_auto_tool_choice,
    supports_parallel_tool_calling,
    supports_structured_output,
    supports_temperature_param,
)
from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST
from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
from letta.log import get_logger
from letta.schemas.llm_config import LLMConfig
from letta.schemas.message import Message as _Message
from letta.schemas.message import MessageRole as _MessageRole
from letta.schemas.openai.chat_completion_request import ChatCompletionRequest
from letta.schemas.openai.chat_completion_request import FunctionCall as ToolFunctionChoiceFunctionCall
from letta.schemas.openai.chat_completion_request import FunctionSchema, Tool, ToolFunctionChoice, cast_message_to_subtype
from letta.schemas.openai.chat_completion_response import (
    ChatCompletionChunkResponse,
    ChatCompletionResponse,
    Choice,
    FunctionCall,
    Message,
    ToolCall,
    UsageStatistics,
)
from letta.schemas.openai.embedding_response import EmbeddingResponse
from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
from letta.tracing import log_event
from letta.utils import get_tool_call_id, smart_urljoin

logger = get_logger(__name__)


def openai_check_valid_api_key(base_url: str, api_key: Union[str, None]) -> None:
    if api_key:
        try:
            # just get model list to check if the api key is valid until we find a cheaper / quicker endpoint
            openai_get_model_list(url=base_url, api_key=api_key)
        except requests.HTTPError as e:
            if e.response.status_code == 401:
                raise LLMAuthenticationError(message=f"Failed to authenticate with OpenAI: {e}", code=ErrorCode.UNAUTHENTICATED)
            raise e
        except Exception as e:
            raise LLMError(message=f"{e}", code=ErrorCode.INTERNAL_SERVER_ERROR)
    else:
        raise ValueError("No API key provided")

def openai_get_model_list(url: str, api_key: Optional[str] = None, fix_url: bool = False, extra_params: Optional[dict] = None) -> dict:
    """https://platform.openai.com/docs/api-reference/models/list"""
    from letta.utils import printd

    # In some cases we may want to double-check the URL and do basic correction, eg:
    # In Letta config the address for vLLM is w/o a /v1 suffix for simplicity
    # However if we're treating the server as an OpenAI proxy we want the /v1 suffix on our model hit
    if fix_url:
        if not url.endswith("/v1"):
            url = smart_urljoin(url, "v1")

    url = smart_urljoin(url, "models")

    headers = {"Content-Type": "application/json"}
    if api_key is not None:
        headers["Authorization"] = f"Bearer {api_key}"

    printd(f"Sending request to {url}")
    response = None
    try:
        # TODO add query param "tool" to be true
        response = requests.get(url, headers=headers, params=extra_params)
        response.raise_for_status()  # Raises HTTPError for 4XX/5XX status
        response = response.json()  # convert to dict from string
        printd(f"response = {response}")
        return response
    except requests.exceptions.HTTPError as http_err:
        # Handle HTTP errors (e.g., response 4XX, 5XX)
        try:
            if response:
                response = response.json()
        except:
            pass
        printd(f"Got HTTPError, exception={http_err}, response={response}")
        raise http_err
    except requests.exceptions.RequestException as req_err:
        # Handle other requests-related errors (e.g., connection error)
        try:
            if response:
                response = response.json()
        except:
            pass
        printd(f"Got RequestException, exception={req_err}, response={response}")
        raise req_err
    except Exception as e:
        # Handle other potential errors
        try:
            if response:
                response = response.json()
        except:
            pass
        printd(f"Got unknown Exception, exception={e}, response={response}")
        raise e


async def openai_get_model_list_async(
    url: str,
    api_key: Optional[str] = None,
    fix_url: bool = False,
    extra_params: Optional[dict] = None,
    client: Optional["httpx.AsyncClient"] = None,
) -> dict:
    """https://platform.openai.com/docs/api-reference/models/list"""
    from letta.utils import printd

    # In some cases we may want to double-check the URL and do basic correction
    if fix_url and not url.endswith("/v1"):
        url = smart_urljoin(url, "v1")

    url = smart_urljoin(url, "models")

    headers = {"Content-Type": "application/json"}
    if api_key is not None:
        headers["Authorization"] = f"Bearer {api_key}"

    printd(f"Sending request to {url}")

    # Use provided client or create a new one
    close_client = False
    if client is None:
        client = httpx.AsyncClient()
        close_client = True

    try:
        response = await client.get(url, headers=headers, params=extra_params)
        response.raise_for_status()
        result = response.json()
        printd(f"response = {result}")
        return result
    except httpx.HTTPStatusError as http_err:
        # Handle HTTP errors (e.g., response 4XX, 5XX)
        error_response = None
        try:
            error_response = http_err.response.json()
        except:
            error_response = {"status_code": http_err.response.status_code, "text": http_err.response.text}
        printd(f"Got HTTPError, exception={http_err}, response={error_response}")
        raise http_err
    except httpx.RequestError as req_err:
        # Handle other httpx-related errors (e.g., connection error)
        printd(f"Got RequestException, exception={req_err}")
        raise req_err
    except Exception as e:
        # Handle other potential errors
        printd(f"Got unknown Exception, exception={e}")
        raise e
    finally:
        if close_client:
            await client.aclose()


def build_openai_chat_completions_request(
    llm_config: LLMConfig,
    messages: List[_Message],
    user_id: Optional[str],
    functions: Optional[list],
    function_call: Optional[str],
    use_tool_naming: bool,
    put_inner_thoughts_first: bool = True,
    use_structured_output: bool = True,
) -> ChatCompletionRequest:
    if functions and llm_config.put_inner_thoughts_in_kwargs:
        # Special case for LM Studio backend since it needs extra guidance to force out the thoughts first
        # TODO(fix)
        inner_thoughts_desc = (
            INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST if ":1234" in llm_config.model_endpoint else INNER_THOUGHTS_KWARG_DESCRIPTION
        )
        functions = add_inner_thoughts_to_functions(
            functions=functions,
            inner_thoughts_key=INNER_THOUGHTS_KWARG,
            inner_thoughts_description=inner_thoughts_desc,
            put_inner_thoughts_first=put_inner_thoughts_first,
        )

    use_developer_message = accepts_developer_role(llm_config.model)

    openai_message_list = [
        cast_message_to_subtype(
            m.to_openai_dict(
                put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
                use_developer_message=use_developer_message,
            )
        )
        for m in messages
    ]

    if llm_config.model:
        model = llm_config.model
    else:
        warnings.warn(f"Model type not set in llm_config: {llm_config.model_dump_json(indent=4)}")
        model = None

    if use_tool_naming:
        if function_call is None:
            tool_choice = None
        elif function_call not in ["none", "auto", "required"]:
            tool_choice = ToolFunctionChoice(type="function", function=ToolFunctionChoiceFunctionCall(name=function_call))
        else:
            if requires_auto_tool_choice(llm_config):
                tool_choice = "auto"
            else:
                tool_choice = function_call
        data = ChatCompletionRequest(
            model=model,
            messages=openai_message_list,
            tools=[Tool(type="function", function=f) for f in functions] if functions else None,
            tool_choice=tool_choice,
            user=str(user_id),
            max_completion_tokens=llm_config.max_tokens,
            temperature=llm_config.temperature if supports_temperature_param(model) else None,
            reasoning_effort=llm_config.reasoning_effort,
        )
    else:
        data = ChatCompletionRequest(
            model=model,
            messages=openai_message_list,
            functions=functions,
            function_call=function_call,
            user=str(user_id),
            max_completion_tokens=llm_config.max_tokens,
            temperature=1.0 if llm_config.enable_reasoner else llm_config.temperature,
            reasoning_effort=llm_config.reasoning_effort,
        )
        # https://platform.openai.com/docs/guides/text-generation/json-mode
        # only supported by gpt-4o, gpt-4-turbo, or gpt-3.5-turbo
        # if "gpt-4o" in llm_config.model or "gpt-4-turbo" in llm_config.model or "gpt-3.5-turbo" in llm_config.model:
        # data.response_format = {"type": "json_object"}

    # always set user id for openai requests
    if user_id:
        data.user = str(user_id)

    if llm_config.model_endpoint == LETTA_MODEL_ENDPOINT:
        if not user_id:
            # override user id for inference.letta.com
            import uuid

            data.user = str(uuid.UUID(int=0))

        data.model = "memgpt-openai"

    if use_structured_output and data.tools is not None and len(data.tools) > 0:
        # Convert to structured output style (which has 'strict' and no optionals)
        for tool in data.tools:
            if supports_structured_output(llm_config):
                try:
                    # tool["function"] = convert_to_structured_output(tool["function"])
                    structured_output_version = convert_to_structured_output(tool.function.model_dump())
                    tool.function = FunctionSchema(**structured_output_version)
                except ValueError as e:
                    warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
    return data


def openai_chat_completions_process_stream(
    url: str,
    api_key: str,
    chat_completion_request: ChatCompletionRequest,
    stream_interface: Optional[Union[AgentChunkStreamingInterface, AgentRefreshStreamingInterface]] = None,
    create_message_id: bool = True,
    create_message_datetime: bool = True,
    override_tool_call_id: bool = True,
    # if we expect reasoning content in the response,
    # then we should emit reasoning_content as "inner_thoughts"
    # however, we don't necessarily want to put these
    # expect_reasoning_content: bool = False,
    expect_reasoning_content: bool = True,
    name: Optional[str] = None,
) -> ChatCompletionResponse:
    """Process a streaming completion response, and return a ChatCompletionResponse at the end.

    To "stream" the response in Letta, we want to call a streaming-compatible interface function
    on the chunks received from the OpenAI-compatible server POST SSE response.
    """
    assert chat_completion_request.stream == True
    assert stream_interface is not None, "Required"

    # Count the prompt tokens
    # TODO move to post-request?
    chat_history = [m.model_dump(exclude_none=True) for m in chat_completion_request.messages]
    # print(chat_history)

    prompt_tokens = num_tokens_from_messages(
        messages=chat_history,
        model=chat_completion_request.model,
    )
    # We also need to add the cost of including the functions list to the input prompt
    if chat_completion_request.tools is not None:
        assert chat_completion_request.functions is None
        prompt_tokens += num_tokens_from_functions(
            functions=[t.function.model_dump() for t in chat_completion_request.tools],
            model=chat_completion_request.model,
        )
    elif chat_completion_request.functions is not None:
        assert chat_completion_request.tools is None
        prompt_tokens += num_tokens_from_functions(
            functions=[f.model_dump() for f in chat_completion_request.functions],
            model=chat_completion_request.model,
        )

    # Create a dummy Message object to get an ID and date
    # TODO(sarah): add message ID generation function
    dummy_message = _Message(
        role=_MessageRole.assistant,
        content=[],
        agent_id="",
        model="",
        name=None,
        tool_calls=None,
        tool_call_id=None,
    )

    TEMP_STREAM_RESPONSE_ID = "temp_id"
    TEMP_STREAM_FINISH_REASON = "temp_null"
    TEMP_STREAM_TOOL_CALL_ID = "temp_id"
    chat_completion_response = ChatCompletionResponse(
        id=dummy_message.id if create_message_id else TEMP_STREAM_RESPONSE_ID,
        choices=[],
        created=int(dummy_message.created_at.timestamp()),  # NOTE: doesn't matter since both will do get_utc_time()
        model=chat_completion_request.model,
        usage=UsageStatistics(
            completion_tokens=0,
            prompt_tokens=prompt_tokens,
            total_tokens=prompt_tokens,
        ),
    )

    log_event(name="llm_request_sent", attributes=chat_completion_request.model_dump())

    if stream_interface:
        stream_interface.stream_start()

    n_chunks = 0  # approx == n_tokens
    chunk_idx = 0
    prev_message_type = None
    message_idx = 0
    try:
        for chat_completion_chunk in openai_chat_completions_request_stream(
            url=url, api_key=api_key, chat_completion_request=chat_completion_request
        ):
            assert isinstance(chat_completion_chunk, ChatCompletionChunkResponse), type(chat_completion_chunk)
            if chat_completion_chunk.choices is None or len(chat_completion_chunk.choices) == 0:
                warnings.warn(f"No choices in chunk: {chat_completion_chunk}")
                continue

            # NOTE: this assumes that the tool call ID will only appear in one of the chunks during the stream
            if override_tool_call_id:
                for choice in chat_completion_chunk.choices:
                    if choice.delta.tool_calls and len(choice.delta.tool_calls) > 0:
                        for tool_call in choice.delta.tool_calls:
                            if tool_call.id is not None:
                                tool_call.id = get_tool_call_id()

            if stream_interface:
                if isinstance(stream_interface, AgentChunkStreamingInterface):
                    message_type = stream_interface.process_chunk(
                        chat_completion_chunk,
                        message_id=chat_completion_response.id if create_message_id else chat_completion_chunk.id,
                        message_date=(
                            timestamp_to_datetime(chat_completion_response.created)
                            if create_message_datetime
                            else timestamp_to_datetime(chat_completion_chunk.created)
                        ),
                        expect_reasoning_content=expect_reasoning_content,
                        name=name,
                        message_index=message_idx,
                        prev_message_type=prev_message_type,
                    )
                    if message_type != prev_message_type and message_type is not None and prev_message_type is not None:
                        message_idx += 1
                    if message_type is not None:
                        prev_message_type = message_type
                elif isinstance(stream_interface, AgentRefreshStreamingInterface):
                    stream_interface.process_refresh(chat_completion_response)
                else:
                    raise TypeError(stream_interface)

            if chunk_idx == 0:
                # initialize the choice objects which we will increment with the deltas
                num_choices = len(chat_completion_chunk.choices)
                assert num_choices > 0
                chat_completion_response.choices = [
                    Choice(
                        finish_reason=TEMP_STREAM_FINISH_REASON,  # NOTE: needs to be ovrerwritten
                        index=i,
                        message=Message(
                            role="assistant",
                        ),
                    )
                    for i in range(len(chat_completion_chunk.choices))
                ]

            # add the choice delta
            assert len(chat_completion_chunk.choices) == len(chat_completion_response.choices), chat_completion_chunk
            for chunk_choice in chat_completion_chunk.choices:
                if chunk_choice.finish_reason is not None:
                    chat_completion_response.choices[chunk_choice.index].finish_reason = chunk_choice.finish_reason

                if chunk_choice.logprobs is not None:
                    chat_completion_response.choices[chunk_choice.index].logprobs = chunk_choice.logprobs

                accum_message = chat_completion_response.choices[chunk_choice.index].message
                message_delta = chunk_choice.delta

                if message_delta.content is not None:
                    content_delta = message_delta.content
                    if accum_message.content is None:
                        accum_message.content = content_delta
                    else:
                        accum_message.content += content_delta

                if expect_reasoning_content and message_delta.reasoning_content is not None:
                    reasoning_content_delta = message_delta.reasoning_content
                    if accum_message.reasoning_content is None:
                        accum_message.reasoning_content = reasoning_content_delta
                    else:
                        accum_message.reasoning_content += reasoning_content_delta

                # TODO(charles) make sure this works for parallel tool calling?
                if message_delta.tool_calls is not None:
                    tool_calls_delta = message_delta.tool_calls

                    # If this is the first tool call showing up in a chunk, initialize the list with it
                    if accum_message.tool_calls is None:
                        accum_message.tool_calls = [
                            ToolCall(id=TEMP_STREAM_TOOL_CALL_ID, function=FunctionCall(name="", arguments=""))
                            for _ in range(len(tool_calls_delta))
                        ]

                    # There may be many tool calls in a tool calls delta (e.g. parallel tool calls)
                    for tool_call_delta in tool_calls_delta:
                        if tool_call_delta.id is not None:
                            # TODO assert that we're not overwriting?
                            # TODO += instead of =?
                            try:
                                accum_message.tool_calls[tool_call_delta.index].id = tool_call_delta.id
                            except IndexError:
                                warnings.warn(
                                    f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
                                )
                                # force index 0
                                # accum_message.tool_calls[0].id = tool_call_delta.id
                            else:
                                accum_message.tool_calls[tool_call_delta.index].id = tool_call_delta.id
                        if tool_call_delta.function is not None:
                            if tool_call_delta.function.name is not None:
                                try:
                                    accum_message.tool_calls[
                                        tool_call_delta.index
                                    ].function.name += tool_call_delta.function.name  # TODO check for parallel tool calls
                                except IndexError:
                                    warnings.warn(
                                        f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
                                    )
                            if tool_call_delta.function.arguments is not None:
                                try:
                                    accum_message.tool_calls[tool_call_delta.index].function.arguments += tool_call_delta.function.arguments
                                except IndexError:
                                    warnings.warn(
                                        f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
                                    )

                if message_delta.function_call is not None:
                    raise NotImplementedError(f"Old function_call style not support with stream=True")

            # overwrite response fields based on latest chunk
            if not create_message_id:
                chat_completion_response.id = chat_completion_chunk.id
            if not create_message_datetime:
                chat_completion_response.created = chat_completion_chunk.created
            chat_completion_response.model = chat_completion_chunk.model
            chat_completion_response.system_fingerprint = chat_completion_chunk.system_fingerprint

            # increment chunk counter
            n_chunks += 1
            chunk_idx += 1

    except Exception as e:
        if stream_interface:
            stream_interface.stream_end()
        import traceback

        traceback.print_exc()
        logger.error(f"Parsing ChatCompletion stream failed with error:\n{str(e)}")
        raise e
    finally:
        logger.info(f"Finally ending streaming interface.")
        if stream_interface:
            stream_interface.stream_end()

    # make sure we didn't leave temp stuff in
    assert all([c.finish_reason != TEMP_STREAM_FINISH_REASON for c in chat_completion_response.choices])
    assert all(
        [
            all([tc.id != TEMP_STREAM_TOOL_CALL_ID for tc in c.message.tool_calls]) if c.message.tool_calls else True
            for c in chat_completion_response.choices
        ]
    )
    if not create_message_id:
        assert chat_completion_response.id != dummy_message.id

    # compute token usage before returning
    # TODO try actually computing the #tokens instead of assuming the chunks is the same
    chat_completion_response.usage.completion_tokens = n_chunks
    chat_completion_response.usage.total_tokens = prompt_tokens + n_chunks

    assert len(chat_completion_response.choices) > 0, f"No response from provider {chat_completion_response}"

    # printd(chat_completion_response)
    log_event(name="llm_response_received", attributes=chat_completion_response.model_dump())
    return chat_completion_response


def openai_chat_completions_request_stream(
    url: str,
    api_key: str,
    chat_completion_request: ChatCompletionRequest,
    fix_url: bool = False,
) -> Generator[ChatCompletionChunkResponse, None, None]:

    # In some cases we may want to double-check the URL and do basic correction, eg:
    # In Letta config the address for vLLM is w/o a /v1 suffix for simplicity
    # However if we're treating the server as an OpenAI proxy we want the /v1 suffix on our model hit
    if fix_url:
        if not url.endswith("/v1"):
            url = smart_urljoin(url, "v1")

    data = prepare_openai_payload(chat_completion_request)
    data["stream"] = True
    client = OpenAI(api_key=api_key, base_url=url, max_retries=0)
    try:
        stream = client.chat.completions.create(**data)
        for chunk in stream:
            # TODO: Use the native OpenAI objects here?
            yield ChatCompletionChunkResponse(**chunk.model_dump(exclude_none=True))
    except Exception as e:
        print(f"Error request stream from /v1/chat/completions, url={url}, data={data}:\n{e}")
        raise e


def openai_chat_completions_request(
    url: str,
    api_key: str,
    chat_completion_request: ChatCompletionRequest,
) -> ChatCompletionResponse:
    """Send a ChatCompletion request to an OpenAI-compatible server

    If request.stream == True, will yield ChatCompletionChunkResponses
    If request.stream == False, will return a ChatCompletionResponse

    https://platform.openai.com/docs/guides/text-generation?lang=curl
    """
    data = prepare_openai_payload(chat_completion_request)
    client = OpenAI(api_key=api_key, base_url=url, max_retries=0)
    log_event(name="llm_request_sent", attributes=data)
    chat_completion = client.chat.completions.create(**data)
    log_event(name="llm_response_received", attributes=chat_completion.model_dump())
    return ChatCompletionResponse(**chat_completion.model_dump())


def openai_embeddings_request(url: str, api_key: str, data: dict) -> EmbeddingResponse:
    """https://platform.openai.com/docs/api-reference/embeddings/create"""

    url = smart_urljoin(url, "embeddings")
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
    response_json = make_post_request(url, headers, data)
    return EmbeddingResponse(**response_json)


def prepare_openai_payload(chat_completion_request: ChatCompletionRequest):
    data = chat_completion_request.model_dump(exclude_none=True)

    # add check otherwise will cause error: "Invalid value for 'parallel_tool_calls': 'parallel_tool_calls' is only allowed when 'tools' are specified."
    if chat_completion_request.tools is not None:
        data["parallel_tool_calls"] = False

    # If functions == None, strip from the payload
    if "functions" in data and data["functions"] is None:
        data.pop("functions")
        data.pop("function_call", None)  # extra safe,  should exist always (default="auto")

    if "tools" in data and data["tools"] is None:
        data.pop("tools")
        data.pop("tool_choice", None)  # extra safe,  should exist always (default="auto")

    # # NOTE: move this out to wherever the ChatCompletionRequest is created
    # if "tools" in data:
    #     for tool in data["tools"]:
    #         try:
    #             tool["function"] = convert_to_structured_output(tool["function"])
    #         except ValueError as e:
    #             warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")

    if not supports_parallel_tool_calling(chat_completion_request.model):
        data.pop("parallel_tool_calls", None)

    return data