feat: add sonnet 3.7 support (#1302)

2025-06-03 04:30:22 +00:00 · 2025-03-24 16:36:16 -10:00 · 2025-03-24 16:36:16 -10:00 · 100431dce8
commit 100431dce8
parent de16a17f65
14 changed files with 1743 additions and 535 deletions
--- a/letta/agent.py
+++ b/letta/agent.py
@ -424,7 +424,7 @@ class Agent(BaseAgent):
            self.logger.debug(f"Function call message: {messages[-1]}")

            nonnull_content = False
-            if response_message.content:
+            if response_message.content or response_message.reasoning_content or response_message.redacted_reasoning_content:
                # The content if then internal monologue, not chat
                self.interface.internal_monologue(response_message.content, msg_obj=messages[-1])
                # Flag to avoid printing a duplicate if inner thoughts get popped from the function call
--- a/letta/client/streaming.py
+++ b/letta/client/streaming.py
@ -9,7 +9,7 @@ from letta.constants import OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING
 from letta.errors import LLMError
 from letta.log import get_logger
 from letta.schemas.enums import MessageStreamStatus
-from letta.schemas.letta_message import AssistantMessage, ReasoningMessage, ToolCallMessage, ToolReturnMessage
+from letta.schemas.letta_message import AssistantMessage, HiddenReasoningMessage, ReasoningMessage, ToolCallMessage, ToolReturnMessage
 from letta.schemas.letta_response import LettaStreamingResponse
 from letta.schemas.usage import LettaUsageStatistics

@ -57,6 +57,8 @@ def _sse_post(url: str, data: dict, headers: dict) -> Generator[Union[LettaStrea
                            yield ReasoningMessage(**chunk_data)
                        elif chunk_data.get("message_type") == "assistant_message":
                            yield AssistantMessage(**chunk_data)
+                        elif "hidden_reasoning" in chunk_data:
+                            yield HiddenReasoningMessage(**chunk_data)
                        elif "tool_call" in chunk_data:
                            yield ToolCallMessage(**chunk_data)
                        elif "tool_return" in chunk_data:
--- a/letta/llm_api/anthropic.py
+++ b/letta/llm_api/anthropic.py
@ -13,7 +13,9 @@ from anthropic.types.beta import (
    BetaRawMessageDeltaEvent,
    BetaRawMessageStartEvent,
    BetaRawMessageStopEvent,
+    BetaRedactedThinkingBlock,
    BetaTextBlock,
+    BetaThinkingBlock,
    BetaToolUseBlock,
 )

@ -345,43 +347,32 @@ def convert_anthropic_response_to_chatcompletion(
    finish_reason = remap_finish_reason(response.stop_reason)

    content = None
+    reasoning_content = None
+    reasoning_content_signature = None
+    redacted_reasoning_content = None
    tool_calls = None

    if len(response.content) > 1:
-        # inner mono + function call
-        assert len(response.content) == 2
-        text_block = response.content[0]
-        tool_block = response.content[1]
-        assert text_block.type == "text"
-        assert tool_block.type == "tool_use"
-        content = strip_xml_tags(string=text_block.text, tag=inner_thoughts_xml_tag)
-        tool_calls = [
-            ToolCall(
-                id=tool_block.id,
-                type="function",
-                function=FunctionCall(
-                    name=tool_block.name,
-                    arguments=json.dumps(tool_block.input, indent=2),
-                ),
-            )
-        ]
-    elif len(response.content) == 1:
-        block = response.content[0]
-        if block.type == "tool_use":
-            # function call only
-            tool_calls = [
-                ToolCall(
-                    id=block.id,
-                    type="function",
-                    function=FunctionCall(
-                        name=block.name,
-                        arguments=json.dumps(block.input, indent=2),
-                    ),
-                )
-            ]
-        else:
-            # inner mono only
-            content = strip_xml_tags(string=block.text, tag=inner_thoughts_xml_tag)
+        for content_part in response.content:
+            if content_part.type == "text":
+                content = strip_xml_tags(string=content_part.text, tag=inner_thoughts_xml_tag)
+            if content_part.type == "tool_use":
+                tool_calls = [
+                    ToolCall(
+                        id=content_part.id,
+                        type="function",
+                        function=FunctionCall(
+                            name=content_part.name,
+                            arguments=json.dumps(content_part.input, indent=2),
+                        ),
+                    )
+                ]
+            if content_part.type == "thinking":
+                reasoning_content = content_part.thinking
+                reasoning_content_signature = content_part.signature
+            if content_part.type == "redacted_thinking":
+                redacted_reasoning_content = content_part.data
+
    else:
        raise RuntimeError("Unexpected empty content in response")

@ -392,6 +383,9 @@ def convert_anthropic_response_to_chatcompletion(
        message=ChoiceMessage(
            role=response.role,
            content=content,
+            reasoning_content=reasoning_content,
+            reasoning_content_signature=reasoning_content_signature,
+            redacted_reasoning_content=redacted_reasoning_content,
            tool_calls=tool_calls,
        ),
    )
@ -462,7 +456,31 @@ def convert_anthropic_stream_event_to_chatcompletion(
    """
    # Get finish reason
    finish_reason = None
-    if isinstance(event, BetaRawMessageDeltaEvent):
+    completion_chunk_tokens = 0
+
+    # Get content and tool calls
+    content = None
+    reasoning_content = None
+    reasoning_content_signature = None
+    redacted_reasoning_content = None  # NOTE called "data" in the stream
+    tool_calls = None
+    if isinstance(event, BetaRawMessageStartEvent):
+        """
+        BetaRawMessageStartEvent(
+            message=BetaMessage(
+                content=[],
+                usage=BetaUsage(
+                    input_tokens=3086,
+                    output_tokens=1,
+                ),
+                ...,
+            ),
+            type='message_start'
+        )
+        """
+        completion_chunk_tokens += event.message.usage.output_tokens
+
+    elif isinstance(event, BetaRawMessageDeltaEvent):
        """
        BetaRawMessageDeltaEvent(
            delta=Delta(
@ -474,11 +492,9 @@ def convert_anthropic_stream_event_to_chatcompletion(
        )
        """
        finish_reason = remap_finish_reason(event.delta.stop_reason)
+        completion_chunk_tokens += event.usage.output_tokens

-    # Get content and tool calls
-    content = None
-    tool_calls = None
-    if isinstance(event, BetaRawContentBlockDeltaEvent):
+    elif isinstance(event, BetaRawContentBlockDeltaEvent):
        """
        BetaRawContentBlockDeltaEvent(
            delta=BetaInputJSONDelta(
@ -501,9 +517,24 @@ def convert_anthropic_stream_event_to_chatcompletion(
        )

        """
+        # ReACT COT
        if event.delta.type == "text_delta":
            content = strip_xml_tags_streaming(string=event.delta.text, tag=inner_thoughts_xml_tag)

+        # Extended thought COT
+        elif event.delta.type == "thinking_delta":
+            # Redacted doesn't come in the delta chunks, comes all at once
+            # "redacted_thinking blocks will not have any deltas associated and will be sent as a single event."
+            # Thinking might start with ""
+            if len(event.delta.thinking) > 0:
+                reasoning_content = event.delta.thinking
+
+        # Extended thought COT signature
+        elif event.delta.type == "signature_delta":
+            if len(event.delta.signature) > 0:
+                reasoning_content_signature = event.delta.signature
+
+        # Tool calling
        elif event.delta.type == "input_json_delta":
            tool_calls = [
                ToolCallDelta(
@ -514,6 +545,9 @@ def convert_anthropic_stream_event_to_chatcompletion(
                    ),
                )
            ]
+        else:
+            warnings.warn("Unexpected delta type: " + event.delta.type)
+
    elif isinstance(event, BetaRawContentBlockStartEvent):
        """
        BetaRawContentBlockStartEvent(
@ -551,6 +585,15 @@ def convert_anthropic_stream_event_to_chatcompletion(
            ]
        elif isinstance(event.content_block, BetaTextBlock):
            content = event.content_block.text
+        elif isinstance(event.content_block, BetaThinkingBlock):
+            reasoning_content = event.content_block.thinking
+        elif isinstance(event.content_block, BetaRedactedThinkingBlock):
+            redacted_reasoning_content = event.content_block.data
+        else:
+            warnings.warn("Unexpected content start type: " + str(type(event.content_block)))
+
+    else:
+        warnings.warn("Unexpected event type: " + event.type)

    # Initialize base response
    choice = ChunkChoice(
@ -558,6 +601,9 @@ def convert_anthropic_stream_event_to_chatcompletion(
        finish_reason=finish_reason,
        delta=MessageDelta(
            content=content,
+            reasoning_content=reasoning_content,
+            reasoning_content_signature=reasoning_content_signature,
+            redacted_reasoning_content=redacted_reasoning_content,
            tool_calls=tool_calls,
        ),
    )
@ -566,6 +612,7 @@ def convert_anthropic_stream_event_to_chatcompletion(
        choices=[choice],
        created=get_utc_time(),
        model=model,
+        output_tokens=completion_chunk_tokens,
    )


@ -577,8 +624,20 @@ def _prepare_anthropic_request(
    # if true, put COT inside the tool calls instead of inside the content
    put_inner_thoughts_in_kwargs: bool = False,
    bedrock: bool = False,
+    # extended thinking related fields
+    # https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking
+    extended_thinking: bool = False,
+    max_reasoning_tokens: Optional[int] = None,
 ) -> dict:
    """Prepare the request data for Anthropic API format."""
+    if extended_thinking:
+        assert (
+            max_reasoning_tokens is not None and max_reasoning_tokens < data.max_tokens
+        ), "max tokens must be greater than thinking budget"
+        assert not put_inner_thoughts_in_kwargs, "extended thinking not compatible with put_inner_thoughts_in_kwargs"
+        # assert not prefix_fill, "extended thinking not compatible with prefix_fill"
+        # Silently disable prefix_fill for now
+        prefix_fill = False

    # if needed, put inner thoughts as a kwarg for all tools
    if data.tools and put_inner_thoughts_in_kwargs:
@ -595,6 +654,14 @@ def _prepare_anthropic_request(
    # pydantic -> dict
    data = data.model_dump(exclude_none=True)

+    if extended_thinking:
+        data["thinking"] = {
+            "type": "enabled",
+            "budget_tokens": max_reasoning_tokens,
+        }
+        # `temperature` may only be set to 1 when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking'
+        data["temperature"] = 1.0
+
    if "functions" in data:
        raise ValueError(f"'functions' unexpected in Anthropic API payload")

@ -665,6 +732,8 @@ def anthropic_chat_completions_request(
    data: ChatCompletionRequest,
    inner_thoughts_xml_tag: Optional[str] = "thinking",
    put_inner_thoughts_in_kwargs: bool = False,
+    extended_thinking: bool = False,
+    max_reasoning_tokens: Optional[int] = None,
    betas: List[str] = ["tools-2024-04-04"],
 ) -> ChatCompletionResponse:
    """https://docs.anthropic.com/claude/docs/tool-use"""
@ -678,6 +747,8 @@ def anthropic_chat_completions_request(
        data=data,
        inner_thoughts_xml_tag=inner_thoughts_xml_tag,
        put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+        extended_thinking=extended_thinking,
+        max_reasoning_tokens=max_reasoning_tokens,
    )
    log_event(name="llm_request_sent", attributes=data)
    response = anthropic_client.beta.messages.create(
@ -717,6 +788,8 @@ def anthropic_chat_completions_request_stream(
    data: ChatCompletionRequest,
    inner_thoughts_xml_tag: Optional[str] = "thinking",
    put_inner_thoughts_in_kwargs: bool = False,
+    extended_thinking: bool = False,
+    max_reasoning_tokens: Optional[int] = None,
    betas: List[str] = ["tools-2024-04-04"],
 ) -> Generator[ChatCompletionChunkResponse, None, None]:
    """Stream chat completions from Anthropic API.
@ -728,6 +801,8 @@ def anthropic_chat_completions_request_stream(
        data=data,
        inner_thoughts_xml_tag=inner_thoughts_xml_tag,
        put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+        extended_thinking=extended_thinking,
+        max_reasoning_tokens=max_reasoning_tokens,
    )

    anthropic_override_key = ProviderManager().get_anthropic_override_key()
@ -777,6 +852,8 @@ def anthropic_chat_completions_process_stream(
    stream_interface: Optional[Union[AgentChunkStreamingInterface, AgentRefreshStreamingInterface]] = None,
    inner_thoughts_xml_tag: Optional[str] = "thinking",
    put_inner_thoughts_in_kwargs: bool = False,
+    extended_thinking: bool = False,
+    max_reasoning_tokens: Optional[int] = None,
    create_message_id: bool = True,
    create_message_datetime: bool = True,
    betas: List[str] = ["tools-2024-04-04"],
@ -839,7 +916,6 @@ def anthropic_chat_completions_process_stream(
        created=dummy_message.created_at,
        model=chat_completion_request.model,
        usage=UsageStatistics(
-            completion_tokens=0,
            prompt_tokens=prompt_tokens,
            total_tokens=prompt_tokens,
        ),
@ -850,13 +926,15 @@ def anthropic_chat_completions_process_stream(
    if stream_interface:
        stream_interface.stream_start()

-    n_chunks = 0
+    completion_tokens = 0
    try:
        for chunk_idx, chat_completion_chunk in enumerate(
            anthropic_chat_completions_request_stream(
                data=chat_completion_request,
                inner_thoughts_xml_tag=inner_thoughts_xml_tag,
                put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
+                extended_thinking=extended_thinking,
+                max_reasoning_tokens=max_reasoning_tokens,
                betas=betas,
            )
        ):
@ -868,6 +946,9 @@ def anthropic_chat_completions_process_stream(
                        chat_completion_chunk,
                        message_id=chat_completion_response.id if create_message_id else chat_completion_chunk.id,
                        message_date=chat_completion_response.created if create_message_datetime else chat_completion_chunk.created,
+                        # if extended_thinking is on, then reasoning_content will be flowing as chunks
+                        # TODO handle emitting redacted reasoning content (e.g. as concat?)
+                        expect_reasoning_content=extended_thinking,
                    )
                elif isinstance(stream_interface, AgentRefreshStreamingInterface):
                    stream_interface.process_refresh(chat_completion_response)
@ -908,6 +989,30 @@ def anthropic_chat_completions_process_stream(
                    else:
                        accum_message.content += content_delta

+                # NOTE: for extended_thinking mode
+                if extended_thinking and message_delta.reasoning_content is not None:
+                    reasoning_content_delta = message_delta.reasoning_content
+                    if accum_message.reasoning_content is None:
+                        accum_message.reasoning_content = reasoning_content_delta
+                    else:
+                        accum_message.reasoning_content += reasoning_content_delta
+
+                # NOTE: extended_thinking sends a signature
+                if extended_thinking and message_delta.reasoning_content_signature is not None:
+                    reasoning_content_signature_delta = message_delta.reasoning_content_signature
+                    if accum_message.reasoning_content_signature is None:
+                        accum_message.reasoning_content_signature = reasoning_content_signature_delta
+                    else:
+                        accum_message.reasoning_content_signature += reasoning_content_signature_delta
+
+                # NOTE: extended_thinking also has the potential for redacted_reasoning_content
+                if extended_thinking and message_delta.redacted_reasoning_content is not None:
+                    redacted_reasoning_content_delta = message_delta.redacted_reasoning_content
+                    if accum_message.redacted_reasoning_content is None:
+                        accum_message.redacted_reasoning_content = redacted_reasoning_content_delta
+                    else:
+                        accum_message.redacted_reasoning_content += redacted_reasoning_content_delta
+
                # TODO(charles) make sure this works for parallel tool calling?
                if message_delta.tool_calls is not None:
                    tool_calls_delta = message_delta.tool_calls
@ -966,7 +1071,8 @@ def anthropic_chat_completions_process_stream(
            chat_completion_response.system_fingerprint = chat_completion_chunk.system_fingerprint

            # increment chunk counter
-            n_chunks += 1
+            if chat_completion_chunk.output_tokens is not None:
+                completion_tokens += chat_completion_chunk.output_tokens

    except Exception as e:
        if stream_interface:
@ -990,8 +1096,8 @@ def anthropic_chat_completions_process_stream(

    # compute token usage before returning
    # TODO try actually computing the #tokens instead of assuming the chunks is the same
-    chat_completion_response.usage.completion_tokens = n_chunks
-    chat_completion_response.usage.total_tokens = prompt_tokens + n_chunks
+    chat_completion_response.usage.completion_tokens = completion_tokens
+    chat_completion_response.usage.total_tokens = prompt_tokens + completion_tokens

    assert len(chat_completion_response.choices) > 0, chat_completion_response

--- a/letta/llm_api/llm_api_tools.py
+++ b/letta/llm_api/llm_api_tools.py
@ -406,6 +406,8 @@ def create(
                chat_completion_request=chat_completion_request,
                put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
                stream_interface=stream_interface,
+                extended_thinking=llm_config.enable_reasoner,
+                max_reasoning_tokens=llm_config.max_reasoning_tokens,
            )

        else:
@ -413,6 +415,8 @@ def create(
            response = anthropic_chat_completions_request(
                data=chat_completion_request,
                put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
+                extended_thinking=llm_config.enable_reasoner,
+                max_reasoning_tokens=llm_config.max_reasoning_tokens,
            )

        if llm_config.put_inner_thoughts_in_kwargs:
--- a/letta/schemas/agent.py
+++ b/letta/schemas/agent.py
@ -147,6 +147,14 @@ class CreateAgent(BaseModel, validate_assignment=True):  #
    )
    context_window_limit: Optional[int] = Field(None, description="The context window limit used by the agent.")
    embedding_chunk_size: Optional[int] = Field(DEFAULT_EMBEDDING_CHUNK_SIZE, description="The embedding chunk size used by the agent.")
+    max_tokens: Optional[int] = Field(
+        None,
+        description="The maximum number of tokens to generate, including reasoning step. If not set, the model will use its default value.",
+    )
+    max_reasoning_tokens: Optional[int] = Field(
+        None, description="The maximum number of tokens to generate for reasoning step. If not set, the model will use its default value."
+    )
+    enable_reasoner: Optional[bool] = Field(False, description="Whether to enable internal extended thinking step for a reasoner model.")
    from_template: Optional[str] = Field(None, description="The template id used to configure the agent")
    template: bool = Field(False, description="Whether the agent is a template")
    project: Optional[str] = Field(
--- a/letta/schemas/letta_message.py
+++ b/letta/schemas/letta_message.py
@ -88,11 +88,13 @@ class ReasoningMessage(LettaMessage):
        source (Literal["reasoner_model", "non_reasoner_model"]): Whether the reasoning
            content was generated natively by a reasoner model or derived via prompting
        reasoning (str): The internal reasoning of the agent
+        signature (Optional[str]): The model-generated signature of the reasoning step
    """

    message_type: Literal["reasoning_message"] = "reasoning_message"
    source: Literal["reasoner_model", "non_reasoner_model"] = "non_reasoner_model"
    reasoning: str
+    signature: Optional[str] = None


 class HiddenReasoningMessage(LettaMessage):
@ -106,12 +108,12 @@ class HiddenReasoningMessage(LettaMessage):
        name (Optional[str]): The name of the sender of the message
        state (Literal["redacted", "omitted"]): Whether the reasoning
            content was redacted by the provider or simply omitted by the API
-        reasoning (str): The internal reasoning of the agent
+        hidden_reasoning (Optional[str]): The internal reasoning of the agent
    """

-    message_type: Literal["reasoning_message"] = "reasoning_message"
+    message_type: Literal["hidden_reasoning_message"] = "hidden_reasoning_message"
    state: Literal["redacted", "omitted"]
-    reasoning: str
+    hidden_reasoning: Optional[str] = None


 class ToolCall(BaseModel):
@ -229,7 +231,7 @@ class AssistantMessage(LettaMessage):

 # NOTE: use Pydantic's discriminated unions feature: https://docs.pydantic.dev/latest/concepts/unions/#discriminated-unions
 LettaMessageUnion = Annotated[
-    Union[SystemMessage, UserMessage, ReasoningMessage, ToolCallMessage, ToolReturnMessage, AssistantMessage],
+    Union[SystemMessage, UserMessage, ReasoningMessage, HiddenReasoningMessage, ToolCallMessage, ToolReturnMessage, AssistantMessage],
    Field(discriminator="message_type"),
 ]

@ -240,6 +242,7 @@ def create_letta_message_union_schema():
            {"$ref": "#/components/schemas/SystemMessage"},
            {"$ref": "#/components/schemas/UserMessage"},
            {"$ref": "#/components/schemas/ReasoningMessage"},
+            {"$ref": "#/components/schemas/HiddenReasoningMessage"},
            {"$ref": "#/components/schemas/ToolCallMessage"},
            {"$ref": "#/components/schemas/ToolReturnMessage"},
            {"$ref": "#/components/schemas/AssistantMessage"},
@ -250,6 +253,7 @@ def create_letta_message_union_schema():
                "system_message": "#/components/schemas/SystemMessage",
                "user_message": "#/components/schemas/UserMessage",
                "reasoning_message": "#/components/schemas/ReasoningMessage",
+                "hidden_reasoning_message": "#/components/schemas/HiddenReasoningMessage",
                "tool_call_message": "#/components/schemas/ToolCallMessage",
                "tool_return_message": "#/components/schemas/ToolReturnMessage",
                "assistant_message": "#/components/schemas/AssistantMessage",
--- a/letta/schemas/llm_config.py
+++ b/letta/schemas/llm_config.py
@ -60,6 +60,12 @@ class LLMConfig(BaseModel):
        4096,
        description="The maximum number of tokens to generate. If not set, the model will use its default value.",
    )
+    enable_reasoner: bool = Field(
+        False, description="Whether or not the model should use extended thinking if it is a 'reasoning' style model"
+    )
+    max_reasoning_tokens: int = Field(
+        0, description="Configurable thinking budget for extended thinking, only used if enable_reasoner is True. Minimum value is 1024."
+    )

    # FIXME hack to silence pydantic protected namespace warning
    model_config = ConfigDict(protected_namespaces=())
--- a/letta/schemas/message.py
+++ b/letta/schemas/message.py
@ -19,6 +19,7 @@ from letta.schemas.enums import MessageRole
 from letta.schemas.letta_base import OrmMetadataBase
 from letta.schemas.letta_message import (
    AssistantMessage,
+    HiddenReasoningMessage,
    LettaMessage,
    ReasoningMessage,
    SystemMessage,
@ -27,7 +28,13 @@ from letta.schemas.letta_message import (
    ToolReturnMessage,
    UserMessage,
 )
-from letta.schemas.letta_message_content import LettaMessageContentUnion, TextContent, get_letta_message_content_union_str_json_schema
+from letta.schemas.letta_message_content import (
+    LettaMessageContentUnion,
+    ReasoningContent,
+    RedactedReasoningContent,
+    TextContent,
+    get_letta_message_content_union_str_json_schema,
+)
 from letta.system import unpack_message


@ -206,23 +213,58 @@ class Message(BaseMessage):
        assistant_message_tool_kwarg: str = DEFAULT_MESSAGE_TOOL_KWARG,
    ) -> List[LettaMessage]:
        """Convert message object (in DB format) to the style used by the original Letta API"""
-        if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
-            text_content = self.content[0].text
-        else:
-            text_content = None
-
        messages = []

        if self.role == MessageRole.assistant:
-            if text_content is not None:
-                # This is type InnerThoughts
-                messages.append(
-                    ReasoningMessage(
-                        id=self.id,
-                        date=self.created_at,
-                        reasoning=text_content,
+
+            # Handle reasoning
+            if self.content:
+                # Check for ReACT-style COT inside of TextContent
+                if len(self.content) == 1 and isinstance(self.content[0], TextContent):
+                    messages.append(
+                        ReasoningMessage(
+                            id=self.id,
+                            date=self.created_at,
+                            reasoning=self.content[0].text,
+                        )
                    )
-                )
+                # Otherwise, we may have a list of multiple types
+                else:
+                    # TODO we can probably collapse these two cases into a single loop
+                    for content_part in self.content:
+                        if isinstance(content_part, TextContent):
+                            # COT
+                            messages.append(
+                                ReasoningMessage(
+                                    id=self.id,
+                                    date=self.created_at,
+                                    reasoning=content_part.text,
+                                )
+                            )
+                        elif isinstance(content_part, ReasoningContent):
+                            # "native" COT
+                            messages.append(
+                                ReasoningMessage(
+                                    id=self.id,
+                                    date=self.created_at,
+                                    reasoning=content_part.reasoning,
+                                    source="reasoner_model",  # TODO do we want to tag like this?
+                                    signature=content_part.signature,
+                                )
+                            )
+                        elif isinstance(content_part, RedactedReasoningContent):
+                            # "native" redacted/hidden COT
+                            messages.append(
+                                HiddenReasoningMessage(
+                                    id=self.id,
+                                    date=self.created_at,
+                                    state="redacted",
+                                    hidden_reasoning=content_part.data,
+                                )
+                            )
+                        else:
+                            warnings.warn(f"Unrecognized content part in assistant message: {content_part}")
+
            if self.tool_calls is not None:
                # This is type FunctionCall
                for tool_call in self.tool_calls:
@ -264,7 +306,11 @@ class Message(BaseMessage):
            #         "message": response_string,
            #         "time": formatted_time,
            #     }
-            assert text_content is not None, self
+            if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
+                text_content = self.content[0].text
+            else:
+                raise ValueError(f"Invalid tool return (no text object on message): {self.content}")
+
            try:
                function_return = json.loads(text_content)
                status = function_return["status"]
@ -292,7 +338,11 @@ class Message(BaseMessage):
            )
        elif self.role == MessageRole.user:
            # This is type UserMessage
-            assert text_content is not None, self
+            if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
+                text_content = self.content[0].text
+            else:
+                raise ValueError(f"Invalid user message (no text object on message): {self.content}")
+
            message_str = unpack_message(text_content)
            messages.append(
                UserMessage(
@ -303,7 +353,11 @@ class Message(BaseMessage):
            )
        elif self.role == MessageRole.system:
            # This is type SystemMessage
-            assert text_content is not None, self
+            if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
+                text_content = self.content[0].text
+            else:
+                raise ValueError(f"Invalid system message (no text object on system): {self.content}")
+
            messages.append(
                SystemMessage(
                    id=self.id,
@ -335,6 +389,29 @@ class Message(BaseMessage):
        assert "role" in openai_message_dict, openai_message_dict
        assert "content" in openai_message_dict, openai_message_dict

+        # TODO(caren) implicit support for only non-parts/list content types
+        if openai_message_dict["content"] is not None and type(openai_message_dict["content"]) is not str:
+            raise ValueError(f"Invalid content type: {type(openai_message_dict['content'])}")
+        content = [TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else []
+
+        # TODO(caren) bad assumption here that "reasoning_content" always comes before "redacted_reasoning_content"
+        if "reasoning_content" in openai_message_dict and openai_message_dict["reasoning_content"]:
+            content.append(
+                ReasoningContent(
+                    reasoning=openai_message_dict["reasoning_content"],
+                    is_native=True,
+                    signature=(
+                        openai_message_dict["reasoning_content_signature"] if openai_message_dict["reasoning_content_signature"] else None
+                    ),
+                ),
+            )
+        if "redacted_reasoning_content" in openai_message_dict and openai_message_dict["redacted_reasoning_content"]:
+            content.append(
+                RedactedReasoningContent(
+                    data=openai_message_dict["redacted_reasoning_content"] if "redacted_reasoning_content" in openai_message_dict else None,
+                ),
+            )
+
        # If we're going from deprecated function form
        if openai_message_dict["role"] == "function":
            if not allow_functions_style:
@ -348,7 +425,7 @@ class Message(BaseMessage):
                    model=model,
                    # standard fields expected in an OpenAI ChatCompletion message object
                    role=MessageRole.tool,  # NOTE
-                    content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
+                    content=content,
                    name=openai_message_dict["name"] if "name" in openai_message_dict else None,
                    tool_calls=openai_message_dict["tool_calls"] if "tool_calls" in openai_message_dict else None,
                    tool_call_id=openai_message_dict["tool_call_id"] if "tool_call_id" in openai_message_dict else None,
@ -362,7 +439,7 @@ class Message(BaseMessage):
                    model=model,
                    # standard fields expected in an OpenAI ChatCompletion message object
                    role=MessageRole.tool,  # NOTE
-                    content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
+                    content=content,
                    name=openai_message_dict["name"] if "name" in openai_message_dict else None,
                    tool_calls=openai_message_dict["tool_calls"] if "tool_calls" in openai_message_dict else None,
                    tool_call_id=openai_message_dict["tool_call_id"] if "tool_call_id" in openai_message_dict else None,
@ -395,7 +472,7 @@ class Message(BaseMessage):
                    model=model,
                    # standard fields expected in an OpenAI ChatCompletion message object
                    role=MessageRole(openai_message_dict["role"]),
-                    content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
+                    content=content,
                    name=openai_message_dict["name"] if "name" in openai_message_dict else None,
                    tool_calls=tool_calls,
                    tool_call_id=None,  # NOTE: None, since this field is only non-null for role=='tool'
@ -409,7 +486,7 @@ class Message(BaseMessage):
                    model=model,
                    # standard fields expected in an OpenAI ChatCompletion message object
                    role=MessageRole(openai_message_dict["role"]),
-                    content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
+                    content=content,
                    name=openai_message_dict["name"] if "name" in openai_message_dict else None,
                    tool_calls=tool_calls,
                    tool_call_id=None,  # NOTE: None, since this field is only non-null for role=='tool'
@ -442,7 +519,7 @@ class Message(BaseMessage):
                    model=model,
                    # standard fields expected in an OpenAI ChatCompletion message object
                    role=MessageRole(openai_message_dict["role"]),
-                    content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
+                    content=content,
                    name=openai_message_dict["name"] if "name" in openai_message_dict else None,
                    tool_calls=tool_calls,
                    tool_call_id=openai_message_dict["tool_call_id"] if "tool_call_id" in openai_message_dict else None,
@ -456,7 +533,7 @@ class Message(BaseMessage):
                    model=model,
                    # standard fields expected in an OpenAI ChatCompletion message object
                    role=MessageRole(openai_message_dict["role"]),
-                    content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
+                    content=content,
                    name=openai_message_dict["name"] if "name" in openai_message_dict else None,
                    tool_calls=tool_calls,
                    tool_call_id=openai_message_dict["tool_call_id"] if "tool_call_id" in openai_message_dict else None,
@ -477,11 +554,25 @@ class Message(BaseMessage):
        """Go from Message class to ChatCompletion message object"""

        # TODO change to pydantic casting, eg `return SystemMessageModel(self)`
+        # If we only have one content part and it's text, treat it as COT
+        parse_content_parts = False
        if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
            text_content = self.content[0].text
+        # Otherwise, check if we have TextContent and multiple other parts
+        elif self.content and len(self.content) > 1:
+            text = [content for content in self.content if isinstance(self.content[0], TextContent)]
+            if len(text) > 1:
+                assert len(text) == 1, f"multiple text content parts found in a single message: {self.content}"
+                text_content = text[0].text
+                parse_content_parts = True
        else:
            text_content = None

+        # TODO(caren) we should eventually support multiple content parts here?
+        # ie, actually make dict['content'] type list
+        # But for now, it's OK until we support multi-modal,
+        # since the only "parts" we have are for supporting various COT
+
        if self.role == "system":
            assert all([v is not None for v in [self.role]]), vars(self)
            openai_message = {
@ -539,6 +630,15 @@ class Message(BaseMessage):
        else:
            raise ValueError(self.role)

+        if parse_content_parts:
+            for content in self.content:
+                if isinstance(content, ReasoningContent):
+                    openai_message["reasoning_content"] = content.reasoning
+                    if content.signature:
+                        openai_message["reasoning_content_signature"] = content.signature
+                if isinstance(content, RedactedReasoningContent):
+                    openai_message["redacted_reasoning_content"] = content.data
+
        return openai_message

    def to_anthropic_dict(
@ -552,6 +652,8 @@ class Message(BaseMessage):
        Args:
            inner_thoughts_xml_tag (str): The XML tag to wrap around inner thoughts
        """
+
+        # Check for COT
        if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
            text_content = self.content[0].text
        else:
@ -587,7 +689,24 @@ class Message(BaseMessage):
            }
            content = []
            # COT / reasoning / thinking
-            if text_content is not None and not put_inner_thoughts_in_kwargs:
+            if len(self.content) > 1:
+                for content_part in self.content:
+                    if isinstance(content_part, ReasoningContent):
+                        content.append(
+                            {
+                                "type": "thinking",
+                                "thinking": content_part.reasoning,
+                                "signature": content_part.signature,
+                            }
+                        )
+                    if isinstance(content_part, RedactedReasoningContent):
+                        content.append(
+                            {
+                                "type": "redacted_thinking",
+                                "data": content_part.data,
+                            }
+                        )
+            elif text_content is not None:
                content.append(
                    {
                        "type": "text",
--- a/letta/schemas/openai/chat_completion_response.py
+++ b/letta/schemas/openai/chat_completion_response.py
@ -40,6 +40,8 @@ class Message(BaseModel):
    role: str
    function_call: Optional[FunctionCall] = None  # Deprecated
    reasoning_content: Optional[str] = None  # Used in newer reasoning APIs
+    reasoning_content_signature: Optional[str] = None  # NOTE: for Anthropic
+    redacted_reasoning_content: Optional[str] = None  # NOTE: for Anthropic


 class Choice(BaseModel):
@ -117,6 +119,8 @@ class MessageDelta(BaseModel):

    content: Optional[str] = None
    reasoning_content: Optional[str] = None
+    reasoning_content_signature: Optional[str] = None  # NOTE: for Anthropic
+    redacted_reasoning_content: Optional[str] = None  # NOTE: for Anthropic
    tool_calls: Optional[List[ToolCallDelta]] = None
    role: Optional[str] = None
    function_call: Optional[FunctionCallDelta] = None  # Deprecated
@ -140,3 +144,4 @@ class ChatCompletionChunkResponse(BaseModel):
    system_fingerprint: Optional[str] = None
    # object: str = Field(default="chat.completion")
    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+    output_tokens: int = 0
--- a/letta/server/rest_api/interface.py
+++ b/letta/server/rest_api/interface.py
@ -13,6 +13,7 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG
 from letta.schemas.enums import MessageStreamStatus
 from letta.schemas.letta_message import (
    AssistantMessage,
+    HiddenReasoningMessage,
    LegacyFunctionCallMessage,
    LegacyLettaMessage,
    LettaMessage,
@ -22,6 +23,7 @@ from letta.schemas.letta_message import (
    ToolCallMessage,
    ToolReturnMessage,
 )
+from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
 from letta.schemas.message import Message
 from letta.schemas.openai.chat_completion_response import ChatCompletionChunkResponse
 from letta.server.rest_api.optimistic_json_parser import OptimisticJSONParser
@ -478,7 +480,7 @@ class StreamingServerInterface(AgentChunkStreamingInterface):

        if (
            message_delta.content is None
-            and (expect_reasoning_content and message_delta.reasoning_content is None)
+            and (expect_reasoning_content and message_delta.reasoning_content is None and message_delta.redacted_reasoning_content is None)
            and message_delta.tool_calls is None
            and message_delta.function_call is None
            and choice.finish_reason is None
@ -493,6 +495,15 @@ class StreamingServerInterface(AgentChunkStreamingInterface):
                id=message_id,
                date=message_date,
                reasoning=message_delta.reasoning_content,
+                signature=message_delta.reasoning_content_signature,
+                source="reasoner_model" if message_delta.reasoning_content_signature else "non_reasoner_model",
+            )
+        elif expect_reasoning_content and message_delta.redacted_reasoning_content is not None:
+            processed_chunk = HiddenReasoningMessage(
+                id=message_id,
+                date=message_date,
+                hidden_reasoning=message_delta.redacted_reasoning_content,
+                state="redacted",
            )
        elif expect_reasoning_content and message_delta.content is not None:
            # "ignore" content if we expect reasoning content
@ -1071,13 +1082,39 @@ class StreamingServerInterface(AgentChunkStreamingInterface):
            #     "id": str(msg_obj.id) if msg_obj is not None else None,
            # }
            assert msg_obj is not None, "Internal monologue requires msg_obj references for metadata"
-            processed_chunk = ReasoningMessage(
-                id=msg_obj.id,
-                date=msg_obj.created_at,
-                reasoning=msg,
-            )
+            if msg_obj.content and len(msg_obj.content) == 1 and isinstance(msg_obj.content[0], TextContent):
+                processed_chunk = ReasoningMessage(
+                    id=msg_obj.id,
+                    date=msg_obj.created_at,
+                    reasoning=msg,
+                )

-            self._push_to_buffer(processed_chunk)
+                self._push_to_buffer(processed_chunk)
+            else:
+                for content in msg_obj.content:
+                    if isinstance(content, TextContent):
+                        processed_chunk = ReasoningMessage(
+                            id=msg_obj.id,
+                            date=msg_obj.created_at,
+                            reasoning=content.text,
+                        )
+                    elif isinstance(content, ReasoningContent):
+                        processed_chunk = ReasoningMessage(
+                            id=msg_obj.id,
+                            date=msg_obj.created_at,
+                            source="reasoner_model",
+                            reasoning=content.reasoning,
+                            signature=content.signature,
+                        )
+                    elif isinstance(content, RedactedReasoningContent):
+                        processed_chunk = HiddenReasoningMessage(
+                            id=msg_obj.id,
+                            date=msg_obj.created_at,
+                            state="redacted",
+                            hidden_reasoning=content.data,
+                        )
+
+                    self._push_to_buffer(processed_chunk)

        return

--- a/letta/server/server.py
+++ b/letta/server/server.py
@ -746,7 +746,13 @@ class SyncServer(Server):
        if request.llm_config is None:
            if request.model is None:
                raise ValueError("Must specify either model or llm_config in request")
-            request.llm_config = self.get_llm_config_from_handle(handle=request.model, context_window_limit=request.context_window_limit)
+            request.llm_config = self.get_llm_config_from_handle(
+                handle=request.model,
+                context_window_limit=request.context_window_limit,
+                max_tokens=request.max_tokens,
+                max_reasoning_tokens=request.max_reasoning_tokens,
+                enable_reasoner=request.enable_reasoner,
+            )

        if request.embedding_config is None:
            if request.embedding is None:
@ -1056,7 +1062,14 @@ class SyncServer(Server):
        # Merge the two dictionaries, keeping the values from providers_from_db where conflicts occur
        return {**providers_from_env, **providers_from_db}.values()

-    def get_llm_config_from_handle(self, handle: str, context_window_limit: Optional[int] = None) -> LLMConfig:
+    def get_llm_config_from_handle(
+        self,
+        handle: str,
+        context_window_limit: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        max_reasoning_tokens: Optional[int] = None,
+        enable_reasoner: Optional[bool] = None,
+    ) -> LLMConfig:
        try:
            provider_name, model_name = handle.split("/", 1)
            provider = self.get_provider_from_name(provider_name)
@ -1078,13 +1091,22 @@ class SyncServer(Server):
        else:
            llm_config = llm_configs[0]

-        if context_window_limit:
+        if context_window_limit is not None:
            if context_window_limit > llm_config.context_window:
                raise ValueError(f"Context window limit ({context_window_limit}) is greater than maximum of ({llm_config.context_window})")
            llm_config.context_window = context_window_limit
        else:
            llm_config.context_window = min(llm_config.context_window, model_settings.global_max_context_window_limit)

+        if max_tokens is not None:
+            llm_config.max_tokens = max_tokens
+        if max_reasoning_tokens is not None:
+            if not max_tokens or max_reasoning_tokens > max_tokens:
+                raise ValueError(f"Max reasoning tokens ({max_reasoning_tokens}) must be less than max tokens ({max_tokens})")
+            llm_config.max_reasoning_tokens = max_reasoning_tokens
+        if enable_reasoner is not None:
+            llm_config.enable_reasoner = enable_reasoner
+
        return llm_config

    def get_embedding_config_from_handle(
--- a/package-lock.json
+++ b/package-lock.json
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -56,8 +56,8 @@ nltk = "^3.8.1"
 jinja2 = "^3.1.5"
 locust = {version = "^2.31.5", optional = true}
 wikipedia = {version = "^1.4.0", optional = true}
-composio-langchain = "^0.7.10"
-composio-core = "^0.7.10"
+composio-langchain = "^0.7.7"
+composio-core = "^0.7.7"
 alembic = "^1.13.3"
 pyhumps = "^3.8.0"
 psycopg2 = {version = "^2.9.10", optional = true}
@ -73,7 +73,7 @@ grpcio-tools = "^1.68.1"
 llama-index = "^0.12.2"
 llama-index-embeddings-openai = "^0.3.1"
 e2b-code-interpreter = {version = "^1.0.3", optional = true}
-anthropic = "^0.43.0"
+anthropic = "^0.49.0"
 letta_client = "^0.1.65"
 openai = "^1.60.0"
 opentelemetry-api = "1.30.0"