feat: add sonnet 3.7 support (#1302)

This commit is contained in:
cthomas 2025-03-24 16:36:16 -10:00 committed by GitHub
parent de16a17f65
commit 100431dce8
14 changed files with 1743 additions and 535 deletions

View File

@ -424,7 +424,7 @@ class Agent(BaseAgent):
self.logger.debug(f"Function call message: {messages[-1]}")
nonnull_content = False
if response_message.content:
if response_message.content or response_message.reasoning_content or response_message.redacted_reasoning_content:
# The content if then internal monologue, not chat
self.interface.internal_monologue(response_message.content, msg_obj=messages[-1])
# Flag to avoid printing a duplicate if inner thoughts get popped from the function call

View File

@ -9,7 +9,7 @@ from letta.constants import OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING
from letta.errors import LLMError
from letta.log import get_logger
from letta.schemas.enums import MessageStreamStatus
from letta.schemas.letta_message import AssistantMessage, ReasoningMessage, ToolCallMessage, ToolReturnMessage
from letta.schemas.letta_message import AssistantMessage, HiddenReasoningMessage, ReasoningMessage, ToolCallMessage, ToolReturnMessage
from letta.schemas.letta_response import LettaStreamingResponse
from letta.schemas.usage import LettaUsageStatistics
@ -57,6 +57,8 @@ def _sse_post(url: str, data: dict, headers: dict) -> Generator[Union[LettaStrea
yield ReasoningMessage(**chunk_data)
elif chunk_data.get("message_type") == "assistant_message":
yield AssistantMessage(**chunk_data)
elif "hidden_reasoning" in chunk_data:
yield HiddenReasoningMessage(**chunk_data)
elif "tool_call" in chunk_data:
yield ToolCallMessage(**chunk_data)
elif "tool_return" in chunk_data:

View File

@ -13,7 +13,9 @@ from anthropic.types.beta import (
BetaRawMessageDeltaEvent,
BetaRawMessageStartEvent,
BetaRawMessageStopEvent,
BetaRedactedThinkingBlock,
BetaTextBlock,
BetaThinkingBlock,
BetaToolUseBlock,
)
@ -345,43 +347,32 @@ def convert_anthropic_response_to_chatcompletion(
finish_reason = remap_finish_reason(response.stop_reason)
content = None
reasoning_content = None
reasoning_content_signature = None
redacted_reasoning_content = None
tool_calls = None
if len(response.content) > 1:
# inner mono + function call
assert len(response.content) == 2
text_block = response.content[0]
tool_block = response.content[1]
assert text_block.type == "text"
assert tool_block.type == "tool_use"
content = strip_xml_tags(string=text_block.text, tag=inner_thoughts_xml_tag)
tool_calls = [
ToolCall(
id=tool_block.id,
type="function",
function=FunctionCall(
name=tool_block.name,
arguments=json.dumps(tool_block.input, indent=2),
),
)
]
elif len(response.content) == 1:
block = response.content[0]
if block.type == "tool_use":
# function call only
tool_calls = [
ToolCall(
id=block.id,
type="function",
function=FunctionCall(
name=block.name,
arguments=json.dumps(block.input, indent=2),
),
)
]
else:
# inner mono only
content = strip_xml_tags(string=block.text, tag=inner_thoughts_xml_tag)
for content_part in response.content:
if content_part.type == "text":
content = strip_xml_tags(string=content_part.text, tag=inner_thoughts_xml_tag)
if content_part.type == "tool_use":
tool_calls = [
ToolCall(
id=content_part.id,
type="function",
function=FunctionCall(
name=content_part.name,
arguments=json.dumps(content_part.input, indent=2),
),
)
]
if content_part.type == "thinking":
reasoning_content = content_part.thinking
reasoning_content_signature = content_part.signature
if content_part.type == "redacted_thinking":
redacted_reasoning_content = content_part.data
else:
raise RuntimeError("Unexpected empty content in response")
@ -392,6 +383,9 @@ def convert_anthropic_response_to_chatcompletion(
message=ChoiceMessage(
role=response.role,
content=content,
reasoning_content=reasoning_content,
reasoning_content_signature=reasoning_content_signature,
redacted_reasoning_content=redacted_reasoning_content,
tool_calls=tool_calls,
),
)
@ -462,7 +456,31 @@ def convert_anthropic_stream_event_to_chatcompletion(
"""
# Get finish reason
finish_reason = None
if isinstance(event, BetaRawMessageDeltaEvent):
completion_chunk_tokens = 0
# Get content and tool calls
content = None
reasoning_content = None
reasoning_content_signature = None
redacted_reasoning_content = None # NOTE called "data" in the stream
tool_calls = None
if isinstance(event, BetaRawMessageStartEvent):
"""
BetaRawMessageStartEvent(
message=BetaMessage(
content=[],
usage=BetaUsage(
input_tokens=3086,
output_tokens=1,
),
...,
),
type='message_start'
)
"""
completion_chunk_tokens += event.message.usage.output_tokens
elif isinstance(event, BetaRawMessageDeltaEvent):
"""
BetaRawMessageDeltaEvent(
delta=Delta(
@ -474,11 +492,9 @@ def convert_anthropic_stream_event_to_chatcompletion(
)
"""
finish_reason = remap_finish_reason(event.delta.stop_reason)
completion_chunk_tokens += event.usage.output_tokens
# Get content and tool calls
content = None
tool_calls = None
if isinstance(event, BetaRawContentBlockDeltaEvent):
elif isinstance(event, BetaRawContentBlockDeltaEvent):
"""
BetaRawContentBlockDeltaEvent(
delta=BetaInputJSONDelta(
@ -501,9 +517,24 @@ def convert_anthropic_stream_event_to_chatcompletion(
)
"""
# ReACT COT
if event.delta.type == "text_delta":
content = strip_xml_tags_streaming(string=event.delta.text, tag=inner_thoughts_xml_tag)
# Extended thought COT
elif event.delta.type == "thinking_delta":
# Redacted doesn't come in the delta chunks, comes all at once
# "redacted_thinking blocks will not have any deltas associated and will be sent as a single event."
# Thinking might start with ""
if len(event.delta.thinking) > 0:
reasoning_content = event.delta.thinking
# Extended thought COT signature
elif event.delta.type == "signature_delta":
if len(event.delta.signature) > 0:
reasoning_content_signature = event.delta.signature
# Tool calling
elif event.delta.type == "input_json_delta":
tool_calls = [
ToolCallDelta(
@ -514,6 +545,9 @@ def convert_anthropic_stream_event_to_chatcompletion(
),
)
]
else:
warnings.warn("Unexpected delta type: " + event.delta.type)
elif isinstance(event, BetaRawContentBlockStartEvent):
"""
BetaRawContentBlockStartEvent(
@ -551,6 +585,15 @@ def convert_anthropic_stream_event_to_chatcompletion(
]
elif isinstance(event.content_block, BetaTextBlock):
content = event.content_block.text
elif isinstance(event.content_block, BetaThinkingBlock):
reasoning_content = event.content_block.thinking
elif isinstance(event.content_block, BetaRedactedThinkingBlock):
redacted_reasoning_content = event.content_block.data
else:
warnings.warn("Unexpected content start type: " + str(type(event.content_block)))
else:
warnings.warn("Unexpected event type: " + event.type)
# Initialize base response
choice = ChunkChoice(
@ -558,6 +601,9 @@ def convert_anthropic_stream_event_to_chatcompletion(
finish_reason=finish_reason,
delta=MessageDelta(
content=content,
reasoning_content=reasoning_content,
reasoning_content_signature=reasoning_content_signature,
redacted_reasoning_content=redacted_reasoning_content,
tool_calls=tool_calls,
),
)
@ -566,6 +612,7 @@ def convert_anthropic_stream_event_to_chatcompletion(
choices=[choice],
created=get_utc_time(),
model=model,
output_tokens=completion_chunk_tokens,
)
@ -577,8 +624,20 @@ def _prepare_anthropic_request(
# if true, put COT inside the tool calls instead of inside the content
put_inner_thoughts_in_kwargs: bool = False,
bedrock: bool = False,
# extended thinking related fields
# https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking
extended_thinking: bool = False,
max_reasoning_tokens: Optional[int] = None,
) -> dict:
"""Prepare the request data for Anthropic API format."""
if extended_thinking:
assert (
max_reasoning_tokens is not None and max_reasoning_tokens < data.max_tokens
), "max tokens must be greater than thinking budget"
assert not put_inner_thoughts_in_kwargs, "extended thinking not compatible with put_inner_thoughts_in_kwargs"
# assert not prefix_fill, "extended thinking not compatible with prefix_fill"
# Silently disable prefix_fill for now
prefix_fill = False
# if needed, put inner thoughts as a kwarg for all tools
if data.tools and put_inner_thoughts_in_kwargs:
@ -595,6 +654,14 @@ def _prepare_anthropic_request(
# pydantic -> dict
data = data.model_dump(exclude_none=True)
if extended_thinking:
data["thinking"] = {
"type": "enabled",
"budget_tokens": max_reasoning_tokens,
}
# `temperature` may only be set to 1 when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking'
data["temperature"] = 1.0
if "functions" in data:
raise ValueError(f"'functions' unexpected in Anthropic API payload")
@ -665,6 +732,8 @@ def anthropic_chat_completions_request(
data: ChatCompletionRequest,
inner_thoughts_xml_tag: Optional[str] = "thinking",
put_inner_thoughts_in_kwargs: bool = False,
extended_thinking: bool = False,
max_reasoning_tokens: Optional[int] = None,
betas: List[str] = ["tools-2024-04-04"],
) -> ChatCompletionResponse:
"""https://docs.anthropic.com/claude/docs/tool-use"""
@ -678,6 +747,8 @@ def anthropic_chat_completions_request(
data=data,
inner_thoughts_xml_tag=inner_thoughts_xml_tag,
put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
extended_thinking=extended_thinking,
max_reasoning_tokens=max_reasoning_tokens,
)
log_event(name="llm_request_sent", attributes=data)
response = anthropic_client.beta.messages.create(
@ -717,6 +788,8 @@ def anthropic_chat_completions_request_stream(
data: ChatCompletionRequest,
inner_thoughts_xml_tag: Optional[str] = "thinking",
put_inner_thoughts_in_kwargs: bool = False,
extended_thinking: bool = False,
max_reasoning_tokens: Optional[int] = None,
betas: List[str] = ["tools-2024-04-04"],
) -> Generator[ChatCompletionChunkResponse, None, None]:
"""Stream chat completions from Anthropic API.
@ -728,6 +801,8 @@ def anthropic_chat_completions_request_stream(
data=data,
inner_thoughts_xml_tag=inner_thoughts_xml_tag,
put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
extended_thinking=extended_thinking,
max_reasoning_tokens=max_reasoning_tokens,
)
anthropic_override_key = ProviderManager().get_anthropic_override_key()
@ -777,6 +852,8 @@ def anthropic_chat_completions_process_stream(
stream_interface: Optional[Union[AgentChunkStreamingInterface, AgentRefreshStreamingInterface]] = None,
inner_thoughts_xml_tag: Optional[str] = "thinking",
put_inner_thoughts_in_kwargs: bool = False,
extended_thinking: bool = False,
max_reasoning_tokens: Optional[int] = None,
create_message_id: bool = True,
create_message_datetime: bool = True,
betas: List[str] = ["tools-2024-04-04"],
@ -839,7 +916,6 @@ def anthropic_chat_completions_process_stream(
created=dummy_message.created_at,
model=chat_completion_request.model,
usage=UsageStatistics(
completion_tokens=0,
prompt_tokens=prompt_tokens,
total_tokens=prompt_tokens,
),
@ -850,13 +926,15 @@ def anthropic_chat_completions_process_stream(
if stream_interface:
stream_interface.stream_start()
n_chunks = 0
completion_tokens = 0
try:
for chunk_idx, chat_completion_chunk in enumerate(
anthropic_chat_completions_request_stream(
data=chat_completion_request,
inner_thoughts_xml_tag=inner_thoughts_xml_tag,
put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs,
extended_thinking=extended_thinking,
max_reasoning_tokens=max_reasoning_tokens,
betas=betas,
)
):
@ -868,6 +946,9 @@ def anthropic_chat_completions_process_stream(
chat_completion_chunk,
message_id=chat_completion_response.id if create_message_id else chat_completion_chunk.id,
message_date=chat_completion_response.created if create_message_datetime else chat_completion_chunk.created,
# if extended_thinking is on, then reasoning_content will be flowing as chunks
# TODO handle emitting redacted reasoning content (e.g. as concat?)
expect_reasoning_content=extended_thinking,
)
elif isinstance(stream_interface, AgentRefreshStreamingInterface):
stream_interface.process_refresh(chat_completion_response)
@ -908,6 +989,30 @@ def anthropic_chat_completions_process_stream(
else:
accum_message.content += content_delta
# NOTE: for extended_thinking mode
if extended_thinking and message_delta.reasoning_content is not None:
reasoning_content_delta = message_delta.reasoning_content
if accum_message.reasoning_content is None:
accum_message.reasoning_content = reasoning_content_delta
else:
accum_message.reasoning_content += reasoning_content_delta
# NOTE: extended_thinking sends a signature
if extended_thinking and message_delta.reasoning_content_signature is not None:
reasoning_content_signature_delta = message_delta.reasoning_content_signature
if accum_message.reasoning_content_signature is None:
accum_message.reasoning_content_signature = reasoning_content_signature_delta
else:
accum_message.reasoning_content_signature += reasoning_content_signature_delta
# NOTE: extended_thinking also has the potential for redacted_reasoning_content
if extended_thinking and message_delta.redacted_reasoning_content is not None:
redacted_reasoning_content_delta = message_delta.redacted_reasoning_content
if accum_message.redacted_reasoning_content is None:
accum_message.redacted_reasoning_content = redacted_reasoning_content_delta
else:
accum_message.redacted_reasoning_content += redacted_reasoning_content_delta
# TODO(charles) make sure this works for parallel tool calling?
if message_delta.tool_calls is not None:
tool_calls_delta = message_delta.tool_calls
@ -966,7 +1071,8 @@ def anthropic_chat_completions_process_stream(
chat_completion_response.system_fingerprint = chat_completion_chunk.system_fingerprint
# increment chunk counter
n_chunks += 1
if chat_completion_chunk.output_tokens is not None:
completion_tokens += chat_completion_chunk.output_tokens
except Exception as e:
if stream_interface:
@ -990,8 +1096,8 @@ def anthropic_chat_completions_process_stream(
# compute token usage before returning
# TODO try actually computing the #tokens instead of assuming the chunks is the same
chat_completion_response.usage.completion_tokens = n_chunks
chat_completion_response.usage.total_tokens = prompt_tokens + n_chunks
chat_completion_response.usage.completion_tokens = completion_tokens
chat_completion_response.usage.total_tokens = prompt_tokens + completion_tokens
assert len(chat_completion_response.choices) > 0, chat_completion_response

View File

@ -406,6 +406,8 @@ def create(
chat_completion_request=chat_completion_request,
put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
stream_interface=stream_interface,
extended_thinking=llm_config.enable_reasoner,
max_reasoning_tokens=llm_config.max_reasoning_tokens,
)
else:
@ -413,6 +415,8 @@ def create(
response = anthropic_chat_completions_request(
data=chat_completion_request,
put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
extended_thinking=llm_config.enable_reasoner,
max_reasoning_tokens=llm_config.max_reasoning_tokens,
)
if llm_config.put_inner_thoughts_in_kwargs:

View File

@ -147,6 +147,14 @@ class CreateAgent(BaseModel, validate_assignment=True): #
)
context_window_limit: Optional[int] = Field(None, description="The context window limit used by the agent.")
embedding_chunk_size: Optional[int] = Field(DEFAULT_EMBEDDING_CHUNK_SIZE, description="The embedding chunk size used by the agent.")
max_tokens: Optional[int] = Field(
None,
description="The maximum number of tokens to generate, including reasoning step. If not set, the model will use its default value.",
)
max_reasoning_tokens: Optional[int] = Field(
None, description="The maximum number of tokens to generate for reasoning step. If not set, the model will use its default value."
)
enable_reasoner: Optional[bool] = Field(False, description="Whether to enable internal extended thinking step for a reasoner model.")
from_template: Optional[str] = Field(None, description="The template id used to configure the agent")
template: bool = Field(False, description="Whether the agent is a template")
project: Optional[str] = Field(

View File

@ -88,11 +88,13 @@ class ReasoningMessage(LettaMessage):
source (Literal["reasoner_model", "non_reasoner_model"]): Whether the reasoning
content was generated natively by a reasoner model or derived via prompting
reasoning (str): The internal reasoning of the agent
signature (Optional[str]): The model-generated signature of the reasoning step
"""
message_type: Literal["reasoning_message"] = "reasoning_message"
source: Literal["reasoner_model", "non_reasoner_model"] = "non_reasoner_model"
reasoning: str
signature: Optional[str] = None
class HiddenReasoningMessage(LettaMessage):
@ -106,12 +108,12 @@ class HiddenReasoningMessage(LettaMessage):
name (Optional[str]): The name of the sender of the message
state (Literal["redacted", "omitted"]): Whether the reasoning
content was redacted by the provider or simply omitted by the API
reasoning (str): The internal reasoning of the agent
hidden_reasoning (Optional[str]): The internal reasoning of the agent
"""
message_type: Literal["reasoning_message"] = "reasoning_message"
message_type: Literal["hidden_reasoning_message"] = "hidden_reasoning_message"
state: Literal["redacted", "omitted"]
reasoning: str
hidden_reasoning: Optional[str] = None
class ToolCall(BaseModel):
@ -229,7 +231,7 @@ class AssistantMessage(LettaMessage):
# NOTE: use Pydantic's discriminated unions feature: https://docs.pydantic.dev/latest/concepts/unions/#discriminated-unions
LettaMessageUnion = Annotated[
Union[SystemMessage, UserMessage, ReasoningMessage, ToolCallMessage, ToolReturnMessage, AssistantMessage],
Union[SystemMessage, UserMessage, ReasoningMessage, HiddenReasoningMessage, ToolCallMessage, ToolReturnMessage, AssistantMessage],
Field(discriminator="message_type"),
]
@ -240,6 +242,7 @@ def create_letta_message_union_schema():
{"$ref": "#/components/schemas/SystemMessage"},
{"$ref": "#/components/schemas/UserMessage"},
{"$ref": "#/components/schemas/ReasoningMessage"},
{"$ref": "#/components/schemas/HiddenReasoningMessage"},
{"$ref": "#/components/schemas/ToolCallMessage"},
{"$ref": "#/components/schemas/ToolReturnMessage"},
{"$ref": "#/components/schemas/AssistantMessage"},
@ -250,6 +253,7 @@ def create_letta_message_union_schema():
"system_message": "#/components/schemas/SystemMessage",
"user_message": "#/components/schemas/UserMessage",
"reasoning_message": "#/components/schemas/ReasoningMessage",
"hidden_reasoning_message": "#/components/schemas/HiddenReasoningMessage",
"tool_call_message": "#/components/schemas/ToolCallMessage",
"tool_return_message": "#/components/schemas/ToolReturnMessage",
"assistant_message": "#/components/schemas/AssistantMessage",

View File

@ -60,6 +60,12 @@ class LLMConfig(BaseModel):
4096,
description="The maximum number of tokens to generate. If not set, the model will use its default value.",
)
enable_reasoner: bool = Field(
False, description="Whether or not the model should use extended thinking if it is a 'reasoning' style model"
)
max_reasoning_tokens: int = Field(
0, description="Configurable thinking budget for extended thinking, only used if enable_reasoner is True. Minimum value is 1024."
)
# FIXME hack to silence pydantic protected namespace warning
model_config = ConfigDict(protected_namespaces=())

View File

@ -19,6 +19,7 @@ from letta.schemas.enums import MessageRole
from letta.schemas.letta_base import OrmMetadataBase
from letta.schemas.letta_message import (
AssistantMessage,
HiddenReasoningMessage,
LettaMessage,
ReasoningMessage,
SystemMessage,
@ -27,7 +28,13 @@ from letta.schemas.letta_message import (
ToolReturnMessage,
UserMessage,
)
from letta.schemas.letta_message_content import LettaMessageContentUnion, TextContent, get_letta_message_content_union_str_json_schema
from letta.schemas.letta_message_content import (
LettaMessageContentUnion,
ReasoningContent,
RedactedReasoningContent,
TextContent,
get_letta_message_content_union_str_json_schema,
)
from letta.system import unpack_message
@ -206,23 +213,58 @@ class Message(BaseMessage):
assistant_message_tool_kwarg: str = DEFAULT_MESSAGE_TOOL_KWARG,
) -> List[LettaMessage]:
"""Convert message object (in DB format) to the style used by the original Letta API"""
if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
text_content = self.content[0].text
else:
text_content = None
messages = []
if self.role == MessageRole.assistant:
if text_content is not None:
# This is type InnerThoughts
messages.append(
ReasoningMessage(
id=self.id,
date=self.created_at,
reasoning=text_content,
# Handle reasoning
if self.content:
# Check for ReACT-style COT inside of TextContent
if len(self.content) == 1 and isinstance(self.content[0], TextContent):
messages.append(
ReasoningMessage(
id=self.id,
date=self.created_at,
reasoning=self.content[0].text,
)
)
)
# Otherwise, we may have a list of multiple types
else:
# TODO we can probably collapse these two cases into a single loop
for content_part in self.content:
if isinstance(content_part, TextContent):
# COT
messages.append(
ReasoningMessage(
id=self.id,
date=self.created_at,
reasoning=content_part.text,
)
)
elif isinstance(content_part, ReasoningContent):
# "native" COT
messages.append(
ReasoningMessage(
id=self.id,
date=self.created_at,
reasoning=content_part.reasoning,
source="reasoner_model", # TODO do we want to tag like this?
signature=content_part.signature,
)
)
elif isinstance(content_part, RedactedReasoningContent):
# "native" redacted/hidden COT
messages.append(
HiddenReasoningMessage(
id=self.id,
date=self.created_at,
state="redacted",
hidden_reasoning=content_part.data,
)
)
else:
warnings.warn(f"Unrecognized content part in assistant message: {content_part}")
if self.tool_calls is not None:
# This is type FunctionCall
for tool_call in self.tool_calls:
@ -264,7 +306,11 @@ class Message(BaseMessage):
# "message": response_string,
# "time": formatted_time,
# }
assert text_content is not None, self
if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
text_content = self.content[0].text
else:
raise ValueError(f"Invalid tool return (no text object on message): {self.content}")
try:
function_return = json.loads(text_content)
status = function_return["status"]
@ -292,7 +338,11 @@ class Message(BaseMessage):
)
elif self.role == MessageRole.user:
# This is type UserMessage
assert text_content is not None, self
if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
text_content = self.content[0].text
else:
raise ValueError(f"Invalid user message (no text object on message): {self.content}")
message_str = unpack_message(text_content)
messages.append(
UserMessage(
@ -303,7 +353,11 @@ class Message(BaseMessage):
)
elif self.role == MessageRole.system:
# This is type SystemMessage
assert text_content is not None, self
if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
text_content = self.content[0].text
else:
raise ValueError(f"Invalid system message (no text object on system): {self.content}")
messages.append(
SystemMessage(
id=self.id,
@ -335,6 +389,29 @@ class Message(BaseMessage):
assert "role" in openai_message_dict, openai_message_dict
assert "content" in openai_message_dict, openai_message_dict
# TODO(caren) implicit support for only non-parts/list content types
if openai_message_dict["content"] is not None and type(openai_message_dict["content"]) is not str:
raise ValueError(f"Invalid content type: {type(openai_message_dict['content'])}")
content = [TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else []
# TODO(caren) bad assumption here that "reasoning_content" always comes before "redacted_reasoning_content"
if "reasoning_content" in openai_message_dict and openai_message_dict["reasoning_content"]:
content.append(
ReasoningContent(
reasoning=openai_message_dict["reasoning_content"],
is_native=True,
signature=(
openai_message_dict["reasoning_content_signature"] if openai_message_dict["reasoning_content_signature"] else None
),
),
)
if "redacted_reasoning_content" in openai_message_dict and openai_message_dict["redacted_reasoning_content"]:
content.append(
RedactedReasoningContent(
data=openai_message_dict["redacted_reasoning_content"] if "redacted_reasoning_content" in openai_message_dict else None,
),
)
# If we're going from deprecated function form
if openai_message_dict["role"] == "function":
if not allow_functions_style:
@ -348,7 +425,7 @@ class Message(BaseMessage):
model=model,
# standard fields expected in an OpenAI ChatCompletion message object
role=MessageRole.tool, # NOTE
content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
content=content,
name=openai_message_dict["name"] if "name" in openai_message_dict else None,
tool_calls=openai_message_dict["tool_calls"] if "tool_calls" in openai_message_dict else None,
tool_call_id=openai_message_dict["tool_call_id"] if "tool_call_id" in openai_message_dict else None,
@ -362,7 +439,7 @@ class Message(BaseMessage):
model=model,
# standard fields expected in an OpenAI ChatCompletion message object
role=MessageRole.tool, # NOTE
content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
content=content,
name=openai_message_dict["name"] if "name" in openai_message_dict else None,
tool_calls=openai_message_dict["tool_calls"] if "tool_calls" in openai_message_dict else None,
tool_call_id=openai_message_dict["tool_call_id"] if "tool_call_id" in openai_message_dict else None,
@ -395,7 +472,7 @@ class Message(BaseMessage):
model=model,
# standard fields expected in an OpenAI ChatCompletion message object
role=MessageRole(openai_message_dict["role"]),
content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
content=content,
name=openai_message_dict["name"] if "name" in openai_message_dict else None,
tool_calls=tool_calls,
tool_call_id=None, # NOTE: None, since this field is only non-null for role=='tool'
@ -409,7 +486,7 @@ class Message(BaseMessage):
model=model,
# standard fields expected in an OpenAI ChatCompletion message object
role=MessageRole(openai_message_dict["role"]),
content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
content=content,
name=openai_message_dict["name"] if "name" in openai_message_dict else None,
tool_calls=tool_calls,
tool_call_id=None, # NOTE: None, since this field is only non-null for role=='tool'
@ -442,7 +519,7 @@ class Message(BaseMessage):
model=model,
# standard fields expected in an OpenAI ChatCompletion message object
role=MessageRole(openai_message_dict["role"]),
content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
content=content,
name=openai_message_dict["name"] if "name" in openai_message_dict else None,
tool_calls=tool_calls,
tool_call_id=openai_message_dict["tool_call_id"] if "tool_call_id" in openai_message_dict else None,
@ -456,7 +533,7 @@ class Message(BaseMessage):
model=model,
# standard fields expected in an OpenAI ChatCompletion message object
role=MessageRole(openai_message_dict["role"]),
content=[TextContent(text=openai_message_dict["content"])] if openai_message_dict["content"] else [],
content=content,
name=openai_message_dict["name"] if "name" in openai_message_dict else None,
tool_calls=tool_calls,
tool_call_id=openai_message_dict["tool_call_id"] if "tool_call_id" in openai_message_dict else None,
@ -477,11 +554,25 @@ class Message(BaseMessage):
"""Go from Message class to ChatCompletion message object"""
# TODO change to pydantic casting, eg `return SystemMessageModel(self)`
# If we only have one content part and it's text, treat it as COT
parse_content_parts = False
if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
text_content = self.content[0].text
# Otherwise, check if we have TextContent and multiple other parts
elif self.content and len(self.content) > 1:
text = [content for content in self.content if isinstance(self.content[0], TextContent)]
if len(text) > 1:
assert len(text) == 1, f"multiple text content parts found in a single message: {self.content}"
text_content = text[0].text
parse_content_parts = True
else:
text_content = None
# TODO(caren) we should eventually support multiple content parts here?
# ie, actually make dict['content'] type list
# But for now, it's OK until we support multi-modal,
# since the only "parts" we have are for supporting various COT
if self.role == "system":
assert all([v is not None for v in [self.role]]), vars(self)
openai_message = {
@ -539,6 +630,15 @@ class Message(BaseMessage):
else:
raise ValueError(self.role)
if parse_content_parts:
for content in self.content:
if isinstance(content, ReasoningContent):
openai_message["reasoning_content"] = content.reasoning
if content.signature:
openai_message["reasoning_content_signature"] = content.signature
if isinstance(content, RedactedReasoningContent):
openai_message["redacted_reasoning_content"] = content.data
return openai_message
def to_anthropic_dict(
@ -552,6 +652,8 @@ class Message(BaseMessage):
Args:
inner_thoughts_xml_tag (str): The XML tag to wrap around inner thoughts
"""
# Check for COT
if self.content and len(self.content) == 1 and isinstance(self.content[0], TextContent):
text_content = self.content[0].text
else:
@ -587,7 +689,24 @@ class Message(BaseMessage):
}
content = []
# COT / reasoning / thinking
if text_content is not None and not put_inner_thoughts_in_kwargs:
if len(self.content) > 1:
for content_part in self.content:
if isinstance(content_part, ReasoningContent):
content.append(
{
"type": "thinking",
"thinking": content_part.reasoning,
"signature": content_part.signature,
}
)
if isinstance(content_part, RedactedReasoningContent):
content.append(
{
"type": "redacted_thinking",
"data": content_part.data,
}
)
elif text_content is not None:
content.append(
{
"type": "text",

View File

@ -40,6 +40,8 @@ class Message(BaseModel):
role: str
function_call: Optional[FunctionCall] = None # Deprecated
reasoning_content: Optional[str] = None # Used in newer reasoning APIs
reasoning_content_signature: Optional[str] = None # NOTE: for Anthropic
redacted_reasoning_content: Optional[str] = None # NOTE: for Anthropic
class Choice(BaseModel):
@ -117,6 +119,8 @@ class MessageDelta(BaseModel):
content: Optional[str] = None
reasoning_content: Optional[str] = None
reasoning_content_signature: Optional[str] = None # NOTE: for Anthropic
redacted_reasoning_content: Optional[str] = None # NOTE: for Anthropic
tool_calls: Optional[List[ToolCallDelta]] = None
role: Optional[str] = None
function_call: Optional[FunctionCallDelta] = None # Deprecated
@ -140,3 +144,4 @@ class ChatCompletionChunkResponse(BaseModel):
system_fingerprint: Optional[str] = None
# object: str = Field(default="chat.completion")
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
output_tokens: int = 0

View File

@ -13,6 +13,7 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG
from letta.schemas.enums import MessageStreamStatus
from letta.schemas.letta_message import (
AssistantMessage,
HiddenReasoningMessage,
LegacyFunctionCallMessage,
LegacyLettaMessage,
LettaMessage,
@ -22,6 +23,7 @@ from letta.schemas.letta_message import (
ToolCallMessage,
ToolReturnMessage,
)
from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
from letta.schemas.message import Message
from letta.schemas.openai.chat_completion_response import ChatCompletionChunkResponse
from letta.server.rest_api.optimistic_json_parser import OptimisticJSONParser
@ -478,7 +480,7 @@ class StreamingServerInterface(AgentChunkStreamingInterface):
if (
message_delta.content is None
and (expect_reasoning_content and message_delta.reasoning_content is None)
and (expect_reasoning_content and message_delta.reasoning_content is None and message_delta.redacted_reasoning_content is None)
and message_delta.tool_calls is None
and message_delta.function_call is None
and choice.finish_reason is None
@ -493,6 +495,15 @@ class StreamingServerInterface(AgentChunkStreamingInterface):
id=message_id,
date=message_date,
reasoning=message_delta.reasoning_content,
signature=message_delta.reasoning_content_signature,
source="reasoner_model" if message_delta.reasoning_content_signature else "non_reasoner_model",
)
elif expect_reasoning_content and message_delta.redacted_reasoning_content is not None:
processed_chunk = HiddenReasoningMessage(
id=message_id,
date=message_date,
hidden_reasoning=message_delta.redacted_reasoning_content,
state="redacted",
)
elif expect_reasoning_content and message_delta.content is not None:
# "ignore" content if we expect reasoning content
@ -1071,13 +1082,39 @@ class StreamingServerInterface(AgentChunkStreamingInterface):
# "id": str(msg_obj.id) if msg_obj is not None else None,
# }
assert msg_obj is not None, "Internal monologue requires msg_obj references for metadata"
processed_chunk = ReasoningMessage(
id=msg_obj.id,
date=msg_obj.created_at,
reasoning=msg,
)
if msg_obj.content and len(msg_obj.content) == 1 and isinstance(msg_obj.content[0], TextContent):
processed_chunk = ReasoningMessage(
id=msg_obj.id,
date=msg_obj.created_at,
reasoning=msg,
)
self._push_to_buffer(processed_chunk)
self._push_to_buffer(processed_chunk)
else:
for content in msg_obj.content:
if isinstance(content, TextContent):
processed_chunk = ReasoningMessage(
id=msg_obj.id,
date=msg_obj.created_at,
reasoning=content.text,
)
elif isinstance(content, ReasoningContent):
processed_chunk = ReasoningMessage(
id=msg_obj.id,
date=msg_obj.created_at,
source="reasoner_model",
reasoning=content.reasoning,
signature=content.signature,
)
elif isinstance(content, RedactedReasoningContent):
processed_chunk = HiddenReasoningMessage(
id=msg_obj.id,
date=msg_obj.created_at,
state="redacted",
hidden_reasoning=content.data,
)
self._push_to_buffer(processed_chunk)
return

View File

@ -746,7 +746,13 @@ class SyncServer(Server):
if request.llm_config is None:
if request.model is None:
raise ValueError("Must specify either model or llm_config in request")
request.llm_config = self.get_llm_config_from_handle(handle=request.model, context_window_limit=request.context_window_limit)
request.llm_config = self.get_llm_config_from_handle(
handle=request.model,
context_window_limit=request.context_window_limit,
max_tokens=request.max_tokens,
max_reasoning_tokens=request.max_reasoning_tokens,
enable_reasoner=request.enable_reasoner,
)
if request.embedding_config is None:
if request.embedding is None:
@ -1056,7 +1062,14 @@ class SyncServer(Server):
# Merge the two dictionaries, keeping the values from providers_from_db where conflicts occur
return {**providers_from_env, **providers_from_db}.values()
def get_llm_config_from_handle(self, handle: str, context_window_limit: Optional[int] = None) -> LLMConfig:
def get_llm_config_from_handle(
self,
handle: str,
context_window_limit: Optional[int] = None,
max_tokens: Optional[int] = None,
max_reasoning_tokens: Optional[int] = None,
enable_reasoner: Optional[bool] = None,
) -> LLMConfig:
try:
provider_name, model_name = handle.split("/", 1)
provider = self.get_provider_from_name(provider_name)
@ -1078,13 +1091,22 @@ class SyncServer(Server):
else:
llm_config = llm_configs[0]
if context_window_limit:
if context_window_limit is not None:
if context_window_limit > llm_config.context_window:
raise ValueError(f"Context window limit ({context_window_limit}) is greater than maximum of ({llm_config.context_window})")
llm_config.context_window = context_window_limit
else:
llm_config.context_window = min(llm_config.context_window, model_settings.global_max_context_window_limit)
if max_tokens is not None:
llm_config.max_tokens = max_tokens
if max_reasoning_tokens is not None:
if not max_tokens or max_reasoning_tokens > max_tokens:
raise ValueError(f"Max reasoning tokens ({max_reasoning_tokens}) must be less than max tokens ({max_tokens})")
llm_config.max_reasoning_tokens = max_reasoning_tokens
if enable_reasoner is not None:
llm_config.enable_reasoner = enable_reasoner
return llm_config
def get_embedding_config_from_handle(

1294
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

497
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -56,8 +56,8 @@ nltk = "^3.8.1"
jinja2 = "^3.1.5"
locust = {version = "^2.31.5", optional = true}
wikipedia = {version = "^1.4.0", optional = true}
composio-langchain = "^0.7.10"
composio-core = "^0.7.10"
composio-langchain = "^0.7.7"
composio-core = "^0.7.7"
alembic = "^1.13.3"
pyhumps = "^3.8.0"
psycopg2 = {version = "^2.9.10", optional = true}
@ -73,7 +73,7 @@ grpcio-tools = "^1.68.1"
llama-index = "^0.12.2"
llama-index-embeddings-openai = "^0.3.1"
e2b-code-interpreter = {version = "^1.0.3", optional = true}
anthropic = "^0.43.0"
anthropic = "^0.49.0"
letta_client = "^0.1.65"
openai = "^1.60.0"
opentelemetry-api = "1.30.0"