MemGPT/tests/test_tool_schema_parsing.py

import json
import os

import pytest

from letta.functions.functions import derive_openai_json_schema
from letta.llm_api.helpers import convert_to_structured_output, make_post_request
from letta.schemas.tool import ToolCreate


def _clean_diff(d1, d2):
    """Utility function to clean up the diff between two dictionaries."""

    # Keys in d1 but not in d2
    removed = {k: d1[k] for k in d1.keys() - d2.keys()}

    # Keys in d2 but not in d1
    added = {k: d2[k] for k in d2.keys() - d1.keys()}

    # Keys in both but values changed
    changed = {k: (d1[k], d2[k]) for k in d1.keys() & d2.keys() if d1[k] != d2[k]}

    return {k: v for k, v in {"removed": removed, "added": added, "changed": changed}.items() if v}  # Only include non-empty differences


def _compare_schemas(generated_schema: dict, expected_schema: dict, strip_heartbeat: bool = True):
    """Compare an autogenerated schema to an expected schema."""

    if strip_heartbeat:
        # Pop out the heartbeat parameter
        del generated_schema["parameters"]["properties"]["request_heartbeat"]
        # Remove from the required list
        generated_schema["parameters"]["required"].remove("request_heartbeat")

    # Check that the two schemas are equal
    # If not, pretty print the difference by dumping with indent=4
    if generated_schema != expected_schema:
        print("==== GENERATED SCHEMA ====")
        print(json.dumps(generated_schema, indent=4))
        print("==== EXPECTED SCHEMA ====")
        print(json.dumps(expected_schema, indent=4))
        print("==== DIFF ====")
        print(json.dumps(_clean_diff(generated_schema, expected_schema), indent=4))
        raise AssertionError("Schemas are not equal")
    else:
        print("Schemas are equal")


def _run_schema_test(schema_name: str, desired_function_name: str, expect_structured_output_fail: bool = False):
    """Load a file and compare the autogenerated schema to the expected schema."""

    # Open the python file as a string
    # Use the absolute path to make it easier to run the test from the root directory
    with open(os.path.join(os.path.dirname(__file__), f"test_tool_schema_parsing_files/{schema_name}.py"), "r") as file:
        source_code = file.read()

    # Derive the schema
    schema = derive_openai_json_schema(source_code, name=desired_function_name)

    # Assert that the schema matches the expected schema
    with open(os.path.join(os.path.dirname(__file__), f"test_tool_schema_parsing_files/{schema_name}.json"), "r") as file:
        expected_schema = json.load(file)

    _compare_schemas(schema, expected_schema)

    # Convert to structured output and compare
    if expect_structured_output_fail:
        with pytest.raises(ValueError):
            structured_output = convert_to_structured_output(schema)

    else:
        structured_output = convert_to_structured_output(schema)

        with open(os.path.join(os.path.dirname(__file__), f"test_tool_schema_parsing_files/{schema_name}_so.json"), "r") as file:
            expected_structured_output = json.load(file)

        _compare_schemas(structured_output, expected_structured_output, strip_heartbeat=False)


def test_derive_openai_json_schema():
    """Test that the schema generator works across a variety of example source code inputs."""

    print("==== TESTING basic example where the arg is a pydantic model ====")
    _run_schema_test("pydantic_as_single_arg_example", "create_step")

    print("==== TESTING basic example where the arg is a list of pydantic models ====")
    _run_schema_test("list_of_pydantic_example", "create_task_plan")

    print("==== TESTING more complex example where the arg is a nested pydantic model ====")
    _run_schema_test("nested_pydantic_as_arg_example", "create_task_plan")

    print("==== TESTING simple function with no args ====")
    _run_schema_test("simple_d20", "roll_d20")

    print("==== TESTING complex function with many args ====")
    _run_schema_test("all_python_complex", "check_order_status", expect_structured_output_fail=True)

    print("==== TESTING complex function with many args and no dict ====")
    # TODO we should properly cast Optionals into union nulls
    # Currently, we just disregard all Optional types on the conversion path
    _run_schema_test("all_python_complex_nodict", "check_order_status")


def _openai_payload(model: str, schema: dict, structured_output: bool):
    """Create an OpenAI payload with a tool call.

    Raw version of openai_chat_completions_request w/o pydantic models
    """

    if structured_output:
        tool_schema = convert_to_structured_output(schema)
    else:
        tool_schema = schema

    api_key = os.getenv("OPENAI_API_KEY")
    assert api_key is not None, "OPENAI_API_KEY must be set"

    # Simple system prompt to encourage the LLM to jump directly to a tool call
    system_prompt = "You job is to test the tool that you've been provided. Don't ask for any clarification on the args, just come up with some dummy data and try executing the tool."

    url = "https://api.openai.com/v1/chat/completions"
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
    data = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
        ],
        "tools": [
            {
                "type": "function",
                "function": tool_schema,
            }
        ],
        "tool_choice": "auto",  # TODO force the tool call on the one we want
        # NOTE: disabled for simplicity
        "parallel_tool_calls": False,
    }

    print("Request:\n", json.dumps(data, indent=2), "\n\n")

    try:
        make_post_request(url, headers, data)
    except Exception as e:
        print(f"Request failed, tool_schema=\n{json.dumps(tool_schema, indent=2)}")
        print(f"Error: {e}")
        raise e


def _load_schema_from_source_filename(filename: str) -> dict:
    with open(os.path.join(os.path.dirname(__file__), f"test_tool_schema_parsing_files/{filename}.py"), "r") as file:
        source_code = file.read()

    return derive_openai_json_schema(source_code)


# @pytest.mark.parametrize("openai_model", ["gpt-4o-mini"])
# @pytest.mark.parametrize("structured_output", [True])
@pytest.mark.parametrize("openai_model", ["gpt-4", "gpt-4o"])
@pytest.mark.parametrize("structured_output", [True, False])
def test_valid_schemas_via_openai(openai_model: str, structured_output: bool):
    """Test that we can send the schemas to OpenAI and get a tool call back."""

    for filename in [
        "pydantic_as_single_arg_example",
        "list_of_pydantic_example",
        "nested_pydantic_as_arg_example",
        "simple_d20",
        "all_python_complex",
        "all_python_complex_nodict",
    ]:
        print(f"==== TESTING OPENAI PAYLOAD FOR {openai_model} + {filename} ====")
        schema = _load_schema_from_source_filename(filename)

        # We should expect the all_python_complex one to fail when structured_output=True
        if filename == "all_python_complex" and structured_output:
            with pytest.raises(ValueError):
                _openai_payload(openai_model, schema, structured_output)
        else:
            _openai_payload(openai_model, schema, structured_output)


@pytest.mark.parametrize("openai_model", ["gpt-4o-mini"])
@pytest.mark.parametrize("structured_output", [True])
def test_composio_tool_schema_generation(openai_model: str, structured_output: bool):
    """Test that we can generate the schemas for some Composio tools."""

    if not os.getenv("COMPOSIO_API_KEY"):
        pytest.skip("COMPOSIO_API_KEY not set")

    for action_name in [
        "GITHUB_STAR_A_REPOSITORY_FOR_THE_AUTHENTICATED_USER",  # Simple
        "CAL_GET_AVAILABLE_SLOTS_INFO",  # has an array arg, needs to be converted properly
        "SALESFORCE_RETRIEVE_LEAD_DETAILS_BY_ID_WITH_CONDITIONAL_SUPPORT",  # has an array arg, needs to be converted properly
    ]:
        tool_create = ToolCreate.from_composio(action_name=action_name)

        assert tool_create.json_schema
        schema = tool_create.json_schema
        print(f"The schema for {action_name}: {json.dumps(schema, indent=4)}\n\n")

        try:
            _openai_payload(openai_model, schema, structured_output)
            print(f"Successfully called OpenAI using schema {schema} generated from {action_name}\n\n")
        except:
            print(f"Failed to call OpenAI using schema {schema} generated from {action_name}\n\n")

            raise


@pytest.mark.parametrize("openai_model", ["gpt-4o-mini"])
@pytest.mark.parametrize("structured_output", [True])
def test_langchain_tool_schema_generation(openai_model: str, structured_output: bool):
    """Test that we can generate the schemas for some Langchain tools."""
    from langchain_community.tools import WikipediaQueryRun
    from langchain_community.utilities import WikipediaAPIWrapper

    api_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=500)
    langchain_tool = WikipediaQueryRun(api_wrapper=api_wrapper)

    tool_create = ToolCreate.from_langchain(
        langchain_tool=langchain_tool,
        additional_imports_module_attr_map={"langchain_community.utilities": "WikipediaAPIWrapper"},
    )

    assert tool_create.json_schema
    schema = tool_create.json_schema
    print(f"The schema for {langchain_tool.name}: {json.dumps(schema, indent=4)}\n\n")

    try:
        _openai_payload(openai_model, schema, structured_output)
        print(f"Successfully called OpenAI using schema {schema} generated from {langchain_tool.name}\n\n")
    except:
        print(f"Failed to call OpenAI using schema {schema} generated from {langchain_tool.name}\n\n")

        raise