Skip to content

Pydantic Evals

Here's an example showing how to use batchling with Pydantic Evals:

pydantic_evals_example.py
import asyncio

from dotenv import load_dotenv
from pydantic_ai import Agent
from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import Contains, EqualsExpected

from batchling import batchify

load_dotenv()


async def run_agent(text: str) -> str:
    """Run the pydantic-ai example."""
    agent = Agent(
        model="openai:gpt-5-nano",
        system_prompt="Convert the input text to uppercase.",
    )
    result = await agent.run(user_prompt=text)
    return result.output


async def evaluate() -> None:
    """Run the pydantic-evals example."""
    # Create a dataset with test cases
    dataset = Dataset(
        cases=[
            Case(
                name="uppercase_basic",
                inputs="hello world",
                expected_output="HELLO WORLD",
            ),
            Case(
                name="uppercase_with_numbers",
                inputs="hello 123",
                expected_output="HELLO 123",
            ),
        ],
        evaluators=[
            EqualsExpected(),  # Check exact match with expected_output
            Contains(value="HELLO", case_sensitive=True),  # Check contains "HELLO"
        ],
    )
    # Run the evaluation
    report = await dataset.evaluate(run_agent)

    # Print the results
    report.print(include_output=True)


async def run_with_batchify() -> None:
    """Run `main` inside `batchify` for direct script execution."""
    async with batchify():
        await evaluate()


if __name__ == "__main__":
    asyncio.run(run_with_batchify())

Output:

                 Evaluation Summary: run_agent
┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Case ID                ┃ Outputs     ┃ Assertions ┃ Duration ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
│ uppercase_basic        │ HELLO WORLD │ ✔✔         │   148.7s │
├────────────────────────┼─────────────┼────────────┼──────────┤
│ uppercase_with_numbers │ HELLO 123   │ ✔✔         │   148.6s │
├────────────────────────┼─────────────┼────────────┼──────────┤
│ Averages               │             │ 100.0% ✔   │   148.7s │
└────────────────────────┴─────────────┴────────────┴──────────┘