The agent/solver

The `pydantic-ai` Agent¶

Now, let’s dive into the implementation of our Lean agent using pydantic-ai. The code is structured into several modules, each with a specific responsibility.

The Agent (`agent.py`)¶

The core of our agent is in agent.py. We use pydantic-ai’s Agent class to create an agent that can use tools. The @agent.tool decorator makes it easy to define tools that the agent can call.

agent.py

"""Pydantic-AI agent for FVAPPS evaluation."""

import logging

from evals.fvapps.pydantic_ai.prompt import LEAN_FVAPPS_SYSTEM_PROMPT
from evals.fvapps.pydantic_ai.tools import verify_lean
from evals.fvapps.pydantic_ai.types import AgentResult, FVAPPSSample
from evals.fvapps.pydantic_ai.utils import categorize_error, extract_code

from pydantic_ai import Agent, RunContext


class AgentDeps:
    """Dependencies for agent tools."""

    def __init__(self, sample: FVAPPSSample):
        """Initialize agent dependencies.

        Args:
            sample: FVAPPS sample to evaluate
        """
        self.sample = sample
        self.attempts = 0


def create_fvapps_agent(model: str) -> Agent[AgentDeps, str]:
    """Create pydantic-ai agent with verify_lean tool.

    Args:
        model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")

    Returns:
        Agent configured for FVAPPS task
    """
    # Strip "anthropic/" prefix if present (inspect-ai format compatibility)
    if model.startswith("anthropic/"):
        model = model.replace("anthropic/", "")

    agent = Agent(
        model,
        deps_type=AgentDeps,
        system_prompt=LEAN_FVAPPS_SYSTEM_PROMPT,
    )

    @agent.tool
    async def verify_lean_tool(ctx: RunContext[AgentDeps], code: str) -> str:
        """Verify Lean code with lake build.

        Args:
            ctx: Pydantic-AI run context with agent dependencies
            code: Complete Lean program with sorry replaced

        Returns:
            Success message or error details
        """
        ctx.deps.attempts += 1

        logger = logging.getLogger(__name__)
        logger.info(f"Attempt {ctx.deps.attempts}: Verifying code ({len(code)} chars)")

        # Run verification
        result = verify_lean(code, ctx.deps.sample.units)

        # Return message (pydantic-ai pattern: return string, not raise error)
        msg = result.get("message", "")
        return msg if isinstance(msg, str) else str(msg)

    return agent  # type: ignore


def run_agent_on_sample(
    sample: FVAPPSSample,
    model: str,
) -> AgentResult:
    """Run pydantic-ai agent on single FVAPPS sample.

    Args:
        sample: FVAPPS sample with Lean code and tests
        model: Model identifier

    Returns:
        AgentResult with success status, attempts, code, and error
    """
    logger = logging.getLogger(__name__)
    logger.info(f"Starting evaluation for sample {sample.apps_id}")

    # Create agent
    agent = create_fvapps_agent(model)

    # Create dependencies
    deps = AgentDeps(sample=sample)

    # Construct user prompt
    user_prompt = f"""Problem: {sample.apps_question}

Specification (replace all sorry placeholders):
```lean
{sample.spec}
```

Unit tests (must pass):
```lean
{sample.units}
```

Write the complete implementation and proofs, then call verify_lean tool to check your work.
"""

    try:
        # Run agent (pydantic-ai handles tool iteration automatically in run_sync)
        result = agent.run_sync(user_prompt, deps=deps)

        # Extract code from result (pydantic-ai uses .data attribute on RunResult)
        # Get the output text from the result
        output_text = str(result.data) if hasattr(result, "data") else str(result)
        final_code = extract_code(output_text)

        # Check if verification succeeded (based on success message in output)
        success = "✓ Verification succeeded" in output_text

        if success:
            logger.info(f"Success after {deps.attempts} attempts")
            error_type = None
        else:
            # Run final verification to get error details
            verify_result = verify_lean(final_code, sample.units)
            stderr = verify_result.get("stderr", "")
            error_type = categorize_error(stderr if isinstance(stderr, str) else "")
            logger.warning(f"Failed after {deps.attempts} attempts: {error_type}")

        return AgentResult(
            sample_id=sample.apps_id,
            success=success,
            attempts=deps.attempts,
            final_code=final_code,
            error_type=error_type,
        )

    except Exception as e:
        logger.error(f"Error running agent: {e}")
        return AgentResult(
            sample_id=sample.apps_id,
            success=False,
            attempts=deps.attempts,
            final_code=None,
            error_type="agent_error",
        )

evals/src/evals/fvapps/pydantic_ai/agent.py

The create_fvapps_agent function creates a pydantic-ai agent and defines a verify_lean_tool that the agent can use. The AgentDeps class is used to maintain state between tool calls, in this case, the number of attempts.

The run_agent_on_sample function then uses this agent to solve a single FVAPPS problem. It constructs a prompt with the problem description and the Lean code, and then calls agent.run_sync. pydantic-ai automatically handles the loop of calling the model, executing tools, and feeding the results back to the model.

agent.py

"""Pydantic-AI agent for FVAPPS evaluation."""

import logging

from evals.fvapps.pydantic_ai.prompt import LEAN_FVAPPS_SYSTEM_PROMPT
from evals.fvapps.pydantic_ai.tools import verify_lean
from evals.fvapps.pydantic_ai.types import AgentResult, FVAPPSSample
from evals.fvapps.pydantic_ai.utils import categorize_error, extract_code

from pydantic_ai import Agent, RunContext


class AgentDeps:
    """Dependencies for agent tools."""

    def __init__(self, sample: FVAPPSSample):
        """Initialize agent dependencies.

        Args:
            sample: FVAPPS sample to evaluate
        """
        self.sample = sample
        self.attempts = 0


def create_fvapps_agent(model: str) -> Agent[AgentDeps, str]:
    """Create pydantic-ai agent with verify_lean tool.

    Args:
        model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")

    Returns:
        Agent configured for FVAPPS task
    """
    # Strip "anthropic/" prefix if present (inspect-ai format compatibility)
    if model.startswith("anthropic/"):
        model = model.replace("anthropic/", "")

    agent = Agent(
        model,
        deps_type=AgentDeps,
        system_prompt=LEAN_FVAPPS_SYSTEM_PROMPT,
    )

    @agent.tool
    async def verify_lean_tool(ctx: RunContext[AgentDeps], code: str) -> str:
        """Verify Lean code with lake build.

        Args:
            ctx: Pydantic-AI run context with agent dependencies
            code: Complete Lean program with sorry replaced

        Returns:
            Success message or error details
        """
        ctx.deps.attempts += 1

        logger = logging.getLogger(__name__)
        logger.info(f"Attempt {ctx.deps.attempts}: Verifying code ({len(code)} chars)")

        # Run verification
        result = verify_lean(code, ctx.deps.sample.units)

        # Return message (pydantic-ai pattern: return string, not raise error)
        msg = result.get("message", "")
        return msg if isinstance(msg, str) else str(msg)

    return agent  # type: ignore


def run_agent_on_sample(
    sample: FVAPPSSample,
    model: str,
) -> AgentResult:
    """Run pydantic-ai agent on single FVAPPS sample.

    Args:
        sample: FVAPPS sample with Lean code and tests
        model: Model identifier

    Returns:
        AgentResult with success status, attempts, code, and error
    """
    logger = logging.getLogger(__name__)
    logger.info(f"Starting evaluation for sample {sample.apps_id}")

    # Create agent
    agent = create_fvapps_agent(model)

    # Create dependencies
    deps = AgentDeps(sample=sample)

    # Construct user prompt
    user_prompt = f"""Problem: {sample.apps_question}

Specification (replace all sorry placeholders):
```lean
{sample.spec}
```

Unit tests (must pass):
```lean
{sample.units}
```

Write the complete implementation and proofs, then call verify_lean tool to check your work.
"""

    try:
        # Run agent (pydantic-ai handles tool iteration automatically in run_sync)
        result = agent.run_sync(user_prompt, deps=deps)

        # Extract code from result (pydantic-ai uses .data attribute on RunResult)
        # Get the output text from the result
        output_text = str(result.data) if hasattr(result, "data") else str(result)
        final_code = extract_code(output_text)

        # Check if verification succeeded (based on success message in output)
        success = "✓ Verification succeeded" in output_text

        if success:
            logger.info(f"Success after {deps.attempts} attempts")
            error_type = None
        else:
            # Run final verification to get error details
            verify_result = verify_lean(final_code, sample.units)
            stderr = verify_result.get("stderr", "")
            error_type = categorize_error(stderr if isinstance(stderr, str) else "")
            logger.warning(f"Failed after {deps.attempts} attempts: {error_type}")

        return AgentResult(
            sample_id=sample.apps_id,
            success=success,
            attempts=deps.attempts,
            final_code=final_code,
            error_type=error_type,
        )

    except Exception as e:
        logger.error(f"Error running agent: {e}")
        return AgentResult(
            sample_id=sample.apps_id,
            success=False,
            attempts=deps.attempts,
            final_code=None,
            error_type="agent_error",
        )

evals/src/evals/fvapps/pydantic_ai/agent.py

The Tool (`tools.py`)¶

The verify_lean function in tools.py is the heart of our evaluation. It takes a string of Lean code, creates a temporary Lean project, and runs lake build to check for errors.

tools.py

"""Tools for FVAPPS evaluation - Lean verification."""

import subprocess
import tempfile
from pathlib import Path


def verify_lean(code: str, units: str, timeout: int = 60) -> dict[str, str | bool]:
    """Verify Lean code using lake build.

    Creates a temporary Lean project, writes code + units, runs lake build.

    Args:
        code: Lean code with sorry replaced
        units: Unit test code (#guard_msgs, #eval)
        timeout: Timeout in seconds (default: 60)

    Returns:
        dict with keys:
        - success: bool
        - message: str (for agent feedback)
        - stdout: str
        - stderr: str
    """
    # Check for verification bypass attempts
    if "axiom" in code.lower() and "sorry" not in code.lower():
        return {
            "success": False,
            "message": "Invalid code: contains axiom declarations which bypass verification.",
            "stdout": "",
            "stderr": "axiom declarations not allowed",
        }

    with tempfile.TemporaryDirectory() as tmpdir:
        project_dir = Path(tmpdir) / "fvapps_verify"
        project_dir.mkdir()

        # Create minimal lakefile.lean
        lakefile_content = """import Lake
open Lake DSL

package fvapps_verify

@[default_target]
lean_lib FVAPPSVerify
"""
        (project_dir / "lakefile.lean").write_text(lakefile_content.strip())

        # Create FVAPPSVerify directory
        lib_dir = project_dir / "FVAPPSVerify"
        lib_dir.mkdir()

        # Write code + unit tests
        full_code = f"{code}\n\n{units}"
        (lib_dir / "Main.lean").write_text(full_code)

        try:
            # Run lake build with timeout
            result = subprocess.run(
                ["lake", "build"],
                cwd=project_dir,
                capture_output=True,
                text=True,
                timeout=timeout,
            )

            # Check for success (returncode 0 and no "error:" in stderr)
            success = result.returncode == 0 and "error:" not in result.stderr.lower()

            if success:
                message = "✓ Verification succeeded! All checks passed."
            else:
                message = f"Verification failed:\n\n{result.stderr}"

            return {
                "success": success,
                "message": message,
                "stdout": result.stdout,
                "stderr": result.stderr,
            }

        except subprocess.TimeoutExpired:
            return {
                "success": False,
                "message": f"Verification timed out after {timeout} seconds. The program may be too complex or non-terminating.",
                "stdout": "",
                "stderr": "timeout",
            }
        except Exception as e:
            return {
                "success": False,
                "message": f"Error during verification: {str(e)}",
                "stdout": "",
                "stderr": str(e),
            }

evals/src/evals/fvapps/pydantic_ai/tools.py

This function is a good example of how to integrate a command-line tool into an agent. It handles creating the necessary files, running the tool, and parsing the output to determine success or failure.

The Prompt (`prompt.py`)¶

The system prompt in prompt.py is crucial for guiding the model. It explains the task, the format of the expected output, and how to use the verify_lean tool.

prompt.py

"""System prompt for FVAPPS Lean verification task."""

LEAN_FVAPPS_SYSTEM_PROMPT = """You are an expert in formal verification using Lean 4.

Your task is to write terminating functional programs that satisfy specifications and pass unit tests.
You must replace all `sorry` placeholders with implementations and proofs.

## What You Need to Provide

1. **Function Implementations**: Complete implementations for functions marked with sorry
2. **Proofs**: Proofs that your implementations satisfy the specifications
3. **Termination**: For recursive functions, provide termination_by clauses

## Using the verify_lean Tool

Once you've written your code, use the `verify_lean` tool to check your work.
Pass your complete Lean code to the tool. If verification fails, analyze the error
messages carefully and adjust your code. Continue refining until verification succeeds.

**CRITICAL**: If the verification succeeds, STOP IMMEDIATELY. Do not add any commentary,
celebration, explanation, or additional text after receiving a success message from the tool.
The conversation should end silently upon success.

## Format

You may discuss your reasoning, but ensure somewhere in your output is a triple backtick code block.

### Example Workflow

**Step 1**: Add implementations/proofs and call tool
```lean
def factorial (n : Nat) : Nat :=
  match n with
  | 0 => 1
  | n + 1 => (n + 1) * factorial n
termination_by n

theorem factorial_positive (n : Nat) : factorial n > 0 := by
  induction n with
  | zero => simp [factorial]
  | succ n ih => simp [factorial]; omega
```

**Step 2**: If tool returns "✓ Verification succeeded! All checks passed."
→ STOP. Do not respond further.

### Lean 4 Tactic Examples

```lean
-- Simple proof with simp
theorem add_zero (n : Nat) : n + 0 = n := by
  simp

-- Proof with induction
theorem add_comm (n m : Nat) : n + m = m + n := by
  induction n with
  | zero => simp
  | succ n ih => simp [Nat.add_succ, ih]

-- Terminating recursive function
def sum_to (n : Nat) : Nat :=
  match n with
  | 0 => 0
  | n + 1 => n + 1 + sum_to n
termination_by n

-- Proof by cases
theorem even_or_odd (n : Nat) : Even n ∨ Odd n := by
  cases n
  · left; constructor
  · right; constructor
```

Remember: All sorry placeholders must be replaced, all specs must be proven, all unit tests must pass.
"""

evals/src/evals/fvapps/pydantic_ai/prompt.py

Data and Types (`types.py` and `dataset.py`)¶

We use Pydantic models in types.py to define the structure of our data. This helps with type checking and makes the code easier to read and maintain.

types.py

"""Type definitions and utilities for FVAPPS pydantic-ai implementation."""

import logging
from datetime import datetime
from pathlib import Path

from pydantic import BaseModel


def get_workspace_root() -> Path:
    """Find the workspace root by looking for the root pyproject.toml.

    Returns:
        Path to the workspace root directory

    Raises:
        RuntimeError: If workspace root cannot be found
    """
    current = Path(__file__).resolve()

    # Walk up the directory tree looking for workspace root
    for parent in current.parents:
        pyproject = parent / "pyproject.toml"
        if pyproject.exists():
            # Check if this is the root (has [tool.uv.workspace])
            content = pyproject.read_text()
            if "[tool.uv.workspace]" in content:
                return parent

    raise RuntimeError(
        "Could not find workspace root (pyproject.toml with [tool.uv.workspace])"
    )


class FVAPPSSample(BaseModel):
    """Single sample from quinn-dougherty/fvapps dataset."""

    apps_id: str
    apps_question: str  # Natural language description
    spec: str  # Lean code with sorry placeholders
    units: str  # Unit tests (#guard_msgs, #eval)
    sorries: int  # Number of sorry placeholders
    apps_difficulty: str | None = None
    assurance_level: str | None = None


class AgentResult(BaseModel):
    """Result for a single sample evaluation."""

    sample_id: str
    success: bool
    attempts: int  # Number of verify_lean calls
    final_code: str | None
    error_type: str | None  # Only if not success


class EvalMetrics(BaseModel):
    """Aggregated metrics across all samples."""

    total_samples: int
    successful: int
    accuracy: float  # successful / total_samples
    average_attempts: float
    error_distribution: dict[str, int]  # error_type -> count


def setup_logging() -> None:
    """Setup logging to logs/fvapps_pydantic_YYYYMMDD_HHMMSS.log.

    Creates logs directory in workspace root if it doesn't exist and configures
    logging to write to both file and console.
    """
    workspace_root = get_workspace_root()
    logs_dir = workspace_root / "logs"
    logs_dir.mkdir(exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = logs_dir / f"fvapps_pydantic_{timestamp}.log"

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler(),  # Also print to console
        ],
    )

evals/src/evals/fvapps/pydantic_ai/types.py

The load_fvapps_dataset function in dataset.py loads the data from Hugging Face and converts it into a list of FVAPPSSample objects.

dataset.py

"""Dataset loading for FVAPPS."""

from datasets import load_dataset
from evals.fvapps.pydantic_ai.types import FVAPPSSample


def load_fvapps_dataset(split: str = "train") -> list[FVAPPSSample]:
    """Load FVAPPS dataset from quinn-dougherty/fvapps.

    Returns:
        List of FVAPPSSample objects with Lean code and unit tests.
    """
    hf_dataset = load_dataset("quinn-dougherty/fvapps", split=split)

    return [
        FVAPPSSample(
            apps_id=row["apps_id"],  # type: ignore
            apps_question=row["apps_question"],  # type: ignore
            spec=row["spec"],  # type: ignore
            units=row["units"],  # type: ignore
            sorries=row["sorries"],  # type: ignore
            apps_difficulty=row.get("apps_difficulty"),  # type: ignore
            assurance_level=row.get("assurance_level"),  # type: ignore
        )
        for row in hf_dataset  # type: ignore
    ]

evals/src/evals/fvapps/pydantic_ai/dataset.py

The Main Evaluation Loop (`init.py`)¶

The run_fvapps_eval function in __init__.py ties everything together. It loads the dataset, iterates through the samples, calls run_agent_on_sample for each one, and then aggregates the results.

__init__.py

"""Pydantic AI implementation of FVAPPS (Lean) evaluation."""

import logging

from evals.fvapps.pydantic_ai.agent import run_agent_on_sample
from evals.fvapps.pydantic_ai.dataset import load_fvapps_dataset
from evals.fvapps.pydantic_ai.types import AgentResult, EvalMetrics, setup_logging


def aggregate_results(results: list[AgentResult]) -> EvalMetrics:
    """Aggregate results into metrics.

    Args:
        results: List of AgentResult objects

    Returns:
        EvalMetrics with accuracy and error distribution
    """
    total = len(results)
    successful = sum(1 for r in results if r.success)
    accuracy = successful / total if total > 0 else 0.0

    total_attempts = sum(r.attempts for r in results)
    average_attempts = total_attempts / total if total > 0 else 0.0

    # Error distribution (only failed samples)
    error_distribution: dict[str, int] = {}
    for r in results:
        if not r.success and r.error_type:
            error_distribution[r.error_type] = (
                error_distribution.get(r.error_type, 0) + 1
            )

    return EvalMetrics(
        total_samples=total,
        successful=successful,
        accuracy=accuracy,
        average_attempts=average_attempts,
        error_distribution=error_distribution,
    )


def run_fvapps_eval(model: str, limit: int | None) -> None:
    """Run FVAPPS evaluation with Pydantic AI.

    This function:
    1. Sets up logging and loads the dataset
    2. Runs the pydantic-ai agent on each sample
    3. Aggregates results and prints summary statistics

    Args:
        model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")
        limit: Max samples to evaluate (None = all samples)
    """
    # Setup logging
    setup_logging()
    logger = logging.getLogger(__name__)

    # Load dataset
    dataset = load_fvapps_dataset()
    if limit:
        dataset = dataset[:limit]

    print(f"Evaluating {len(dataset)} samples with {model}...")
    logger.info(f"Starting evaluation: {len(dataset)} samples, model={model}")

    # Run evaluation
    results = []
    for i, sample in enumerate(dataset, 1):
        print(f"[{i}/{len(dataset)}] Evaluating {sample.apps_id}...")

        result = run_agent_on_sample(sample, model=model)
        results.append(result)

        status = "✓" if result.success else "✗"
        print(f"  {status} {result.attempts} attempts")

    # Aggregate metrics
    metrics = aggregate_results(results)
    logger.info(f"Evaluation complete: {metrics.accuracy:.1%} accuracy")

    # Print summary
    print("\n" + "=" * 60)
    print("EVALUATION SUMMARY")
    print("=" * 60)
    print(f"Total samples:     {metrics.total_samples}")
    print(f"Successful:        {metrics.successful}")
    print(f"Accuracy:          {metrics.accuracy:.1%}")
    print(f"Average attempts:  {metrics.average_attempts:.2f}")

    if metrics.error_distribution:
        print("\nError distribution:")
        for error_type, count in sorted(metrics.error_distribution.items()):
            print(f"  {error_type}: {count}")

evals/src/evals/fvapps/pydantic_ai/init.py

Utilities (`utils.py`)¶

Finally, utils.py contains helper functions for extracting code from the model’s output and categorizing Lean errors.

utils.py

"""Utility functions for FVAPPS evaluation."""

import re


def extract_code(output: str) -> str:
    """Extract Lean code from agent output.

    Looks for ```lean or ``` code blocks and returns the content.
    Returns last code block if multiple exist.

    Args:
        output: Agent output text

    Returns:
        Extracted Lean code
    """
    # Find all code blocks (``` or ```lean)
    pattern = r"```(?:lean)?\n(.*?)```"
    matches = re.findall(pattern, output, re.DOTALL)

    if matches:
        return matches[-1].strip()

    # Fallback: return entire output if no code blocks
    return output.strip()


def categorize_error(stderr: str) -> str:
    """Categorize Lean error types from stderr output.

    Args:
        stderr: Error output from Lean compiler

    Returns:
        One of: type_error, name_error, tactic_failure, termination_failure,
        incomplete_proof, syntax_error, unit_test_failure, unknown_error
    """
    stderr_lower = stderr.lower()

    if "type mismatch" in stderr_lower or "expected type" in stderr_lower:
        return "type_error"
    elif "unknown identifier" in stderr_lower or "unknown constant" in stderr_lower:
        return "name_error"
    elif "tactic" in stderr_lower and "failed" in stderr_lower:
        return "tactic_failure"
    elif "termination" in stderr_lower or "structural recursion" in stderr_lower:
        return "termination_failure"
    elif (
        "unsolved goals" in stderr_lower
        or "don't know how to synthesize" in stderr_lower
    ):
        return "incomplete_proof"
    elif "unexpected" in stderr_lower or "expected" in stderr_lower:
        return "syntax_error"
    elif "guard" in stderr_lower or "failed" in stderr_lower:
        return "unit_test_failure"
    else:
        return "unknown_error"

evals/src/evals/fvapps/pydantic_ai/utils.py

This walkthrough shows how you can use pydantic-ai to quickly build a powerful agent for a complex task like formal verification. The library handles the agent loop, leaving you to focus on the core logic of your tools and prompts.

The pydantic-ai Agent¶

The Agent (agent.py)¶

The Tool (tools.py)¶

The Prompt (prompt.py)¶

Data and Types (types.py and dataset.py)¶

The Main Evaluation Loop (__init__.py)¶

Utilities (utils.py)¶