The pydantic-ai Agent¶
Now, let’s dive into the implementation of our Lean agent using pydantic-ai. The code is structured into several modules, each with a specific responsibility.
The Agent (agent.py)¶
The core of our agent is in agent.py. We use pydantic-ai’s Agent class to create an agent that can use tools. The @agent.tool decorator makes it easy to define tools that the agent can call.
"""Pydantic-AI agent for FVAPPS evaluation."""
import logging
from evals.fvapps.pydantic_ai.prompt import LEAN_FVAPPS_SYSTEM_PROMPT
from evals.fvapps.pydantic_ai.tools import verify_lean
from evals.fvapps.pydantic_ai.types import AgentResult, FVAPPSSample
from evals.fvapps.pydantic_ai.utils import categorize_error, extract_code
from pydantic_ai import Agent, RunContext
class AgentDeps:
"""Dependencies for agent tools."""
def __init__(self, sample: FVAPPSSample):
"""Initialize agent dependencies.
Args:
sample: FVAPPS sample to evaluate
"""
self.sample = sample
self.attempts = 0
def create_fvapps_agent(model: str) -> Agent[AgentDeps, str]:
"""Create pydantic-ai agent with verify_lean tool.
Args:
model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")
Returns:
Agent configured for FVAPPS task
"""
# Strip "anthropic/" prefix if present (inspect-ai format compatibility)
if model.startswith("anthropic/"):
model = model.replace("anthropic/", "")
agent = Agent(
model,
deps_type=AgentDeps,
system_prompt=LEAN_FVAPPS_SYSTEM_PROMPT,
)
@agent.tool
async def verify_lean_tool(ctx: RunContext[AgentDeps], code: str) -> str:
"""Verify Lean code with lake build.
Args:
ctx: Pydantic-AI run context with agent dependencies
code: Complete Lean program with sorry replaced
Returns:
Success message or error details
"""
ctx.deps.attempts += 1
logger = logging.getLogger(__name__)
logger.info(f"Attempt {ctx.deps.attempts}: Verifying code ({len(code)} chars)")
# Run verification
result = verify_lean(code, ctx.deps.sample.units)
# Return message (pydantic-ai pattern: return string, not raise error)
msg = result.get("message", "")
return msg if isinstance(msg, str) else str(msg)
return agent # type: ignore
def run_agent_on_sample(
sample: FVAPPSSample,
model: str,
) -> AgentResult:
"""Run pydantic-ai agent on single FVAPPS sample.
Args:
sample: FVAPPS sample with Lean code and tests
model: Model identifier
Returns:
AgentResult with success status, attempts, code, and error
"""
logger = logging.getLogger(__name__)
logger.info(f"Starting evaluation for sample {sample.apps_id}")
# Create agent
agent = create_fvapps_agent(model)
# Create dependencies
deps = AgentDeps(sample=sample)
# Construct user prompt
user_prompt = f"""Problem: {sample.apps_question}
Specification (replace all sorry placeholders):
```lean
{sample.spec}
```
Unit tests (must pass):
```lean
{sample.units}
```
Write the complete implementation and proofs, then call verify_lean tool to check your work.
"""
try:
# Run agent (pydantic-ai handles tool iteration automatically in run_sync)
result = agent.run_sync(user_prompt, deps=deps)
# Extract code from result (pydantic-ai uses .data attribute on RunResult)
# Get the output text from the result
output_text = str(result.data) if hasattr(result, "data") else str(result)
final_code = extract_code(output_text)
# Check if verification succeeded (based on success message in output)
success = "✓ Verification succeeded" in output_text
if success:
logger.info(f"Success after {deps.attempts} attempts")
error_type = None
else:
# Run final verification to get error details
verify_result = verify_lean(final_code, sample.units)
stderr = verify_result.get("stderr", "")
error_type = categorize_error(stderr if isinstance(stderr, str) else "")
logger.warning(f"Failed after {deps.attempts} attempts: {error_type}")
return AgentResult(
sample_id=sample.apps_id,
success=success,
attempts=deps.attempts,
final_code=final_code,
error_type=error_type,
)
except Exception as e:
logger.error(f"Error running agent: {e}")
return AgentResult(
sample_id=sample.apps_id,
success=False,
attempts=deps.attempts,
final_code=None,
error_type="agent_error",
)
evals/src/evals/fvapps/pydantic_ai/agent.py
The create_fvapps_agent function creates a pydantic-ai agent and defines a verify_lean_tool that the agent can use. The AgentDeps class is used to maintain state between tool calls, in this case, the number of attempts.
The run_agent_on_sample function then uses this agent to solve a single FVAPPS problem. It constructs a prompt with the problem description and the Lean code, and then calls agent.run_sync. pydantic-ai automatically handles the loop of calling the model, executing tools, and feeding the results back to the model.
"""Pydantic-AI agent for FVAPPS evaluation."""
import logging
from evals.fvapps.pydantic_ai.prompt import LEAN_FVAPPS_SYSTEM_PROMPT
from evals.fvapps.pydantic_ai.tools import verify_lean
from evals.fvapps.pydantic_ai.types import AgentResult, FVAPPSSample
from evals.fvapps.pydantic_ai.utils import categorize_error, extract_code
from pydantic_ai import Agent, RunContext
class AgentDeps:
"""Dependencies for agent tools."""
def __init__(self, sample: FVAPPSSample):
"""Initialize agent dependencies.
Args:
sample: FVAPPS sample to evaluate
"""
self.sample = sample
self.attempts = 0
def create_fvapps_agent(model: str) -> Agent[AgentDeps, str]:
"""Create pydantic-ai agent with verify_lean tool.
Args:
model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")
Returns:
Agent configured for FVAPPS task
"""
# Strip "anthropic/" prefix if present (inspect-ai format compatibility)
if model.startswith("anthropic/"):
model = model.replace("anthropic/", "")
agent = Agent(
model,
deps_type=AgentDeps,
system_prompt=LEAN_FVAPPS_SYSTEM_PROMPT,
)
@agent.tool
async def verify_lean_tool(ctx: RunContext[AgentDeps], code: str) -> str:
"""Verify Lean code with lake build.
Args:
ctx: Pydantic-AI run context with agent dependencies
code: Complete Lean program with sorry replaced
Returns:
Success message or error details
"""
ctx.deps.attempts += 1
logger = logging.getLogger(__name__)
logger.info(f"Attempt {ctx.deps.attempts}: Verifying code ({len(code)} chars)")
# Run verification
result = verify_lean(code, ctx.deps.sample.units)
# Return message (pydantic-ai pattern: return string, not raise error)
msg = result.get("message", "")
return msg if isinstance(msg, str) else str(msg)
return agent # type: ignore
def run_agent_on_sample(
sample: FVAPPSSample,
model: str,
) -> AgentResult:
"""Run pydantic-ai agent on single FVAPPS sample.
Args:
sample: FVAPPS sample with Lean code and tests
model: Model identifier
Returns:
AgentResult with success status, attempts, code, and error
"""
logger = logging.getLogger(__name__)
logger.info(f"Starting evaluation for sample {sample.apps_id}")
# Create agent
agent = create_fvapps_agent(model)
# Create dependencies
deps = AgentDeps(sample=sample)
# Construct user prompt
user_prompt = f"""Problem: {sample.apps_question}
Specification (replace all sorry placeholders):
```lean
{sample.spec}
```
Unit tests (must pass):
```lean
{sample.units}
```
Write the complete implementation and proofs, then call verify_lean tool to check your work.
"""
try:
# Run agent (pydantic-ai handles tool iteration automatically in run_sync)
result = agent.run_sync(user_prompt, deps=deps)
# Extract code from result (pydantic-ai uses .data attribute on RunResult)
# Get the output text from the result
output_text = str(result.data) if hasattr(result, "data") else str(result)
final_code = extract_code(output_text)
# Check if verification succeeded (based on success message in output)
success = "✓ Verification succeeded" in output_text
if success:
logger.info(f"Success after {deps.attempts} attempts")
error_type = None
else:
# Run final verification to get error details
verify_result = verify_lean(final_code, sample.units)
stderr = verify_result.get("stderr", "")
error_type = categorize_error(stderr if isinstance(stderr, str) else "")
logger.warning(f"Failed after {deps.attempts} attempts: {error_type}")
return AgentResult(
sample_id=sample.apps_id,
success=success,
attempts=deps.attempts,
final_code=final_code,
error_type=error_type,
)
except Exception as e:
logger.error(f"Error running agent: {e}")
return AgentResult(
sample_id=sample.apps_id,
success=False,
attempts=deps.attempts,
final_code=None,
error_type="agent_error",
)
evals/src/evals/fvapps/pydantic_ai/agent.py
The Tool (tools.py)¶
The verify_lean function in tools.py is the heart of our evaluation. It takes a string of Lean code, creates a temporary Lean project, and runs lake build to check for errors.
"""Tools for FVAPPS evaluation - Lean verification."""
import subprocess
import tempfile
from pathlib import Path
def verify_lean(code: str, units: str, timeout: int = 60) -> dict[str, str | bool]:
"""Verify Lean code using lake build.
Creates a temporary Lean project, writes code + units, runs lake build.
Args:
code: Lean code with sorry replaced
units: Unit test code (#guard_msgs, #eval)
timeout: Timeout in seconds (default: 60)
Returns:
dict with keys:
- success: bool
- message: str (for agent feedback)
- stdout: str
- stderr: str
"""
# Check for verification bypass attempts
if "axiom" in code.lower() and "sorry" not in code.lower():
return {
"success": False,
"message": "Invalid code: contains axiom declarations which bypass verification.",
"stdout": "",
"stderr": "axiom declarations not allowed",
}
with tempfile.TemporaryDirectory() as tmpdir:
project_dir = Path(tmpdir) / "fvapps_verify"
project_dir.mkdir()
# Create minimal lakefile.lean
lakefile_content = """import Lake
open Lake DSL
package fvapps_verify
@[default_target]
lean_lib FVAPPSVerify
"""
(project_dir / "lakefile.lean").write_text(lakefile_content.strip())
# Create FVAPPSVerify directory
lib_dir = project_dir / "FVAPPSVerify"
lib_dir.mkdir()
# Write code + unit tests
full_code = f"{code}\n\n{units}"
(lib_dir / "Main.lean").write_text(full_code)
try:
# Run lake build with timeout
result = subprocess.run(
["lake", "build"],
cwd=project_dir,
capture_output=True,
text=True,
timeout=timeout,
)
# Check for success (returncode 0 and no "error:" in stderr)
success = result.returncode == 0 and "error:" not in result.stderr.lower()
if success:
message = "✓ Verification succeeded! All checks passed."
else:
message = f"Verification failed:\n\n{result.stderr}"
return {
"success": success,
"message": message,
"stdout": result.stdout,
"stderr": result.stderr,
}
except subprocess.TimeoutExpired:
return {
"success": False,
"message": f"Verification timed out after {timeout} seconds. The program may be too complex or non-terminating.",
"stdout": "",
"stderr": "timeout",
}
except Exception as e:
return {
"success": False,
"message": f"Error during verification: {str(e)}",
"stdout": "",
"stderr": str(e),
}
evals/src/evals/fvapps/pydantic_ai/tools.py
This function is a good example of how to integrate a command-line tool into an agent. It handles creating the necessary files, running the tool, and parsing the output to determine success or failure.
The Prompt (prompt.py)¶
The system prompt in prompt.py is crucial for guiding the model. It explains the task, the format of the expected output, and how to use the verify_lean tool.
"""System prompt for FVAPPS Lean verification task."""
LEAN_FVAPPS_SYSTEM_PROMPT = """You are an expert in formal verification using Lean 4.
Your task is to write terminating functional programs that satisfy specifications and pass unit tests.
You must replace all `sorry` placeholders with implementations and proofs.
## What You Need to Provide
1. **Function Implementations**: Complete implementations for functions marked with sorry
2. **Proofs**: Proofs that your implementations satisfy the specifications
3. **Termination**: For recursive functions, provide termination_by clauses
## Using the verify_lean Tool
Once you've written your code, use the `verify_lean` tool to check your work.
Pass your complete Lean code to the tool. If verification fails, analyze the error
messages carefully and adjust your code. Continue refining until verification succeeds.
**CRITICAL**: If the verification succeeds, STOP IMMEDIATELY. Do not add any commentary,
celebration, explanation, or additional text after receiving a success message from the tool.
The conversation should end silently upon success.
## Format
You may discuss your reasoning, but ensure somewhere in your output is a triple backtick code block.
### Example Workflow
**Step 1**: Add implementations/proofs and call tool
```lean
def factorial (n : Nat) : Nat :=
match n with
| 0 => 1
| n + 1 => (n + 1) * factorial n
termination_by n
theorem factorial_positive (n : Nat) : factorial n > 0 := by
induction n with
| zero => simp [factorial]
| succ n ih => simp [factorial]; omega
```
**Step 2**: If tool returns "✓ Verification succeeded! All checks passed."
→ STOP. Do not respond further.
### Lean 4 Tactic Examples
```lean
-- Simple proof with simp
theorem add_zero (n : Nat) : n + 0 = n := by
simp
-- Proof with induction
theorem add_comm (n m : Nat) : n + m = m + n := by
induction n with
| zero => simp
| succ n ih => simp [Nat.add_succ, ih]
-- Terminating recursive function
def sum_to (n : Nat) : Nat :=
match n with
| 0 => 0
| n + 1 => n + 1 + sum_to n
termination_by n
-- Proof by cases
theorem even_or_odd (n : Nat) : Even n ∨ Odd n := by
cases n
· left; constructor
· right; constructor
```
Remember: All sorry placeholders must be replaced, all specs must be proven, all unit tests must pass.
"""
evals/src/evals/fvapps/pydantic_ai/prompt.py
Data and Types (types.py and dataset.py)¶
We use Pydantic models in types.py to define the structure of our data. This helps with type checking and makes the code easier to read and maintain.
"""Type definitions and utilities for FVAPPS pydantic-ai implementation."""
import logging
from datetime import datetime
from pathlib import Path
from pydantic import BaseModel
def get_workspace_root() -> Path:
"""Find the workspace root by looking for the root pyproject.toml.
Returns:
Path to the workspace root directory
Raises:
RuntimeError: If workspace root cannot be found
"""
current = Path(__file__).resolve()
# Walk up the directory tree looking for workspace root
for parent in current.parents:
pyproject = parent / "pyproject.toml"
if pyproject.exists():
# Check if this is the root (has [tool.uv.workspace])
content = pyproject.read_text()
if "[tool.uv.workspace]" in content:
return parent
raise RuntimeError(
"Could not find workspace root (pyproject.toml with [tool.uv.workspace])"
)
class FVAPPSSample(BaseModel):
"""Single sample from quinn-dougherty/fvapps dataset."""
apps_id: str
apps_question: str # Natural language description
spec: str # Lean code with sorry placeholders
units: str # Unit tests (#guard_msgs, #eval)
sorries: int # Number of sorry placeholders
apps_difficulty: str | None = None
assurance_level: str | None = None
class AgentResult(BaseModel):
"""Result for a single sample evaluation."""
sample_id: str
success: bool
attempts: int # Number of verify_lean calls
final_code: str | None
error_type: str | None # Only if not success
class EvalMetrics(BaseModel):
"""Aggregated metrics across all samples."""
total_samples: int
successful: int
accuracy: float # successful / total_samples
average_attempts: float
error_distribution: dict[str, int] # error_type -> count
def setup_logging() -> None:
"""Setup logging to logs/fvapps_pydantic_YYYYMMDD_HHMMSS.log.
Creates logs directory in workspace root if it doesn't exist and configures
logging to write to both file and console.
"""
workspace_root = get_workspace_root()
logs_dir = workspace_root / "logs"
logs_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = logs_dir / f"fvapps_pydantic_{timestamp}.log"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler(), # Also print to console
],
)
evals/src/evals/fvapps/pydantic_ai/types.py
The load_fvapps_dataset function in dataset.py loads the data from Hugging Face and converts it into a list of FVAPPSSample objects.
"""Dataset loading for FVAPPS."""
from datasets import load_dataset
from evals.fvapps.pydantic_ai.types import FVAPPSSample
def load_fvapps_dataset(split: str = "train") -> list[FVAPPSSample]:
"""Load FVAPPS dataset from quinn-dougherty/fvapps.
Returns:
List of FVAPPSSample objects with Lean code and unit tests.
"""
hf_dataset = load_dataset("quinn-dougherty/fvapps", split=split)
return [
FVAPPSSample(
apps_id=row["apps_id"], # type: ignore
apps_question=row["apps_question"], # type: ignore
spec=row["spec"], # type: ignore
units=row["units"], # type: ignore
sorries=row["sorries"], # type: ignore
apps_difficulty=row.get("apps_difficulty"), # type: ignore
assurance_level=row.get("assurance_level"), # type: ignore
)
for row in hf_dataset # type: ignore
]
evals/src/evals/fvapps/pydantic_ai/dataset.py
The Main Evaluation Loop (__init__.py)¶
The run_fvapps_eval function in __init__.py ties everything together. It loads the dataset, iterates through the samples, calls run_agent_on_sample for each one, and then aggregates the results.
"""Pydantic AI implementation of FVAPPS (Lean) evaluation."""
import logging
from evals.fvapps.pydantic_ai.agent import run_agent_on_sample
from evals.fvapps.pydantic_ai.dataset import load_fvapps_dataset
from evals.fvapps.pydantic_ai.types import AgentResult, EvalMetrics, setup_logging
def aggregate_results(results: list[AgentResult]) -> EvalMetrics:
"""Aggregate results into metrics.
Args:
results: List of AgentResult objects
Returns:
EvalMetrics with accuracy and error distribution
"""
total = len(results)
successful = sum(1 for r in results if r.success)
accuracy = successful / total if total > 0 else 0.0
total_attempts = sum(r.attempts for r in results)
average_attempts = total_attempts / total if total > 0 else 0.0
# Error distribution (only failed samples)
error_distribution: dict[str, int] = {}
for r in results:
if not r.success and r.error_type:
error_distribution[r.error_type] = (
error_distribution.get(r.error_type, 0) + 1
)
return EvalMetrics(
total_samples=total,
successful=successful,
accuracy=accuracy,
average_attempts=average_attempts,
error_distribution=error_distribution,
)
def run_fvapps_eval(model: str, limit: int | None) -> None:
"""Run FVAPPS evaluation with Pydantic AI.
This function:
1. Sets up logging and loads the dataset
2. Runs the pydantic-ai agent on each sample
3. Aggregates results and prints summary statistics
Args:
model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")
limit: Max samples to evaluate (None = all samples)
"""
# Setup logging
setup_logging()
logger = logging.getLogger(__name__)
# Load dataset
dataset = load_fvapps_dataset()
if limit:
dataset = dataset[:limit]
print(f"Evaluating {len(dataset)} samples with {model}...")
logger.info(f"Starting evaluation: {len(dataset)} samples, model={model}")
# Run evaluation
results = []
for i, sample in enumerate(dataset, 1):
print(f"[{i}/{len(dataset)}] Evaluating {sample.apps_id}...")
result = run_agent_on_sample(sample, model=model)
results.append(result)
status = "✓" if result.success else "✗"
print(f" {status} {result.attempts} attempts")
# Aggregate metrics
metrics = aggregate_results(results)
logger.info(f"Evaluation complete: {metrics.accuracy:.1%} accuracy")
# Print summary
print("\n" + "=" * 60)
print("EVALUATION SUMMARY")
print("=" * 60)
print(f"Total samples: {metrics.total_samples}")
print(f"Successful: {metrics.successful}")
print(f"Accuracy: {metrics.accuracy:.1%}")
print(f"Average attempts: {metrics.average_attempts:.2f}")
if metrics.error_distribution:
print("\nError distribution:")
for error_type, count in sorted(metrics.error_distribution.items()):
print(f" {error_type}: {count}")
evals/src/evals/fvapps/pydantic_ai/init.py
Utilities (utils.py)¶
Finally, utils.py contains helper functions for extracting code from the model’s output and categorizing Lean errors.
"""Utility functions for FVAPPS evaluation."""
import re
def extract_code(output: str) -> str:
"""Extract Lean code from agent output.
Looks for ```lean or ``` code blocks and returns the content.
Returns last code block if multiple exist.
Args:
output: Agent output text
Returns:
Extracted Lean code
"""
# Find all code blocks (``` or ```lean)
pattern = r"```(?:lean)?\n(.*?)```"
matches = re.findall(pattern, output, re.DOTALL)
if matches:
return matches[-1].strip()
# Fallback: return entire output if no code blocks
return output.strip()
def categorize_error(stderr: str) -> str:
"""Categorize Lean error types from stderr output.
Args:
stderr: Error output from Lean compiler
Returns:
One of: type_error, name_error, tactic_failure, termination_failure,
incomplete_proof, syntax_error, unit_test_failure, unknown_error
"""
stderr_lower = stderr.lower()
if "type mismatch" in stderr_lower or "expected type" in stderr_lower:
return "type_error"
elif "unknown identifier" in stderr_lower or "unknown constant" in stderr_lower:
return "name_error"
elif "tactic" in stderr_lower and "failed" in stderr_lower:
return "tactic_failure"
elif "termination" in stderr_lower or "structural recursion" in stderr_lower:
return "termination_failure"
elif (
"unsolved goals" in stderr_lower
or "don't know how to synthesize" in stderr_lower
):
return "incomplete_proof"
elif "unexpected" in stderr_lower or "expected" in stderr_lower:
return "syntax_error"
elif "guard" in stderr_lower or "failed" in stderr_lower:
return "unit_test_failure"
else:
return "unknown_error"
evals/src/evals/fvapps/pydantic_ai/utils.py
This walkthrough shows how you can use pydantic-ai to quickly build a powerful agent for a complex task like formal verification. The library handles the agent loop, leaving you to focus on the core logic of your tools and prompts.