The Manual Tool-Calling Loop - prealpha; Formal Verification Agent Recipes

At the heart of any agent framework is a loop. A loop that calls the model, gets a response, and then decides what to do next. In this section, we’ll dissect the manual tool-calling loop from our plain Python implementation, revealing the mechanics that frameworks like inspect-ai abstract away. ## TODO: audit the prose to make sure its not too LLMy

The core of the loop is surprisingly simple. It’s a for loop that iterates a fixed number of times, calling the Anthropic API on each iteration.

agent.py

"""Manual tool-calling loop for plain DafnyBench implementation.

This module demonstrates what inspect-ai's generate() abstracts away by
implementing the tool-calling loop manually with the Anthropic SDK.

This version uses specialized insertion tools instead of having the agent
regenerate complete files.
"""

import logging
from datetime import datetime

import anthropic
from evals.dafnybench.inspect_ai.utils import categorize_error
from evals.dafnybench.plain.config import get_config, normalize_model_name
from evals.dafnybench.plain.io_util import save_artifact, save_conversation_history
from evals.dafnybench.plain.structures import AgentResult, EvalSample
from evals.dafnybench.plain.tools import (
    TOOLS,
    get_code_state,
    insert_assertion,
    insert_invariant,
    insert_measure,
    insert_postcondition,
    insert_precondition,
    update_code_state,
    verify_dafny,
)


def handle_tool(
    tool_name: str,
    tool_input: dict,
    tool_use_id: str,
    messages: list[dict],
    sample: EvalSample,
    attempts: int,
    success: bool,
    final_code: str | None,
) -> tuple[dict, int, bool, str | None, str | None]:
    """Handle a single tool call and return results.

    Args:
        tool_name: Name of the tool to call
        tool_input: Input parameters for the tool
        tool_use_id: Tool use ID from Anthropic API
        messages: Message history
        sample: Current evaluation sample
        attempts: Current attempt count
        success: Current success status
        final_code: Current final code

    Returns:
        Tuple of (tool_result_dict, attempts, success, final_code, latest_code)
    """
    logger = logging.getLogger(__name__)
    latest_code = None

    match tool_name:
        case "insert_invariant":
            result = insert_invariant(
                messages,
                invariant=tool_input["invariant"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert invariant: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_assertion":
            result = insert_assertion(
                messages,
                assertion=tool_input["assertion"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert assertion: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_precondition":
            result = insert_precondition(
                messages,
                precondition=tool_input["precondition"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert precondition: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_postcondition":
            result = insert_postcondition(
                messages,
                postcondition=tool_input["postcondition"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert postcondition: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_measure":
            result = insert_measure(
                messages,
                measure=tool_input["measure"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert measure: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "verify_dafny":
            attempts += 1
            result = verify_dafny(messages)

            logger.info(
                f"Attempt {attempts}: Verification {'succeeded' if result['success'] else 'failed'}"
            )

            # Save artifact with current code
            code = result.get("code")
            if code and isinstance(code, str):
                save_artifact(
                    sample.test_id,
                    attempts,
                    code,
                    is_final=bool(result.get("success")),
                )

            if result.get("success"):
                # Verification succeeded!
                success = True
                code_value = result.get("code")
                final_code = code_value if isinstance(code_value, str) else None
                logger.info(f"Success after {attempts} attempts")

                msg = result.get("message", "")
                tool_result = {
                    "type": "tool_result",
                    "tool_use_id": tool_use_id,
                    "content": msg if isinstance(msg, str) else str(msg),
                }
            else:
                # Verification failed
                msg = result.get("message", "")
                msg_str = msg if isinstance(msg, str) else str(msg)
                logger.debug(f"Verification failed: {msg_str[:100]}...")
                tool_result = {
                    "type": "tool_result",
                    "tool_use_id": tool_use_id,
                    "content": msg_str,
                    "is_error": True,
                }

    latest_code_result: str | None = (
        latest_code if isinstance(latest_code, str) else None
    )
    return tool_result, attempts, success, final_code, latest_code_result


def run_agent(
    sample: EvalSample, model: str, max_iterations: int | None = None
) -> AgentResult:
    """Run agent on a single sample with manual tool-calling loop.

    This function implements the core tool-calling loop that inspect-ai's
    generate() handles automatically. It demonstrates:
    - Manual message history management
    - Anthropic API integration with tools
    - Tool execution and result handling
    - Iteration control and stopping conditions

    This version uses specialized insertion tools for targeted hint placement
    instead of having the agent regenerate complete Dafny files.

    Args:
        sample: Evaluation sample with code to verify
        model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")
        max_iterations: Maximum number of tool-calling iterations (None = use config)

    Returns:
        AgentResult with success status, attempts, final code, and error type
    """
    # Load configuration
    config = get_config()

    # Use config defaults if not specified
    if max_iterations is None:
        max_iterations = config.evaluation.max_iterations

    # Generate timestamp for logging
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Initialize Anthropic client
    client = anthropic.Anthropic()

    # Strip "anthropic/" prefix from model name if present (inspect-ai format)
    model = normalize_model_name(model)

    # Initialize message history with code state (using template from config)
    initial_state = config.prompt.initial_state_template.format(
        code=sample.hints_removed
    )

    messages = [
        {"role": "user", "content": sample.input},
        {"role": "user", "content": initial_state},
    ]

    # Track metrics
    attempts = 0
    success = False
    final_code = None
    error_type = None

    # Setup logging for this sample
    logger = logging.getLogger(__name__)
    logger.info(f"Starting evaluation for sample {sample.test_id}")

    # Manual iteration loop - this is what generate() does automatically!
    for iteration in range(max_iterations):
        logger.debug(f"Iteration {iteration + 1}/{max_iterations}")

        try:
            # Call Anthropic API
            response = client.messages.create(
                model=model,
                max_tokens=config.evaluation.max_tokens,
                system=config.prompt.system_prompt,
                messages=messages,
                tools=TOOLS,
            )
        except anthropic.APIError as e:
            logger.error(f"API error: {e}")
            break

        # Add assistant response to message history
        assistant_message = {"role": "assistant", "content": response.content}
        messages.append(assistant_message)

        # Check stop reason
        if response.stop_reason == "end_turn":
            # No tool use - agent finished without calling tool
            logger.info("Agent ended turn without tool use")
            break

        if response.stop_reason != "tool_use":
            # Unexpected stop reason
            logger.warning(f"Unexpected stop reason: {response.stop_reason}")
            break

        # Process tool uses
        tool_results = []
        latest_code = None  # Track latest code state from insertions

        for content_block in response.content:
            if content_block.type == "tool_use":
                tool_result, attempts, success, final_code, code_update = handle_tool(
                    tool_name=content_block.name,
                    tool_input=content_block.input,
                    tool_use_id=content_block.id,
                    messages=messages,
                    sample=sample,
                    attempts=attempts,
                    success=success,
                    final_code=final_code,
                )
                tool_results.append(tool_result)
                if code_update is not None:
                    latest_code = code_update

        # Add tool results as user message (BEFORE state update to maintain pairing)
        messages.append({"role": "user", "content": tool_results})

        # Update code state AFTER tool_results to maintain proper Anthropic message pairing
        # Note: This means multiple insertions in one turn are NOT cumulative - agent must
        # call verify_dafny or make multiple turns to see cumulative effects
        if latest_code is not None and isinstance(latest_code, str):
            update_code_state(messages, latest_code)

        # If verification succeeded, make one more API call to let model respond
        if success:
            try:
                response = client.messages.create(
                    model=model,
                    max_tokens=config.evaluation.max_tokens,
                    system=config.prompt.system_prompt,
                    messages=messages,
                    tools=TOOLS,
                )
            except anthropic.APIError as e:
                logger.error(f"API error on final call: {e}")
            break

    # Get final code from state if we didn't get it from success
    if not success:
        final_code = get_code_state(messages)
        if final_code:
            # Run one final verification to get error details
            result = verify_dafny(messages)
            stderr = result.get("stderr", "")
            error_type = categorize_error(stderr if isinstance(stderr, str) else "")
            logger.warning(f"Failed after {attempts} attempts: {error_type}")

    # Save full conversation history to JSON
    save_conversation_history(
        test_id=sample.test_id,
        timestamp=timestamp,
        messages=messages,
        system_prompt=config.prompt.system_prompt,
    )

    return AgentResult(
        sample_id=sample.test_id,
        success=success,
        attempts=attempts,
        final_code=final_code if isinstance(final_code, str) else None,
        error_type=error_type if not success else None,
    )

The core agent loop

The real complexity lies in managing the message history. The Anthropic API is stateless, which means we need to send the entire conversation history with each API call. This history must follow a strict alternating pattern of user and assistant roles.

When a model response includes a tool_use stop reason, we need to execute the requested tools and then append the results to the message history in a new user message. This is the essence of the tool-calling loop.

agent.py

"""Manual tool-calling loop for plain DafnyBench implementation.

This module demonstrates what inspect-ai's generate() abstracts away by
implementing the tool-calling loop manually with the Anthropic SDK.

This version uses specialized insertion tools instead of having the agent
regenerate complete files.
"""

import logging
from datetime import datetime

import anthropic
from evals.dafnybench.inspect_ai.utils import categorize_error
from evals.dafnybench.plain.config import get_config, normalize_model_name
from evals.dafnybench.plain.io_util import save_artifact, save_conversation_history
from evals.dafnybench.plain.structures import AgentResult, EvalSample
from evals.dafnybench.plain.tools import (
    TOOLS,
    get_code_state,
    insert_assertion,
    insert_invariant,
    insert_measure,
    insert_postcondition,
    insert_precondition,
    update_code_state,
    verify_dafny,
)


def handle_tool(
    tool_name: str,
    tool_input: dict,
    tool_use_id: str,
    messages: list[dict],
    sample: EvalSample,
    attempts: int,
    success: bool,
    final_code: str | None,
) -> tuple[dict, int, bool, str | None, str | None]:
    """Handle a single tool call and return results.

    Args:
        tool_name: Name of the tool to call
        tool_input: Input parameters for the tool
        tool_use_id: Tool use ID from Anthropic API
        messages: Message history
        sample: Current evaluation sample
        attempts: Current attempt count
        success: Current success status
        final_code: Current final code

    Returns:
        Tuple of (tool_result_dict, attempts, success, final_code, latest_code)
    """
    logger = logging.getLogger(__name__)
    latest_code = None

    match tool_name:
        case "insert_invariant":
            result = insert_invariant(
                messages,
                invariant=tool_input["invariant"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert invariant: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_assertion":
            result = insert_assertion(
                messages,
                assertion=tool_input["assertion"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert assertion: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_precondition":
            result = insert_precondition(
                messages,
                precondition=tool_input["precondition"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert precondition: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_postcondition":
            result = insert_postcondition(
                messages,
                postcondition=tool_input["postcondition"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert postcondition: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_measure":
            result = insert_measure(
                messages,
                measure=tool_input["measure"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert measure: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "verify_dafny":
            attempts += 1
            result = verify_dafny(messages)

            logger.info(
                f"Attempt {attempts}: Verification {'succeeded' if result['success'] else 'failed'}"
            )

            # Save artifact with current code
            code = result.get("code")
            if code and isinstance(code, str):
                save_artifact(
                    sample.test_id,
                    attempts,
                    code,
                    is_final=bool(result.get("success")),
                )

            if result.get("success"):
                # Verification succeeded!
                success = True
                code_value = result.get("code")
                final_code = code_value if isinstance(code_value, str) else None
                logger.info(f"Success after {attempts} attempts")

                msg = result.get("message", "")
                tool_result = {
                    "type": "tool_result",
                    "tool_use_id": tool_use_id,
                    "content": msg if isinstance(msg, str) else str(msg),
                }
            else:
                # Verification failed
                msg = result.get("message", "")
                msg_str = msg if isinstance(msg, str) else str(msg)
                logger.debug(f"Verification failed: {msg_str[:100]}...")
                tool_result = {
                    "type": "tool_result",
                    "tool_use_id": tool_use_id,
                    "content": msg_str,
                    "is_error": True,
                }

    latest_code_result: str | None = (
        latest_code if isinstance(latest_code, str) else None
    )
    return tool_result, attempts, success, final_code, latest_code_result


def run_agent(
    sample: EvalSample, model: str, max_iterations: int | None = None
) -> AgentResult:
    """Run agent on a single sample with manual tool-calling loop.

    This function implements the core tool-calling loop that inspect-ai's
    generate() handles automatically. It demonstrates:
    - Manual message history management
    - Anthropic API integration with tools
    - Tool execution and result handling
    - Iteration control and stopping conditions

    This version uses specialized insertion tools for targeted hint placement
    instead of having the agent regenerate complete Dafny files.

    Args:
        sample: Evaluation sample with code to verify
        model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")
        max_iterations: Maximum number of tool-calling iterations (None = use config)

    Returns:
        AgentResult with success status, attempts, final code, and error type
    """
    # Load configuration
    config = get_config()

    # Use config defaults if not specified
    if max_iterations is None:
        max_iterations = config.evaluation.max_iterations

    # Generate timestamp for logging
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Initialize Anthropic client
    client = anthropic.Anthropic()

    # Strip "anthropic/" prefix from model name if present (inspect-ai format)
    model = normalize_model_name(model)

    # Initialize message history with code state (using template from config)
    initial_state = config.prompt.initial_state_template.format(
        code=sample.hints_removed
    )

    messages = [
        {"role": "user", "content": sample.input},
        {"role": "user", "content": initial_state},
    ]

    # Track metrics
    attempts = 0
    success = False
    final_code = None
    error_type = None

    # Setup logging for this sample
    logger = logging.getLogger(__name__)
    logger.info(f"Starting evaluation for sample {sample.test_id}")

    # Manual iteration loop - this is what generate() does automatically!
    for iteration in range(max_iterations):
        logger.debug(f"Iteration {iteration + 1}/{max_iterations}")

        try:
            # Call Anthropic API
            response = client.messages.create(
                model=model,
                max_tokens=config.evaluation.max_tokens,
                system=config.prompt.system_prompt,
                messages=messages,
                tools=TOOLS,
            )
        except anthropic.APIError as e:
            logger.error(f"API error: {e}")
            break

        # Add assistant response to message history
        assistant_message = {"role": "assistant", "content": response.content}
        messages.append(assistant_message)

        # Check stop reason
        if response.stop_reason == "end_turn":
            # No tool use - agent finished without calling tool
            logger.info("Agent ended turn without tool use")
            break

        if response.stop_reason != "tool_use":
            # Unexpected stop reason
            logger.warning(f"Unexpected stop reason: {response.stop_reason}")
            break

        # Process tool uses
        tool_results = []
        latest_code = None  # Track latest code state from insertions

        for content_block in response.content:
            if content_block.type == "tool_use":
                tool_result, attempts, success, final_code, code_update = handle_tool(
                    tool_name=content_block.name,
                    tool_input=content_block.input,
                    tool_use_id=content_block.id,
                    messages=messages,
                    sample=sample,
                    attempts=attempts,
                    success=success,
                    final_code=final_code,
                )
                tool_results.append(tool_result)
                if code_update is not None:
                    latest_code = code_update

        # Add tool results as user message (BEFORE state update to maintain pairing)
        messages.append({"role": "user", "content": tool_results})

        # Update code state AFTER tool_results to maintain proper Anthropic message pairing
        # Note: This means multiple insertions in one turn are NOT cumulative - agent must
        # call verify_dafny or make multiple turns to see cumulative effects
        if latest_code is not None and isinstance(latest_code, str):
            update_code_state(messages, latest_code)

        # If verification succeeded, make one more API call to let model respond
        if success:
            try:
                response = client.messages.create(
                    model=model,
                    max_tokens=config.evaluation.max_tokens,
                    system=config.prompt.system_prompt,
                    messages=messages,
                    tools=TOOLS,
                )
            except anthropic.APIError as e:
                logger.error(f"API error on final call: {e}")
            break

    # Get final code from state if we didn't get it from success
    if not success:
        final_code = get_code_state(messages)
        if final_code:
            # Run one final verification to get error details
            result = verify_dafny(messages)
            stderr = result.get("stderr", "")
            error_type = categorize_error(stderr if isinstance(stderr, str) else "")
            logger.warning(f"Failed after {attempts} attempts: {error_type}")

    # Save full conversation history to JSON
    save_conversation_history(
        test_id=sample.test_id,
        timestamp=timestamp,
        messages=messages,
        system_prompt=config.prompt.system_prompt,
    )

    return AgentResult(
        sample_id=sample.test_id,
        success=success,
        attempts=attempts,
        final_code=final_code if isinstance(final_code, str) else None,
        error_type=error_type if not success else None,
    )

Handling tool use

Another important detail is the “one more call” pattern. When the verify_dafny tool succeeds, we don’t just stop. We make one more call to the model, with the success message in the history. This gives the model a chance to respond to the success, which can make the conversation feel more natural.

agent.py

"""Manual tool-calling loop for plain DafnyBench implementation.

This module demonstrates what inspect-ai's generate() abstracts away by
implementing the tool-calling loop manually with the Anthropic SDK.

This version uses specialized insertion tools instead of having the agent
regenerate complete files.
"""

import logging
from datetime import datetime

import anthropic
from evals.dafnybench.inspect_ai.utils import categorize_error
from evals.dafnybench.plain.config import get_config, normalize_model_name
from evals.dafnybench.plain.io_util import save_artifact, save_conversation_history
from evals.dafnybench.plain.structures import AgentResult, EvalSample
from evals.dafnybench.plain.tools import (
    TOOLS,
    get_code_state,
    insert_assertion,
    insert_invariant,
    insert_measure,
    insert_postcondition,
    insert_precondition,
    update_code_state,
    verify_dafny,
)


def handle_tool(
    tool_name: str,
    tool_input: dict,
    tool_use_id: str,
    messages: list[dict],
    sample: EvalSample,
    attempts: int,
    success: bool,
    final_code: str | None,
) -> tuple[dict, int, bool, str | None, str | None]:
    """Handle a single tool call and return results.

    Args:
        tool_name: Name of the tool to call
        tool_input: Input parameters for the tool
        tool_use_id: Tool use ID from Anthropic API
        messages: Message history
        sample: Current evaluation sample
        attempts: Current attempt count
        success: Current success status
        final_code: Current final code

    Returns:
        Tuple of (tool_result_dict, attempts, success, final_code, latest_code)
    """
    logger = logging.getLogger(__name__)
    latest_code = None

    match tool_name:
        case "insert_invariant":
            result = insert_invariant(
                messages,
                invariant=tool_input["invariant"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert invariant: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_assertion":
            result = insert_assertion(
                messages,
                assertion=tool_input["assertion"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert assertion: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_precondition":
            result = insert_precondition(
                messages,
                precondition=tool_input["precondition"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert precondition: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_postcondition":
            result = insert_postcondition(
                messages,
                postcondition=tool_input["postcondition"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert postcondition: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "insert_measure":
            result = insert_measure(
                messages,
                measure=tool_input["measure"],
                line_number=tool_input.get("line_number"),
                context_before=tool_input.get("context_before"),
                context_after=tool_input.get("context_after"),
            )
            logger.info(f"Insert measure: {result['message']}")
            tool_result = {
                "type": "tool_result",
                "tool_use_id": tool_use_id,
                "content": result["message"],
                "is_error": not result["success"],
            }
            if result["success"] and result.get("code"):
                latest_code = result["code"]

        case "verify_dafny":
            attempts += 1
            result = verify_dafny(messages)

            logger.info(
                f"Attempt {attempts}: Verification {'succeeded' if result['success'] else 'failed'}"
            )

            # Save artifact with current code
            code = result.get("code")
            if code and isinstance(code, str):
                save_artifact(
                    sample.test_id,
                    attempts,
                    code,
                    is_final=bool(result.get("success")),
                )

            if result.get("success"):
                # Verification succeeded!
                success = True
                code_value = result.get("code")
                final_code = code_value if isinstance(code_value, str) else None
                logger.info(f"Success after {attempts} attempts")

                msg = result.get("message", "")
                tool_result = {
                    "type": "tool_result",
                    "tool_use_id": tool_use_id,
                    "content": msg if isinstance(msg, str) else str(msg),
                }
            else:
                # Verification failed
                msg = result.get("message", "")
                msg_str = msg if isinstance(msg, str) else str(msg)
                logger.debug(f"Verification failed: {msg_str[:100]}...")
                tool_result = {
                    "type": "tool_result",
                    "tool_use_id": tool_use_id,
                    "content": msg_str,
                    "is_error": True,
                }

    latest_code_result: str | None = (
        latest_code if isinstance(latest_code, str) else None
    )
    return tool_result, attempts, success, final_code, latest_code_result


def run_agent(
    sample: EvalSample, model: str, max_iterations: int | None = None
) -> AgentResult:
    """Run agent on a single sample with manual tool-calling loop.

    This function implements the core tool-calling loop that inspect-ai's
    generate() handles automatically. It demonstrates:
    - Manual message history management
    - Anthropic API integration with tools
    - Tool execution and result handling
    - Iteration control and stopping conditions

    This version uses specialized insertion tools for targeted hint placement
    instead of having the agent regenerate complete Dafny files.

    Args:
        sample: Evaluation sample with code to verify
        model: Model identifier (e.g., "anthropic/claude-sonnet-4-5")
        max_iterations: Maximum number of tool-calling iterations (None = use config)

    Returns:
        AgentResult with success status, attempts, final code, and error type
    """
    # Load configuration
    config = get_config()

    # Use config defaults if not specified
    if max_iterations is None:
        max_iterations = config.evaluation.max_iterations

    # Generate timestamp for logging
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Initialize Anthropic client
    client = anthropic.Anthropic()

    # Strip "anthropic/" prefix from model name if present (inspect-ai format)
    model = normalize_model_name(model)

    # Initialize message history with code state (using template from config)
    initial_state = config.prompt.initial_state_template.format(
        code=sample.hints_removed
    )

    messages = [
        {"role": "user", "content": sample.input},
        {"role": "user", "content": initial_state},
    ]

    # Track metrics
    attempts = 0
    success = False
    final_code = None
    error_type = None

    # Setup logging for this sample
    logger = logging.getLogger(__name__)
    logger.info(f"Starting evaluation for sample {sample.test_id}")

    # Manual iteration loop - this is what generate() does automatically!
    for iteration in range(max_iterations):
        logger.debug(f"Iteration {iteration + 1}/{max_iterations}")

        try:
            # Call Anthropic API
            response = client.messages.create(
                model=model,
                max_tokens=config.evaluation.max_tokens,
                system=config.prompt.system_prompt,
                messages=messages,
                tools=TOOLS,
            )
        except anthropic.APIError as e:
            logger.error(f"API error: {e}")
            break

        # Add assistant response to message history
        assistant_message = {"role": "assistant", "content": response.content}
        messages.append(assistant_message)

        # Check stop reason
        if response.stop_reason == "end_turn":
            # No tool use - agent finished without calling tool
            logger.info("Agent ended turn without tool use")
            break

        if response.stop_reason != "tool_use":
            # Unexpected stop reason
            logger.warning(f"Unexpected stop reason: {response.stop_reason}")
            break

        # Process tool uses
        tool_results = []
        latest_code = None  # Track latest code state from insertions

        for content_block in response.content:
            if content_block.type == "tool_use":
                tool_result, attempts, success, final_code, code_update = handle_tool(
                    tool_name=content_block.name,
                    tool_input=content_block.input,
                    tool_use_id=content_block.id,
                    messages=messages,
                    sample=sample,
                    attempts=attempts,
                    success=success,
                    final_code=final_code,
                )
                tool_results.append(tool_result)
                if code_update is not None:
                    latest_code = code_update

        # Add tool results as user message (BEFORE state update to maintain pairing)
        messages.append({"role": "user", "content": tool_results})

        # Update code state AFTER tool_results to maintain proper Anthropic message pairing
        # Note: This means multiple insertions in one turn are NOT cumulative - agent must
        # call verify_dafny or make multiple turns to see cumulative effects
        if latest_code is not None and isinstance(latest_code, str):
            update_code_state(messages, latest_code)

        # If verification succeeded, make one more API call to let model respond
        if success:
            try:
                response = client.messages.create(
                    model=model,
                    max_tokens=config.evaluation.max_tokens,
                    system=config.prompt.system_prompt,
                    messages=messages,
                    tools=TOOLS,
                )
            except anthropic.APIError as e:
                logger.error(f"API error on final call: {e}")
            break

    # Get final code from state if we didn't get it from success
    if not success:
        final_code = get_code_state(messages)
        if final_code:
            # Run one final verification to get error details
            result = verify_dafny(messages)
            stderr = result.get("stderr", "")
            error_type = categorize_error(stderr if isinstance(stderr, str) else "")
            logger.warning(f"Failed after {attempts} attempts: {error_type}")

    # Save full conversation history to JSON
    save_conversation_history(
        test_id=sample.test_id,
        timestamp=timestamp,
        messages=messages,
        system_prompt=config.prompt.system_prompt,
    )

    return AgentResult(
        sample_id=sample.test_id,
        success=success,
        attempts=attempts,
        final_code=final_code if isinstance(final_code, str) else None,
        error_type=error_type if not success else None,
    )

The “one more call” pattern

As you can see, the manual loop gives us a lot of control, but it also means we’re responsible for a lot of bookkeeping. This is the trade-off of working without a framework.

Dafny part 2: plain Python

Introduction: Dispelling Superstitions

Dafny part 2: plain Python

Specialized Insertion Tools