Dual Location Specification - prealpha; Formal Verification Agent Recipes

Once we have specialized insertion tools, we need a way for the agent to specify where it wants to insert the hints. This is a surprisingly tricky problem. ## TODO: audit the prose to make sure its not too LLMy
One approach is to use line numbers. Line numbers are precise, but they’re also brittle. If the code changes, the line numbers might become invalid.
Another approach is to use context matching. The agent can specify a few lines of code that appear before or after the insertion point. This is more flexible, but it can be ambiguous if the same context appears in multiple places.
Our plain Python agent uses a hybrid approach. The insertion tools accept both a line_number and a context_before argument. The agent can provide either one, or both.
tools.py
"""Tool execution for plain DafnyBench implementation with hint insertion tools."""

import subprocess
import tempfile
from pathlib import Path

# Tool definitions (Anthropic API format)
INSERT_INVARIANT_TOOL = {
    "name": "insert_invariant",
    "description": "Insert a loop invariant at specified location",
    "input_schema": {
        "type": "object",
        "properties": {
            "invariant": {
                "type": "string",
                "description": "Invariant expression",
            },
            "line_number": {
                "type": "integer",
                "description": "Line number (1-indexed, optional)",
            },
            "context_before": {
                "type": "string",
                "description": "Line before insertion point (optional)",
            },
            "context_after": {
                "type": "string",
                "description": "Line after insertion point (optional)",
            },
        },
        "required": ["invariant"],
    },
}

INSERT_ASSERTION_TOOL = {
    "name": "insert_assertion",
    "description": "Insert an assertion at specified location",
    "input_schema": {
        "type": "object",
        "properties": {
            "assertion": {
                "type": "string",
                "description": "Assertion expression",
            },
            "line_number": {
                "type": "integer",
                "description": "Line number (1-indexed, optional)",
            },
            "context_before": {
                "type": "string",
                "description": "Line before insertion point (optional)",
            },
            "context_after": {
                "type": "string",
                "description": "Line after insertion point (optional)",
            },
        },
        "required": ["assertion"],
    },
}

INSERT_PRECONDITION_TOOL = {
    "name": "insert_precondition",
    "description": "Insert a function precondition (requires clause) at specified location",
    "input_schema": {
        "type": "object",
        "properties": {
            "precondition": {
                "type": "string",
                "description": "Precondition expression",
            },
            "line_number": {
                "type": "integer",
                "description": "Line number (1-indexed, optional)",
            },
            "context_before": {
                "type": "string",
                "description": "Line before insertion point (optional)",
            },
            "context_after": {
                "type": "string",
                "description": "Line after insertion point (optional)",
            },
        },
        "required": ["precondition"],
    },
}

INSERT_POSTCONDITION_TOOL = {
    "name": "insert_postcondition",
    "description": "Insert a function postcondition (ensures clause) at specified location",
    "input_schema": {
        "type": "object",
        "properties": {
            "postcondition": {
                "type": "string",
                "description": "Postcondition expression",
            },
            "line_number": {
                "type": "integer",
                "description": "Line number (1-indexed, optional)",
            },
            "context_before": {
                "type": "string",
                "description": "Line before insertion point (optional)",
            },
            "context_after": {
                "type": "string",
                "description": "Line after insertion point (optional)",
            },
        },
        "required": ["postcondition"],
    },
}

INSERT_MEASURE_TOOL = {
    "name": "insert_measure",
    "description": "Insert a termination measure (decreases clause) at specified location",
    "input_schema": {
        "type": "object",
        "properties": {
            "measure": {"type": "string", "description": "Decreases expression"},
            "line_number": {
                "type": "integer",
                "description": "Line number (1-indexed, optional)",
            },
            "context_before": {
                "type": "string",
                "description": "Line before insertion point (optional)",
            },
            "context_after": {
                "type": "string",
                "description": "Line after insertion point (optional)",
            },
        },
        "required": ["measure"],
    },
}

VERIFY_DAFNY_TOOL = {
    "name": "verify_dafny",
    "description": "Verify the current code state with all hints inserted so far. "
    "Returns verification results and full rendered code.",
    "input_schema": {
        "type": "object",
        "properties": {},  # No parameters - reads from state
    },
}

TOOLS = [
    INSERT_INVARIANT_TOOL,
    INSERT_ASSERTION_TOOL,
    INSERT_PRECONDITION_TOOL,
    INSERT_POSTCONDITION_TOOL,
    INSERT_MEASURE_TOOL,
    VERIFY_DAFNY_TOOL,
]

# ===== Phase 1: State Management =====


def get_code_state(messages: list[dict]) -> str | None:
    """Extract current code state from message history.

    Searches backwards through messages for most recent code state marker.
    Returns None if no state found (initialization case).

    Args:
        messages: Message history containing code states

    Returns:
        Current code string or None if no state found
    """
    for msg in reversed(messages):
        if msg["role"] == "user":
            # Check for list of tool_result blocks
            if isinstance(msg["content"], list):
                for block in msg["content"]:
                    if block.get("type") == "tool_result":
                        content = block.get("content", "")
                        if "=== CURRENT_CODE_STATE ===" in content:
                            # Extract code between markers
                            start = content.find("```dafny\n")
                            end = content.find("\n```", start)
                            if start != -1 and end != -1:
                                return content[start + 9 : end]
            # Check for string content with state marker
            elif isinstance(msg["content"], str):
                if "=== CURRENT_CODE_STATE ===" in msg["content"]:
                    start = msg["content"].find("```dafny\n")
                    end = msg["content"].find("\n```", start)
                    if start != -1 and end != -1:
                        return msg["content"][start + 9 : end]
    return None


def update_code_state(messages: list[dict], new_code: str) -> None:
    """Update message history with new code state.

    Appends user message containing state marker and code.
    This is called after each insertion to update the tracked state.

    Args:
        messages: Message history to update (modified in place)
        new_code: New code state to store
    """
    from evals.dafnybench.plain.config import get_config

    config = get_config()
    state_message = config.prompt.state_update_template.format(code=new_code)

    messages.append({"role": "user", "content": state_message})


# ===== Phase 2: Insertion Utilities =====


def get_indentation(line: str) -> str:
    """Extract leading whitespace from a line.

    Args:
        line: Line of code

    Returns:
        Leading whitespace string
    """
    return line[: len(line) - len(line.lstrip())]


def find_insertion_point(
    code_lines: list[str],
    line_number: int | None = None,
    context_before: str | None = None,
    context_after: str | None = None,
) -> tuple[int, str] | tuple[None, str]:
    """Find where to insert hint using line number or context matching.

    Returns:
        (line_index, indentation) on success
        (None, error_message) on failure

    Priority:
        1. If line_number provided, use it (0-indexed internally)
        2. If context_before/after provided, search for match
        3. If both provided, try line_number first, fall back to context

    Args:
        code_lines: Lines of code
        line_number: 1-indexed line number (optional)
        context_before: Line before insertion point (optional)
        context_after: Line after insertion point for disambiguation (optional)

    Returns:
        Either (index, indentation) or (None, error_message)
    """
    if line_number is not None:
        # Line-based insertion (convert to 0-indexed)
        idx = line_number - 1
        if 0 <= idx <= len(code_lines):
            # Get indentation from previous line or next line
            if idx > 0:
                indent = get_indentation(code_lines[idx - 1])
            elif idx < len(code_lines):
                indent = get_indentation(code_lines[idx])
            else:
                indent = ""
            return idx, indent
        else:
            return (
                None,
                f"Line number {line_number} out of range (1-{len(code_lines)})",
            )

    if context_before is not None:
        # Context-based search
        matches = []
        for i, line in enumerate(code_lines):
            if context_before.strip() in line:
                # Check context_after if provided
                if context_after is not None:
                    if (
                        i + 1 < len(code_lines)
                        and context_after.strip() in code_lines[i + 1]
                    ):
                        matches.append(i + 1)
                else:
                    matches.append(i + 1)

        if len(matches) == 0:
            return None, f"Context not found: '{context_before}'"
        elif len(matches) > 1:
            return (
                None,
                f"Ambiguous context: '{context_before}' matches {len(matches)} locations. "
                f"Use context_after or line_number to disambiguate.",
            )
        else:
            idx = matches[0]
            indent = get_indentation(code_lines[idx - 1]) if idx > 0 else ""
            return idx, indent

    return None, "Must provide either line_number or context_before"


def format_hint_line(hint_type: str, hint_content: str, indentation: str) -> str:
    """Format a hint with proper indentation and syntax.

    Args:
        hint_type: "invariant", "assert", "requires", "ensures", "decreases"
        hint_content: The actual hint expression
        indentation: Leading whitespace

    Returns:
        Formatted hint line (no trailing newline)
    """
    return f"{indentation}{hint_type} {hint_content}"


# ===== Phase 3: Specialized Insertion Tools =====


def insert_hint(
    messages: list[dict],
    hint_type: str,
    hint_content: str,
    line_number: int | None = None,
    context_before: str | None = None,
    context_after: str | None = None,
) -> dict[str, str | bool]:
    """Generic hint insertion function.

    Returns:
        dict with keys:
            - success (bool): True if insertion succeeded
            - message (str): Success/error message
            - code (str): Full code after insertion (on success)

    Args:
        messages: Message history containing code state
        hint_type: Type of hint to insert
        hint_content: Content of the hint
        line_number: Optional line number (1-indexed)
        context_before: Optional context before insertion point
        context_after: Optional context after insertion point
    """
    # Get current code state
    current_code = get_code_state(messages)
    if current_code is None:
        return {
            "success": False,
            "message": "Error: No code state found. This is a system error.",
            "code": "",
        }

    # Parse into lines
    code_lines = current_code.split("\n")

    # Find insertion point
    result = find_insertion_point(
        code_lines, line_number, context_before, context_after
    )
    if result[0] is None:
        return {
            "success": False,
            "message": result[1],
            "code": current_code,
        }

    idx, indent = result
    assert idx is not None, "idx should not be None after early return check"

    # Format hint
    hint_line = format_hint_line(hint_type, hint_content, indent)

    # Insert at position
    code_lines.insert(idx, hint_line)

    # Reconstruct code
    new_code = "\n".join(code_lines)

    # DON'T update state here - let the agent do it after collecting all tool results
    # This prevents breaking tool_use/tool_result pairing in message history

    return {
        "success": True,
        "message": f"✓ {hint_type.capitalize()} inserted at line {idx + 1}",
        "code": new_code,
    }


def insert_invariant(
    messages: list[dict],
    invariant: str,
    line_number: int | None = None,
    context_before: str | None = None,
    context_after: str | None = None,
) -> dict[str, str | bool]:
    """Insert loop invariant hint.

    Args:
        messages: Message history
        invariant: Invariant expression
        line_number: Optional line number (1-indexed)
        context_before: Optional context before insertion
        context_after: Optional context after insertion

    Returns:
        Result dict with success, message, and code
    """
    return insert_hint(
        messages, "invariant", invariant, line_number, context_before, context_after
    )


def insert_assertion(
    messages: list[dict],
    assertion: str,
    line_number: int | None = None,
    context_before: str | None = None,
    context_after: str | None = None,
) -> dict[str, str | bool]:
    """Insert assertion hint.

    Args:
        messages: Message history
        assertion: Assertion expression
        line_number: Optional line number (1-indexed)
        context_before: Optional context before insertion
        context_after: Optional context after insertion

    Returns:
        Result dict with success, message, and code
    """
    return insert_hint(
        messages, "assert", assertion, line_number, context_before, context_after
    )


def insert_precondition(
    messages: list[dict],
    precondition: str,
    line_number: int | None = None,
    context_before: str | None = None,
    context_after: str | None = None,
) -> dict[str, str | bool]:
    """Insert function precondition (requires).

    Args:
        messages: Message history
        precondition: Precondition expression
        line_number: Optional line number (1-indexed)
        context_before: Optional context before insertion
        context_after: Optional context after insertion

    Returns:
        Result dict with success, message, and code
    """
    return insert_hint(
        messages, "requires", precondition, line_number, context_before, context_after
    )


def insert_postcondition(
    messages: list[dict],
    postcondition: str,
    line_number: int | None = None,
    context_before: str | None = None,
    context_after: str | None = None,
) -> dict[str, str | bool]:
    """Insert function postcondition (ensures).

    Args:
        messages: Message history
        postcondition: Postcondition expression
        line_number: Optional line number (1-indexed)
        context_before: Optional context before insertion
        context_after: Optional context after insertion

    Returns:
        Result dict with success, message, and code
    """
    return insert_hint(
        messages, "ensures", postcondition, line_number, context_before, context_after
    )


def insert_measure(
    messages: list[dict],
    measure: str,
    line_number: int | None = None,
    context_before: str | None = None,
    context_after: str | None = None,
) -> dict[str, str | bool]:
    """Insert termination measure (decreases).

    Args:
        messages: Message history
        measure: Decreases expression
        line_number: Optional line number (1-indexed)
        context_before: Optional context before insertion
        context_after: Optional context after insertion

    Returns:
        Result dict with success, message, and code
    """
    return insert_hint(
        messages, "decreases", measure, line_number, context_before, context_after
    )


# ===== Phase 4: Updated verify_dafny =====


def verify_dafny(messages: list[dict]) -> dict[str, str | bool]:
    """Verify current code state using Dafny compiler.

    Reads code from message history state, runs verification, returns
    results including full rendered code.

    Args:
        messages: Message history containing code state

    Returns:
        dict with keys:
            - success (bool): True if verification succeeded
            - message (str): Success/failure message
            - code (str): Full current code with all hints
            - stdout (str): Standard output from Dafny
            - stderr (str): Standard error from Dafny
    """
    # Get current code state
    code = get_code_state(messages)
    if code is None:
        return {
            "success": False,
            "message": "Error: No code state found. Insert hints first.",
            "code": "",
            "stdout": "",
            "stderr": "",
        }

    # Check for verification bypass attempts (following DafnyBench methodology)
    if "{:verify false}" in code.lower():
        return {
            "success": False,
            "message": "Invalid code: contains {:verify false} which bypasses verification. "
            "You must properly verify the code with correct annotations.",
            "code": code,
            "stdout": "",
            "stderr": "Bypass attempt detected",
        }

    # Write to temporary file
    with tempfile.NamedTemporaryFile(mode="w", suffix=".dfy", delete=False) as tmp:
        tmp.write(code)
        temp_path = tmp.name

    try:
        # Run Dafny verification with timeout
        result = subprocess.run(
            ["dafny", "verify", temp_path],
            capture_output=True,
            text=True,
            timeout=30,
        )

        # Check for successful verification
        success = result.returncode == 0 and "0 errors" in result.stdout

        if success:
            message = "✓ Verification succeeded! All checks passed."
        else:
            error_output = result.stderr if result.stderr else result.stdout
            message = f"Verification failed:\n\n{error_output}"

        return {
            "success": success,
            "message": message,
            "code": code,  # Return full code
            "stdout": result.stdout,
            "stderr": result.stderr,
        }

    except subprocess.TimeoutExpired:
        return {
            "success": False,
            "message": "Verification timed out after 30 seconds. "
            "The program may be too complex or contain infinite loops.",
            "code": code,
            "stdout": "",
            "stderr": "Timeout",
        }
    finally:
        # Cleanup temporary file
        Path(temp_path).unlink(missing_ok=True)
The find_insertion_point function
The find_insertion_point function first checks for a line number. If it’s provided, it uses that. Otherwise, it uses the context to find the insertion point. If the context is ambiguous, it returns an error message to the agent.
This dual location specification makes the tools more robust and user-friendly for the agent. It’s a great example of how to design tool APIs with the agent in mind.
Dafny part 2: plain Python
Specialized Insertion Tools
Dafny part 2: plain Python
State Management via Message History