chore: initialize sandbox and overwrite remote content

2026-03-02 22:32:27 +08:00
commit a64378956a
584 changed files with 93604 additions and 0 deletions
--- a/examples/agent/browser_agent/build_in_helper/_file_download.py
+++ b/examples/agent/browser_agent/build_in_helper/_file_download.py
@@ -0,0 +1,238 @@
+# -*- coding: utf-8 -*-
+"""Standalone file download skill for the browser agent."""
+# flake8: noqa: E501
+# pylint: disable=W0212,W0107,too-many-lines,C0301
+
+from __future__ import annotations
+import os
+import copy
+from typing import Any
+from pydantic import BaseModel
+
+
+from agentscope.memory import InMemoryMemory
+from agentscope.message import Msg, TextBlock
+from agentscope.tool import ToolResponse
+from agentscope.agent import ReActAgent
+
+
+_CURRENT_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), os.pardir),
+)
+
+with open(
+    os.path.join(
+        _CURRENT_DIR,
+        "build_in_prompt/browser_agent_file_download_sys_prompt.md",
+    ),
+    "r",
+    encoding="utf-8",
+) as f:
+    _FILE_DOWNLOAD_AGENT_SYS_PROMPT = f.read()
+
+
+class EmptyModel(BaseModel):
+    """Empty structured model for default structured output requirement."""
+
+    pass
+
+
+class FileDownloadAgent(ReActAgent):
+    """Lightweight helper agent that downloads files"""
+
+    finish_function_name: str = "file_download_final_response"
+
+    def __init__(
+        self,
+        browser_agent: Any,
+        sys_prompt: str = _FILE_DOWNLOAD_AGENT_SYS_PROMPT,
+        max_iters: int = 15,
+    ) -> None:
+        name = (
+            f"{getattr(browser_agent, 'name', 'browser_agent')}_file_download"
+        )
+        super().__init__(
+            name=name,
+            sys_prompt=sys_prompt,
+            model=browser_agent.model,
+            formatter=browser_agent.formatter,
+            memory=InMemoryMemory(),
+            toolkit=browser_agent.toolkit,
+            max_iters=max_iters,
+        )
+        # Register the finish function
+        self.toolkit.register_tool_function(self.file_download_final_response)
+        # Remove conflicting tool functions if they exist
+        if hasattr(self.toolkit, "remove_tool_function"):
+            try:
+                self.toolkit.remove_tool_function("browser_pdf_save")
+            except Exception:
+                # Tool may not exist, ignore removal errors
+                pass
+            try:
+                self.toolkit.remove_tool_function("file_download")
+            except Exception:
+                # Tool may not exist, ignore removal errors
+                pass
+
+    async def file_download_final_response(
+        self,  # pylint: disable=W0613
+        **kwargs: Any,  # pylint: disable=W0613
+    ) -> ToolResponse:
+        """Summarize the file download outcome."""
+        hint_msg = Msg(
+            "user",
+            (
+                "Provide a concise summary of the file download attempt.\n"
+                "Highlight these items:\n"
+                "0. The original request\n"
+                "1. The element(s) interacted with and actions taken\n"
+                "2. The download status or any issues encountered\n"
+                "3. Any follow-up recommendations or next steps\n"
+            ),
+            role="user",
+        )
+
+        memory_msgs = await self.memory.get_memory()
+        memory_msgs_copy = copy.deepcopy(memory_msgs)
+        if memory_msgs_copy:
+            last_msg = memory_msgs_copy[-1]
+            last_msg.content = last_msg.get_content_blocks("text")
+            memory_msgs_copy[-1] = last_msg
+
+        prompt = await self.formatter.format(
+            msgs=[
+                Msg("system", self.sys_prompt, "system"),
+                *memory_msgs_copy,
+                hint_msg,
+            ],
+        )
+
+        res = await self.model(prompt)
+
+        if self.model.stream:
+            summary_text = ""
+            async for chunk in res:
+                summary_text = chunk.content[0]["text"]
+        else:
+            summary_text = res.content[0]["text"]
+
+        summary_text = summary_text or "No summary generated."
+
+        structure_response = {
+            "task_done": True,
+            "subtask_progress_summary": summary_text,
+            "generated_files": {},
+        }
+
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text="File download summary generated. " + summary_text,
+                ),
+            ],
+            metadata={
+                "success": True,
+                "structured_output": structure_response,
+            },
+            is_last=True,
+        )
+
+
+def _build_initial_instruction(
+    target_description: str,
+    snapshot_text: str,
+) -> str:
+    """Compose the initial instruction for the helper agent."""
+    return (
+        "You must locate and trigger the download for the requested file.\n\n"
+        "Target description provided by the user:\n"
+        f"{target_description}\n\n"
+        "Latest snapshot captured prior to your run:\n"
+        f"{snapshot_text}\n\n"
+        "Follow the sys prompt guidance, think step-by-step, and verify that "
+        "the download action succeeded. If the download cannot be completed, "
+        "explain why in the final summary."
+    )
+
+
+async def file_download(
+    browser_agent: Any,
+    target_description: str,
+) -> ToolResponse:
+    """
+    Download the target file. The current page should
+    contain download-related element.
+
+    Args:
+        target_description (str): The description of the
+        target file to download.
+
+    Returns:
+        ToolResponse: A structured response containing
+        the download directory.
+    """
+    try:
+        snapshot_chunks = await browser_agent._get_snapshot_in_text()
+    except Exception as exc:  # pylint: disable=broad-except
+        snapshot_chunks = []
+        snapshot_error = str(exc)
+    else:
+        snapshot_error = ""
+
+    snapshot_text = "\n\n---\n\n".join(snapshot_chunks)
+    if snapshot_error and not snapshot_text:
+        snapshot_text = f"[Snapshot failed: {snapshot_error}]"
+
+    sub_agent = FileDownloadAgent(browser_agent)
+    instruction = _build_initial_instruction(
+        target_description=target_description,
+        snapshot_text=snapshot_text,
+    )
+
+    init_msg = Msg(
+        name="user",
+        role="user",
+        content=instruction,
+    )
+
+    try:
+        sub_agent_response_msg = await sub_agent.reply(
+            init_msg,
+            structured_model=EmptyModel,
+        )
+
+        text_content = ""
+        if sub_agent_response_msg.content:
+            first_block = sub_agent_response_msg.content[0]
+            if isinstance(first_block, dict):
+                text_content = first_block.get("text") or ""
+            else:
+                text_content = getattr(first_block, "text", "") or ""
+
+        if not text_content:
+            text_content = (
+                "File download agent finished without a textual summary."
+            )
+
+        return ToolResponse(
+            metadata=sub_agent_response_msg.metadata,
+            content=[
+                TextBlock(
+                    type="text",
+                    text=text_content,
+                ),
+            ],
+        )
+    except Exception as exc:  # pylint: disable=broad-except
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text=f"Tool call Error. Cannot be executed. {exc}",
+                ),
+            ],
+            metadata={"success": False},
+            is_last=True,
+        )
--- a/examples/agent/browser_agent/build_in_helper/_form_filling.py
+++ b/examples/agent/browser_agent/build_in_helper/_form_filling.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+"""Standalone form filling skill for the browser agent."""
+# flake8: noqa: E501
+# pylint: disable=W0212,W0107,too-many-lines,C0301
+
+from __future__ import annotations
+import os
+import copy
+from typing import Any
+from pydantic import BaseModel
+
+from agentscope.memory import InMemoryMemory
+from agentscope.message import Msg, TextBlock
+from agentscope.tool import ToolResponse
+from agentscope.agent import ReActAgent
+
+_CURRENT_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), os.pardir),
+)
+
+with open(
+    os.path.join(
+        _CURRENT_DIR,
+        "build_in_prompt/browser_agent_form_filling_sys_prompt.md",
+    ),
+    "r",
+    encoding="utf-8",
+) as f:
+    _FORM_FILL_AGENT_SYS_PROMPT = f.read()
+
+
+class EmptyModel(BaseModel):
+    """Empty structured model for default structured output requirement."""
+
+    pass
+
+
+class FormFillingAgent(ReActAgent):
+    """Lightweight helper agent that fills forms."""
+
+    finish_function_name: str = "form_filling_final_response"
+
+    def __init__(
+        self,
+        browser_agent: Any,
+        sys_prompt: str = _FORM_FILL_AGENT_SYS_PROMPT,
+        max_iters: int = 20,
+    ) -> None:
+        name = f"{getattr(browser_agent, 'name', 'browser_agent')}_form_fill"
+        super().__init__(
+            name=name,
+            sys_prompt=sys_prompt,
+            model=browser_agent.model,
+            formatter=browser_agent.formatter,
+            memory=InMemoryMemory(),
+            toolkit=browser_agent.toolkit,
+            max_iters=max_iters,
+        )
+        # Register the finish function
+        self.toolkit.register_tool_function(self.form_filling_final_response)
+
+    async def form_filling_final_response(
+        self,  # pylint: disable=W0613
+        **kwargs: Any,  # pylint: disable=W0613
+    ) -> ToolResponse:
+        """Summarize the form filling outcome."""
+        hint_msg = Msg(
+            "user",
+            (
+                "Provide a concise summary of the completed form "
+                "filling task.\n"
+                "Highlight these items:\n"
+                "0. The original task/query\n"
+                "1. Which fields were filled/selected and their final values\n"
+                "2. Any important observations or follow-up notes\n"
+                "3. Confirmation that if the task is complete\n\n"
+            ),
+            role="user",
+        )
+
+        memory_msgs = await self.memory.get_memory()
+        memory_msgs_copy = copy.deepcopy(memory_msgs)
+        last_msg = memory_msgs_copy[-1]
+        # check if the last message has tool call, if so clean the content
+
+        last_msg.content = last_msg.get_content_blocks("text")
+        memory_msgs_copy[-1] = last_msg
+
+        prompt = await self.formatter.format(
+            msgs=[
+                Msg("system", self.sys_prompt, "system"),
+                *memory_msgs_copy,
+                hint_msg,
+            ],
+        )
+
+        res = await self.model(prompt)
+
+        if self.model.stream:
+            summary_text = ""
+            async for chunk in res:
+                summary_text = chunk.content[0]["text"]
+        else:
+            summary_text = res.content[0]["text"]
+
+        structure_response = {
+            "task_done": True,
+            "subtask_progress_summary": summary_text,
+            "generated_files": {},
+        }
+
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text="Form filling summary generated. " + summary_text,
+                ),
+            ],
+            metadata={
+                "success": True,
+                "structured_output": structure_response,
+            },
+            is_last=True,
+        )
+
+
+def _build_initial_instruction(
+    fill_information: str,
+    snapshot_text: str,
+) -> str:
+    """Compose the initial instruction fed to the helper agent."""
+    return (
+        "You must complete the web form using the information "
+        "provided below.\n\nFill instructions (plain text from the user):\n"
+        f"{fill_information}\n\n"
+        "Latest snapshot captured prior to your run:\n"
+        f"{snapshot_text}\n\n"
+    )
+
+
+async def form_filling(
+    browser_agent: Any,
+    fill_information: str,
+) -> ToolResponse:
+    """
+    Fill in a web form according to plain-text instructions.
+
+    Args:
+        fill_information (str):
+            Plain-text description of the values that
+            must be entered into the form,
+            including any submission requirements.
+
+    Returns:
+        ToolResponse: Summary of the helper agent execution and status.
+    """
+    try:
+        snapshot_chunks = (
+            await browser_agent._get_snapshot_in_text()
+        )  # pylint: disable=protected-access
+    except Exception as exc:  # pylint: disable=broad-except
+        snapshot_chunks = []
+        snapshot_error = str(exc)
+    else:
+        snapshot_error = ""
+
+    snapshot_text = "\n\n---\n\n".join(snapshot_chunks)
+    if snapshot_error and not snapshot_text:
+        snapshot_text = f"[Snapshot failed: {snapshot_error}]"
+
+    sub_agent = FormFillingAgent(browser_agent)
+    instruction = _build_initial_instruction(
+        fill_information=fill_information,
+        snapshot_text=snapshot_text,
+    )
+
+    init_msg = Msg(
+        name="user",
+        role="user",
+        content=instruction,
+    )
+
+    try:
+        sub_agent_response_msg = await sub_agent.reply(
+            init_msg,
+            structured_model=EmptyModel,
+        )
+
+        text_content = ""
+        if sub_agent_response_msg.content:
+            first_block = sub_agent_response_msg.content[0]
+            if isinstance(first_block, dict):
+                text_content = first_block.get("text") or ""
+            else:
+                text_content = getattr(first_block, "text", "") or ""
+
+        if not text_content:
+            text_content = (
+                "Form filling agent finished without a textual summary."
+            )
+
+        return ToolResponse(
+            metadata=sub_agent_response_msg.metadata,
+            content=[
+                TextBlock(
+                    type="text",
+                    text=text_content,
+                ),
+            ],
+        )
+    except Exception as e:
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text=f"Tool call Error. Cannot be executed. {e}",
+                ),
+            ],
+            metadata={"success": False},
+            is_last=True,
+        )
--- a/examples/agent/browser_agent/build_in_helper/_image_understanding.py
+++ b/examples/agent/browser_agent/build_in_helper/_image_understanding.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+"""Standalone image understanding skill for the browser agent."""
+# flake8: noqa: E501
+# pylint: disable=W0212
+# pylint: disable=too-many-lines
+# pylint: disable=C0301
+from __future__ import annotations
+
+import json
+import uuid
+from typing import Any
+
+from agentscope.message import (
+    Base64Source,
+    ImageBlock,
+    Msg,
+    TextBlock,
+    ToolUseBlock,
+)
+from agentscope.tool import ToolResponse
+
+
+async def image_understanding(
+    browser_agent: Any,
+    object_description: str,
+    task: str,
+) -> ToolResponse:
+    """
+    Locate an element and solve a visual task on the current webpage.
+
+    Args:
+        object_description (str): The description of the object to locate.
+        task (str): The specific task or question to solve about the image
+        (e.g., description, object detection, activity recognition, or
+        answering a question about the image's content).
+
+    Returns:
+        ToolResponse: A structured response containing the answer to
+        the specified task based on the image content.
+    """
+
+    sys_prompt = (
+        "You are a web page analysis expert. Given the following page "
+        "snapshot and object description, "
+        "identify the exact element and its reference string (ref) "
+        "that matches the description. "
+        "Return ONLY a JSON object: "
+        '{"element": <element description>, "ref": <ref string>}'
+    )
+
+    snapshot_chunks = (
+        await browser_agent._get_snapshot_in_text()  # noqa: E501 # pylint: disable=protected-access
+    )
+    page_snapshot = snapshot_chunks[0] if snapshot_chunks else ""
+    user_prompt = (
+        f"Object description: {object_description}\n"
+        f"Page snapshot:\n{page_snapshot}"
+    )
+
+    prompt = await browser_agent.formatter.format(
+        msgs=[
+            Msg("system", sys_prompt, role="system"),
+            Msg("user", user_prompt, role="user"),
+        ],
+    )
+    res = await browser_agent.model(prompt)
+    if browser_agent.model.stream:
+        async for chunk in res:
+            model_text = chunk.content[0]["text"]
+    else:
+        model_text = res.content[0]["text"]
+
+    try:
+        if "```json" in model_text:
+            model_text = model_text.replace("```json", "").replace(
+                "```",
+                "",
+            )
+        element_info = json.loads(model_text)
+        element = element_info.get("element", "")
+        ref = element_info.get("ref", "")
+    except Exception:
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text="Failed to parse element/ref from model output.",
+                ),
+            ],
+            metadata={"success": False},
+        )
+
+    screenshot_tool_call = ToolUseBlock(
+        id=str(uuid.uuid4()),
+        name="browser_take_screenshot",
+        input={"element": element, "ref": ref},
+        type="tool_use",
+    )
+    screenshot_response = await browser_agent.toolkit.call_tool_function(
+        screenshot_tool_call,
+    )
+    image_data = None
+    async for chunk in screenshot_response:
+        if (
+            chunk.content
+            and len(chunk.content) > 1
+            and "data" in chunk.content[1]
+        ):
+            image_data = chunk.content[1]["data"]
+
+    sys_prompt_task = (
+        "You are a web automation expert. "
+        "Given the object description, screenshot, and page context, "
+        "solve the following task. Return ONLY the answer as plain text."
+    )
+    content_blocks = [
+        TextBlock(
+            type="text",
+            text=(
+                "Object description: "
+                f"{object_description}\nTask: {task}\n"
+                f"Page snapshot:\n{page_snapshot}"
+            ),
+        ),
+    ]
+
+    if image_data:
+        image_block = ImageBlock(
+            type="image",
+            source=Base64Source(
+                type="base64",
+                media_type="image/png",
+                data=image_data,
+            ),
+        )
+        content_blocks.append(image_block)
+
+    prompt_task = await browser_agent.formatter.format(
+        msgs=[
+            Msg("system", sys_prompt_task, role="system"),
+            Msg("user", content_blocks, role="user"),
+        ],
+    )
+    res_task = await browser_agent.model(prompt_task)
+    if browser_agent.model.stream:
+        async for chunk in res_task:
+            answer_text = chunk.content[0]["text"]
+    else:
+        answer_text = res_task.content[0]["text"]
+
+    return ToolResponse(
+        content=[
+            TextBlock(
+                type="text",
+                text=(
+                    f"Screenshot taken for element: {element}\nref: {ref}\n"
+                    f"Task solution: {answer_text}"
+                ),
+            ),
+        ],
+    )
--- a/examples/agent/browser_agent/build_in_helper/_video_understanding.py
+++ b/examples/agent/browser_agent/build_in_helper/_video_understanding.py
@@ -0,0 +1,330 @@
+# -*- coding: utf-8 -*-
+"""Standalone video understanding skill for the browser agent."""
+# flake8: noqa: E501
+# pylint: disable=W0212
+# pylint: disable=too-many-lines
+# pylint: disable=C0301
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import tempfile
+import uuid
+from base64 import b64encode
+from pathlib import Path
+from typing import Any, List, Optional
+
+from agentscope.message import (
+    Base64Source,
+    ImageBlock,
+    Msg,
+    TextBlock,
+)
+from agentscope.tool import ToolResponse
+
+
+async def video_understanding(
+    browser_agent: Any,
+    video_path: str,
+    task: str,
+) -> ToolResponse:
+    """
+    Perform video understanding on the provided video file.
+
+    Args:
+        video_path (str): The path to the video file to analyze.
+        task (str): The specific task or question to solve about
+        the video (e.g., summary, object detection, activity recognition,
+        or answering a question about the video's content).
+
+    Returns:
+        ToolResponse: A structured response containing the answer
+        to the specified task based on the video content.
+    """
+
+    workdir = _prepare_workdir(browser_agent)
+    try:
+        frames_dir = os.path.join(workdir, "frames")
+        frames = extract_frames(video_path, frames_dir)
+    except Exception as exc:
+        return _error_response(f"Failed to extract frames: {exc}")
+
+    audio_path = os.path.join(
+        workdir,
+        f"audio_{getattr(browser_agent, 'iter_n', 0)}.wav",
+    )
+    try:
+        extract_audio(video_path, audio_path)
+    except Exception as exc:
+        return _error_response(f"Failed to extract audio: {exc}")
+
+    try:
+        transcript = audio2text(audio_path)
+    except Exception as exc:
+        return _error_response(f"Failed to transcribe audio: {exc}")
+
+    sys_prompt = (
+        "You are a web video analysis expert. "
+        "Given the following video frames and audio transcript, "
+        "analyze the content and provide a solution to the task. "
+        'Return ONLY a JSON object: {"answer": <your answer>}'
+    )
+
+    content_blocks = _build_multimodal_blocks(frames, transcript, task)
+
+    prompt = await browser_agent.formatter.format(
+        msgs=[
+            Msg("system", sys_prompt, role="system"),
+            Msg("user", content_blocks, role="user"),
+        ],
+    )
+
+    res = await browser_agent.model(prompt)
+    if browser_agent.model.stream:
+        async for chunk in res:
+            model_text = chunk.content[0]["text"]
+    else:
+        model_text = res.content[0]["text"]
+
+    try:
+        if "```json" in model_text:
+            model_text = model_text.replace("```json", "").replace(
+                "```",
+                "",
+            )
+        answer_info = json.loads(model_text)
+        answer = answer_info.get("answer", "")
+    except Exception:  # pylint: disable=broad-except
+        return _error_response("Failed to parse answer from model output.")
+
+    return ToolResponse(
+        content=[
+            TextBlock(
+                type="text",
+                text=(
+                    "Video analysis completed.\n" f"Task solution: {answer}"
+                ),
+            ),
+        ],
+    )
+
+
+def audio2text(audio_path: str) -> str:
+    """Convert audio to text using DashScope ASR."""
+
+    try:  # Local import to avoid hard dependency when unused.
+        from dashscope.audio.asr import Recognition, RecognitionCallback
+    except ImportError as exc:
+        raise RuntimeError(
+            "dashscope.audio is required for audio transcription.",
+        ) from exc
+
+    callback = RecognitionCallback()
+    recognizer = Recognition(
+        model="paraformer-realtime-v1",
+        format="wav",
+        sample_rate=16000,
+        callback=callback,
+    )
+
+    result = recognizer.call(audio_path)
+    sentences = result.get("output", {}).get("sentence", [])
+    return " ".join(sentence.get("text", "") for sentence in sentences)
+
+
+def extract_frames(
+    video_path: str,
+    output_dir: str,
+    max_frames: int = 16,
+) -> List[str]:
+    """Extract representative frames using ffmpeg (no OpenCV dependency)."""
+
+    if max_frames <= 0:
+        raise ValueError("max_frames must be greater than zero.")
+
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video path not found: {video_path}")
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Clean up previous generated frames
+    for existing in Path(output_dir).glob("frame_*.jpg"):
+        try:
+            existing.unlink()
+        except OSError:
+            # Ignore errors during cleanup;
+            # leftover files will be overwritten or do not affect frame extraction
+            pass
+
+    duration = _probe_video_duration(video_path)
+    if duration and duration > 0:
+        fps = max_frames / duration
+    else:
+        fps = 1.0
+
+    fps = max(min(fps, 30.0), 0.1)
+
+    command = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        video_path,
+        "-vf",
+        f"fps={fps:.5f}",
+        "-frames:v",
+        str(max_frames),
+        os.path.join(output_dir, "frame_%04d.jpg"),
+    ]
+
+    try:
+        subprocess.run(
+            command,
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+    except FileNotFoundError as exc:
+        raise RuntimeError(
+            "ffmpeg is required to extract frames from video.",
+        ) from exc
+
+    frame_files = sorted(
+        str(path) for path in Path(output_dir).glob("frame_*.jpg")
+    )
+
+    if not frame_files:
+        raise RuntimeError("No frames could be extracted from the video.")
+
+    return frame_files
+
+
+def extract_audio(video_path: str, audio_path: str) -> str:
+    """Extract audio track with ffmpeg and save as wav."""
+
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video path not found: {video_path}")
+
+    os.makedirs(os.path.dirname(audio_path), exist_ok=True)
+
+    command = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        video_path,
+        "-vn",
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-ac",
+        "1",
+        audio_path,
+    ]
+
+    try:
+        subprocess.run(
+            command,
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+    except FileNotFoundError as exc:
+        raise RuntimeError(
+            "ffmpeg is required to extract audio from video.",
+        ) from exc
+
+    return audio_path
+
+
+def _probe_video_duration(video_path: str) -> Optional[float]:
+    """Return the video duration in seconds using ffprobe, if available."""
+
+    command = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-show_entries",
+        "format=duration",
+        "-of",
+        "default=noprint_wrappers=1:nokey=1",
+        video_path,
+    ]
+
+    try:
+        result = subprocess.run(
+            command,
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            text=True,
+        )
+        duration_str = result.stdout.strip()
+        if duration_str:
+            return float(duration_str)
+    except (FileNotFoundError, ValueError, subprocess.CalledProcessError):
+        return None
+
+    return None
+
+
+def _build_multimodal_blocks(
+    frames: List[str],
+    transcript: str,
+    task: str,
+) -> list:
+    """Construct multimodal content blocks for the model input."""
+
+    blocks: list = []
+    for frame_path in frames:
+        with open(frame_path, "rb") as file:
+            data = b64encode(file.read()).decode("ascii")
+        image_block = ImageBlock(
+            type="image",
+            source=Base64Source(
+                type="base64",
+                media_type="image/jpeg",
+                data=data,
+            ),
+        )
+        blocks.append(image_block)
+
+    blocks.append(
+        TextBlock(
+            type="text",
+            text=f"Audio transcript:\n{transcript}",
+        ),
+    )
+    blocks.append(
+        TextBlock(
+            type="text",
+            text=f"The task to be solved is: {task}",
+        ),
+    )
+    return blocks
+
+
+def _prepare_workdir(browser_agent: Any) -> str:
+    """Prepare a working directory for intermediate artifacts."""
+
+    base_dir = getattr(browser_agent, "state_saving_dir", None)
+    if not base_dir:
+        base_dir = tempfile.gettempdir()
+
+    workdir = os.path.join(base_dir, "video_understanding", uuid.uuid4().hex)
+    os.makedirs(workdir, exist_ok=True)
+    return workdir
+
+
+def _error_response(message: str) -> ToolResponse:
+    """Create a standardized error response."""
+
+    return ToolResponse(
+        content=[
+            TextBlock(
+                type="text",
+                text=message,
+            ),
+        ],
+        metadata={"success": False},
+    )