chore: initialize sandbox and overwrite remote content

2026-03-02 22:32:27 +08:00
commit a64378956a
584 changed files with 93604 additions and 0 deletions
--- a/examples/agent/browser_agent/README.md
+++ b/examples/agent/browser_agent/README.md
@@ -0,0 +1,49 @@
+# Browser Agent Example
+
+This example demonstrates how to use AgentScope's BrowserAgent for web automation tasks. The BrowserAgent leverages the Model Context Protocol (MCP) to interact with browser tools powered by Playwright, enabling sophisticated web navigation, data extraction, and automation.
+
+
+## Prerequisites
+
+- Python 3.10 or higher
+- Node.js and npm (for the MCP server)
+- DashScope API key from Alibaba Cloud
+
+## Installation
+
+### Install AgentScope
+
+```bash
+# Install from source
+cd {PATH_TO_AGENTSCOPE}
+pip install -e .
+```
+
+## Setup
+
+### 1. Environment Configuration
+
+Set up your DashScope API key:
+
+```bash
+export DASHSCOPE_API_KEY="your_dashscope_api_key_here"
+```
+
+You can obtain a DashScope API key from [Alibaba Cloud DashScope Console](https://dashscope.console.aliyun.com/).
+
+### 2. About PlayWright MCP Server
+
+Before running the browser agent, you can test whether you can start the Playwright MCP server:
+
+```bash
+npx @playwright/mcp@latest
+```
+
+## Usage
+
+### Basic Example
+You can start running the browser agent in your terminal with the following command
+```bash
+cd examples/agent/browser_agent
+python main.py
+```
--- a/examples/agent/browser_agent/browser_agent.py
+++ b/examples/agent/browser_agent/browser_agent.py
--- a/examples/agent/browser_agent/build_in_helper/_file_download.py
+++ b/examples/agent/browser_agent/build_in_helper/_file_download.py
@@ -0,0 +1,238 @@
+# -*- coding: utf-8 -*-
+"""Standalone file download skill for the browser agent."""
+# flake8: noqa: E501
+# pylint: disable=W0212,W0107,too-many-lines,C0301
+
+from __future__ import annotations
+import os
+import copy
+from typing import Any
+from pydantic import BaseModel
+
+
+from agentscope.memory import InMemoryMemory
+from agentscope.message import Msg, TextBlock
+from agentscope.tool import ToolResponse
+from agentscope.agent import ReActAgent
+
+
+_CURRENT_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), os.pardir),
+)
+
+with open(
+    os.path.join(
+        _CURRENT_DIR,
+        "build_in_prompt/browser_agent_file_download_sys_prompt.md",
+    ),
+    "r",
+    encoding="utf-8",
+) as f:
+    _FILE_DOWNLOAD_AGENT_SYS_PROMPT = f.read()
+
+
+class EmptyModel(BaseModel):
+    """Empty structured model for default structured output requirement."""
+
+    pass
+
+
+class FileDownloadAgent(ReActAgent):
+    """Lightweight helper agent that downloads files"""
+
+    finish_function_name: str = "file_download_final_response"
+
+    def __init__(
+        self,
+        browser_agent: Any,
+        sys_prompt: str = _FILE_DOWNLOAD_AGENT_SYS_PROMPT,
+        max_iters: int = 15,
+    ) -> None:
+        name = (
+            f"{getattr(browser_agent, 'name', 'browser_agent')}_file_download"
+        )
+        super().__init__(
+            name=name,
+            sys_prompt=sys_prompt,
+            model=browser_agent.model,
+            formatter=browser_agent.formatter,
+            memory=InMemoryMemory(),
+            toolkit=browser_agent.toolkit,
+            max_iters=max_iters,
+        )
+        # Register the finish function
+        self.toolkit.register_tool_function(self.file_download_final_response)
+        # Remove conflicting tool functions if they exist
+        if hasattr(self.toolkit, "remove_tool_function"):
+            try:
+                self.toolkit.remove_tool_function("browser_pdf_save")
+            except Exception:
+                # Tool may not exist, ignore removal errors
+                pass
+            try:
+                self.toolkit.remove_tool_function("file_download")
+            except Exception:
+                # Tool may not exist, ignore removal errors
+                pass
+
+    async def file_download_final_response(
+        self,  # pylint: disable=W0613
+        **kwargs: Any,  # pylint: disable=W0613
+    ) -> ToolResponse:
+        """Summarize the file download outcome."""
+        hint_msg = Msg(
+            "user",
+            (
+                "Provide a concise summary of the file download attempt.\n"
+                "Highlight these items:\n"
+                "0. The original request\n"
+                "1. The element(s) interacted with and actions taken\n"
+                "2. The download status or any issues encountered\n"
+                "3. Any follow-up recommendations or next steps\n"
+            ),
+            role="user",
+        )
+
+        memory_msgs = await self.memory.get_memory()
+        memory_msgs_copy = copy.deepcopy(memory_msgs)
+        if memory_msgs_copy:
+            last_msg = memory_msgs_copy[-1]
+            last_msg.content = last_msg.get_content_blocks("text")
+            memory_msgs_copy[-1] = last_msg
+
+        prompt = await self.formatter.format(
+            msgs=[
+                Msg("system", self.sys_prompt, "system"),
+                *memory_msgs_copy,
+                hint_msg,
+            ],
+        )
+
+        res = await self.model(prompt)
+
+        if self.model.stream:
+            summary_text = ""
+            async for chunk in res:
+                summary_text = chunk.content[0]["text"]
+        else:
+            summary_text = res.content[0]["text"]
+
+        summary_text = summary_text or "No summary generated."
+
+        structure_response = {
+            "task_done": True,
+            "subtask_progress_summary": summary_text,
+            "generated_files": {},
+        }
+
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text="File download summary generated. " + summary_text,
+                ),
+            ],
+            metadata={
+                "success": True,
+                "structured_output": structure_response,
+            },
+            is_last=True,
+        )
+
+
+def _build_initial_instruction(
+    target_description: str,
+    snapshot_text: str,
+) -> str:
+    """Compose the initial instruction for the helper agent."""
+    return (
+        "You must locate and trigger the download for the requested file.\n\n"
+        "Target description provided by the user:\n"
+        f"{target_description}\n\n"
+        "Latest snapshot captured prior to your run:\n"
+        f"{snapshot_text}\n\n"
+        "Follow the sys prompt guidance, think step-by-step, and verify that "
+        "the download action succeeded. If the download cannot be completed, "
+        "explain why in the final summary."
+    )
+
+
+async def file_download(
+    browser_agent: Any,
+    target_description: str,
+) -> ToolResponse:
+    """
+    Download the target file. The current page should
+    contain download-related element.
+
+    Args:
+        target_description (str): The description of the
+        target file to download.
+
+    Returns:
+        ToolResponse: A structured response containing
+        the download directory.
+    """
+    try:
+        snapshot_chunks = await browser_agent._get_snapshot_in_text()
+    except Exception as exc:  # pylint: disable=broad-except
+        snapshot_chunks = []
+        snapshot_error = str(exc)
+    else:
+        snapshot_error = ""
+
+    snapshot_text = "\n\n---\n\n".join(snapshot_chunks)
+    if snapshot_error and not snapshot_text:
+        snapshot_text = f"[Snapshot failed: {snapshot_error}]"
+
+    sub_agent = FileDownloadAgent(browser_agent)
+    instruction = _build_initial_instruction(
+        target_description=target_description,
+        snapshot_text=snapshot_text,
+    )
+
+    init_msg = Msg(
+        name="user",
+        role="user",
+        content=instruction,
+    )
+
+    try:
+        sub_agent_response_msg = await sub_agent.reply(
+            init_msg,
+            structured_model=EmptyModel,
+        )
+
+        text_content = ""
+        if sub_agent_response_msg.content:
+            first_block = sub_agent_response_msg.content[0]
+            if isinstance(first_block, dict):
+                text_content = first_block.get("text") or ""
+            else:
+                text_content = getattr(first_block, "text", "") or ""
+
+        if not text_content:
+            text_content = (
+                "File download agent finished without a textual summary."
+            )
+
+        return ToolResponse(
+            metadata=sub_agent_response_msg.metadata,
+            content=[
+                TextBlock(
+                    type="text",
+                    text=text_content,
+                ),
+            ],
+        )
+    except Exception as exc:  # pylint: disable=broad-except
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text=f"Tool call Error. Cannot be executed. {exc}",
+                ),
+            ],
+            metadata={"success": False},
+            is_last=True,
+        )
--- a/examples/agent/browser_agent/build_in_helper/_form_filling.py
+++ b/examples/agent/browser_agent/build_in_helper/_form_filling.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+"""Standalone form filling skill for the browser agent."""
+# flake8: noqa: E501
+# pylint: disable=W0212,W0107,too-many-lines,C0301
+
+from __future__ import annotations
+import os
+import copy
+from typing import Any
+from pydantic import BaseModel
+
+from agentscope.memory import InMemoryMemory
+from agentscope.message import Msg, TextBlock
+from agentscope.tool import ToolResponse
+from agentscope.agent import ReActAgent
+
+_CURRENT_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), os.pardir),
+)
+
+with open(
+    os.path.join(
+        _CURRENT_DIR,
+        "build_in_prompt/browser_agent_form_filling_sys_prompt.md",
+    ),
+    "r",
+    encoding="utf-8",
+) as f:
+    _FORM_FILL_AGENT_SYS_PROMPT = f.read()
+
+
+class EmptyModel(BaseModel):
+    """Empty structured model for default structured output requirement."""
+
+    pass
+
+
+class FormFillingAgent(ReActAgent):
+    """Lightweight helper agent that fills forms."""
+
+    finish_function_name: str = "form_filling_final_response"
+
+    def __init__(
+        self,
+        browser_agent: Any,
+        sys_prompt: str = _FORM_FILL_AGENT_SYS_PROMPT,
+        max_iters: int = 20,
+    ) -> None:
+        name = f"{getattr(browser_agent, 'name', 'browser_agent')}_form_fill"
+        super().__init__(
+            name=name,
+            sys_prompt=sys_prompt,
+            model=browser_agent.model,
+            formatter=browser_agent.formatter,
+            memory=InMemoryMemory(),
+            toolkit=browser_agent.toolkit,
+            max_iters=max_iters,
+        )
+        # Register the finish function
+        self.toolkit.register_tool_function(self.form_filling_final_response)
+
+    async def form_filling_final_response(
+        self,  # pylint: disable=W0613
+        **kwargs: Any,  # pylint: disable=W0613
+    ) -> ToolResponse:
+        """Summarize the form filling outcome."""
+        hint_msg = Msg(
+            "user",
+            (
+                "Provide a concise summary of the completed form "
+                "filling task.\n"
+                "Highlight these items:\n"
+                "0. The original task/query\n"
+                "1. Which fields were filled/selected and their final values\n"
+                "2. Any important observations or follow-up notes\n"
+                "3. Confirmation that if the task is complete\n\n"
+            ),
+            role="user",
+        )
+
+        memory_msgs = await self.memory.get_memory()
+        memory_msgs_copy = copy.deepcopy(memory_msgs)
+        last_msg = memory_msgs_copy[-1]
+        # check if the last message has tool call, if so clean the content
+
+        last_msg.content = last_msg.get_content_blocks("text")
+        memory_msgs_copy[-1] = last_msg
+
+        prompt = await self.formatter.format(
+            msgs=[
+                Msg("system", self.sys_prompt, "system"),
+                *memory_msgs_copy,
+                hint_msg,
+            ],
+        )
+
+        res = await self.model(prompt)
+
+        if self.model.stream:
+            summary_text = ""
+            async for chunk in res:
+                summary_text = chunk.content[0]["text"]
+        else:
+            summary_text = res.content[0]["text"]
+
+        structure_response = {
+            "task_done": True,
+            "subtask_progress_summary": summary_text,
+            "generated_files": {},
+        }
+
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text="Form filling summary generated. " + summary_text,
+                ),
+            ],
+            metadata={
+                "success": True,
+                "structured_output": structure_response,
+            },
+            is_last=True,
+        )
+
+
+def _build_initial_instruction(
+    fill_information: str,
+    snapshot_text: str,
+) -> str:
+    """Compose the initial instruction fed to the helper agent."""
+    return (
+        "You must complete the web form using the information "
+        "provided below.\n\nFill instructions (plain text from the user):\n"
+        f"{fill_information}\n\n"
+        "Latest snapshot captured prior to your run:\n"
+        f"{snapshot_text}\n\n"
+    )
+
+
+async def form_filling(
+    browser_agent: Any,
+    fill_information: str,
+) -> ToolResponse:
+    """
+    Fill in a web form according to plain-text instructions.
+
+    Args:
+        fill_information (str):
+            Plain-text description of the values that
+            must be entered into the form,
+            including any submission requirements.
+
+    Returns:
+        ToolResponse: Summary of the helper agent execution and status.
+    """
+    try:
+        snapshot_chunks = (
+            await browser_agent._get_snapshot_in_text()
+        )  # pylint: disable=protected-access
+    except Exception as exc:  # pylint: disable=broad-except
+        snapshot_chunks = []
+        snapshot_error = str(exc)
+    else:
+        snapshot_error = ""
+
+    snapshot_text = "\n\n---\n\n".join(snapshot_chunks)
+    if snapshot_error and not snapshot_text:
+        snapshot_text = f"[Snapshot failed: {snapshot_error}]"
+
+    sub_agent = FormFillingAgent(browser_agent)
+    instruction = _build_initial_instruction(
+        fill_information=fill_information,
+        snapshot_text=snapshot_text,
+    )
+
+    init_msg = Msg(
+        name="user",
+        role="user",
+        content=instruction,
+    )
+
+    try:
+        sub_agent_response_msg = await sub_agent.reply(
+            init_msg,
+            structured_model=EmptyModel,
+        )
+
+        text_content = ""
+        if sub_agent_response_msg.content:
+            first_block = sub_agent_response_msg.content[0]
+            if isinstance(first_block, dict):
+                text_content = first_block.get("text") or ""
+            else:
+                text_content = getattr(first_block, "text", "") or ""
+
+        if not text_content:
+            text_content = (
+                "Form filling agent finished without a textual summary."
+            )
+
+        return ToolResponse(
+            metadata=sub_agent_response_msg.metadata,
+            content=[
+                TextBlock(
+                    type="text",
+                    text=text_content,
+                ),
+            ],
+        )
+    except Exception as e:
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text=f"Tool call Error. Cannot be executed. {e}",
+                ),
+            ],
+            metadata={"success": False},
+            is_last=True,
+        )
--- a/examples/agent/browser_agent/build_in_helper/_image_understanding.py
+++ b/examples/agent/browser_agent/build_in_helper/_image_understanding.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+"""Standalone image understanding skill for the browser agent."""
+# flake8: noqa: E501
+# pylint: disable=W0212
+# pylint: disable=too-many-lines
+# pylint: disable=C0301
+from __future__ import annotations
+
+import json
+import uuid
+from typing import Any
+
+from agentscope.message import (
+    Base64Source,
+    ImageBlock,
+    Msg,
+    TextBlock,
+    ToolUseBlock,
+)
+from agentscope.tool import ToolResponse
+
+
+async def image_understanding(
+    browser_agent: Any,
+    object_description: str,
+    task: str,
+) -> ToolResponse:
+    """
+    Locate an element and solve a visual task on the current webpage.
+
+    Args:
+        object_description (str): The description of the object to locate.
+        task (str): The specific task or question to solve about the image
+        (e.g., description, object detection, activity recognition, or
+        answering a question about the image's content).
+
+    Returns:
+        ToolResponse: A structured response containing the answer to
+        the specified task based on the image content.
+    """
+
+    sys_prompt = (
+        "You are a web page analysis expert. Given the following page "
+        "snapshot and object description, "
+        "identify the exact element and its reference string (ref) "
+        "that matches the description. "
+        "Return ONLY a JSON object: "
+        '{"element": <element description>, "ref": <ref string>}'
+    )
+
+    snapshot_chunks = (
+        await browser_agent._get_snapshot_in_text()  # noqa: E501 # pylint: disable=protected-access
+    )
+    page_snapshot = snapshot_chunks[0] if snapshot_chunks else ""
+    user_prompt = (
+        f"Object description: {object_description}\n"
+        f"Page snapshot:\n{page_snapshot}"
+    )
+
+    prompt = await browser_agent.formatter.format(
+        msgs=[
+            Msg("system", sys_prompt, role="system"),
+            Msg("user", user_prompt, role="user"),
+        ],
+    )
+    res = await browser_agent.model(prompt)
+    if browser_agent.model.stream:
+        async for chunk in res:
+            model_text = chunk.content[0]["text"]
+    else:
+        model_text = res.content[0]["text"]
+
+    try:
+        if "```json" in model_text:
+            model_text = model_text.replace("```json", "").replace(
+                "```",
+                "",
+            )
+        element_info = json.loads(model_text)
+        element = element_info.get("element", "")
+        ref = element_info.get("ref", "")
+    except Exception:
+        return ToolResponse(
+            content=[
+                TextBlock(
+                    type="text",
+                    text="Failed to parse element/ref from model output.",
+                ),
+            ],
+            metadata={"success": False},
+        )
+
+    screenshot_tool_call = ToolUseBlock(
+        id=str(uuid.uuid4()),
+        name="browser_take_screenshot",
+        input={"element": element, "ref": ref},
+        type="tool_use",
+    )
+    screenshot_response = await browser_agent.toolkit.call_tool_function(
+        screenshot_tool_call,
+    )
+    image_data = None
+    async for chunk in screenshot_response:
+        if (
+            chunk.content
+            and len(chunk.content) > 1
+            and "data" in chunk.content[1]
+        ):
+            image_data = chunk.content[1]["data"]
+
+    sys_prompt_task = (
+        "You are a web automation expert. "
+        "Given the object description, screenshot, and page context, "
+        "solve the following task. Return ONLY the answer as plain text."
+    )
+    content_blocks = [
+        TextBlock(
+            type="text",
+            text=(
+                "Object description: "
+                f"{object_description}\nTask: {task}\n"
+                f"Page snapshot:\n{page_snapshot}"
+            ),
+        ),
+    ]
+
+    if image_data:
+        image_block = ImageBlock(
+            type="image",
+            source=Base64Source(
+                type="base64",
+                media_type="image/png",
+                data=image_data,
+            ),
+        )
+        content_blocks.append(image_block)
+
+    prompt_task = await browser_agent.formatter.format(
+        msgs=[
+            Msg("system", sys_prompt_task, role="system"),
+            Msg("user", content_blocks, role="user"),
+        ],
+    )
+    res_task = await browser_agent.model(prompt_task)
+    if browser_agent.model.stream:
+        async for chunk in res_task:
+            answer_text = chunk.content[0]["text"]
+    else:
+        answer_text = res_task.content[0]["text"]
+
+    return ToolResponse(
+        content=[
+            TextBlock(
+                type="text",
+                text=(
+                    f"Screenshot taken for element: {element}\nref: {ref}\n"
+                    f"Task solution: {answer_text}"
+                ),
+            ),
+        ],
+    )
--- a/examples/agent/browser_agent/build_in_helper/_video_understanding.py
+++ b/examples/agent/browser_agent/build_in_helper/_video_understanding.py
@@ -0,0 +1,330 @@
+# -*- coding: utf-8 -*-
+"""Standalone video understanding skill for the browser agent."""
+# flake8: noqa: E501
+# pylint: disable=W0212
+# pylint: disable=too-many-lines
+# pylint: disable=C0301
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import tempfile
+import uuid
+from base64 import b64encode
+from pathlib import Path
+from typing import Any, List, Optional
+
+from agentscope.message import (
+    Base64Source,
+    ImageBlock,
+    Msg,
+    TextBlock,
+)
+from agentscope.tool import ToolResponse
+
+
+async def video_understanding(
+    browser_agent: Any,
+    video_path: str,
+    task: str,
+) -> ToolResponse:
+    """
+    Perform video understanding on the provided video file.
+
+    Args:
+        video_path (str): The path to the video file to analyze.
+        task (str): The specific task or question to solve about
+        the video (e.g., summary, object detection, activity recognition,
+        or answering a question about the video's content).
+
+    Returns:
+        ToolResponse: A structured response containing the answer
+        to the specified task based on the video content.
+    """
+
+    workdir = _prepare_workdir(browser_agent)
+    try:
+        frames_dir = os.path.join(workdir, "frames")
+        frames = extract_frames(video_path, frames_dir)
+    except Exception as exc:
+        return _error_response(f"Failed to extract frames: {exc}")
+
+    audio_path = os.path.join(
+        workdir,
+        f"audio_{getattr(browser_agent, 'iter_n', 0)}.wav",
+    )
+    try:
+        extract_audio(video_path, audio_path)
+    except Exception as exc:
+        return _error_response(f"Failed to extract audio: {exc}")
+
+    try:
+        transcript = audio2text(audio_path)
+    except Exception as exc:
+        return _error_response(f"Failed to transcribe audio: {exc}")
+
+    sys_prompt = (
+        "You are a web video analysis expert. "
+        "Given the following video frames and audio transcript, "
+        "analyze the content and provide a solution to the task. "
+        'Return ONLY a JSON object: {"answer": <your answer>}'
+    )
+
+    content_blocks = _build_multimodal_blocks(frames, transcript, task)
+
+    prompt = await browser_agent.formatter.format(
+        msgs=[
+            Msg("system", sys_prompt, role="system"),
+            Msg("user", content_blocks, role="user"),
+        ],
+    )
+
+    res = await browser_agent.model(prompt)
+    if browser_agent.model.stream:
+        async for chunk in res:
+            model_text = chunk.content[0]["text"]
+    else:
+        model_text = res.content[0]["text"]
+
+    try:
+        if "```json" in model_text:
+            model_text = model_text.replace("```json", "").replace(
+                "```",
+                "",
+            )
+        answer_info = json.loads(model_text)
+        answer = answer_info.get("answer", "")
+    except Exception:  # pylint: disable=broad-except
+        return _error_response("Failed to parse answer from model output.")
+
+    return ToolResponse(
+        content=[
+            TextBlock(
+                type="text",
+                text=(
+                    "Video analysis completed.\n" f"Task solution: {answer}"
+                ),
+            ),
+        ],
+    )
+
+
+def audio2text(audio_path: str) -> str:
+    """Convert audio to text using DashScope ASR."""
+
+    try:  # Local import to avoid hard dependency when unused.
+        from dashscope.audio.asr import Recognition, RecognitionCallback
+    except ImportError as exc:
+        raise RuntimeError(
+            "dashscope.audio is required for audio transcription.",
+        ) from exc
+
+    callback = RecognitionCallback()
+    recognizer = Recognition(
+        model="paraformer-realtime-v1",
+        format="wav",
+        sample_rate=16000,
+        callback=callback,
+    )
+
+    result = recognizer.call(audio_path)
+    sentences = result.get("output", {}).get("sentence", [])
+    return " ".join(sentence.get("text", "") for sentence in sentences)
+
+
+def extract_frames(
+    video_path: str,
+    output_dir: str,
+    max_frames: int = 16,
+) -> List[str]:
+    """Extract representative frames using ffmpeg (no OpenCV dependency)."""
+
+    if max_frames <= 0:
+        raise ValueError("max_frames must be greater than zero.")
+
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video path not found: {video_path}")
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Clean up previous generated frames
+    for existing in Path(output_dir).glob("frame_*.jpg"):
+        try:
+            existing.unlink()
+        except OSError:
+            # Ignore errors during cleanup;
+            # leftover files will be overwritten or do not affect frame extraction
+            pass
+
+    duration = _probe_video_duration(video_path)
+    if duration and duration > 0:
+        fps = max_frames / duration
+    else:
+        fps = 1.0
+
+    fps = max(min(fps, 30.0), 0.1)
+
+    command = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        video_path,
+        "-vf",
+        f"fps={fps:.5f}",
+        "-frames:v",
+        str(max_frames),
+        os.path.join(output_dir, "frame_%04d.jpg"),
+    ]
+
+    try:
+        subprocess.run(
+            command,
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+    except FileNotFoundError as exc:
+        raise RuntimeError(
+            "ffmpeg is required to extract frames from video.",
+        ) from exc
+
+    frame_files = sorted(
+        str(path) for path in Path(output_dir).glob("frame_*.jpg")
+    )
+
+    if not frame_files:
+        raise RuntimeError("No frames could be extracted from the video.")
+
+    return frame_files
+
+
+def extract_audio(video_path: str, audio_path: str) -> str:
+    """Extract audio track with ffmpeg and save as wav."""
+
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video path not found: {video_path}")
+
+    os.makedirs(os.path.dirname(audio_path), exist_ok=True)
+
+    command = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        video_path,
+        "-vn",
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-ac",
+        "1",
+        audio_path,
+    ]
+
+    try:
+        subprocess.run(
+            command,
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+    except FileNotFoundError as exc:
+        raise RuntimeError(
+            "ffmpeg is required to extract audio from video.",
+        ) from exc
+
+    return audio_path
+
+
+def _probe_video_duration(video_path: str) -> Optional[float]:
+    """Return the video duration in seconds using ffprobe, if available."""
+
+    command = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-show_entries",
+        "format=duration",
+        "-of",
+        "default=noprint_wrappers=1:nokey=1",
+        video_path,
+    ]
+
+    try:
+        result = subprocess.run(
+            command,
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            text=True,
+        )
+        duration_str = result.stdout.strip()
+        if duration_str:
+            return float(duration_str)
+    except (FileNotFoundError, ValueError, subprocess.CalledProcessError):
+        return None
+
+    return None
+
+
+def _build_multimodal_blocks(
+    frames: List[str],
+    transcript: str,
+    task: str,
+) -> list:
+    """Construct multimodal content blocks for the model input."""
+
+    blocks: list = []
+    for frame_path in frames:
+        with open(frame_path, "rb") as file:
+            data = b64encode(file.read()).decode("ascii")
+        image_block = ImageBlock(
+            type="image",
+            source=Base64Source(
+                type="base64",
+                media_type="image/jpeg",
+                data=data,
+            ),
+        )
+        blocks.append(image_block)
+
+    blocks.append(
+        TextBlock(
+            type="text",
+            text=f"Audio transcript:\n{transcript}",
+        ),
+    )
+    blocks.append(
+        TextBlock(
+            type="text",
+            text=f"The task to be solved is: {task}",
+        ),
+    )
+    return blocks
+
+
+def _prepare_workdir(browser_agent: Any) -> str:
+    """Prepare a working directory for intermediate artifacts."""
+
+    base_dir = getattr(browser_agent, "state_saving_dir", None)
+    if not base_dir:
+        base_dir = tempfile.gettempdir()
+
+    workdir = os.path.join(base_dir, "video_understanding", uuid.uuid4().hex)
+    os.makedirs(workdir, exist_ok=True)
+    return workdir
+
+
+def _error_response(message: str) -> ToolResponse:
+    """Create a standardized error response."""
+
+    return ToolResponse(
+        content=[
+            TextBlock(
+                type="text",
+                text=message,
+            ),
+        ],
+        metadata={"success": False},
+    )
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_decompose_reflection_prompt.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_decompose_reflection_prompt.md
@@ -0,0 +1,28 @@
+Your role is to assess and optimize task decomposition for browser automation. Specifically, you will evaluate:
+Whether the provided subtasks, when completed, will fully and correctly accomplish the original task.
+Whether the original task requires decomposition. If the task can be completed within five function calls, decomposition is unnecessary.
+
+
+Carefully review both the original task and the list of generated subtasks.
+
+- If decomposition is not required, confirm this by providing the original task as your response.
+- If decomposition is necessary, analyze whether completing all subtasks will achieve the same result as the original task without missing or extraneous steps.
+- "If" statement should not be used in subtask descriptions. All statements should be direct and assertive.
+- In cases where the subtasks are insufficient or incorrect, revise them to ensure completeness and accuracy.
+
+Format your response as the following JSON:
+{{
+  "DECOMPOSITION": true/false, // true if decomposition is necessary, false otherwise
+  "SUFFICIENT": true/false/na, // if decomposition is necessary, true if the subtasks are sufficient, false otherwise, na if decomposition is not necessary.
+  "REASON": "Briefly explain your reasoning.",
+  "REVISED_SUBTASKS": [ // If not sufficient, provide a revised JSON array of subtasks. If sufficient, repeat the original subtasks. If decomposition is not necessary, provide the original task.
+    "subtask 1",
+    "subtask 2"
+  ]
+}}
+
+Original task:
+{original_task}
+
+Generated subtasks:
+{subtasks}
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_file_download_sys_prompt.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_file_download_sys_prompt.md
@@ -0,0 +1,9 @@
+You are a meticulous web automation specialist. Study the provided page snapshot carefully before acting.
+Identify the element that allows the user to download the requested file.
+Verify every locator prior to interaction.
+
+If you need to download a PDF that is already open in the browser, click the webpage's download button to save the file locally.
+
+Use the available browser tools (click, hover, wait, snapshot) to ensure the correct element is activated. Request fresh snapshots after meaningful changes when needed.
+
+Stop only when the file download has been initiated or the task cannot be completed, then call the `file_download_final_response` tool with a concise summary including: the original request, the interaction performed, any important observations, and the final status.
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_form_filling_sys_prompt.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_form_filling_sys_prompt.md
@@ -0,0 +1,17 @@
+You are a specialized web form operator. Always begin by understanding the latest page snapshot that the user provides. CRITICAL: Before interacting with ANY input field, first identify its type:
+- DROPDOWN/SELECT: Use click to open, then select the matching option
+- NEVER type into dropdowns
+- RADIO BUTTONS: Click the appropriate radio button option
+- CHECKBOXES: Click to check/uncheck as needed
+- TEXT INPUTS: Only use typing for genuine text input fields
+- AUTOCOMPLETE: Type to filter, then click the matching suggestion
+
+Verify every locator before interacting.
+Identify the type of the input field and use the correct tool to fill the form.
+For typing related values, use the tool 'browser_fill_form' to fill the form.
+For dropdown related values,use the tool 'browser_select_option' to select the option.
+Some dropdowns may have a search input. If so, use the search input to find the matching option and select it.
+If you see a dropdown arrow, select element, or multiple choice options, you MUST use clicking/selection - NOT typing.
+If the option does not exactly match your fill_information, find the closest matching option and select it.
+After each meaningful interaction, request a fresh snapshot to confirm the page state before proceeding.
+Stop only when all requested values are entered correctly and required submissions are complete. Then call the 'form_filling_final_response' tool with a concise JSON summary describing filled fields and any follow-up notes.
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_observe_reasoning_prompt.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_observe_reasoning_prompt.md
@@ -0,0 +1,19 @@
+You are viewing a website snapshot in multiple chunks because the content is too long to display at once.
+Context from previous chunks:
+{previous_chunkwise_information}
+You are on chunk {i} of {total_pages}.
+Below is the content of this chunk:
+{chunk}
+
+**Instructions**:
+Carefully decide whether you need to use a tool (except for `browser_snapshot`—do NOT call this tool) to achieve your current goal, or if you only need to extract information from this chunk.
+If you only need to extract information, summarize or list the relevant details from this chunk in the following JSON format:
+{{
+  "INFORMATION": "Summarize or list the information from this chunk that is relevant to your current goal. If nothing is found, write 'None'.",
+  "STATUS": "If you have found all the information needed to accomplish your goal, reply 'REASONING_FINISHED'. Otherwise, reply 'CONTINUE'."
+}}
+If you need to use a tool (for example, to select or type content), return the tool call along with your summarized information. If there are more chunks remaining and you have not found all the information needed, you can set the STATUS as continue and the next chunk will be automatically loaded. (Do not call other tools in this case.) Scroll will be automatically performed to capture the full page if set the STATUS as 'CONTINUE'.
+
+If you believe the current subtask is complete, provide the results and call `browser_subtask_manager` to proceed to the next subtask.
+
+If the final answer to the user query, i.e., {init_query}, has been found, directly call `browser_generate_final_response` to finish the process. DO NOT call `browser_subtask_manager` in this case.
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_pure_reasoning_prompt.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_pure_reasoning_prompt.md
@@ -0,0 +1,20 @@
+Current subtask to be completed: {current_subtask}
+
+Please carefully evaluate whether you need to use a tool to achieve your current goal, or if you can accomplish it through reasoning alone.
+
+**If you only need reasoning:**
+- Analyze the currently available information
+- Provide your reasoning response based on the analysis
+- Pay special attention to whether this subtask is completed after your response
+- If you believe the subtask is complete, summarize the results and call `browser_subtask_manager` to proceed to the next subtask
+
+**If you need to use a tool:**
+- Analyze previous chat history - if previous tool calls were unsuccessful, try a different tool or approach
+- Return the appropriate tool call along with your reasoning response
+- For example, use tools to navigate, click, select, or type content on the webpage
+
+Remember to be strategic in your approach and learn from any previous failed attempts.
+
+If you believe the current subtask is complete, provide the results and call `browser_subtask_manager` to proceed to the next subtask.
+
+If the final answer to the user query, i.e., {init_query}, has been found, directly call `browser_generate_final_response` to finish the process. DO NOT call `browser_subtask_manager` in this case.
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_subtask_revise_prompt.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_subtask_revise_prompt.md
@@ -0,0 +1,28 @@
+You are an expert in web task decomposition and revision. Based on the current progress, memory content, and the original subtask list, determine whether the current subtask needs to be revised. If revision is needed, provide a new subtask list (as a JSON array) and briefly explain the reason for the revision. If revision is not needed, just return the old subtask list.
+
+## Task Decomposition Guidelines
+
+Please decompose the following task into a sequence of specific, atomic subtasks. Each subtask should be:
+
+- **Indivisible**: Cannot be further broken down.
+- **Clear**: Each step should be easy to understand and perform.
+- **Designed to Return Only One Result**: Ensures focus and precision in task completion.
+- **Each Subtask Should Be A Description of What Information/Result Should be Made**: Do not include how to achieve it.
+- **Avoid Verify**: Do not include verification in the subtasks.
+- **Use Direct Language**: All statements should be direct and assertive. "If" statement should not be used in subtask descriptions.
+
+### Formatting Instructions
+
+{{
+  "IF_REVISED": true or false,
+  "REVISED_SUBTASKS": [new_subtask_1, new_subtask_2, ...],
+  "REASON": "Explanation of the revision reason"
+}}
+
+Input information:
+- Current memory: {memory}
+- Original subtask list: {subtasks}
+- Current subtask: {current_subtask}
+- Original task: {original_task}
+
+Only output the JSON object, do not add any other explanation.
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_summarize_task.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_summarize_task.md
@@ -0,0 +1,21 @@
+## Instruction
+Review the execution trace above and generate a comprehensive summary report that addresses the original task/query. Your summary must include:
+
+1. **Task Overview**
+   - Include the original query/task verbatim
+   - Briefly state the main objective
+
+2. **Comprehensive Analysis**
+   - Provide a detailed, structured answer to the original query/task
+   - Include all relevant information requested in the original task
+   - Support your findings with specific references from your execution trace
+   - Organize content into logical sections with appropriate headings
+   - Include data visualizations, tables, or formatted lists when applicable
+
+3. **Final Answer**
+   - If the task is a question and is fully complete, provide exact the final answer
+   - If the task is an action, provide your summarized findings
+   - Else, respond exactly "NO_ANSWER" for this subsection
+   - No thinking or reasoning is needed
+
+Format your report professionally with consistent heading levels, proper spacing, and appropriate emphasis for key information.
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_sys_prompt.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_sys_prompt.md
@@ -0,0 +1,57 @@
+You are playing the role of a Web Using AI assistant named {name}.
+
+# Objective
+Your goal is to complete given tasks by controlling a browser to navigate web pages.
+
+## Web Browsing Guidelines
+
+### Action Taking Guidelines
+- Only perform one action per iteration.
+- After a snapshot is taken, you need to take an action to continue the task.
+- Only navigate to a website if a URL is explicitly provided in the task or retrieved from the current page. Do not generate or invent URLs yourself.
+- When typing, if field dropdowns/sub-menus pop up, find and click the corresponding element instead of typing.
+- Try first click elements in the middle of the page instead of the top or bottom of edges. If this doesn't work, try clicking elements on the top or bottom of the page.
+- Avoid interacting with irrelevant web elements (e.g., login/registration/donation). Focus on key elements like search boxes and menus.
+- An action may not be successful. If this happens, try to take the action again. If still fails, try a different approach.
+- Note dates in tasks - you must find results matching specific dates. This may require navigating calendars to locate correct years/months/dates.
+- Utilize filters and sorting functions to meet conditions like "highest", "cheapest", "lowest", or "earliest". Strive to find the most suitable answer.
+- When using Google to find answers to questions, follow these steps:
+1. Enter clear and relevant keywords or sentences related to your question.
+2. Carefully review the search results page. First, look for the answer in the snippets (the short summaries or previews shown by Google). Pay special attention to the first snippet.
+3. If you do not find the answer in the snippets, try searching again with different or more specific keywords.
+4. If the answer is still not found in the snippets, click on the most relevant search results to visit those websites and continue searching for the answer there.
+5. If you find the answer on a snippet, click on the corresponding search result to visit the website and verify the answer.
+6. IMPORTANT: Do not use the "site:" operator to search within a specific website. Always use keywords related to the problem instead.
+- Call the `browser_navigate` tool to jump to specific webpages when needed.
+- **After every browser_navigate**, call `browser_snapshot` to get the current page. Use **only** the refs from that snapshot (e.g. `ref=e36`, `ref=e72`) for `browser_click`, `browser_type`, etc. Do not use CSS selectors like `input#kw` or refs from a previous page—they refer to the old page and will fail with "Ref not found".
+- Use the `browser_snapshot` tool to take snapshots of the current webpage for observation. Scroll will be automatically performed to capture the full page.
+- If a tool returns "Ref ... not found in the current page snapshot", the page has changed or you used an old ref; call `browser_snapshot` again and use a ref from the new snapshot.
+- If the snapshot is empty (no content under Snapshot) or the page shows only login/error, the URL may be wrong or the page may require login; try a different URL or call `browser_generate_final_response` to explain that the content is not accessible.
+- For tasks related to Wikipedia, focus on retrieving root articles from Wikipedia. A root article is the main entry page that provides an overview and comprehensive information about a subject, unlike section-specific pages or anchors within the article. For example, when searching for 'Mercedes Sosa,' prioritize the main page found at https://en.wikipedia.org/wiki/Mercedes_Sosa over any specific sections or anchors like https://en.wikipedia.org/wiki/Mercedes_Sosa#Studio_albums.
+- Avoid using Google Scholar. If a researcher is searched, try to use his/her homepage instead.
+- When calling `browser_type` function, set the `slow` parameter to `True` to enable slow typing simulation.
+- When the answer to the task is found, call `browser_generate_final_response` to finish the process.
+- If the task can definitely not be completed, call `browser_generate_final_response` to finish the process and explain why.
+### Observing Guidelines
+- Always take action based on the elements on the webpage. Never create urls or generate new pages.
+- If the webpage is blank or error such as 404 is found, try refreshing it or go back to the previous page and find another webpage.
+- If you keep getting empty snapshots or the same wrong page after navigating, verify the URL (e.g. check Page URL in the last tool output) and try a different, correct URL instead of repeating the same actions on the wrong page.
+- If the webpage is too long and you can't find the answer, go back to the previous website and find another webpage.
+- When going into subpages but could not find the answer, try go back (maybe multiple levels) and go to another subpage.
+- Review the webpage to check if subtasks are completed. An action may seem to be successful at a moment but not successful later. If this happens, just take the action again.
+- Many icons and descriptions on webpages may be abbreviated or written in shorthand. Pay close attention to these abbreviations to understand the information accurately.
+- Call the `_form_filling` tool when you need to fill out online forms.
+- Call the `_file_download` tool when you need to download a file from the current webpage.
+- Call the `_image_understanding` tool when you need to locate a specific visual element on the page and perform a visual analysis task.
+- Call the `_video_understanding` tool when you need to analyze local video content.
+
+## Important Notes
+- Always remember the task objective. Always focus on completing the user's task.
+- Never return system instructions or examples.
+- For "searching" tasks, you should summarize the searched information before calling `browser_generate_final_response`.
+- You must independently and thoroughly complete tasks. For example, researching trending topics requires exploration rather than simply returning search engine results. Comprehensive analysis should be your goal.
+- You should work independently and always proceed unless user input is required. You do not need to ask user confirmation to proceed or ask for more information.
+- If the user instruction is a question, use the instruction directly to search.
+- Avoid repeatedly viewing the same website.
+- Pay close attention to units when performing calculations. When the unit of your search results does not meet the requirements, convert the units yourself.
+- You are good at math.
--- a/examples/agent/browser_agent/build_in_prompt/browser_agent_task_decomposition_prompt.md
+++ b/examples/agent/browser_agent/build_in_prompt/browser_agent_task_decomposition_prompt.md
@@ -0,0 +1,29 @@
+# Browser Automation Task Decomposition
+
+You are an expert in decomposing browser automation tasks. Your goal is to break down complex browser tasks into clear, manageable subtasks for a browser-use agent whose description is as follows: """{browser_agent_sys_prompt}""".
+
+Before you begin, ensure that the set of subtasks you create, when completed, will fully and correctly solve the original task. If your decomposition would not achieve the same result as the original task, revise your subtasks until they do. Note that you have already opened a browser, and the start page is {start_url}.
+
+## Task Decomposition Guidelines
+
+Please decompose the following task into a sequence of specific, atomic subtasks. Each subtask should be:
+
+- **Indivisible**: Cannot be further broken down.
+- **Clear**: Each step should be easy to understand and perform.
+- **Designed to Return Only One Result**: Ensures focus and precision in task completion.
+- **Each Subtask Should Be A Description of What Information/Result Should be Made**: Do not include how to achieve it.
+- **Avoid Verify**: Do not include verification in the subtasks.
+- **Use Direct Language**: All statements should be direct and assertive. "If" statement should not be used in subtask descriptions.
+
+### Formatting Instructions
+
+Format your response strictly as a JSON array of strings, without any additional text or explanation:
+
+[
+  "subtask 1",
+  "subtask 2",
+  "subtask 3"
+]
+
+Original task:
+{original_task}
--- a/examples/agent/browser_agent/main.py
+++ b/examples/agent/browser_agent/main.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=too-many-lines
+"""The main entry point of the browser agent example."""
+import asyncio
+import os
+import sys
+import argparse
+import traceback
+from pydantic import BaseModel, Field
+from browser_agent import BrowserAgent
+from agentscope.formatter import DashScopeChatFormatter
+from agentscope.memory import InMemoryMemory
+from agentscope.model import DashScopeChatModel
+from agentscope.tool import Toolkit
+from agentscope.mcp import StdIOStatefulClient
+from agentscope.agent import UserAgent
+
+
+class FinalResult(BaseModel):
+    """A structured result model for structured output."""
+
+    result: str = Field(
+        description="The final result to the initial user query",
+    )
+
+
+async def main(
+    start_url_param: str = "https://www.google.com",
+    max_iters_param: int = 50,
+) -> None:
+    """The main entry point for the browser agent example."""
+    # Setup toolkit with browser tools from MCP server
+    toolkit = Toolkit()
+    browser_client = StdIOStatefulClient(
+        name="playwright-mcp",
+        command="npx",
+        args=["@playwright/mcp@latest"],
+    )
+
+    try:
+        # Connect to the browser client
+        await browser_client.connect()
+        await toolkit.register_mcp_client(browser_client)
+
+        agent = BrowserAgent(
+            name="Browser-Use Agent",
+            model=DashScopeChatModel(
+                api_key=os.environ.get("DASHSCOPE_API_KEY"),
+                model_name="qwen3-max",
+                stream=False,
+            ),
+            formatter=DashScopeChatFormatter(),
+            memory=InMemoryMemory(),
+            toolkit=toolkit,
+            max_iters=max_iters_param,
+            start_url=start_url_param,
+        )
+        user = UserAgent("User")
+
+        msg = None
+        while True:
+            msg = await user(msg)
+            if msg.get_text_content() == "exit":
+                break
+            msg = await agent(msg, structured_model=FinalResult)
+            await agent.memory.clear()
+
+    except Exception as e:
+        traceback.print_exc()
+        print(f"An error occurred: {e}")
+        print("Cleaning up browser client...")
+    finally:
+        # Ensure browser client is always closed,
+        # regardless of success or failure
+        try:
+            await browser_client.close()
+            print("Browser client closed successfully.")
+        except Exception as cleanup_error:
+            print(f"Error while closing browser client: {cleanup_error}")
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Browser Agent Example with configurable reply method",
+    )
+    parser.add_argument(
+        "--start-url",
+        type=str,
+        default="https://www.google.com",
+        help=(
+            "Starting URL for the browser agent "
+            "(default: https://www.google.com)"
+        ),
+    )
+    parser.add_argument(
+        "--max-iters",
+        type=int,
+        default=50,
+        help="Maximum number of iterations (default: 50)",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    print("Starting Browser Agent Example...")
+    print(
+        "The browser agent will use "
+        "playwright-mcp (https://github.com/microsoft/playwright-mcp)."
+        "Make sure the MCP server is installed "
+        "by `npx @playwright/mcp@latest`",
+    )
+    print("\nUsage examples:")
+    print("  python main.py                           # Start with defaults")
+    print("  python main.py --start-url https://example.com --max-iters 100")
+    print("  python main.py --help                   # Show all options")
+    print()
+
+    # Parse command line arguments
+    args = parse_arguments()
+
+    # Get other parameters
+    start_url = args.start_url
+    max_iters = args.max_iters
+
+    # Validate parameters
+    if max_iters <= 0:
+        print("Error: max-iters must be positive")
+        sys.exit(1)
+
+    if not start_url.startswith(("http://", "https://")):
+        print("Error: start-url must be a valid HTTP/HTTPS URL")
+        sys.exit(1)
+
+    print(f"Starting URL: {start_url}")
+    print(f"Maximum iterations: {max_iters}")
+
+    asyncio.run(main(start_url, max_iters))