chore: initialize sandbox and overwrite remote content
Some checks failed
Pre-commit / run (ubuntu-latest) (push) Has been cancelled
Deploy Sphinx documentation to Pages / build_en (ubuntu-latest, 3.10) (push) Has been cancelled
Deploy Sphinx documentation to Pages / build_zh (ubuntu-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.12) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.12) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.12) (push) Has been cancelled

This commit is contained in:
codex-bot
2026-03-02 22:32:27 +08:00
commit a64378956a
584 changed files with 93604 additions and 0 deletions

View File

@@ -0,0 +1,238 @@
# -*- coding: utf-8 -*-
"""Standalone file download skill for the browser agent."""
# flake8: noqa: E501
# pylint: disable=W0212,W0107,too-many-lines,C0301
from __future__ import annotations
import os
import copy
from typing import Any
from pydantic import BaseModel
from agentscope.memory import InMemoryMemory
from agentscope.message import Msg, TextBlock
from agentscope.tool import ToolResponse
from agentscope.agent import ReActAgent
_CURRENT_DIR = os.path.abspath(
os.path.join(os.path.dirname(__file__), os.pardir),
)
with open(
os.path.join(
_CURRENT_DIR,
"build_in_prompt/browser_agent_file_download_sys_prompt.md",
),
"r",
encoding="utf-8",
) as f:
_FILE_DOWNLOAD_AGENT_SYS_PROMPT = f.read()
class EmptyModel(BaseModel):
"""Empty structured model for default structured output requirement."""
pass
class FileDownloadAgent(ReActAgent):
"""Lightweight helper agent that downloads files"""
finish_function_name: str = "file_download_final_response"
def __init__(
self,
browser_agent: Any,
sys_prompt: str = _FILE_DOWNLOAD_AGENT_SYS_PROMPT,
max_iters: int = 15,
) -> None:
name = (
f"{getattr(browser_agent, 'name', 'browser_agent')}_file_download"
)
super().__init__(
name=name,
sys_prompt=sys_prompt,
model=browser_agent.model,
formatter=browser_agent.formatter,
memory=InMemoryMemory(),
toolkit=browser_agent.toolkit,
max_iters=max_iters,
)
# Register the finish function
self.toolkit.register_tool_function(self.file_download_final_response)
# Remove conflicting tool functions if they exist
if hasattr(self.toolkit, "remove_tool_function"):
try:
self.toolkit.remove_tool_function("browser_pdf_save")
except Exception:
# Tool may not exist, ignore removal errors
pass
try:
self.toolkit.remove_tool_function("file_download")
except Exception:
# Tool may not exist, ignore removal errors
pass
async def file_download_final_response(
self, # pylint: disable=W0613
**kwargs: Any, # pylint: disable=W0613
) -> ToolResponse:
"""Summarize the file download outcome."""
hint_msg = Msg(
"user",
(
"Provide a concise summary of the file download attempt.\n"
"Highlight these items:\n"
"0. The original request\n"
"1. The element(s) interacted with and actions taken\n"
"2. The download status or any issues encountered\n"
"3. Any follow-up recommendations or next steps\n"
),
role="user",
)
memory_msgs = await self.memory.get_memory()
memory_msgs_copy = copy.deepcopy(memory_msgs)
if memory_msgs_copy:
last_msg = memory_msgs_copy[-1]
last_msg.content = last_msg.get_content_blocks("text")
memory_msgs_copy[-1] = last_msg
prompt = await self.formatter.format(
msgs=[
Msg("system", self.sys_prompt, "system"),
*memory_msgs_copy,
hint_msg,
],
)
res = await self.model(prompt)
if self.model.stream:
summary_text = ""
async for chunk in res:
summary_text = chunk.content[0]["text"]
else:
summary_text = res.content[0]["text"]
summary_text = summary_text or "No summary generated."
structure_response = {
"task_done": True,
"subtask_progress_summary": summary_text,
"generated_files": {},
}
return ToolResponse(
content=[
TextBlock(
type="text",
text="File download summary generated. " + summary_text,
),
],
metadata={
"success": True,
"structured_output": structure_response,
},
is_last=True,
)
def _build_initial_instruction(
target_description: str,
snapshot_text: str,
) -> str:
"""Compose the initial instruction for the helper agent."""
return (
"You must locate and trigger the download for the requested file.\n\n"
"Target description provided by the user:\n"
f"{target_description}\n\n"
"Latest snapshot captured prior to your run:\n"
f"{snapshot_text}\n\n"
"Follow the sys prompt guidance, think step-by-step, and verify that "
"the download action succeeded. If the download cannot be completed, "
"explain why in the final summary."
)
async def file_download(
browser_agent: Any,
target_description: str,
) -> ToolResponse:
"""
Download the target file. The current page should
contain download-related element.
Args:
target_description (str): The description of the
target file to download.
Returns:
ToolResponse: A structured response containing
the download directory.
"""
try:
snapshot_chunks = await browser_agent._get_snapshot_in_text()
except Exception as exc: # pylint: disable=broad-except
snapshot_chunks = []
snapshot_error = str(exc)
else:
snapshot_error = ""
snapshot_text = "\n\n---\n\n".join(snapshot_chunks)
if snapshot_error and not snapshot_text:
snapshot_text = f"[Snapshot failed: {snapshot_error}]"
sub_agent = FileDownloadAgent(browser_agent)
instruction = _build_initial_instruction(
target_description=target_description,
snapshot_text=snapshot_text,
)
init_msg = Msg(
name="user",
role="user",
content=instruction,
)
try:
sub_agent_response_msg = await sub_agent.reply(
init_msg,
structured_model=EmptyModel,
)
text_content = ""
if sub_agent_response_msg.content:
first_block = sub_agent_response_msg.content[0]
if isinstance(first_block, dict):
text_content = first_block.get("text") or ""
else:
text_content = getattr(first_block, "text", "") or ""
if not text_content:
text_content = (
"File download agent finished without a textual summary."
)
return ToolResponse(
metadata=sub_agent_response_msg.metadata,
content=[
TextBlock(
type="text",
text=text_content,
),
],
)
except Exception as exc: # pylint: disable=broad-except
return ToolResponse(
content=[
TextBlock(
type="text",
text=f"Tool call Error. Cannot be executed. {exc}",
),
],
metadata={"success": False},
is_last=True,
)

View File

@@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-
"""Standalone form filling skill for the browser agent."""
# flake8: noqa: E501
# pylint: disable=W0212,W0107,too-many-lines,C0301
from __future__ import annotations
import os
import copy
from typing import Any
from pydantic import BaseModel
from agentscope.memory import InMemoryMemory
from agentscope.message import Msg, TextBlock
from agentscope.tool import ToolResponse
from agentscope.agent import ReActAgent
_CURRENT_DIR = os.path.abspath(
os.path.join(os.path.dirname(__file__), os.pardir),
)
with open(
os.path.join(
_CURRENT_DIR,
"build_in_prompt/browser_agent_form_filling_sys_prompt.md",
),
"r",
encoding="utf-8",
) as f:
_FORM_FILL_AGENT_SYS_PROMPT = f.read()
class EmptyModel(BaseModel):
"""Empty structured model for default structured output requirement."""
pass
class FormFillingAgent(ReActAgent):
"""Lightweight helper agent that fills forms."""
finish_function_name: str = "form_filling_final_response"
def __init__(
self,
browser_agent: Any,
sys_prompt: str = _FORM_FILL_AGENT_SYS_PROMPT,
max_iters: int = 20,
) -> None:
name = f"{getattr(browser_agent, 'name', 'browser_agent')}_form_fill"
super().__init__(
name=name,
sys_prompt=sys_prompt,
model=browser_agent.model,
formatter=browser_agent.formatter,
memory=InMemoryMemory(),
toolkit=browser_agent.toolkit,
max_iters=max_iters,
)
# Register the finish function
self.toolkit.register_tool_function(self.form_filling_final_response)
async def form_filling_final_response(
self, # pylint: disable=W0613
**kwargs: Any, # pylint: disable=W0613
) -> ToolResponse:
"""Summarize the form filling outcome."""
hint_msg = Msg(
"user",
(
"Provide a concise summary of the completed form "
"filling task.\n"
"Highlight these items:\n"
"0. The original task/query\n"
"1. Which fields were filled/selected and their final values\n"
"2. Any important observations or follow-up notes\n"
"3. Confirmation that if the task is complete\n\n"
),
role="user",
)
memory_msgs = await self.memory.get_memory()
memory_msgs_copy = copy.deepcopy(memory_msgs)
last_msg = memory_msgs_copy[-1]
# check if the last message has tool call, if so clean the content
last_msg.content = last_msg.get_content_blocks("text")
memory_msgs_copy[-1] = last_msg
prompt = await self.formatter.format(
msgs=[
Msg("system", self.sys_prompt, "system"),
*memory_msgs_copy,
hint_msg,
],
)
res = await self.model(prompt)
if self.model.stream:
summary_text = ""
async for chunk in res:
summary_text = chunk.content[0]["text"]
else:
summary_text = res.content[0]["text"]
structure_response = {
"task_done": True,
"subtask_progress_summary": summary_text,
"generated_files": {},
}
return ToolResponse(
content=[
TextBlock(
type="text",
text="Form filling summary generated. " + summary_text,
),
],
metadata={
"success": True,
"structured_output": structure_response,
},
is_last=True,
)
def _build_initial_instruction(
fill_information: str,
snapshot_text: str,
) -> str:
"""Compose the initial instruction fed to the helper agent."""
return (
"You must complete the web form using the information "
"provided below.\n\nFill instructions (plain text from the user):\n"
f"{fill_information}\n\n"
"Latest snapshot captured prior to your run:\n"
f"{snapshot_text}\n\n"
)
async def form_filling(
browser_agent: Any,
fill_information: str,
) -> ToolResponse:
"""
Fill in a web form according to plain-text instructions.
Args:
fill_information (str):
Plain-text description of the values that
must be entered into the form,
including any submission requirements.
Returns:
ToolResponse: Summary of the helper agent execution and status.
"""
try:
snapshot_chunks = (
await browser_agent._get_snapshot_in_text()
) # pylint: disable=protected-access
except Exception as exc: # pylint: disable=broad-except
snapshot_chunks = []
snapshot_error = str(exc)
else:
snapshot_error = ""
snapshot_text = "\n\n---\n\n".join(snapshot_chunks)
if snapshot_error and not snapshot_text:
snapshot_text = f"[Snapshot failed: {snapshot_error}]"
sub_agent = FormFillingAgent(browser_agent)
instruction = _build_initial_instruction(
fill_information=fill_information,
snapshot_text=snapshot_text,
)
init_msg = Msg(
name="user",
role="user",
content=instruction,
)
try:
sub_agent_response_msg = await sub_agent.reply(
init_msg,
structured_model=EmptyModel,
)
text_content = ""
if sub_agent_response_msg.content:
first_block = sub_agent_response_msg.content[0]
if isinstance(first_block, dict):
text_content = first_block.get("text") or ""
else:
text_content = getattr(first_block, "text", "") or ""
if not text_content:
text_content = (
"Form filling agent finished without a textual summary."
)
return ToolResponse(
metadata=sub_agent_response_msg.metadata,
content=[
TextBlock(
type="text",
text=text_content,
),
],
)
except Exception as e:
return ToolResponse(
content=[
TextBlock(
type="text",
text=f"Tool call Error. Cannot be executed. {e}",
),
],
metadata={"success": False},
is_last=True,
)

View File

@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""Standalone image understanding skill for the browser agent."""
# flake8: noqa: E501
# pylint: disable=W0212
# pylint: disable=too-many-lines
# pylint: disable=C0301
from __future__ import annotations
import json
import uuid
from typing import Any
from agentscope.message import (
Base64Source,
ImageBlock,
Msg,
TextBlock,
ToolUseBlock,
)
from agentscope.tool import ToolResponse
async def image_understanding(
browser_agent: Any,
object_description: str,
task: str,
) -> ToolResponse:
"""
Locate an element and solve a visual task on the current webpage.
Args:
object_description (str): The description of the object to locate.
task (str): The specific task or question to solve about the image
(e.g., description, object detection, activity recognition, or
answering a question about the image's content).
Returns:
ToolResponse: A structured response containing the answer to
the specified task based on the image content.
"""
sys_prompt = (
"You are a web page analysis expert. Given the following page "
"snapshot and object description, "
"identify the exact element and its reference string (ref) "
"that matches the description. "
"Return ONLY a JSON object: "
'{"element": <element description>, "ref": <ref string>}'
)
snapshot_chunks = (
await browser_agent._get_snapshot_in_text() # noqa: E501 # pylint: disable=protected-access
)
page_snapshot = snapshot_chunks[0] if snapshot_chunks else ""
user_prompt = (
f"Object description: {object_description}\n"
f"Page snapshot:\n{page_snapshot}"
)
prompt = await browser_agent.formatter.format(
msgs=[
Msg("system", sys_prompt, role="system"),
Msg("user", user_prompt, role="user"),
],
)
res = await browser_agent.model(prompt)
if browser_agent.model.stream:
async for chunk in res:
model_text = chunk.content[0]["text"]
else:
model_text = res.content[0]["text"]
try:
if "```json" in model_text:
model_text = model_text.replace("```json", "").replace(
"```",
"",
)
element_info = json.loads(model_text)
element = element_info.get("element", "")
ref = element_info.get("ref", "")
except Exception:
return ToolResponse(
content=[
TextBlock(
type="text",
text="Failed to parse element/ref from model output.",
),
],
metadata={"success": False},
)
screenshot_tool_call = ToolUseBlock(
id=str(uuid.uuid4()),
name="browser_take_screenshot",
input={"element": element, "ref": ref},
type="tool_use",
)
screenshot_response = await browser_agent.toolkit.call_tool_function(
screenshot_tool_call,
)
image_data = None
async for chunk in screenshot_response:
if (
chunk.content
and len(chunk.content) > 1
and "data" in chunk.content[1]
):
image_data = chunk.content[1]["data"]
sys_prompt_task = (
"You are a web automation expert. "
"Given the object description, screenshot, and page context, "
"solve the following task. Return ONLY the answer as plain text."
)
content_blocks = [
TextBlock(
type="text",
text=(
"Object description: "
f"{object_description}\nTask: {task}\n"
f"Page snapshot:\n{page_snapshot}"
),
),
]
if image_data:
image_block = ImageBlock(
type="image",
source=Base64Source(
type="base64",
media_type="image/png",
data=image_data,
),
)
content_blocks.append(image_block)
prompt_task = await browser_agent.formatter.format(
msgs=[
Msg("system", sys_prompt_task, role="system"),
Msg("user", content_blocks, role="user"),
],
)
res_task = await browser_agent.model(prompt_task)
if browser_agent.model.stream:
async for chunk in res_task:
answer_text = chunk.content[0]["text"]
else:
answer_text = res_task.content[0]["text"]
return ToolResponse(
content=[
TextBlock(
type="text",
text=(
f"Screenshot taken for element: {element}\nref: {ref}\n"
f"Task solution: {answer_text}"
),
),
],
)

View File

@@ -0,0 +1,330 @@
# -*- coding: utf-8 -*-
"""Standalone video understanding skill for the browser agent."""
# flake8: noqa: E501
# pylint: disable=W0212
# pylint: disable=too-many-lines
# pylint: disable=C0301
from __future__ import annotations
import json
import os
import subprocess
import tempfile
import uuid
from base64 import b64encode
from pathlib import Path
from typing import Any, List, Optional
from agentscope.message import (
Base64Source,
ImageBlock,
Msg,
TextBlock,
)
from agentscope.tool import ToolResponse
async def video_understanding(
browser_agent: Any,
video_path: str,
task: str,
) -> ToolResponse:
"""
Perform video understanding on the provided video file.
Args:
video_path (str): The path to the video file to analyze.
task (str): The specific task or question to solve about
the video (e.g., summary, object detection, activity recognition,
or answering a question about the video's content).
Returns:
ToolResponse: A structured response containing the answer
to the specified task based on the video content.
"""
workdir = _prepare_workdir(browser_agent)
try:
frames_dir = os.path.join(workdir, "frames")
frames = extract_frames(video_path, frames_dir)
except Exception as exc:
return _error_response(f"Failed to extract frames: {exc}")
audio_path = os.path.join(
workdir,
f"audio_{getattr(browser_agent, 'iter_n', 0)}.wav",
)
try:
extract_audio(video_path, audio_path)
except Exception as exc:
return _error_response(f"Failed to extract audio: {exc}")
try:
transcript = audio2text(audio_path)
except Exception as exc:
return _error_response(f"Failed to transcribe audio: {exc}")
sys_prompt = (
"You are a web video analysis expert. "
"Given the following video frames and audio transcript, "
"analyze the content and provide a solution to the task. "
'Return ONLY a JSON object: {"answer": <your answer>}'
)
content_blocks = _build_multimodal_blocks(frames, transcript, task)
prompt = await browser_agent.formatter.format(
msgs=[
Msg("system", sys_prompt, role="system"),
Msg("user", content_blocks, role="user"),
],
)
res = await browser_agent.model(prompt)
if browser_agent.model.stream:
async for chunk in res:
model_text = chunk.content[0]["text"]
else:
model_text = res.content[0]["text"]
try:
if "```json" in model_text:
model_text = model_text.replace("```json", "").replace(
"```",
"",
)
answer_info = json.loads(model_text)
answer = answer_info.get("answer", "")
except Exception: # pylint: disable=broad-except
return _error_response("Failed to parse answer from model output.")
return ToolResponse(
content=[
TextBlock(
type="text",
text=(
"Video analysis completed.\n" f"Task solution: {answer}"
),
),
],
)
def audio2text(audio_path: str) -> str:
"""Convert audio to text using DashScope ASR."""
try: # Local import to avoid hard dependency when unused.
from dashscope.audio.asr import Recognition, RecognitionCallback
except ImportError as exc:
raise RuntimeError(
"dashscope.audio is required for audio transcription.",
) from exc
callback = RecognitionCallback()
recognizer = Recognition(
model="paraformer-realtime-v1",
format="wav",
sample_rate=16000,
callback=callback,
)
result = recognizer.call(audio_path)
sentences = result.get("output", {}).get("sentence", [])
return " ".join(sentence.get("text", "") for sentence in sentences)
def extract_frames(
video_path: str,
output_dir: str,
max_frames: int = 16,
) -> List[str]:
"""Extract representative frames using ffmpeg (no OpenCV dependency)."""
if max_frames <= 0:
raise ValueError("max_frames must be greater than zero.")
if not os.path.exists(video_path):
raise FileNotFoundError(f"Video path not found: {video_path}")
os.makedirs(output_dir, exist_ok=True)
# Clean up previous generated frames
for existing in Path(output_dir).glob("frame_*.jpg"):
try:
existing.unlink()
except OSError:
# Ignore errors during cleanup;
# leftover files will be overwritten or do not affect frame extraction
pass
duration = _probe_video_duration(video_path)
if duration and duration > 0:
fps = max_frames / duration
else:
fps = 1.0
fps = max(min(fps, 30.0), 0.1)
command = [
"ffmpeg",
"-y",
"-i",
video_path,
"-vf",
f"fps={fps:.5f}",
"-frames:v",
str(max_frames),
os.path.join(output_dir, "frame_%04d.jpg"),
]
try:
subprocess.run(
command,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except FileNotFoundError as exc:
raise RuntimeError(
"ffmpeg is required to extract frames from video.",
) from exc
frame_files = sorted(
str(path) for path in Path(output_dir).glob("frame_*.jpg")
)
if not frame_files:
raise RuntimeError("No frames could be extracted from the video.")
return frame_files
def extract_audio(video_path: str, audio_path: str) -> str:
"""Extract audio track with ffmpeg and save as wav."""
if not os.path.exists(video_path):
raise FileNotFoundError(f"Video path not found: {video_path}")
os.makedirs(os.path.dirname(audio_path), exist_ok=True)
command = [
"ffmpeg",
"-y",
"-i",
video_path,
"-vn",
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
audio_path,
]
try:
subprocess.run(
command,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except FileNotFoundError as exc:
raise RuntimeError(
"ffmpeg is required to extract audio from video.",
) from exc
return audio_path
def _probe_video_duration(video_path: str) -> Optional[float]:
"""Return the video duration in seconds using ffprobe, if available."""
command = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
video_path,
]
try:
result = subprocess.run(
command,
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
)
duration_str = result.stdout.strip()
if duration_str:
return float(duration_str)
except (FileNotFoundError, ValueError, subprocess.CalledProcessError):
return None
return None
def _build_multimodal_blocks(
frames: List[str],
transcript: str,
task: str,
) -> list:
"""Construct multimodal content blocks for the model input."""
blocks: list = []
for frame_path in frames:
with open(frame_path, "rb") as file:
data = b64encode(file.read()).decode("ascii")
image_block = ImageBlock(
type="image",
source=Base64Source(
type="base64",
media_type="image/jpeg",
data=data,
),
)
blocks.append(image_block)
blocks.append(
TextBlock(
type="text",
text=f"Audio transcript:\n{transcript}",
),
)
blocks.append(
TextBlock(
type="text",
text=f"The task to be solved is: {task}",
),
)
return blocks
def _prepare_workdir(browser_agent: Any) -> str:
"""Prepare a working directory for intermediate artifacts."""
base_dir = getattr(browser_agent, "state_saving_dir", None)
if not base_dir:
base_dir = tempfile.gettempdir()
workdir = os.path.join(base_dir, "video_understanding", uuid.uuid4().hex)
os.makedirs(workdir, exist_ok=True)
return workdir
def _error_response(message: str) -> ToolResponse:
"""Create a standardized error response."""
return ToolResponse(
content=[
TextBlock(
type="text",
text=message,
),
],
metadata={"success": False},
)