chore: initialize sandbox and overwrite remote content
Some checks failed
Pre-commit / run (ubuntu-latest) (push) Has been cancelled
Deploy Sphinx documentation to Pages / build_en (ubuntu-latest, 3.10) (push) Has been cancelled
Deploy Sphinx documentation to Pages / build_zh (ubuntu-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.12) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.12) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.12) (push) Has been cancelled
Some checks failed
Pre-commit / run (ubuntu-latest) (push) Has been cancelled
Deploy Sphinx documentation to Pages / build_en (ubuntu-latest, 3.10) (push) Has been cancelled
Deploy Sphinx documentation to Pages / build_zh (ubuntu-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (macos-15, 3.12) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (ubuntu-latest, 3.12) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.10) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.11) (push) Has been cancelled
Python Unittest Coverage / test (windows-latest, 3.12) (push) Has been cancelled
This commit is contained in:
238
examples/agent/browser_agent/build_in_helper/_file_download.py
Normal file
238
examples/agent/browser_agent/build_in_helper/_file_download.py
Normal file
@@ -0,0 +1,238 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Standalone file download skill for the browser agent."""
|
||||
# flake8: noqa: E501
|
||||
# pylint: disable=W0212,W0107,too-many-lines,C0301
|
||||
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import copy
|
||||
from typing import Any
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
from agentscope.memory import InMemoryMemory
|
||||
from agentscope.message import Msg, TextBlock
|
||||
from agentscope.tool import ToolResponse
|
||||
from agentscope.agent import ReActAgent
|
||||
|
||||
|
||||
_CURRENT_DIR = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), os.pardir),
|
||||
)
|
||||
|
||||
with open(
|
||||
os.path.join(
|
||||
_CURRENT_DIR,
|
||||
"build_in_prompt/browser_agent_file_download_sys_prompt.md",
|
||||
),
|
||||
"r",
|
||||
encoding="utf-8",
|
||||
) as f:
|
||||
_FILE_DOWNLOAD_AGENT_SYS_PROMPT = f.read()
|
||||
|
||||
|
||||
class EmptyModel(BaseModel):
|
||||
"""Empty structured model for default structured output requirement."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FileDownloadAgent(ReActAgent):
|
||||
"""Lightweight helper agent that downloads files"""
|
||||
|
||||
finish_function_name: str = "file_download_final_response"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
browser_agent: Any,
|
||||
sys_prompt: str = _FILE_DOWNLOAD_AGENT_SYS_PROMPT,
|
||||
max_iters: int = 15,
|
||||
) -> None:
|
||||
name = (
|
||||
f"{getattr(browser_agent, 'name', 'browser_agent')}_file_download"
|
||||
)
|
||||
super().__init__(
|
||||
name=name,
|
||||
sys_prompt=sys_prompt,
|
||||
model=browser_agent.model,
|
||||
formatter=browser_agent.formatter,
|
||||
memory=InMemoryMemory(),
|
||||
toolkit=browser_agent.toolkit,
|
||||
max_iters=max_iters,
|
||||
)
|
||||
# Register the finish function
|
||||
self.toolkit.register_tool_function(self.file_download_final_response)
|
||||
# Remove conflicting tool functions if they exist
|
||||
if hasattr(self.toolkit, "remove_tool_function"):
|
||||
try:
|
||||
self.toolkit.remove_tool_function("browser_pdf_save")
|
||||
except Exception:
|
||||
# Tool may not exist, ignore removal errors
|
||||
pass
|
||||
try:
|
||||
self.toolkit.remove_tool_function("file_download")
|
||||
except Exception:
|
||||
# Tool may not exist, ignore removal errors
|
||||
pass
|
||||
|
||||
async def file_download_final_response(
|
||||
self, # pylint: disable=W0613
|
||||
**kwargs: Any, # pylint: disable=W0613
|
||||
) -> ToolResponse:
|
||||
"""Summarize the file download outcome."""
|
||||
hint_msg = Msg(
|
||||
"user",
|
||||
(
|
||||
"Provide a concise summary of the file download attempt.\n"
|
||||
"Highlight these items:\n"
|
||||
"0. The original request\n"
|
||||
"1. The element(s) interacted with and actions taken\n"
|
||||
"2. The download status or any issues encountered\n"
|
||||
"3. Any follow-up recommendations or next steps\n"
|
||||
),
|
||||
role="user",
|
||||
)
|
||||
|
||||
memory_msgs = await self.memory.get_memory()
|
||||
memory_msgs_copy = copy.deepcopy(memory_msgs)
|
||||
if memory_msgs_copy:
|
||||
last_msg = memory_msgs_copy[-1]
|
||||
last_msg.content = last_msg.get_content_blocks("text")
|
||||
memory_msgs_copy[-1] = last_msg
|
||||
|
||||
prompt = await self.formatter.format(
|
||||
msgs=[
|
||||
Msg("system", self.sys_prompt, "system"),
|
||||
*memory_msgs_copy,
|
||||
hint_msg,
|
||||
],
|
||||
)
|
||||
|
||||
res = await self.model(prompt)
|
||||
|
||||
if self.model.stream:
|
||||
summary_text = ""
|
||||
async for chunk in res:
|
||||
summary_text = chunk.content[0]["text"]
|
||||
else:
|
||||
summary_text = res.content[0]["text"]
|
||||
|
||||
summary_text = summary_text or "No summary generated."
|
||||
|
||||
structure_response = {
|
||||
"task_done": True,
|
||||
"subtask_progress_summary": summary_text,
|
||||
"generated_files": {},
|
||||
}
|
||||
|
||||
return ToolResponse(
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text="File download summary generated. " + summary_text,
|
||||
),
|
||||
],
|
||||
metadata={
|
||||
"success": True,
|
||||
"structured_output": structure_response,
|
||||
},
|
||||
is_last=True,
|
||||
)
|
||||
|
||||
|
||||
def _build_initial_instruction(
|
||||
target_description: str,
|
||||
snapshot_text: str,
|
||||
) -> str:
|
||||
"""Compose the initial instruction for the helper agent."""
|
||||
return (
|
||||
"You must locate and trigger the download for the requested file.\n\n"
|
||||
"Target description provided by the user:\n"
|
||||
f"{target_description}\n\n"
|
||||
"Latest snapshot captured prior to your run:\n"
|
||||
f"{snapshot_text}\n\n"
|
||||
"Follow the sys prompt guidance, think step-by-step, and verify that "
|
||||
"the download action succeeded. If the download cannot be completed, "
|
||||
"explain why in the final summary."
|
||||
)
|
||||
|
||||
|
||||
async def file_download(
|
||||
browser_agent: Any,
|
||||
target_description: str,
|
||||
) -> ToolResponse:
|
||||
"""
|
||||
Download the target file. The current page should
|
||||
contain download-related element.
|
||||
|
||||
Args:
|
||||
target_description (str): The description of the
|
||||
target file to download.
|
||||
|
||||
Returns:
|
||||
ToolResponse: A structured response containing
|
||||
the download directory.
|
||||
"""
|
||||
try:
|
||||
snapshot_chunks = await browser_agent._get_snapshot_in_text()
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
snapshot_chunks = []
|
||||
snapshot_error = str(exc)
|
||||
else:
|
||||
snapshot_error = ""
|
||||
|
||||
snapshot_text = "\n\n---\n\n".join(snapshot_chunks)
|
||||
if snapshot_error and not snapshot_text:
|
||||
snapshot_text = f"[Snapshot failed: {snapshot_error}]"
|
||||
|
||||
sub_agent = FileDownloadAgent(browser_agent)
|
||||
instruction = _build_initial_instruction(
|
||||
target_description=target_description,
|
||||
snapshot_text=snapshot_text,
|
||||
)
|
||||
|
||||
init_msg = Msg(
|
||||
name="user",
|
||||
role="user",
|
||||
content=instruction,
|
||||
)
|
||||
|
||||
try:
|
||||
sub_agent_response_msg = await sub_agent.reply(
|
||||
init_msg,
|
||||
structured_model=EmptyModel,
|
||||
)
|
||||
|
||||
text_content = ""
|
||||
if sub_agent_response_msg.content:
|
||||
first_block = sub_agent_response_msg.content[0]
|
||||
if isinstance(first_block, dict):
|
||||
text_content = first_block.get("text") or ""
|
||||
else:
|
||||
text_content = getattr(first_block, "text", "") or ""
|
||||
|
||||
if not text_content:
|
||||
text_content = (
|
||||
"File download agent finished without a textual summary."
|
||||
)
|
||||
|
||||
return ToolResponse(
|
||||
metadata=sub_agent_response_msg.metadata,
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=text_content,
|
||||
),
|
||||
],
|
||||
)
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
return ToolResponse(
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=f"Tool call Error. Cannot be executed. {exc}",
|
||||
),
|
||||
],
|
||||
metadata={"success": False},
|
||||
is_last=True,
|
||||
)
|
||||
221
examples/agent/browser_agent/build_in_helper/_form_filling.py
Normal file
221
examples/agent/browser_agent/build_in_helper/_form_filling.py
Normal file
@@ -0,0 +1,221 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Standalone form filling skill for the browser agent."""
|
||||
# flake8: noqa: E501
|
||||
# pylint: disable=W0212,W0107,too-many-lines,C0301
|
||||
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import copy
|
||||
from typing import Any
|
||||
from pydantic import BaseModel
|
||||
|
||||
from agentscope.memory import InMemoryMemory
|
||||
from agentscope.message import Msg, TextBlock
|
||||
from agentscope.tool import ToolResponse
|
||||
from agentscope.agent import ReActAgent
|
||||
|
||||
_CURRENT_DIR = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), os.pardir),
|
||||
)
|
||||
|
||||
with open(
|
||||
os.path.join(
|
||||
_CURRENT_DIR,
|
||||
"build_in_prompt/browser_agent_form_filling_sys_prompt.md",
|
||||
),
|
||||
"r",
|
||||
encoding="utf-8",
|
||||
) as f:
|
||||
_FORM_FILL_AGENT_SYS_PROMPT = f.read()
|
||||
|
||||
|
||||
class EmptyModel(BaseModel):
|
||||
"""Empty structured model for default structured output requirement."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FormFillingAgent(ReActAgent):
|
||||
"""Lightweight helper agent that fills forms."""
|
||||
|
||||
finish_function_name: str = "form_filling_final_response"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
browser_agent: Any,
|
||||
sys_prompt: str = _FORM_FILL_AGENT_SYS_PROMPT,
|
||||
max_iters: int = 20,
|
||||
) -> None:
|
||||
name = f"{getattr(browser_agent, 'name', 'browser_agent')}_form_fill"
|
||||
super().__init__(
|
||||
name=name,
|
||||
sys_prompt=sys_prompt,
|
||||
model=browser_agent.model,
|
||||
formatter=browser_agent.formatter,
|
||||
memory=InMemoryMemory(),
|
||||
toolkit=browser_agent.toolkit,
|
||||
max_iters=max_iters,
|
||||
)
|
||||
# Register the finish function
|
||||
self.toolkit.register_tool_function(self.form_filling_final_response)
|
||||
|
||||
async def form_filling_final_response(
|
||||
self, # pylint: disable=W0613
|
||||
**kwargs: Any, # pylint: disable=W0613
|
||||
) -> ToolResponse:
|
||||
"""Summarize the form filling outcome."""
|
||||
hint_msg = Msg(
|
||||
"user",
|
||||
(
|
||||
"Provide a concise summary of the completed form "
|
||||
"filling task.\n"
|
||||
"Highlight these items:\n"
|
||||
"0. The original task/query\n"
|
||||
"1. Which fields were filled/selected and their final values\n"
|
||||
"2. Any important observations or follow-up notes\n"
|
||||
"3. Confirmation that if the task is complete\n\n"
|
||||
),
|
||||
role="user",
|
||||
)
|
||||
|
||||
memory_msgs = await self.memory.get_memory()
|
||||
memory_msgs_copy = copy.deepcopy(memory_msgs)
|
||||
last_msg = memory_msgs_copy[-1]
|
||||
# check if the last message has tool call, if so clean the content
|
||||
|
||||
last_msg.content = last_msg.get_content_blocks("text")
|
||||
memory_msgs_copy[-1] = last_msg
|
||||
|
||||
prompt = await self.formatter.format(
|
||||
msgs=[
|
||||
Msg("system", self.sys_prompt, "system"),
|
||||
*memory_msgs_copy,
|
||||
hint_msg,
|
||||
],
|
||||
)
|
||||
|
||||
res = await self.model(prompt)
|
||||
|
||||
if self.model.stream:
|
||||
summary_text = ""
|
||||
async for chunk in res:
|
||||
summary_text = chunk.content[0]["text"]
|
||||
else:
|
||||
summary_text = res.content[0]["text"]
|
||||
|
||||
structure_response = {
|
||||
"task_done": True,
|
||||
"subtask_progress_summary": summary_text,
|
||||
"generated_files": {},
|
||||
}
|
||||
|
||||
return ToolResponse(
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text="Form filling summary generated. " + summary_text,
|
||||
),
|
||||
],
|
||||
metadata={
|
||||
"success": True,
|
||||
"structured_output": structure_response,
|
||||
},
|
||||
is_last=True,
|
||||
)
|
||||
|
||||
|
||||
def _build_initial_instruction(
|
||||
fill_information: str,
|
||||
snapshot_text: str,
|
||||
) -> str:
|
||||
"""Compose the initial instruction fed to the helper agent."""
|
||||
return (
|
||||
"You must complete the web form using the information "
|
||||
"provided below.\n\nFill instructions (plain text from the user):\n"
|
||||
f"{fill_information}\n\n"
|
||||
"Latest snapshot captured prior to your run:\n"
|
||||
f"{snapshot_text}\n\n"
|
||||
)
|
||||
|
||||
|
||||
async def form_filling(
|
||||
browser_agent: Any,
|
||||
fill_information: str,
|
||||
) -> ToolResponse:
|
||||
"""
|
||||
Fill in a web form according to plain-text instructions.
|
||||
|
||||
Args:
|
||||
fill_information (str):
|
||||
Plain-text description of the values that
|
||||
must be entered into the form,
|
||||
including any submission requirements.
|
||||
|
||||
Returns:
|
||||
ToolResponse: Summary of the helper agent execution and status.
|
||||
"""
|
||||
try:
|
||||
snapshot_chunks = (
|
||||
await browser_agent._get_snapshot_in_text()
|
||||
) # pylint: disable=protected-access
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
snapshot_chunks = []
|
||||
snapshot_error = str(exc)
|
||||
else:
|
||||
snapshot_error = ""
|
||||
|
||||
snapshot_text = "\n\n---\n\n".join(snapshot_chunks)
|
||||
if snapshot_error and not snapshot_text:
|
||||
snapshot_text = f"[Snapshot failed: {snapshot_error}]"
|
||||
|
||||
sub_agent = FormFillingAgent(browser_agent)
|
||||
instruction = _build_initial_instruction(
|
||||
fill_information=fill_information,
|
||||
snapshot_text=snapshot_text,
|
||||
)
|
||||
|
||||
init_msg = Msg(
|
||||
name="user",
|
||||
role="user",
|
||||
content=instruction,
|
||||
)
|
||||
|
||||
try:
|
||||
sub_agent_response_msg = await sub_agent.reply(
|
||||
init_msg,
|
||||
structured_model=EmptyModel,
|
||||
)
|
||||
|
||||
text_content = ""
|
||||
if sub_agent_response_msg.content:
|
||||
first_block = sub_agent_response_msg.content[0]
|
||||
if isinstance(first_block, dict):
|
||||
text_content = first_block.get("text") or ""
|
||||
else:
|
||||
text_content = getattr(first_block, "text", "") or ""
|
||||
|
||||
if not text_content:
|
||||
text_content = (
|
||||
"Form filling agent finished without a textual summary."
|
||||
)
|
||||
|
||||
return ToolResponse(
|
||||
metadata=sub_agent_response_msg.metadata,
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=text_content,
|
||||
),
|
||||
],
|
||||
)
|
||||
except Exception as e:
|
||||
return ToolResponse(
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=f"Tool call Error. Cannot be executed. {e}",
|
||||
),
|
||||
],
|
||||
metadata={"success": False},
|
||||
is_last=True,
|
||||
)
|
||||
@@ -0,0 +1,161 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Standalone image understanding skill for the browser agent."""
|
||||
# flake8: noqa: E501
|
||||
# pylint: disable=W0212
|
||||
# pylint: disable=too-many-lines
|
||||
# pylint: disable=C0301
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import Any
|
||||
|
||||
from agentscope.message import (
|
||||
Base64Source,
|
||||
ImageBlock,
|
||||
Msg,
|
||||
TextBlock,
|
||||
ToolUseBlock,
|
||||
)
|
||||
from agentscope.tool import ToolResponse
|
||||
|
||||
|
||||
async def image_understanding(
|
||||
browser_agent: Any,
|
||||
object_description: str,
|
||||
task: str,
|
||||
) -> ToolResponse:
|
||||
"""
|
||||
Locate an element and solve a visual task on the current webpage.
|
||||
|
||||
Args:
|
||||
object_description (str): The description of the object to locate.
|
||||
task (str): The specific task or question to solve about the image
|
||||
(e.g., description, object detection, activity recognition, or
|
||||
answering a question about the image's content).
|
||||
|
||||
Returns:
|
||||
ToolResponse: A structured response containing the answer to
|
||||
the specified task based on the image content.
|
||||
"""
|
||||
|
||||
sys_prompt = (
|
||||
"You are a web page analysis expert. Given the following page "
|
||||
"snapshot and object description, "
|
||||
"identify the exact element and its reference string (ref) "
|
||||
"that matches the description. "
|
||||
"Return ONLY a JSON object: "
|
||||
'{"element": <element description>, "ref": <ref string>}'
|
||||
)
|
||||
|
||||
snapshot_chunks = (
|
||||
await browser_agent._get_snapshot_in_text() # noqa: E501 # pylint: disable=protected-access
|
||||
)
|
||||
page_snapshot = snapshot_chunks[0] if snapshot_chunks else ""
|
||||
user_prompt = (
|
||||
f"Object description: {object_description}\n"
|
||||
f"Page snapshot:\n{page_snapshot}"
|
||||
)
|
||||
|
||||
prompt = await browser_agent.formatter.format(
|
||||
msgs=[
|
||||
Msg("system", sys_prompt, role="system"),
|
||||
Msg("user", user_prompt, role="user"),
|
||||
],
|
||||
)
|
||||
res = await browser_agent.model(prompt)
|
||||
if browser_agent.model.stream:
|
||||
async for chunk in res:
|
||||
model_text = chunk.content[0]["text"]
|
||||
else:
|
||||
model_text = res.content[0]["text"]
|
||||
|
||||
try:
|
||||
if "```json" in model_text:
|
||||
model_text = model_text.replace("```json", "").replace(
|
||||
"```",
|
||||
"",
|
||||
)
|
||||
element_info = json.loads(model_text)
|
||||
element = element_info.get("element", "")
|
||||
ref = element_info.get("ref", "")
|
||||
except Exception:
|
||||
return ToolResponse(
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text="Failed to parse element/ref from model output.",
|
||||
),
|
||||
],
|
||||
metadata={"success": False},
|
||||
)
|
||||
|
||||
screenshot_tool_call = ToolUseBlock(
|
||||
id=str(uuid.uuid4()),
|
||||
name="browser_take_screenshot",
|
||||
input={"element": element, "ref": ref},
|
||||
type="tool_use",
|
||||
)
|
||||
screenshot_response = await browser_agent.toolkit.call_tool_function(
|
||||
screenshot_tool_call,
|
||||
)
|
||||
image_data = None
|
||||
async for chunk in screenshot_response:
|
||||
if (
|
||||
chunk.content
|
||||
and len(chunk.content) > 1
|
||||
and "data" in chunk.content[1]
|
||||
):
|
||||
image_data = chunk.content[1]["data"]
|
||||
|
||||
sys_prompt_task = (
|
||||
"You are a web automation expert. "
|
||||
"Given the object description, screenshot, and page context, "
|
||||
"solve the following task. Return ONLY the answer as plain text."
|
||||
)
|
||||
content_blocks = [
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=(
|
||||
"Object description: "
|
||||
f"{object_description}\nTask: {task}\n"
|
||||
f"Page snapshot:\n{page_snapshot}"
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
if image_data:
|
||||
image_block = ImageBlock(
|
||||
type="image",
|
||||
source=Base64Source(
|
||||
type="base64",
|
||||
media_type="image/png",
|
||||
data=image_data,
|
||||
),
|
||||
)
|
||||
content_blocks.append(image_block)
|
||||
|
||||
prompt_task = await browser_agent.formatter.format(
|
||||
msgs=[
|
||||
Msg("system", sys_prompt_task, role="system"),
|
||||
Msg("user", content_blocks, role="user"),
|
||||
],
|
||||
)
|
||||
res_task = await browser_agent.model(prompt_task)
|
||||
if browser_agent.model.stream:
|
||||
async for chunk in res_task:
|
||||
answer_text = chunk.content[0]["text"]
|
||||
else:
|
||||
answer_text = res_task.content[0]["text"]
|
||||
|
||||
return ToolResponse(
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=(
|
||||
f"Screenshot taken for element: {element}\nref: {ref}\n"
|
||||
f"Task solution: {answer_text}"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@@ -0,0 +1,330 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Standalone video understanding skill for the browser agent."""
|
||||
# flake8: noqa: E501
|
||||
# pylint: disable=W0212
|
||||
# pylint: disable=too-many-lines
|
||||
# pylint: disable=C0301
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import uuid
|
||||
from base64 import b64encode
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from agentscope.message import (
|
||||
Base64Source,
|
||||
ImageBlock,
|
||||
Msg,
|
||||
TextBlock,
|
||||
)
|
||||
from agentscope.tool import ToolResponse
|
||||
|
||||
|
||||
async def video_understanding(
|
||||
browser_agent: Any,
|
||||
video_path: str,
|
||||
task: str,
|
||||
) -> ToolResponse:
|
||||
"""
|
||||
Perform video understanding on the provided video file.
|
||||
|
||||
Args:
|
||||
video_path (str): The path to the video file to analyze.
|
||||
task (str): The specific task or question to solve about
|
||||
the video (e.g., summary, object detection, activity recognition,
|
||||
or answering a question about the video's content).
|
||||
|
||||
Returns:
|
||||
ToolResponse: A structured response containing the answer
|
||||
to the specified task based on the video content.
|
||||
"""
|
||||
|
||||
workdir = _prepare_workdir(browser_agent)
|
||||
try:
|
||||
frames_dir = os.path.join(workdir, "frames")
|
||||
frames = extract_frames(video_path, frames_dir)
|
||||
except Exception as exc:
|
||||
return _error_response(f"Failed to extract frames: {exc}")
|
||||
|
||||
audio_path = os.path.join(
|
||||
workdir,
|
||||
f"audio_{getattr(browser_agent, 'iter_n', 0)}.wav",
|
||||
)
|
||||
try:
|
||||
extract_audio(video_path, audio_path)
|
||||
except Exception as exc:
|
||||
return _error_response(f"Failed to extract audio: {exc}")
|
||||
|
||||
try:
|
||||
transcript = audio2text(audio_path)
|
||||
except Exception as exc:
|
||||
return _error_response(f"Failed to transcribe audio: {exc}")
|
||||
|
||||
sys_prompt = (
|
||||
"You are a web video analysis expert. "
|
||||
"Given the following video frames and audio transcript, "
|
||||
"analyze the content and provide a solution to the task. "
|
||||
'Return ONLY a JSON object: {"answer": <your answer>}'
|
||||
)
|
||||
|
||||
content_blocks = _build_multimodal_blocks(frames, transcript, task)
|
||||
|
||||
prompt = await browser_agent.formatter.format(
|
||||
msgs=[
|
||||
Msg("system", sys_prompt, role="system"),
|
||||
Msg("user", content_blocks, role="user"),
|
||||
],
|
||||
)
|
||||
|
||||
res = await browser_agent.model(prompt)
|
||||
if browser_agent.model.stream:
|
||||
async for chunk in res:
|
||||
model_text = chunk.content[0]["text"]
|
||||
else:
|
||||
model_text = res.content[0]["text"]
|
||||
|
||||
try:
|
||||
if "```json" in model_text:
|
||||
model_text = model_text.replace("```json", "").replace(
|
||||
"```",
|
||||
"",
|
||||
)
|
||||
answer_info = json.loads(model_text)
|
||||
answer = answer_info.get("answer", "")
|
||||
except Exception: # pylint: disable=broad-except
|
||||
return _error_response("Failed to parse answer from model output.")
|
||||
|
||||
return ToolResponse(
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=(
|
||||
"Video analysis completed.\n" f"Task solution: {answer}"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def audio2text(audio_path: str) -> str:
|
||||
"""Convert audio to text using DashScope ASR."""
|
||||
|
||||
try: # Local import to avoid hard dependency when unused.
|
||||
from dashscope.audio.asr import Recognition, RecognitionCallback
|
||||
except ImportError as exc:
|
||||
raise RuntimeError(
|
||||
"dashscope.audio is required for audio transcription.",
|
||||
) from exc
|
||||
|
||||
callback = RecognitionCallback()
|
||||
recognizer = Recognition(
|
||||
model="paraformer-realtime-v1",
|
||||
format="wav",
|
||||
sample_rate=16000,
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
result = recognizer.call(audio_path)
|
||||
sentences = result.get("output", {}).get("sentence", [])
|
||||
return " ".join(sentence.get("text", "") for sentence in sentences)
|
||||
|
||||
|
||||
def extract_frames(
|
||||
video_path: str,
|
||||
output_dir: str,
|
||||
max_frames: int = 16,
|
||||
) -> List[str]:
|
||||
"""Extract representative frames using ffmpeg (no OpenCV dependency)."""
|
||||
|
||||
if max_frames <= 0:
|
||||
raise ValueError("max_frames must be greater than zero.")
|
||||
|
||||
if not os.path.exists(video_path):
|
||||
raise FileNotFoundError(f"Video path not found: {video_path}")
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Clean up previous generated frames
|
||||
for existing in Path(output_dir).glob("frame_*.jpg"):
|
||||
try:
|
||||
existing.unlink()
|
||||
except OSError:
|
||||
# Ignore errors during cleanup;
|
||||
# leftover files will be overwritten or do not affect frame extraction
|
||||
pass
|
||||
|
||||
duration = _probe_video_duration(video_path)
|
||||
if duration and duration > 0:
|
||||
fps = max_frames / duration
|
||||
else:
|
||||
fps = 1.0
|
||||
|
||||
fps = max(min(fps, 30.0), 0.1)
|
||||
|
||||
command = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
video_path,
|
||||
"-vf",
|
||||
f"fps={fps:.5f}",
|
||||
"-frames:v",
|
||||
str(max_frames),
|
||||
os.path.join(output_dir, "frame_%04d.jpg"),
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
command,
|
||||
check=True,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"ffmpeg is required to extract frames from video.",
|
||||
) from exc
|
||||
|
||||
frame_files = sorted(
|
||||
str(path) for path in Path(output_dir).glob("frame_*.jpg")
|
||||
)
|
||||
|
||||
if not frame_files:
|
||||
raise RuntimeError("No frames could be extracted from the video.")
|
||||
|
||||
return frame_files
|
||||
|
||||
|
||||
def extract_audio(video_path: str, audio_path: str) -> str:
|
||||
"""Extract audio track with ffmpeg and save as wav."""
|
||||
|
||||
if not os.path.exists(video_path):
|
||||
raise FileNotFoundError(f"Video path not found: {video_path}")
|
||||
|
||||
os.makedirs(os.path.dirname(audio_path), exist_ok=True)
|
||||
|
||||
command = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
video_path,
|
||||
"-vn",
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
audio_path,
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
command,
|
||||
check=True,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"ffmpeg is required to extract audio from video.",
|
||||
) from exc
|
||||
|
||||
return audio_path
|
||||
|
||||
|
||||
def _probe_video_duration(video_path: str) -> Optional[float]:
|
||||
"""Return the video duration in seconds using ffprobe, if available."""
|
||||
|
||||
command = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-show_entries",
|
||||
"format=duration",
|
||||
"-of",
|
||||
"default=noprint_wrappers=1:nokey=1",
|
||||
video_path,
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
command,
|
||||
check=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
duration_str = result.stdout.strip()
|
||||
if duration_str:
|
||||
return float(duration_str)
|
||||
except (FileNotFoundError, ValueError, subprocess.CalledProcessError):
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _build_multimodal_blocks(
|
||||
frames: List[str],
|
||||
transcript: str,
|
||||
task: str,
|
||||
) -> list:
|
||||
"""Construct multimodal content blocks for the model input."""
|
||||
|
||||
blocks: list = []
|
||||
for frame_path in frames:
|
||||
with open(frame_path, "rb") as file:
|
||||
data = b64encode(file.read()).decode("ascii")
|
||||
image_block = ImageBlock(
|
||||
type="image",
|
||||
source=Base64Source(
|
||||
type="base64",
|
||||
media_type="image/jpeg",
|
||||
data=data,
|
||||
),
|
||||
)
|
||||
blocks.append(image_block)
|
||||
|
||||
blocks.append(
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=f"Audio transcript:\n{transcript}",
|
||||
),
|
||||
)
|
||||
blocks.append(
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=f"The task to be solved is: {task}",
|
||||
),
|
||||
)
|
||||
return blocks
|
||||
|
||||
|
||||
def _prepare_workdir(browser_agent: Any) -> str:
|
||||
"""Prepare a working directory for intermediate artifacts."""
|
||||
|
||||
base_dir = getattr(browser_agent, "state_saving_dir", None)
|
||||
if not base_dir:
|
||||
base_dir = tempfile.gettempdir()
|
||||
|
||||
workdir = os.path.join(base_dir, "video_understanding", uuid.uuid4().hex)
|
||||
os.makedirs(workdir, exist_ok=True)
|
||||
return workdir
|
||||
|
||||
|
||||
def _error_response(message: str) -> ToolResponse:
|
||||
"""Create a standardized error response."""
|
||||
|
||||
return ToolResponse(
|
||||
content=[
|
||||
TextBlock(
|
||||
type="text",
|
||||
text=message,
|
||||
),
|
||||
],
|
||||
metadata={"success": False},
|
||||
)
|
||||
Reference in New Issue
Block a user