fix: harden alicdn image downloads
This commit is contained in:
@@ -2,6 +2,7 @@ import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
@@ -25,6 +26,30 @@ AUTO_PROCESS_CATEGORY = os.getenv("AUTO_PROCESS_CATEGORY", "设计素材")
|
||||
AUTO_PROCESS_ROOT = Path(
|
||||
os.getenv("AUTO_PROCESS_ROOT", str(Path(__file__).resolve().parents[1] / "runtime" / "auto_processed"))
|
||||
)
|
||||
_DOWNLOAD_REFERERS = (
|
||||
"https://www.taobao.com/",
|
||||
"https://item.taobao.com/",
|
||||
"https://detail.tmall.com/",
|
||||
)
|
||||
_DOWNLOAD_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/133.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Cache-Control": "no-cache",
|
||||
"Pragma": "no-cache",
|
||||
}
|
||||
_CONTENT_TYPE_SUFFIX = {
|
||||
"image/jpeg": ".jpg",
|
||||
"image/jpg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/webp": ".webp",
|
||||
"image/avif": ".avif",
|
||||
"image/gif": ".gif",
|
||||
}
|
||||
|
||||
|
||||
def _safe_name(text: str, fallback: str = "image") -> str:
|
||||
@@ -55,13 +80,56 @@ class AutoImagePipelineService:
|
||||
def __init__(self):
|
||||
self.customer_db = CustomerDatabase()
|
||||
|
||||
@staticmethod
|
||||
def _resolve_download_path(dest_path: Path, content_type: str, image_url: str) -> Path:
|
||||
normalized_type = str(content_type or "").split(";", 1)[0].strip().lower()
|
||||
suffix = _CONTENT_TYPE_SUFFIX.get(normalized_type, "")
|
||||
if not suffix:
|
||||
guessed, _ = mimetypes.guess_type(str(image_url or ""))
|
||||
suffix = _CONTENT_TYPE_SUFFIX.get(str(guessed or "").lower(), "")
|
||||
suffix = suffix or dest_path.suffix or ".bin"
|
||||
return dest_path.with_suffix(suffix)
|
||||
|
||||
async def _download_image(self, image_url: str, dest_path: Path) -> Path:
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
|
||||
response = await client.get(image_url)
|
||||
response.raise_for_status()
|
||||
dest_path.write_bytes(response.content)
|
||||
return dest_path
|
||||
timeout = httpx.Timeout(60.0, connect=20.0)
|
||||
last_error: Optional[Exception] = None
|
||||
|
||||
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||
for referer in _DOWNLOAD_REFERERS:
|
||||
for attempt in range(1, 4):
|
||||
headers = dict(_DOWNLOAD_HEADERS)
|
||||
headers["Referer"] = referer
|
||||
try:
|
||||
response = await client.get(image_url, headers=headers)
|
||||
if response.status_code in (403, 420, 429):
|
||||
raise httpx.HTTPStatusError(
|
||||
f"download blocked status={response.status_code}",
|
||||
request=response.request,
|
||||
response=response,
|
||||
)
|
||||
response.raise_for_status()
|
||||
resolved_path = self._resolve_download_path(
|
||||
dest_path,
|
||||
response.headers.get("content-type", ""),
|
||||
image_url,
|
||||
)
|
||||
resolved_path.write_bytes(response.content)
|
||||
logger.info(
|
||||
f"[AutoImagePipeline] 图片下载成功 status={response.status_code} "
|
||||
f"referer={referer} path={resolved_path}"
|
||||
)
|
||||
return resolved_path
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
logger.warning(
|
||||
f"[AutoImagePipeline] 图片下载失败 attempt={attempt}/3 "
|
||||
f"referer={referer} url={image_url} err={e}"
|
||||
)
|
||||
if attempt < 3:
|
||||
await asyncio.sleep(attempt)
|
||||
|
||||
raise RuntimeError(f"下载原图失败: {last_error}")
|
||||
|
||||
@staticmethod
|
||||
def _format_transfer_notice(
|
||||
|
||||
Reference in New Issue
Block a user