fix: harden alicdn image downloads

This commit is contained in:
2026-03-09 10:51:12 +08:00
parent 2ab27eb914
commit bcd162ef22
2 changed files with 147 additions and 5 deletions

View File

@@ -2,6 +2,7 @@ import asyncio
import hashlib
import json
import logging
import mimetypes
import os
import re
from pathlib import Path
@@ -25,6 +26,30 @@ AUTO_PROCESS_CATEGORY = os.getenv("AUTO_PROCESS_CATEGORY", "设计素材")
AUTO_PROCESS_ROOT = Path(
os.getenv("AUTO_PROCESS_ROOT", str(Path(__file__).resolve().parents[1] / "runtime" / "auto_processed"))
)
_DOWNLOAD_REFERERS = (
"https://www.taobao.com/",
"https://item.taobao.com/",
"https://detail.tmall.com/",
)
_DOWNLOAD_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/133.0.0.0 Safari/537.36"
),
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
}
_CONTENT_TYPE_SUFFIX = {
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/avif": ".avif",
"image/gif": ".gif",
}
def _safe_name(text: str, fallback: str = "image") -> str:
@@ -55,13 +80,56 @@ class AutoImagePipelineService:
def __init__(self):
self.customer_db = CustomerDatabase()
@staticmethod
def _resolve_download_path(dest_path: Path, content_type: str, image_url: str) -> Path:
normalized_type = str(content_type or "").split(";", 1)[0].strip().lower()
suffix = _CONTENT_TYPE_SUFFIX.get(normalized_type, "")
if not suffix:
guessed, _ = mimetypes.guess_type(str(image_url or ""))
suffix = _CONTENT_TYPE_SUFFIX.get(str(guessed or "").lower(), "")
suffix = suffix or dest_path.suffix or ".bin"
return dest_path.with_suffix(suffix)
async def _download_image(self, image_url: str, dest_path: Path) -> Path:
dest_path.parent.mkdir(parents=True, exist_ok=True)
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
response = await client.get(image_url)
response.raise_for_status()
dest_path.write_bytes(response.content)
return dest_path
timeout = httpx.Timeout(60.0, connect=20.0)
last_error: Optional[Exception] = None
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
for referer in _DOWNLOAD_REFERERS:
for attempt in range(1, 4):
headers = dict(_DOWNLOAD_HEADERS)
headers["Referer"] = referer
try:
response = await client.get(image_url, headers=headers)
if response.status_code in (403, 420, 429):
raise httpx.HTTPStatusError(
f"download blocked status={response.status_code}",
request=response.request,
response=response,
)
response.raise_for_status()
resolved_path = self._resolve_download_path(
dest_path,
response.headers.get("content-type", ""),
image_url,
)
resolved_path.write_bytes(response.content)
logger.info(
f"[AutoImagePipeline] 图片下载成功 status={response.status_code} "
f"referer={referer} path={resolved_path}"
)
return resolved_path
except Exception as e:
last_error = e
logger.warning(
f"[AutoImagePipeline] 图片下载失败 attempt={attempt}/3 "
f"referer={referer} url={image_url} err={e}"
)
if attempt < 3:
await asyncio.sleep(attempt)
raise RuntimeError(f"下载原图失败: {last_error}")
@staticmethod
def _format_transfer_notice(