fix: harden alicdn image downloads

This commit is contained in:
2026-03-09 10:51:12 +08:00
parent 2ab27eb914
commit bcd162ef22
2 changed files with 147 additions and 5 deletions

View File

@@ -0,0 +1,74 @@
import asyncio
from pathlib import Path
import httpx
TEST_URL = "https://img.alicdn.com/imgextra/i1/O1CN01959PmC2MK7jvMhqXF_!!4611686018427385312-0-amp.jpg"
OUTPUT_DIR = Path(__file__).resolve().parents[1] / "tmp_alicdn_download"
CONTENT_TYPE_TO_SUFFIX = {
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/avif": ".avif",
}
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/133.0.0.0 Safari/537.36"
),
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
"Referer": "https://www.taobao.com/",
}
async def download_once(client: httpx.AsyncClient, url: str):
response = await client.get(url, headers=DEFAULT_HEADERS)
print(f"HTTP {response.status_code}")
content_type = response.headers.get("content-type", "").split(";", 1)[0].strip().lower()
print(f"Content-Type: {content_type}")
if response.status_code != 200:
print(response.text[:300])
response.raise_for_status()
suffix = CONTENT_TYPE_TO_SUFFIX.get(content_type, ".bin")
output_path = OUTPUT_DIR / f"alicdn_test{suffix}"
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_bytes(response.content)
print(f"Saved to: {output_path}")
print(f"Size: {output_path.stat().st_size} bytes")
async def main():
timeout = httpx.Timeout(60.0, connect=20.0)
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
last_error = None
referers = [
"https://www.taobao.com/",
"https://item.taobao.com/",
"https://detail.tmall.com/",
]
for idx, referer in enumerate(referers, 1):
try:
DEFAULT_HEADERS["Referer"] = referer
print(f"Attempt {idx} with Referer={referer}")
await download_once(client, TEST_URL)
print("Download success")
return
except Exception as e:
last_error = e
print(f"Attempt {idx} failed: {type(e).__name__}: {e}")
await asyncio.sleep(1)
raise RuntimeError(f"All attempts failed: {last_error}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -2,6 +2,7 @@ import asyncio
import hashlib import hashlib
import json import json
import logging import logging
import mimetypes
import os import os
import re import re
from pathlib import Path from pathlib import Path
@@ -25,6 +26,30 @@ AUTO_PROCESS_CATEGORY = os.getenv("AUTO_PROCESS_CATEGORY", "设计素材")
AUTO_PROCESS_ROOT = Path( AUTO_PROCESS_ROOT = Path(
os.getenv("AUTO_PROCESS_ROOT", str(Path(__file__).resolve().parents[1] / "runtime" / "auto_processed")) os.getenv("AUTO_PROCESS_ROOT", str(Path(__file__).resolve().parents[1] / "runtime" / "auto_processed"))
) )
_DOWNLOAD_REFERERS = (
"https://www.taobao.com/",
"https://item.taobao.com/",
"https://detail.tmall.com/",
)
_DOWNLOAD_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/133.0.0.0 Safari/537.36"
),
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
}
_CONTENT_TYPE_SUFFIX = {
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/avif": ".avif",
"image/gif": ".gif",
}
def _safe_name(text: str, fallback: str = "image") -> str: def _safe_name(text: str, fallback: str = "image") -> str:
@@ -55,13 +80,56 @@ class AutoImagePipelineService:
def __init__(self): def __init__(self):
self.customer_db = CustomerDatabase() self.customer_db = CustomerDatabase()
@staticmethod
def _resolve_download_path(dest_path: Path, content_type: str, image_url: str) -> Path:
normalized_type = str(content_type or "").split(";", 1)[0].strip().lower()
suffix = _CONTENT_TYPE_SUFFIX.get(normalized_type, "")
if not suffix:
guessed, _ = mimetypes.guess_type(str(image_url or ""))
suffix = _CONTENT_TYPE_SUFFIX.get(str(guessed or "").lower(), "")
suffix = suffix or dest_path.suffix or ".bin"
return dest_path.with_suffix(suffix)
async def _download_image(self, image_url: str, dest_path: Path) -> Path: async def _download_image(self, image_url: str, dest_path: Path) -> Path:
dest_path.parent.mkdir(parents=True, exist_ok=True) dest_path.parent.mkdir(parents=True, exist_ok=True)
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client: timeout = httpx.Timeout(60.0, connect=20.0)
response = await client.get(image_url) last_error: Optional[Exception] = None
response.raise_for_status()
dest_path.write_bytes(response.content) async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
return dest_path for referer in _DOWNLOAD_REFERERS:
for attempt in range(1, 4):
headers = dict(_DOWNLOAD_HEADERS)
headers["Referer"] = referer
try:
response = await client.get(image_url, headers=headers)
if response.status_code in (403, 420, 429):
raise httpx.HTTPStatusError(
f"download blocked status={response.status_code}",
request=response.request,
response=response,
)
response.raise_for_status()
resolved_path = self._resolve_download_path(
dest_path,
response.headers.get("content-type", ""),
image_url,
)
resolved_path.write_bytes(response.content)
logger.info(
f"[AutoImagePipeline] 图片下载成功 status={response.status_code} "
f"referer={referer} path={resolved_path}"
)
return resolved_path
except Exception as e:
last_error = e
logger.warning(
f"[AutoImagePipeline] 图片下载失败 attempt={attempt}/3 "
f"referer={referer} url={image_url} err={e}"
)
if attempt < 3:
await asyncio.sleep(attempt)
raise RuntimeError(f"下载原图失败: {last_error}")
@staticmethod @staticmethod
def _format_transfer_notice( def _format_transfer_notice(