diff --git a/scripts/test_alicdn_download.py b/scripts/test_alicdn_download.py new file mode 100644 index 0000000..ffbfc44 --- /dev/null +++ b/scripts/test_alicdn_download.py @@ -0,0 +1,74 @@ +import asyncio +from pathlib import Path + +import httpx + + +TEST_URL = "https://img.alicdn.com/imgextra/i1/O1CN01959PmC2MK7jvMhqXF_!!4611686018427385312-0-amp.jpg" +OUTPUT_DIR = Path(__file__).resolve().parents[1] / "tmp_alicdn_download" + +CONTENT_TYPE_TO_SUFFIX = { + "image/jpeg": ".jpg", + "image/jpg": ".jpg", + "image/png": ".png", + "image/webp": ".webp", + "image/avif": ".avif", +} + +DEFAULT_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/133.0.0.0 Safari/537.36" + ), + "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Cache-Control": "no-cache", + "Pragma": "no-cache", + "Referer": "https://www.taobao.com/", +} + + +async def download_once(client: httpx.AsyncClient, url: str): + response = await client.get(url, headers=DEFAULT_HEADERS) + print(f"HTTP {response.status_code}") + content_type = response.headers.get("content-type", "").split(";", 1)[0].strip().lower() + print(f"Content-Type: {content_type}") + if response.status_code != 200: + print(response.text[:300]) + response.raise_for_status() + + suffix = CONTENT_TYPE_TO_SUFFIX.get(content_type, ".bin") + output_path = OUTPUT_DIR / f"alicdn_test{suffix}" + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_bytes(response.content) + print(f"Saved to: {output_path}") + print(f"Size: {output_path.stat().st_size} bytes") + + +async def main(): + timeout = httpx.Timeout(60.0, connect=20.0) + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: + last_error = None + referers = [ + "https://www.taobao.com/", + "https://item.taobao.com/", + "https://detail.tmall.com/", + ] + for idx, referer in enumerate(referers, 1): + try: + DEFAULT_HEADERS["Referer"] = referer + print(f"Attempt {idx} with Referer={referer}") + await download_once(client, TEST_URL) + print("Download success") + return + except Exception as e: + last_error = e + print(f"Attempt {idx} failed: {type(e).__name__}: {e}") + await asyncio.sleep(1) + + raise RuntimeError(f"All attempts failed: {last_error}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/services/service_auto_image_pipeline.py b/services/service_auto_image_pipeline.py index c6895ea..47a31f7 100644 --- a/services/service_auto_image_pipeline.py +++ b/services/service_auto_image_pipeline.py @@ -2,6 +2,7 @@ import asyncio import hashlib import json import logging +import mimetypes import os import re from pathlib import Path @@ -25,6 +26,30 @@ AUTO_PROCESS_CATEGORY = os.getenv("AUTO_PROCESS_CATEGORY", "设计素材") AUTO_PROCESS_ROOT = Path( os.getenv("AUTO_PROCESS_ROOT", str(Path(__file__).resolve().parents[1] / "runtime" / "auto_processed")) ) +_DOWNLOAD_REFERERS = ( + "https://www.taobao.com/", + "https://item.taobao.com/", + "https://detail.tmall.com/", +) +_DOWNLOAD_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/133.0.0.0 Safari/537.36" + ), + "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Cache-Control": "no-cache", + "Pragma": "no-cache", +} +_CONTENT_TYPE_SUFFIX = { + "image/jpeg": ".jpg", + "image/jpg": ".jpg", + "image/png": ".png", + "image/webp": ".webp", + "image/avif": ".avif", + "image/gif": ".gif", +} def _safe_name(text: str, fallback: str = "image") -> str: @@ -55,13 +80,56 @@ class AutoImagePipelineService: def __init__(self): self.customer_db = CustomerDatabase() + @staticmethod + def _resolve_download_path(dest_path: Path, content_type: str, image_url: str) -> Path: + normalized_type = str(content_type or "").split(";", 1)[0].strip().lower() + suffix = _CONTENT_TYPE_SUFFIX.get(normalized_type, "") + if not suffix: + guessed, _ = mimetypes.guess_type(str(image_url or "")) + suffix = _CONTENT_TYPE_SUFFIX.get(str(guessed or "").lower(), "") + suffix = suffix or dest_path.suffix or ".bin" + return dest_path.with_suffix(suffix) + async def _download_image(self, image_url: str, dest_path: Path) -> Path: dest_path.parent.mkdir(parents=True, exist_ok=True) - async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client: - response = await client.get(image_url) - response.raise_for_status() - dest_path.write_bytes(response.content) - return dest_path + timeout = httpx.Timeout(60.0, connect=20.0) + last_error: Optional[Exception] = None + + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: + for referer in _DOWNLOAD_REFERERS: + for attempt in range(1, 4): + headers = dict(_DOWNLOAD_HEADERS) + headers["Referer"] = referer + try: + response = await client.get(image_url, headers=headers) + if response.status_code in (403, 420, 429): + raise httpx.HTTPStatusError( + f"download blocked status={response.status_code}", + request=response.request, + response=response, + ) + response.raise_for_status() + resolved_path = self._resolve_download_path( + dest_path, + response.headers.get("content-type", ""), + image_url, + ) + resolved_path.write_bytes(response.content) + logger.info( + f"[AutoImagePipeline] 图片下载成功 status={response.status_code} " + f"referer={referer} path={resolved_path}" + ) + return resolved_path + except Exception as e: + last_error = e + logger.warning( + f"[AutoImagePipeline] 图片下载失败 attempt={attempt}/3 " + f"referer={referer} url={image_url} err={e}" + ) + if attempt < 3: + await asyncio.sleep(attempt) + + raise RuntimeError(f"下载原图失败: {last_error}") @staticmethod def _format_transfer_notice(