fix: harden alicdn image downloads
This commit is contained in:
74
scripts/test_alicdn_download.py
Normal file
74
scripts/test_alicdn_download.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
|
||||||
|
TEST_URL = "https://img.alicdn.com/imgextra/i1/O1CN01959PmC2MK7jvMhqXF_!!4611686018427385312-0-amp.jpg"
|
||||||
|
OUTPUT_DIR = Path(__file__).resolve().parents[1] / "tmp_alicdn_download"
|
||||||
|
|
||||||
|
CONTENT_TYPE_TO_SUFFIX = {
|
||||||
|
"image/jpeg": ".jpg",
|
||||||
|
"image/jpg": ".jpg",
|
||||||
|
"image/png": ".png",
|
||||||
|
"image/webp": ".webp",
|
||||||
|
"image/avif": ".avif",
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/133.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Pragma": "no-cache",
|
||||||
|
"Referer": "https://www.taobao.com/",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def download_once(client: httpx.AsyncClient, url: str):
|
||||||
|
response = await client.get(url, headers=DEFAULT_HEADERS)
|
||||||
|
print(f"HTTP {response.status_code}")
|
||||||
|
content_type = response.headers.get("content-type", "").split(";", 1)[0].strip().lower()
|
||||||
|
print(f"Content-Type: {content_type}")
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(response.text[:300])
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
suffix = CONTENT_TYPE_TO_SUFFIX.get(content_type, ".bin")
|
||||||
|
output_path = OUTPUT_DIR / f"alicdn_test{suffix}"
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path.write_bytes(response.content)
|
||||||
|
print(f"Saved to: {output_path}")
|
||||||
|
print(f"Size: {output_path.stat().st_size} bytes")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
timeout = httpx.Timeout(60.0, connect=20.0)
|
||||||
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||||
|
last_error = None
|
||||||
|
referers = [
|
||||||
|
"https://www.taobao.com/",
|
||||||
|
"https://item.taobao.com/",
|
||||||
|
"https://detail.tmall.com/",
|
||||||
|
]
|
||||||
|
for idx, referer in enumerate(referers, 1):
|
||||||
|
try:
|
||||||
|
DEFAULT_HEADERS["Referer"] = referer
|
||||||
|
print(f"Attempt {idx} with Referer={referer}")
|
||||||
|
await download_once(client, TEST_URL)
|
||||||
|
print("Download success")
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
last_error = e
|
||||||
|
print(f"Attempt {idx} failed: {type(e).__name__}: {e}")
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
raise RuntimeError(f"All attempts failed: {last_error}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -2,6 +2,7 @@ import asyncio
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -25,6 +26,30 @@ AUTO_PROCESS_CATEGORY = os.getenv("AUTO_PROCESS_CATEGORY", "设计素材")
|
|||||||
AUTO_PROCESS_ROOT = Path(
|
AUTO_PROCESS_ROOT = Path(
|
||||||
os.getenv("AUTO_PROCESS_ROOT", str(Path(__file__).resolve().parents[1] / "runtime" / "auto_processed"))
|
os.getenv("AUTO_PROCESS_ROOT", str(Path(__file__).resolve().parents[1] / "runtime" / "auto_processed"))
|
||||||
)
|
)
|
||||||
|
_DOWNLOAD_REFERERS = (
|
||||||
|
"https://www.taobao.com/",
|
||||||
|
"https://item.taobao.com/",
|
||||||
|
"https://detail.tmall.com/",
|
||||||
|
)
|
||||||
|
_DOWNLOAD_HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/133.0.0.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Pragma": "no-cache",
|
||||||
|
}
|
||||||
|
_CONTENT_TYPE_SUFFIX = {
|
||||||
|
"image/jpeg": ".jpg",
|
||||||
|
"image/jpg": ".jpg",
|
||||||
|
"image/png": ".png",
|
||||||
|
"image/webp": ".webp",
|
||||||
|
"image/avif": ".avif",
|
||||||
|
"image/gif": ".gif",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _safe_name(text: str, fallback: str = "image") -> str:
|
def _safe_name(text: str, fallback: str = "image") -> str:
|
||||||
@@ -55,13 +80,56 @@ class AutoImagePipelineService:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.customer_db = CustomerDatabase()
|
self.customer_db = CustomerDatabase()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _resolve_download_path(dest_path: Path, content_type: str, image_url: str) -> Path:
|
||||||
|
normalized_type = str(content_type or "").split(";", 1)[0].strip().lower()
|
||||||
|
suffix = _CONTENT_TYPE_SUFFIX.get(normalized_type, "")
|
||||||
|
if not suffix:
|
||||||
|
guessed, _ = mimetypes.guess_type(str(image_url or ""))
|
||||||
|
suffix = _CONTENT_TYPE_SUFFIX.get(str(guessed or "").lower(), "")
|
||||||
|
suffix = suffix or dest_path.suffix or ".bin"
|
||||||
|
return dest_path.with_suffix(suffix)
|
||||||
|
|
||||||
async def _download_image(self, image_url: str, dest_path: Path) -> Path:
|
async def _download_image(self, image_url: str, dest_path: Path) -> Path:
|
||||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
|
timeout = httpx.Timeout(60.0, connect=20.0)
|
||||||
response = await client.get(image_url)
|
last_error: Optional[Exception] = None
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||||
|
for referer in _DOWNLOAD_REFERERS:
|
||||||
|
for attempt in range(1, 4):
|
||||||
|
headers = dict(_DOWNLOAD_HEADERS)
|
||||||
|
headers["Referer"] = referer
|
||||||
|
try:
|
||||||
|
response = await client.get(image_url, headers=headers)
|
||||||
|
if response.status_code in (403, 420, 429):
|
||||||
|
raise httpx.HTTPStatusError(
|
||||||
|
f"download blocked status={response.status_code}",
|
||||||
|
request=response.request,
|
||||||
|
response=response,
|
||||||
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
dest_path.write_bytes(response.content)
|
resolved_path = self._resolve_download_path(
|
||||||
return dest_path
|
dest_path,
|
||||||
|
response.headers.get("content-type", ""),
|
||||||
|
image_url,
|
||||||
|
)
|
||||||
|
resolved_path.write_bytes(response.content)
|
||||||
|
logger.info(
|
||||||
|
f"[AutoImagePipeline] 图片下载成功 status={response.status_code} "
|
||||||
|
f"referer={referer} path={resolved_path}"
|
||||||
|
)
|
||||||
|
return resolved_path
|
||||||
|
except Exception as e:
|
||||||
|
last_error = e
|
||||||
|
logger.warning(
|
||||||
|
f"[AutoImagePipeline] 图片下载失败 attempt={attempt}/3 "
|
||||||
|
f"referer={referer} url={image_url} err={e}"
|
||||||
|
)
|
||||||
|
if attempt < 3:
|
||||||
|
await asyncio.sleep(attempt)
|
||||||
|
|
||||||
|
raise RuntimeError(f"下载原图失败: {last_error}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _format_transfer_notice(
|
def _format_transfer_notice(
|
||||||
|
|||||||
Reference in New Issue
Block a user