xiaohongshu-mcp/skills/post-to-xhs/scripts/image_downloader.py

"""
Image downloader for Xiaohongshu publishing.

Downloads images from URLs to a local temp directory for upload,
and cleans up after publishing is complete.
"""

import os
import sys
import tempfile
import shutil
import uuid
from urllib.parse import urlparse, unquote

import requests

DEFAULT_TIMEOUT = 30  # seconds per download
TEMP_DIR_PREFIX = "xhs_images_"


class ImageDownloader:
    """Download images from URLs and manage a temporary directory for them."""

    def __init__(self, temp_dir: str | None = None):
        if temp_dir:
            self.temp_dir = temp_dir
            os.makedirs(self.temp_dir, exist_ok=True)
            self._owns_dir = False
        else:
            self.temp_dir = tempfile.mkdtemp(prefix=TEMP_DIR_PREFIX)
            self._owns_dir = True
        self.downloaded_files: list[str] = []

    def _guess_extension(self, url: str, content_type: str | None) -> str:
        """Guess file extension from URL path or Content-Type header."""
        # Try URL path first
        path = urlparse(url).path
        _, ext = os.path.splitext(unquote(path))
        if ext and ext.lower() in (".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"):
            return ext.lower()

        # Fall back to Content-Type
        ct_map = {
            "image/jpeg": ".jpg",
            "image/png": ".png",
            "image/gif": ".gif",
            "image/webp": ".webp",
            "image/bmp": ".bmp",
        }
        if content_type:
            for mime, ext in ct_map.items():
                if mime in content_type:
                    return ext

        return ".jpg"  # safe default

    def download(self, url: str, referer: str | None = None) -> str:
        """
        Download a single image and return the local file path.

        Args:
            url: Image URL to download
            referer: Optional Referer header. If None, auto-generates from URL domain.

        Raises requests.RequestException on network errors.
        """
        # Build headers with Referer to bypass hotlink protection
        parsed = urlparse(url)
        if referer is None:
            referer = f"{parsed.scheme}://{parsed.netloc}/"

        headers = {
            "Referer": referer,
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        }

        resp = requests.get(url, timeout=DEFAULT_TIMEOUT, stream=True, headers=headers)
        resp.raise_for_status()

        ext = self._guess_extension(url, resp.headers.get("Content-Type"))
        filename = f"{uuid.uuid4().hex[:12]}{ext}"
        filepath = os.path.join(self.temp_dir, filename)

        with open(filepath, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                f.write(chunk)

        self.downloaded_files.append(filepath)
        print(f"[image_downloader] Downloaded: {url}")
        print(f"  -> {filepath} ({os.path.getsize(filepath)} bytes)")
        return filepath

    def download_all(self, urls: list[str]) -> list[str]:
        """
        Download multiple images. Returns list of local file paths.

        Skips URLs that fail to download (logs the error, continues).
        """
        paths = []
        for url in urls:
            try:
                path = self.download(url)
                paths.append(path)
            except Exception as e:
                print(f"[image_downloader] Failed to download {url}: {e}", file=sys.stderr)
        return paths

    def cleanup(self):
        """Remove all downloaded files and the temp directory."""
        if self._owns_dir and os.path.isdir(self.temp_dir):
            shutil.rmtree(self.temp_dir, ignore_errors=True)
            print(f"[image_downloader] Cleaned up temp dir: {self.temp_dir}")
        else:
            for f in self.downloaded_files:
                try:
                    os.remove(f)
                except OSError:
                    pass
            print(f"[image_downloader] Cleaned up {len(self.downloaded_files)} files.")
        self.downloaded_files.clear()

    def __enter__(self):
        return self

    def __exit__(self, *_):
        self.cleanup()


if __name__ == "__main__":
    # Quick test: download URLs passed as command-line arguments
    if len(sys.argv) < 2:
        print("Usage: python image_downloader.py <url1> [url2] ...")
        sys.exit(1)

    dl = ImageDownloader()
    paths = dl.download_all(sys.argv[1:])
    print(f"\nDownloaded {len(paths)} image(s):")
    for p in paths:
        print(f"  {p}")
    print(f"Temp dir: {dl.temp_dir}")
    print("Files will remain until manually cleaned up.")