Files
smart-support/backend/app/openapi/fetcher.py
Yaojia Wang a54eb224e0 feat: complete phase 3 -- OpenAPI auto-discovery, SSRF protection, tool generation
- SSRF protection: private IP blocking, DNS rebinding defense, redirect validation
- OpenAPI fetcher with SSRF guard, JSON/YAML auto-detection, 10MB limit
- Structural spec validator (3.0.x/3.1.x)
- Endpoint parser with $ref resolution, auto-generated operation IDs
- Heuristic + LLM endpoint classifier with Protocol interface
- Review API at /api/openapi (import, job status, classification CRUD, approve)
- @tool code generator + Agent YAML generator
- Import orchestrator (fetch -> validate -> parse -> classify pipeline)
- 125 new tests, 322 total passing, 93.23% coverage
2026-03-31 00:10:44 +02:00

94 lines
3.0 KiB
Python

"""OpenAPI spec fetcher with SSRF protection.
Fetches OpenAPI spec documents from remote URLs, validates them against
SSRF policy, and parses JSON or YAML format automatically.
"""
from __future__ import annotations
import json
import yaml
from app.openapi.ssrf import DEFAULT_POLICY, SSRFPolicy
_MAX_SIZE_BYTES = 10 * 1024 * 1024 # 10MB
async def fetch_spec(url: str, policy: SSRFPolicy = DEFAULT_POLICY) -> dict:
"""Fetch an OpenAPI spec from a URL and return as a dict.
Auto-detects JSON or YAML format from content-type header or URL extension.
Enforces a 10MB size limit.
Raises:
SSRFError: If the URL is blocked by SSRF policy.
ValueError: If the response is too large or cannot be parsed.
"""
from app.openapi.ssrf import safe_fetch
response = await safe_fetch(url, policy=policy)
response.raise_for_status()
content = response.text
if len(content.encode("utf-8")) > _MAX_SIZE_BYTES:
raise ValueError(
f"Response too large: {len(content.encode('utf-8'))} bytes "
f"(max {_MAX_SIZE_BYTES} bytes)"
)
content_type = response.headers.get("content-type", "")
return _parse_content(content, content_type, url)
def _parse_content(content: str, content_type: str, url: str) -> dict:
"""Parse content as JSON or YAML based on content-type or URL extension."""
if _is_yaml_format(content_type, url):
return _parse_yaml(content)
if _is_json_format(content_type, url):
return _parse_json(content)
# Fall back: try JSON first, then YAML
try:
return _parse_json(content)
except ValueError:
return _parse_yaml(content)
def _is_yaml_format(content_type: str, url: str) -> bool:
"""Check if the content is YAML format."""
yaml_types = {"application/x-yaml", "text/yaml", "application/yaml"}
if any(t in content_type for t in yaml_types):
return True
lower_url = url.lower().split("?")[0]
return lower_url.endswith(".yaml") or lower_url.endswith(".yml")
def _is_json_format(content_type: str, url: str) -> bool:
"""Check if the content is JSON format."""
if "application/json" in content_type:
return True
lower_url = url.lower().split("?")[0]
return lower_url.endswith(".json")
def _parse_json(content: str) -> dict:
"""Parse content as JSON, raising ValueError on failure."""
try:
result = json.loads(content)
except json.JSONDecodeError as exc:
raise ValueError(f"Invalid JSON: {exc}") from exc
if not isinstance(result, dict):
raise ValueError(f"Expected a JSON object, got {type(result).__name__}")
return result
def _parse_yaml(content: str) -> dict:
"""Parse content as YAML, raising ValueError on failure."""
try:
result = yaml.safe_load(content)
except yaml.YAMLError as exc:
raise ValueError(f"Invalid YAML: {exc}") from exc
if not isinstance(result, dict):
raise ValueError(f"Expected a YAML mapping, got {type(result).__name__}")
return result