feat: complete phase 3 -- OpenAPI auto-discovery, SSRF protection, tool generation

- SSRF protection: private IP blocking, DNS rebinding defense, redirect validation
- OpenAPI fetcher with SSRF guard, JSON/YAML auto-detection, 10MB limit
- Structural spec validator (3.0.x/3.1.x)
- Endpoint parser with $ref resolution, auto-generated operation IDs
- Heuristic + LLM endpoint classifier with Protocol interface
- Review API at /api/openapi (import, job status, classification CRUD, approve)
- @tool code generator + Agent YAML generator
- Import orchestrator (fetch -> validate -> parse -> classify pipeline)
- 125 new tests, 322 total passing, 93.23% coverage
This commit is contained in:
Yaojia Wang
2026-03-31 00:10:44 +02:00
committed by yaoji
parent 006b4ee5d7
commit a54eb224e0
25 changed files with 2914 additions and 35 deletions

View File

@@ -0,0 +1,93 @@
"""OpenAPI spec fetcher with SSRF protection.
Fetches OpenAPI spec documents from remote URLs, validates them against
SSRF policy, and parses JSON or YAML format automatically.
"""
from __future__ import annotations
import json
import yaml
from app.openapi.ssrf import DEFAULT_POLICY, SSRFPolicy
_MAX_SIZE_BYTES = 10 * 1024 * 1024 # 10MB
async def fetch_spec(url: str, policy: SSRFPolicy = DEFAULT_POLICY) -> dict:
"""Fetch an OpenAPI spec from a URL and return as a dict.
Auto-detects JSON or YAML format from content-type header or URL extension.
Enforces a 10MB size limit.
Raises:
SSRFError: If the URL is blocked by SSRF policy.
ValueError: If the response is too large or cannot be parsed.
"""
from app.openapi.ssrf import safe_fetch
response = await safe_fetch(url, policy=policy)
response.raise_for_status()
content = response.text
if len(content.encode("utf-8")) > _MAX_SIZE_BYTES:
raise ValueError(
f"Response too large: {len(content.encode('utf-8'))} bytes "
f"(max {_MAX_SIZE_BYTES} bytes)"
)
content_type = response.headers.get("content-type", "")
return _parse_content(content, content_type, url)
def _parse_content(content: str, content_type: str, url: str) -> dict:
"""Parse content as JSON or YAML based on content-type or URL extension."""
if _is_yaml_format(content_type, url):
return _parse_yaml(content)
if _is_json_format(content_type, url):
return _parse_json(content)
# Fall back: try JSON first, then YAML
try:
return _parse_json(content)
except ValueError:
return _parse_yaml(content)
def _is_yaml_format(content_type: str, url: str) -> bool:
"""Check if the content is YAML format."""
yaml_types = {"application/x-yaml", "text/yaml", "application/yaml"}
if any(t in content_type for t in yaml_types):
return True
lower_url = url.lower().split("?")[0]
return lower_url.endswith(".yaml") or lower_url.endswith(".yml")
def _is_json_format(content_type: str, url: str) -> bool:
"""Check if the content is JSON format."""
if "application/json" in content_type:
return True
lower_url = url.lower().split("?")[0]
return lower_url.endswith(".json")
def _parse_json(content: str) -> dict:
"""Parse content as JSON, raising ValueError on failure."""
try:
result = json.loads(content)
except json.JSONDecodeError as exc:
raise ValueError(f"Invalid JSON: {exc}") from exc
if not isinstance(result, dict):
raise ValueError(f"Expected a JSON object, got {type(result).__name__}")
return result
def _parse_yaml(content: str) -> dict:
"""Parse content as YAML, raising ValueError on failure."""
try:
result = yaml.safe_load(content)
except yaml.YAMLError as exc:
raise ValueError(f"Invalid YAML: {exc}") from exc
if not isinstance(result, dict):
raise ValueError(f"Expected a YAML mapping, got {type(result).__name__}")
return result