Files
smart-support/backend/app/openapi/parser.py
Yaojia Wang a54eb224e0 feat: complete phase 3 -- OpenAPI auto-discovery, SSRF protection, tool generation
- SSRF protection: private IP blocking, DNS rebinding defense, redirect validation
- OpenAPI fetcher with SSRF guard, JSON/YAML auto-detection, 10MB limit
- Structural spec validator (3.0.x/3.1.x)
- Endpoint parser with $ref resolution, auto-generated operation IDs
- Heuristic + LLM endpoint classifier with Protocol interface
- Review API at /api/openapi (import, job status, classification CRUD, approve)
- @tool code generator + Agent YAML generator
- Import orchestrator (fetch -> validate -> parse -> classify pipeline)
- 125 new tests, 322 total passing, 93.23% coverage
2026-03-31 00:10:44 +02:00

153 lines
5.1 KiB
Python

"""OpenAPI spec endpoint parser.
Extracts all endpoint definitions from a parsed OpenAPI spec dict,
resolving $ref references from components.
"""
from __future__ import annotations
import re
from app.openapi.models import EndpointInfo, ParameterInfo
_HTTP_METHODS = ("get", "post", "put", "patch", "delete", "head", "options")
def parse_endpoints(spec_dict: dict) -> tuple[EndpointInfo, ...]:
"""Parse all endpoints from a validated OpenAPI spec dict.
Returns an immutable tuple of EndpointInfo instances.
"""
paths = spec_dict.get("paths", {})
if not isinstance(paths, dict) or not paths:
return ()
endpoints: list[EndpointInfo] = []
for path, path_item in paths.items():
if not isinstance(path_item, dict):
continue
for method in _HTTP_METHODS:
operation = path_item.get(method)
if operation is None:
continue
endpoint = _parse_operation(path, method.upper(), operation, spec_dict)
endpoints.append(endpoint)
return tuple(endpoints)
def _parse_operation(
path: str,
method: str,
operation: dict,
spec_dict: dict,
) -> EndpointInfo:
"""Parse a single operation dict into an EndpointInfo."""
operation_id = operation.get("operationId") or _generate_operation_id(path, method)
summary = operation.get("summary", "")
description = operation.get("description", "")
parameters = _parse_parameters(operation.get("parameters", []), spec_dict)
request_body_schema = _parse_request_body(operation.get("requestBody"), spec_dict)
response_schema = _parse_response_schema(operation.get("responses", {}), spec_dict)
return EndpointInfo(
path=path,
method=method,
operation_id=operation_id,
summary=summary,
description=description,
parameters=tuple(parameters),
request_body_schema=request_body_schema,
response_schema=response_schema,
)
def _parse_parameters(
params_list: list,
spec_dict: dict,
) -> list[ParameterInfo]:
"""Parse list of parameter dicts into ParameterInfo instances."""
result: list[ParameterInfo] = []
for param in params_list:
if not isinstance(param, dict):
continue
schema = param.get("schema", {})
schema_type = schema.get("type", "string") if isinstance(schema, dict) else "string"
result.append(
ParameterInfo(
name=param.get("name", ""),
location=param.get("in", "query"),
required=bool(param.get("required", False)),
schema_type=schema_type,
description=param.get("description", ""),
)
)
return result
def _parse_request_body(request_body: dict | None, spec_dict: dict) -> dict | None:
"""Extract schema from requestBody, resolving $ref if present."""
if not isinstance(request_body, dict):
return None
content = request_body.get("content", {})
if not isinstance(content, dict):
return None
# Prefer application/json
for media_type in ("application/json", *content.keys()):
media = content.get(media_type)
if not isinstance(media, dict):
continue
schema = media.get("schema")
if schema:
return _resolve_ref(schema, spec_dict)
return None
def _parse_response_schema(responses: dict, spec_dict: dict) -> dict | None:
"""Extract schema from the first 2xx response."""
if not isinstance(responses, dict):
return None
for status_code in ("200", "201", "202", "204"):
response = responses.get(status_code)
if not isinstance(response, dict):
continue
content = response.get("content", {})
if not isinstance(content, dict):
continue
for media_type in ("application/json", *content.keys()):
media = content.get(media_type)
if not isinstance(media, dict):
continue
schema = media.get("schema")
if schema:
return _resolve_ref(schema, spec_dict)
return None
def _resolve_ref(schema: object, spec_dict: dict) -> dict:
"""Resolve a $ref to its target schema, or return the schema as-is."""
if not isinstance(schema, dict):
return {}
ref = schema.get("$ref")
if not ref:
return schema
# Only handle local refs like #/components/schemas/Foo
if not isinstance(ref, str) or not ref.startswith("#/"):
return schema
parts = ref.lstrip("#/").split("/")
target: object = spec_dict
for part in parts:
if not isinstance(target, dict):
return schema
target = target.get(part)
return target if isinstance(target, dict) else schema
def _generate_operation_id(path: str, method: str) -> str:
"""Generate a snake_case operation_id from path and method."""
# Remove path parameters braces and replace / with _
clean = re.sub(r"\{[^}]+\}", "by_param", path)
clean = re.sub(r"[^a-zA-Z0-9]+", "_", clean).strip("_")
return f"{method.lower()}_{clean}" if clean else method.lower()