fix: address critical security and code review findings in Phase 3

- Wire ImportOrchestrator into review_api start_import via BackgroundTasks
- Sanitize docstrings in generated tool code to prevent code injection
- Add Literal["read", "write"] validation for access_type
- Add regex validation for agent_group
- Validate URL scheme (http/https only) in ImportRequest
- Validate LLM output fields (clamp confidence, validate access_type)
- Use dataclasses.replace instead of manual reconstruction in importer
- Expand SSRF blocked networks (Carrier-Grade NAT, IPv4-mapped IPv6, etc.)
- Make _BLOCKED_NETWORKS immutable tuple
- Use yaml.safe_dump instead of yaml.dump
- Fix _to_snake_case for empty strings and Python keywords
This commit is contained in:
Yaojia Wang
2026-03-31 00:28:28 +02:00
parent a54eb224e0
commit a2f750269d
6 changed files with 128 additions and 28 deletions

View File

@@ -6,6 +6,7 @@ YAML agent configurations from classification results.
from __future__ import annotations
import keyword
import re
import yaml
@@ -24,7 +25,7 @@ def generate_tool_code(classification: ClassificationResult, base_url: str) -> G
func_name = _to_snake_case(ep.operation_id)
params = _collect_params(ep)
sig = _build_signature(params, ep.request_body_schema)
docstring = ep.summary or ep.description or ep.operation_id
docstring = _sanitize_docstring(ep.summary or ep.description or ep.operation_id)
interrupt_comment = _interrupt_comment(classification)
http_call = _build_http_call(ep, base_url, params)
@@ -59,7 +60,7 @@ def generate_agent_yaml(
Groups tools by agent_group, creating one agent entry per group.
"""
if not classifications:
return yaml.dump({"agents": []})
return yaml.safe_dump({"agents": []})
groups: dict[str, dict] = {}
for clf in classifications:
@@ -75,7 +76,7 @@ def generate_agent_yaml(
}
groups[group]["tools"].append(func_name)
return yaml.dump({"agents": list(groups.values())}, sort_keys=False)
return yaml.safe_dump({"agents": list(groups.values())}, sort_keys=False)
# --- Private helpers ---
@@ -149,8 +150,15 @@ def _schema_type_to_python(schema_type: str) -> str:
return mapping.get(schema_type, "str")
def _sanitize_docstring(text: str) -> str:
"""Escape triple-quotes and newlines to prevent docstring injection."""
return text.replace("\\", "\\\\").replace('"""', r"\"\"\"").replace("\n", " ")
def _to_snake_case(name: str) -> str:
"""Convert operationId to a valid snake_case Python identifier."""
# Replace non-alphanumeric with underscore
clean = re.sub(r"[^a-zA-Z0-9]+", "_", name).strip("_")
return clean.lower()
result = clean.lower() or "unnamed_tool"
if keyword.iskeyword(result):
result = f"{result}_tool"
return result