GitHubSecurityLab · anticomputer · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -24,6 +24,7 @@
 from agents.run import DEFAULT_MAX_TURNS
 from dotenv import find_dotenv, load_dotenv
 from openai import AsyncOpenAI
+import httpx
 
 from .capi import get_AI_endpoint, get_AI_token, get_provider
 
@@ -182,6 +183,7 @@ def __init__(
             base_url=resolved_endpoint,
             api_key=resolved_token,
             default_headers=provider.extra_headers or None,
+            timeout=httpx.Timeout(connect=10.0, read=300.0, write=300.0, pool=60.0),
         )
         set_tracing_disabled(True)
         self.run_hooks = run_hooks or TaskRunHooks()
@@ -198,6 +200,7 @@ def _ToolsToFinalOutputFunction(
         else:
             model_impl = OpenAIChatCompletionsModel(model=model, openai_client=client)
 
+        self._openai_client = client
         self.agent = Agent(
             name=name,
             instructions=instructions,
@@ -209,6 +212,11 @@ def _ToolsToFinalOutputFunction(
             hooks=agent_hooks or TaskAgentHooks(),
         )
 
+    async def close(self) -> None:
+        """Close the underlying AsyncOpenAI client and its httpx connection pool."""
+        if self._openai_client is not None:
+            await self._openai_client.close()
+
     async def run(self, prompt: str, max_turns: int = DEFAULT_MAX_TURNS) -> result.RunResult:
         """Run the agent to completion and return the result."""
         return await Runner.run(starting_agent=self.agent, input=prompt, max_turns=max_turns, hooks=self.run_hooks)

@@ -23,6 +23,9 @@
 import json
 import logging
 import os
+import sys
+import threading
+import time
 import uuid
 from typing import Any
 
@@ -51,6 +54,49 @@
 MAX_API_RETRY = 5  # Maximum number of consecutive API error retries
 TASK_RETRY_LIMIT = 3  # Maximum retry attempts for a failed task
 TASK_RETRY_BACKOFF = 10  # Initial backoff in seconds between task retries
+# Application-level backstop: kill a streaming run if no events yielded for 30 min.
+# Complements the TCP-level httpx.Timeout(read=300s) in agent.py which catches
+# dead sockets; this catches subtler hangs where the connection stays open but
+# the server (or async generator) stops producing events.
+STREAM_IDLE_TIMEOUT = 1800
+
+# Watchdog: a non-asyncio thread that force-kills the process if the event
+# loop stops making progress.  Covers every hang variant (dead connections,
+# asyncio cleanup spin, MCP cleanup, etc.) because it runs outside asyncio.
+WATCHDOG_IDLE_TIMEOUT = int(os.environ.get("WATCHDOG_IDLE_TIMEOUT", "2100"))  # 35 min default
+
+_watchdog_last_activity = time.monotonic()
+_watchdog_lock = threading.Lock()
+
+
+def watchdog_ping() -> None:
+    """Call from any coroutine/callback to signal the process is alive."""
+    global _watchdog_last_activity
+    with _watchdog_lock:
+        _watchdog_last_activity = time.monotonic()
+
+
+def _watchdog_thread(timeout: int) -> None:
+    """Background thread: force-exit if no activity for *timeout* seconds."""
+    check_interval = min(60, max(1, timeout // 5))
+    while True:
+        time.sleep(check_interval)
+        with _watchdog_lock:
+            idle = time.monotonic() - _watchdog_last_activity
+        if idle > timeout:
+            logging.error(
+                f"Watchdog: no activity for {idle:.0f}s (limit {timeout}s) — "
+                "force-exiting to prevent hang"
+            )
+            sys.stderr.flush()
+            sys.stdout.flush()
+            os._exit(2)
+
+
+def start_watchdog(timeout: int = WATCHDOG_IDLE_TIMEOUT) -> None:
+    """Start the watchdog thread (idempotent, daemon thread)."""
+    t = threading.Thread(target=_watchdog_thread, args=(timeout,), daemon=True)
+    t.start()
 
 
 def _resolve_model_config(
@@ -367,6 +413,7 @@ async def deploy_task_agents(
             server_prompts=server_prompts,
             important_guidelines=important_guidelines,
         )
+        agent0 = None
         agent0 = TaskAgent(
             name=primary_name,
             instructions=prompt_with_handoff_instructions(system_prompt) if handoffs else system_prompt,
@@ -389,11 +436,41 @@ async def _run_streamed() -> None:
                 max_retry = MAX_API_RETRY
                 rate_limit_backoff = RATE_LIMIT_BACKOFF
                 while rate_limit_backoff:
+                    result = None
                     try:
                         result = agent0.run_streamed(prompt, max_turns=max_turns)
-                        async for event in result.stream_events():
-                            if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
-                                await render_model_output(event.data.delta, async_task=async_task, task_id=task_id)
+                        stream = None
+                        try:
+                            stream = result.stream_events()
+                            async_iter = stream.__aiter__()
+                            while True:
+                                try:
+                                    event = await asyncio.wait_for(
+                                        async_iter.__anext__(),
+                                        timeout=STREAM_IDLE_TIMEOUT,
+                                    )
+                                except StopAsyncIteration:
+                                    break
+                                except asyncio.TimeoutError:
+                                    logging.error(
+                                        f"Stream idle for {STREAM_IDLE_TIMEOUT}s — "
+                                        "connection likely dead, raising APITimeoutError"
+                                    )
+                                    raise APITimeoutError("Stream idle timeout exceeded")
+                                if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
+                                    watchdog_ping()
+                                    await render_model_output(event.data.delta, async_task=async_task, task_id=task_id)
+                        finally:
+                            if stream is not None:
+                                aclose = getattr(stream, "aclose", None)
+                                if aclose is not None:
+                                    await aclose()
-                                    await aclose()
+                                    try:
+                                        await aclose()
+                                    except Exception:
+                                        logging.exception("Failed to close streamed response")
-                                    await aclose()
+                                    try:
+                                        await aclose()
+                                    except Exception:
+                                        logging.exception("Failed to close streamed response")
+                            # Cancel the RunResultStreaming background tasks.
+                            # aclose() on the stream_events() async generator throws
+                            # GeneratorExit which skips _cleanup_tasks(), so we must
+                            # cancel explicitly to avoid leaking _run_impl_task.
+                            if result is not None:
+                                result.cancel()
                         await render_model_output("\n\n", async_task=async_task, task_id=task_id)
                         return
                     except APITimeoutError:
@@ -433,6 +510,11 @@ async def _run_streamed() -> None:
         return complete
 
     finally:
+        # Close the AsyncOpenAI client to release httpx connection pool.
+        # Dead CLOSE_WAIT sockets in the pool cause kqueue CPU spin if left open.
+        watchdog_ping()
+        if agent0 is not None:
+            await agent0.close()
         start_cleanup.set()
         cleanup_attempts_left = len(entries)
         while cleanup_attempts_left and entries:
@@ -443,6 +525,14 @@ async def _run_streamed() -> None:
                 continue
             except Exception:
                 logging.exception("Exception in mcp server cleanup task")
+        # Cancel the MCP session task if it's still running to prevent
+        # the asyncio event loop from spinning on a dangling task.
+        if not mcp_sessions.done():
+            mcp_sessions.cancel()
+            try:
+                await mcp_sessions
+            except (asyncio.CancelledError, Exception):
+                pass
-                await mcp_sessions
-            except (asyncio.CancelledError, Exception):
-                pass
+                await asyncio.wait_for(mcp_sessions, timeout=MCP_CLEANUP_TIMEOUT)
+            except asyncio.TimeoutError:
+                logging.warning(
+                    "Timed out waiting for MCP session task cancellation after %s seconds",
+                    MCP_CLEANUP_TIMEOUT,
+                )
+            except asyncio.CancelledError:
+                pass
+            except Exception:
+                logging.exception("Exception while waiting for mcp session task cancellation")
-                await mcp_sessions
-            except (asyncio.CancelledError, Exception):
-                pass
+                await asyncio.wait_for(mcp_sessions, timeout=MCP_CLEANUP_TIMEOUT)
+            except asyncio.TimeoutError:
+                logging.warning(
+                    "Timed out waiting for MCP session task cancellation after %s seconds",
+                    MCP_CLEANUP_TIMEOUT,
+                )
+            except asyncio.CancelledError:
+                pass
+            except Exception:
+                logging.exception("Exception while waiting for mcp session task cancellation")
 
 
 async def run_main(
@@ -465,12 +555,18 @@ async def run_main(
     """
     from .session import TaskflowSession
 
+    # Start the watchdog thread — if the process hangs for any reason
+    # (asyncio spin, dead connections, MCP cleanup), this kills it.
+    start_watchdog()
+
     last_mcp_tool_results: list[str] = []
 
     async def on_tool_end_hook(context: RunContextWrapper[TContext], agent: Agent[TContext], tool: Tool, result: str) -> None:
+        watchdog_ping()
         last_mcp_tool_results.append(result)
 
     async def on_tool_start_hook(context: RunContextWrapper[TContext], agent: Agent[TContext], tool: Tool) -> None:
+        watchdog_ping()
         await render_model_output(f"\n** 🤖🛠️ Tool Call: {tool.name}\n")
 
     async def on_handoff_hook(context: RunContextWrapper[TContext], agent: Agent[TContext], source: Agent[TContext]) -> None:
@@ -731,3 +827,9 @@ async def _deploy(ra: dict, pp: str) -> bool:
         if session is not None and not session.error:
             session.mark_finished()
             await render_model_output(f"** 🤖✅ Session {session.session_id} completed\n")
+
+    # Force-exit to prevent asyncio event loop spin on dangling
+    # tasks/connections from the responses API path. Flush first.
+    sys.stdout.flush()
+    sys.stderr.flush()
+    os._exit(0 if (session is None or session.finished) else 1)