@@ -31,6 +31,50 @@ def _truncate_output(output: str) -> str:
3131 return f"{ first } \n ... [{ truncated_len } characters truncated] ... \n { last } "
3232
3333
34+ def format_api_error (e : requests .exceptions .HTTPError ) -> str :
35+ """Extract API error details as a plain multi-line string.
36+
37+ Mirrors :func:`handle_api_error` but returns text instead of printing,
38+ so it can be passed to UI handlers (e.g. ``ui.on_error``) that don't
39+ expose a Rich console — the Rich Live panel and the plain-text UI both
40+ consume errors as plain strings via the ``on_error`` protocol.
41+ """
42+ status = getattr (e .response , "status_code" , None )
43+ try :
44+ payload = e .response .json ()
45+ detail = payload .get ("detail" , payload )
46+ except (ValueError , AttributeError ):
47+ return getattr (e .response , "text" , "" ) or f"HTTP { status } Error"
48+
49+ def _format (detail_obj : Any ) -> list [str ]:
50+ if isinstance (detail_obj , str ):
51+ return [detail_obj ]
52+ if isinstance (detail_obj , dict ):
53+ lines : list [str ] = []
54+ message_keys = ("message" , "error" , "msg" , "detail" )
55+ message = next ((detail_obj .get (key ) for key in message_keys if detail_obj .get (key )), None )
56+ lines .append (message or f"HTTP { status } Error" )
57+ suggestion = detail_obj .get ("suggestion" )
58+ if suggestion :
59+ lines .append (str (suggestion ))
60+ extras = {
61+ k : v
62+ for k , v in detail_obj .items ()
63+ if k not in {"message" , "error" , "msg" , "detail" , "suggestion" } and v not in (None , "" )
64+ }
65+ for key , value in extras .items ():
66+ lines .append (f"{ key } : { value } " )
67+ return lines
68+ if isinstance (detail_obj , list ) and detail_obj :
69+ lines = list (_format (detail_obj [0 ]))
70+ for extra in detail_obj [1 :]:
71+ lines .append (str (extra ))
72+ return lines
73+ return [str (detail_obj ) if detail_obj else f"HTTP { status } Error" ]
74+
75+ return "\n " .join (_format (detail ))
76+
77+
3478def handle_api_error (e : requests .exceptions .HTTPError , console ) -> None :
3579 """Extract and display error messages from API responses in a structured format."""
3680 status = getattr (e .response , "status_code" , None )
@@ -272,11 +316,20 @@ def suggest(
272316 step : int | None = None ,
273317 task_id : str | None = None ,
274318 api_keys : dict [str , str ] | None = None ,
319+ timeout : tuple [int , int ] | int | None = None ,
275320 ) -> dict :
276321 """``POST /runs/{run_id}/suggest`` — submit execution output, get next candidate.
277322
278- If *step* is provided, transport errors (ReadTimeout, 502, ConnectionError)
279- trigger an automatic recovery attempt via ``get_run_status``.
323+ If *step* is provided (legacy flow), transport errors (ReadTimeout, 502,
324+ ConnectionError) trigger recovery via ``get_run_status``. If *task_id* is
325+ provided (queue flow), recovery instead checks ``/execution-tasks/`` and
326+ the run status, so a dropped response doesn't hang the CLI for up to
327+ ``timeout[1]`` seconds waiting on a socket the backend has already replied on.
328+
329+ Args:
330+ timeout: Optional ``(connect, read)`` tuple or int override for the
331+ HTTP request. Defaults to ``(10, 3650)`` to preserve existing
332+ behavior; pass a smaller value to exercise the recovery path.
280333
281334 Raises:
282335 requests.exceptions.HTTPError: On non-recoverable HTTP errors.
@@ -289,8 +342,10 @@ def suggest(
289342 if api_keys :
290343 body ["api_keys" ] = api_keys
291344
345+ request_timeout = timeout if timeout is not None else (10 , 3650 )
346+
292347 try :
293- resp = self ._post (f"/runs/{ run_id } /suggest" , json = body , timeout = ( 10 , 3650 ) )
348+ resp = self ._post (f"/runs/{ run_id } /suggest" , json = body , timeout = request_timeout )
294349 resp .raise_for_status ()
295350 result = resp .json ()
296351 if result .get ("plan" ) is None :
@@ -303,12 +358,21 @@ def suggest(
303358 recovered = self ._recover_suggest (run_id , step )
304359 if recovered is not None :
305360 return recovered
361+ elif task_id is not None :
362+ recovered = self ._recover_queue_suggest (run_id )
363+ if recovered is not None :
364+ return recovered
306365 raise type (exc )(exc ) from exc
307366 except requests .exceptions .HTTPError as exc :
308- if step is not None and getattr (exc .response , "status_code" , None ) == 502 :
367+ status_code = getattr (exc .response , "status_code" , None )
368+ if step is not None and status_code == 502 :
309369 recovered = self ._recover_suggest (run_id , step )
310370 if recovered is not None :
311371 return recovered
372+ elif task_id is not None and status_code in (502 , 503 , 504 ):
373+ recovered = self ._recover_queue_suggest (run_id )
374+ if recovered is not None :
375+ return recovered
312376 raise
313377
314378 def heartbeat (self , run_id : str ) -> bool :
@@ -482,6 +546,64 @@ def log_external_step(
482546 # Internal
483547 # ------------------------------------------------------------------
484548
549+ def _recover_queue_suggest (self , run_id : str ) -> dict | None :
550+ """Try to reconstruct a ``/suggest`` response for queue-mode clients.
551+
552+ Called after a transport error (ReadTimeout / 5xx / ConnectionError) when
553+ a ``task_id`` was supplied. The backend marks the submitted task as
554+ completed early in ``/suggest`` and, if everything succeeds, atomically
555+ creates the next node + revision + execution task before returning.
556+
557+ If we can observe either (a) a ready execution task queued for this run,
558+ or (b) the run transitioning to ``completed``, the submit effectively
559+ landed — we can synthesize a success response and let the main loop
560+ proceed to its next poll/claim iteration. Also recover the previous
561+ step's metric from the run history so the UI's ``on_metric`` still fires
562+ for the step whose response we missed. Otherwise return ``None`` and let
563+ the caller surface the transport error.
564+ """
565+ try :
566+ run_data = self .get_run_status (run_id , include_history = True )
567+ except Exception :
568+ run_data = None
569+
570+ run_status = (run_data or {}).get ("status" )
571+ if run_status in ("terminated" , "error" ):
572+ return None
573+
574+ # Latest node that has an execution output is the one we just evaluated
575+ # — its metric is the ``previous_solution_metric_value`` the caller expects.
576+ previous_metric = None
577+ if run_data is not None :
578+ evaluated_nodes = [
579+ n
580+ for n in (run_data .get ("nodes" ) or [])
581+ if n .get ("execution_output" ) is not None and n .get ("metric_value" ) is not None
582+ ]
583+ if evaluated_nodes :
584+ latest_evaluated = max (evaluated_nodes , key = lambda n : n .get ("step" , 0 ))
585+ previous_metric = latest_evaluated .get ("metric_value" )
586+
587+ def _with_metric (payload : dict ) -> dict :
588+ if previous_metric is not None :
589+ payload ["previous_solution_metric_value" ] = previous_metric
590+ return payload
591+
592+ if run_status == "completed" :
593+ return _with_metric ({"run_id" : run_id , "is_done" : True })
594+
595+ # Verify the next candidate task is queued — that's the signal that the
596+ # submit landed end-to-end (not just that the previous node was updated).
597+ try :
598+ tasks_result = self .get_execution_tasks (run_id )
599+ except Exception :
600+ tasks_result = None
601+
602+ if tasks_result is not None and tasks_result .tasks :
603+ return _with_metric ({"run_id" : run_id , "is_done" : False })
604+
605+ return None
606+
485607 def _recover_suggest (self , run_id : str , step : int ) -> dict | None :
486608 """Try to reconstruct a ``/suggest`` response after a transport error.
487609
0 commit comments