@@ -530,8 +530,6 @@ def extract_rules(pdf_path: Path, standard: str, cache_dir: Path) -> list[Rule]:
530530# transform — no parsing or formatting — and keeps the output readable.
531531
532532_CODE_FORMAT_STEPS = [
533- # Pull "// ..." comments onto their own line.
534- (re .compile (r"\s+//" ), "\n //" ),
535533 # Newline after `;` (but not inside `for( ; ; )` — the next rule catches
536534 # runs of `;` we should leave alone).
537535 (re .compile (r";\s+(?=\S)" ), ";\n " ),
@@ -542,17 +540,44 @@ def extract_rules(pdf_path: Path, standard: str, cache_dir: Path) -> list[Rule]:
542540]
543541
544542
543+ def _indent_by_braces (text : str ) -> str :
544+ """Add 2-space indentation based on brace nesting depth."""
545+ lines = text .splitlines ()
546+ out : list [str ] = []
547+ depth = 0
548+ for line in lines :
549+ stripped = line .strip ()
550+ if not stripped :
551+ out .append ("" )
552+ continue
553+ # Dedent for lines that start with `}`
554+ if stripped .startswith ("}" ):
555+ depth = max (0 , depth - 1 )
556+ out .append (" " * depth + stripped )
557+ # Indent after lines that end with `{`
558+ if stripped .endswith ("{" ):
559+ depth += 1
560+ return "\n " .join (out )
561+
562+
545563def _format_code_lines (text : str ) -> str :
546564 """Heuristically insert line breaks into a C/C++ code example that
547565 docling concatenated onto a single line. Deterministic.
566+
567+ Preserves existing multi-space alignment and inline ``//`` comments.
568+ Only inserts line breaks at ``;``, ``{``, ``}`` boundaries and adds
569+ brace-depth indentation.
548570 """
549- # Collapse 2+ spaces (docling sometimes inserts them where a PDF
550- # layout break occurred) so the regexes below match reliably.
551- s = re .sub (r"[ \t]{2,}" , " " , text ).strip ()
571+ # Collapse runs of 3+ spaces (likely docling kerning artefacts) to
572+ # a single space, but preserve 2-space runs which may be intentional
573+ # alignment in column-style comments.
574+ s = re .sub (r"[ \t]{3,}" , " " , text ).strip ()
552575 for pat , repl in _CODE_FORMAT_STEPS :
553576 s = pat .sub (repl , s )
554- # Trim any leading/trailing whitespace on each resulting line.
555- return "\n " .join (line .rstrip () for line in s .splitlines ()).strip ()
577+ # Trim trailing whitespace on each line.
578+ s = "\n " .join (line .rstrip () for line in s .splitlines ()).strip ()
579+ # Add indentation based on brace depth.
580+ return _indent_by_braces (s )
556581
557582
558583# ----------------------------------------------------------------------------
0 commit comments