diff --git a/CHANGES.md b/CHANGES.md index 04bf011..d8151b3 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # Changelog +## Unreleased + +- Fix `sqlparse` dropping statements with an invalid leading token under CrateDB 6.3.2's optional-`statement` grammar; they are now recovered with their exception attached, for both the Python and JavaScript targets (#284) + ## 2026/05/22 v0.0.18 - Fix error matching issue when missing EOF - Set CrateDB version to 6.3.2 diff --git a/cratedb_sqlparse_js/cratedb_sqlparse/AstBuilder.js b/cratedb_sqlparse_js/cratedb_sqlparse/AstBuilder.js index 151b7f1..a62e30b 100644 --- a/cratedb_sqlparse_js/cratedb_sqlparse/AstBuilder.js +++ b/cratedb_sqlparse_js/cratedb_sqlparse/AstBuilder.js @@ -42,6 +42,9 @@ export class AstBuilder extends SqlBaseParserVisitor { */ enrich(stmt) { this.stmt = stmt + if (this.stmt.ctx === null || this.stmt.ctx === undefined) { + return // synthesized statement: no tree to walk + } this.visit(this.stmt.ctx) } diff --git a/cratedb_sqlparse_js/cratedb_sqlparse/parser.js b/cratedb_sqlparse_js/cratedb_sqlparse/parser.js index 9ea50d0..53af2c4 100644 --- a/cratedb_sqlparse_js/cratedb_sqlparse/parser.js +++ b/cratedb_sqlparse_js/cratedb_sqlparse/parser.js @@ -162,10 +162,23 @@ export class Statement { * @member {Metadata} metadata * @member {string} type - The type of query, example: 'SELECT' * @member {string} tree - * @param {object} ctx + * @param {object} ctx - null when the statement is synthesized from a parse error * @param {ParseError} exception */ constructor(ctx, exception) { + this.ctx = ctx || null; + this.exception = exception || null; + this.metadata = new Metadata(); + + if (this.ctx === null) { + // Synthesized from a parse error: no tree, so fall back to the offending fragment. + this.query = this.exception ? this.exception.query : ""; + this.originalQuery = this.exception ? this.exception.query : null; + this.tree = null; + this.type = null; + return; + } + this.query = ctx.parser.getTokenStream().getText( new Interval( ctx.start.tokenIndex, @@ -175,9 +188,6 @@ export class Statement { this.originalQuery = ctx.parser.getTokenStream().getText(); this.tree = ctx.toStringTree(null, ctx.parser); this.type = ctx.start.text; - this.ctx = ctx; - this.exception = exception || null; - this.metadata = new Metadata(); } } @@ -199,6 +209,22 @@ function findSuitableError(statement, errors) { } } +/** + * Text after the first top-level `;`, or null if there is none. Scans tokens, so a `;` inside a + * string or comment is not treated as a separator. + * + * @param {CommonTokenStream} tokenStream + * @returns {string|null} + */ +function queryTailAfterFirstStatement(tokenStream) { + for (const token of tokenStream.tokens) { + if (token.type === SqlBaseLexer.SEMICOLON) { + return token.source[1].strdata.substring(token.stop + 1) + } + } + return null +} + /** * * @param {string} query @@ -254,6 +280,19 @@ export function sqlparse(query, raise_exception = false) { console.error("Could not match errors to queries, too much ambiguity, please report it opening an issue with the query.") } + // Since CrateDB 6.3.2 made the leading `statement` optional, a statement whose first token is + // invalid produces no StatementContext and collapses the parse to []. Rebuild from the + // collected error, then recurse on the tail after the first `;` (GH-284). Fully-collapsed case + // only; a bad non-leading statement still derails its followers (GH-28). + if (!raise_exception && statementsContext.length === 0 && errorListener.errors.length > 0) { + const recovered = [new Statement(null, errorListener.errors[0])] + const tail = queryTailAfterFirstStatement(stream) + if (tail !== null && tail.trim()) { + recovered.push(...sqlparse(tail)) + } + return recovered + } + const stmtEnricher = new AstBuilder() for (const stmt of statements) { diff --git a/cratedb_sqlparse_js/tests/exceptions.test.js b/cratedb_sqlparse_js/tests/exceptions.test.js index 39ffbbb..3820bb7 100644 --- a/cratedb_sqlparse_js/tests/exceptions.test.js +++ b/cratedb_sqlparse_js/tests/exceptions.test.js @@ -68,7 +68,7 @@ test('Exception message is correct', () => { SELECT A, B, C, D FROM tbl1; SELECT D, A FROM tbl1 WHERE;` ) - const expectedMessage = "[line 2:8 mismatched input 'SELEC' expecting {'SELECT', 'DEALLOCATE', 'FETCH', 'END', 'WITH', 'CREATE', 'ALTER', 'KILL', 'CLOSE', 'BEGIN', 'START', 'COMMIT', 'ANALYZE', 'DISCARD', 'EXPLAIN', 'SHOW', 'OPTIMIZE', 'REFRESH', 'RESTORE', 'DROP', 'INSERT', 'VALUES', 'DELETE', 'UPDATE', 'SET', 'RESET', 'COPY', 'GRANT', 'DENY', 'REVOKE', 'DECLARE'}]" + const expectedMessage = "[line 2:8 mismatched input 'SELEC' expecting {, 'SELECT', 'DEALLOCATE', 'FETCH', 'END', 'WITH', 'CREATE', 'ALTER', 'KILL', 'CLOSE', 'BEGIN', 'START', 'COMMIT', 'ANALYZE', 'DISCARD', 'EXPLAIN', 'SHOW', 'OPTIMIZE', 'REFRESH', 'RESTORE', 'DROP', 'INSERT', 'VALUES', 'DELETE', 'UPDATE', 'SET', 'RESET', 'COPY', 'GRANT', 'DENY', 'REVOKE', 'DECLARE', ';'}]" const expectedMessageVerbose = " \n" + " SELEC 1;\n" + " ^^^^^\n" + @@ -88,7 +88,7 @@ test('White or special characters should not avoid exception catching', () => { `SELECT 1\t limit `, `SELECT 1 limit ` ] - for (const stmt in stmts) { + for (const stmt of stmts) { let r = sqlparse(stmt) expect(r[0].exception).toBeDefined(); } @@ -112,7 +112,7 @@ test('Special characters should not avoid exception catching', () => { `SELECT 1\t limit `, `SELECT 1 limit ` ] - for (const stmt in stmts) { + for (const stmt of stmts) { let r = sqlparse(stmt) expect(r[0].exception).not.toBeNull(); } @@ -171,4 +171,30 @@ test('Missing EOF should not block error catching', () => { expect(r[0].exception).toBeNull() expect(r[1].exception).not.toBeNull() } +}) + +test('Invalid leading token is recovered', () => { + // An invalid leading token must still yield a Statement carrying the exception (GH-284). + let r = sqlparse("SELCT 2") + expect(r).length(1) + expect(r[0].exception).not.toBeNull() + expect(typeof r[0].query).toBe("string") // synthesized statement: attribute access must not throw + expect(r[0].type).toBeNull() + + // A bad leading statement must not swallow the valid ones after it. + r = sqlparse("SELCT 1; SELECT 2; SELECT 3 FROM tbl WHERE;") + expect(r).length(3) + expect(r[0].exception).not.toBeNull() + expect(r[1].exception).toBeNull() + expect(r[2].exception).not.toBeNull() +}) + +// GH-28 tripwire: a bad non-leading statement still derails the statements after it. `test.fails` +// flips to a failure when GH-28 is fixed — drop the marker then. +test.fails('Invalid token mid-stream derails following statements (GH-28)', () => { + const r = sqlparse("SELECT 1; SELEC 2; SELECT 3") + expect(r).length(3) + expect(r[0].exception).toBeNull() + expect(r[1].exception).not.toBeNull() + expect(r[2].exception).toBeNull() }) \ No newline at end of file diff --git a/cratedb_sqlparse_py/cratedb_sqlparse/AstBuilder.py b/cratedb_sqlparse_py/cratedb_sqlparse/AstBuilder.py index 4946bf7..3ceb8dc 100644 --- a/cratedb_sqlparse_py/cratedb_sqlparse/AstBuilder.py +++ b/cratedb_sqlparse_py/cratedb_sqlparse/AstBuilder.py @@ -29,6 +29,8 @@ def stmt(self, value): def enrich(self, stmt) -> None: self.stmt = stmt + if self.stmt.ctx is None: # synthesized statement: no tree to walk + return self.visit(self.stmt.ctx) def visitTableName(self, ctx: SqlBaseParser.TableNameContext): diff --git a/cratedb_sqlparse_py/cratedb_sqlparse/parser.py b/cratedb_sqlparse_py/cratedb_sqlparse/parser.py index 6134a1a..e48a538 100644 --- a/cratedb_sqlparse_py/cratedb_sqlparse/parser.py +++ b/cratedb_sqlparse_py/cratedb_sqlparse/parser.py @@ -1,4 +1,5 @@ # ruff: noqa: A005 # Module `parser` shadows a Python standard-library module +import typing as t from typing import List from antlr4 import CommonTokenStream, InputStream, RecognitionException, Token @@ -145,7 +146,8 @@ def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e): class Statement: """ - Represents a CrateDB SQL statement. + Represents a CrateDB SQL statement. `ctx` is None when the statement was synthesized from a + parse error instead of a parse tree (see the recovery path in `sqlparse`). """ def __init__(self, ctx: SqlBaseParser.StatementContext, exception: ParsingException = None): @@ -158,6 +160,8 @@ def tree(self): """ Returns the String representation of the Tree in LISP format. """ + if self.ctx is None: + return None return self.ctx.toStringTree(recog=self.ctx.parser) @property @@ -165,6 +169,8 @@ def original_query(self): """ The original query that was fed into `sqlparse`, including semicolons and comments. """ + if self.ctx is None: + return self.exception.query if self.exception else None return self.ctx.parser.getTokenStream().getText() @property @@ -172,6 +178,8 @@ def query(self) -> str: """ Returns the query, comments and ';' are not included. """ + if self.ctx is None: + return self.exception.query if self.exception else "" return self.ctx.parser.getTokenStream().getText(start=self.ctx.start, stop=self.ctx.stop) @property @@ -179,6 +187,8 @@ def type(self): """ Returns the type of the Statement, i.e.: 'SELECT, 'UPDATE', 'COPY TO'... """ + if self.ctx is None: + return None return self.ctx.start.text def __repr__(self): @@ -199,6 +209,17 @@ def find_suitable_error(statement, errors): errors.pop(errors.index(error)) +def query_tail_after_first_statement(token_stream) -> t.Optional[str]: + """ + Return the text after the first top-level `;`, or None if there is none. Scans tokens, so a + `;` inside a string or comment is not treated as a separator. + """ + for token in token_stream.tokens: + if token.type == SqlBaseLexer.SEMICOLON: + return token.source[1].strdata[token.stop + 1 :] + return None + + def sqlparse(query: str, raise_exception: bool = False) -> List[Statement]: """ Parses a string into SQL `Statement`. @@ -257,6 +278,17 @@ def sqlparse(query: str, raise_exception: bool = False) -> List[Statement]: # triggered and not be an actual error. pass + # Since CrateDB 6.3.2 made the leading `statement` optional, a statement whose first token is + # invalid produces no StatementContext and collapses the parse to []. Rebuild from the + # collected error, then recurse on the tail after the first `;` (GH-284). Fully-collapsed case + # only; a bad non-leading statement still derails its followers (GH-28). + if not raise_exception and not statements_context and error_listener.errors: + recovered = [Statement(None, exception=error_listener.errors[0])] + tail = query_tail_after_first_statement(stream) + if tail is not None and tail.strip(): + recovered.extend(sqlparse(tail)) + return recovered + # We extract the metadata and enrich every Statement's `metadata`. stmt_enricher = AstBuilder() diff --git a/cratedb_sqlparse_py/tests/test_exceptions.py b/cratedb_sqlparse_py/tests/test_exceptions.py index c327f05..67790f9 100644 --- a/cratedb_sqlparse_py/tests/test_exceptions.py +++ b/cratedb_sqlparse_py/tests/test_exceptions.py @@ -9,7 +9,7 @@ def test_exception_message(): SELECT A, B, C, D FROM tbl1; SELECT D, A FROM tbl1 WHERE; """) - expected_message = "InputMismatchException[line 2:9 mismatched input 'SELEC' expecting {'SELECT', 'DEALLOCATE', 'FETCH', 'END', 'WITH', 'CREATE', 'ALTER', 'KILL', 'CLOSE', 'BEGIN', 'START', 'COMMIT', 'ANALYZE', 'DISCARD', 'EXPLAIN', 'SHOW', 'OPTIMIZE', 'REFRESH', 'RESTORE', 'DROP', 'INSERT', 'VALUES', 'DELETE', 'UPDATE', 'SET', 'RESET', 'COPY', 'GRANT', 'DENY', 'REVOKE', 'DECLARE'}]" # noqa + expected_message = "InputMismatchException[line 2:9 mismatched input 'SELEC' expecting {, 'SELECT', 'DEALLOCATE', 'FETCH', 'END', 'WITH', 'CREATE', 'ALTER', 'KILL', 'CLOSE', 'BEGIN', 'START', 'COMMIT', 'ANALYZE', 'DISCARD', 'EXPLAIN', 'SHOW', 'OPTIMIZE', 'REFRESH', 'RESTORE', 'DROP', 'INSERT', 'VALUES', 'DELETE', 'UPDATE', 'SET', 'RESET', 'COPY', 'GRANT', 'DENY', 'REVOKE', 'DECLARE', ';'}]" # noqa expected_message_2 = "\n SELEC 1;\n ^^^^^\n SELECT A, B, C, D FROM tbl1;\n SELECT D, A FROM tbl1 WHERE;\n " # noqa assert r[0].exception.error_message == expected_message assert r[0].exception.original_query_with_error_marked == expected_message_2 @@ -93,12 +93,12 @@ def test_sqlparse_catches_exception(): """ from cratedb_sqlparse import sqlparse - stmts = """ - SELECT 1\n limit, - SELECT 1\r limit, - SELECT 1\t limit, - SELECT 1 limit - """ + stmts = [ + "SELECT 1\n limit", + "SELECT 1\r limit", + "SELECT 1\t limit", + "SELECT 1 limit", + ] for stmt in stmts: assert sqlparse(stmt)[0].exception @@ -183,3 +183,33 @@ def test_sqlparse_match_exception_missing_eof(): assert len(r) == 2 assert not r[0].exception assert r[1].exception + + +def test_sqlparse_recovers_invalid_leading_token(): + """An invalid leading token must still yield a Statement carrying the exception (GH-284).""" + from cratedb_sqlparse import sqlparse + + r = sqlparse("SELCT 2") + assert len(r) == 1 + assert r[0].exception is not None + assert isinstance(r[0].query, str) # synthesized statement: attribute access must not raise + assert r[0].type is None + + # A bad leading statement must not swallow the valid ones after it. + r = sqlparse("SELCT 1; SELECT 2; SELECT 3 FROM tbl WHERE;") + assert len(r) == 3 + assert r[0].exception is not None + assert r[1].exception is None + assert r[2].exception is not None + + +@pytest.mark.xfail(reason="GH-28: a bad non-leading statement still derails the statements after it") +def test_sqlparse_invalid_token_mid_stream_derails_followers(): + """Strict-xfail tripwire for GH-28; drop the marker when it starts passing.""" + from cratedb_sqlparse import sqlparse + + r = sqlparse("SELECT 1; SELEC 2; SELECT 3") + assert len(r) == 3 + assert r[0].exception is None + assert r[1].exception is not None + assert r[2].exception is None