Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## Unreleased

- Fix `sqlparse` dropping statements with an invalid leading token under CrateDB 6.3.2's optional-`statement` grammar; they are now recovered with their exception attached, for both the Python and JavaScript targets (#284)

## 2026/05/22 v0.0.18
- Fix error matching issue when missing EOF
- Set CrateDB version to 6.3.2
Expand Down
3 changes: 3 additions & 0 deletions cratedb_sqlparse_js/cratedb_sqlparse/AstBuilder.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ export class AstBuilder extends SqlBaseParserVisitor {
*/
enrich(stmt) {
this.stmt = stmt
if (this.stmt.ctx === null || this.stmt.ctx === undefined) {
return // synthesized statement: no tree to walk
}
this.visit(this.stmt.ctx)
}

Expand Down
47 changes: 43 additions & 4 deletions cratedb_sqlparse_js/cratedb_sqlparse/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,23 @@ export class Statement {
* @member {Metadata} metadata
* @member {string} type - The type of query, example: 'SELECT'
* @member {string} tree
* @param {object} ctx
* @param {object} ctx - null when the statement is synthesized from a parse error
* @param {ParseError} exception
*/
constructor(ctx, exception) {
this.ctx = ctx || null;
this.exception = exception || null;
this.metadata = new Metadata();

if (this.ctx === null) {
// Synthesized from a parse error: no tree, so fall back to the offending fragment.
this.query = this.exception ? this.exception.query : "";
this.originalQuery = this.exception ? this.exception.query : null;
this.tree = null;
this.type = null;
return;
}

this.query = ctx.parser.getTokenStream().getText(
new Interval(
ctx.start.tokenIndex,
Expand All @@ -175,9 +188,6 @@ export class Statement {
this.originalQuery = ctx.parser.getTokenStream().getText();
this.tree = ctx.toStringTree(null, ctx.parser);
this.type = ctx.start.text;
this.ctx = ctx;
this.exception = exception || null;
this.metadata = new Metadata();
}
}

Expand All @@ -199,6 +209,22 @@ function findSuitableError(statement, errors) {
}
}

/**
* Text after the first top-level `;`, or null if there is none. Scans tokens, so a `;` inside a
* string or comment is not treated as a separator.
*
* @param {CommonTokenStream} tokenStream
* @returns {string|null}
*/
function queryTailAfterFirstStatement(tokenStream) {
for (const token of tokenStream.tokens) {
if (token.type === SqlBaseLexer.SEMICOLON) {
return token.source[1].strdata.substring(token.stop + 1)
}
}
return null
}

/**
*
* @param {string} query
Expand Down Expand Up @@ -254,6 +280,19 @@ export function sqlparse(query, raise_exception = false) {
console.error("Could not match errors to queries, too much ambiguity, please report it opening an issue with the query.")
}

// Since CrateDB 6.3.2 made the leading `statement` optional, a statement whose first token is
// invalid produces no StatementContext and collapses the parse to []. Rebuild from the
// collected error, then recurse on the tail after the first `;` (GH-284). Fully-collapsed case
// only; a bad non-leading statement still derails its followers (GH-28).
if (!raise_exception && statementsContext.length === 0 && errorListener.errors.length > 0) {
const recovered = [new Statement(null, errorListener.errors[0])]
const tail = queryTailAfterFirstStatement(stream)
if (tail !== null && tail.trim()) {
recovered.push(...sqlparse(tail))
}
return recovered
}

const stmtEnricher = new AstBuilder()

for (const stmt of statements) {
Expand Down
32 changes: 29 additions & 3 deletions cratedb_sqlparse_js/tests/exceptions.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ test('Exception message is correct', () => {
SELECT A, B, C, D FROM tbl1;
SELECT D, A FROM tbl1 WHERE;`
)
const expectedMessage = "[line 2:8 mismatched input 'SELEC' expecting {'SELECT', 'DEALLOCATE', 'FETCH', 'END', 'WITH', 'CREATE', 'ALTER', 'KILL', 'CLOSE', 'BEGIN', 'START', 'COMMIT', 'ANALYZE', 'DISCARD', 'EXPLAIN', 'SHOW', 'OPTIMIZE', 'REFRESH', 'RESTORE', 'DROP', 'INSERT', 'VALUES', 'DELETE', 'UPDATE', 'SET', 'RESET', 'COPY', 'GRANT', 'DENY', 'REVOKE', 'DECLARE'}]"
const expectedMessage = "[line 2:8 mismatched input 'SELEC' expecting {<EOF>, 'SELECT', 'DEALLOCATE', 'FETCH', 'END', 'WITH', 'CREATE', 'ALTER', 'KILL', 'CLOSE', 'BEGIN', 'START', 'COMMIT', 'ANALYZE', 'DISCARD', 'EXPLAIN', 'SHOW', 'OPTIMIZE', 'REFRESH', 'RESTORE', 'DROP', 'INSERT', 'VALUES', 'DELETE', 'UPDATE', 'SET', 'RESET', 'COPY', 'GRANT', 'DENY', 'REVOKE', 'DECLARE', ';'}]"
const expectedMessageVerbose = " \n" +
" SELEC 1;\n" +
" ^^^^^\n" +
Expand All @@ -88,7 +88,7 @@ test('White or special characters should not avoid exception catching', () => {
`SELECT 1\t limit `,
`SELECT 1 limit `
]
for (const stmt in stmts) {
for (const stmt of stmts) {
let r = sqlparse(stmt)
expect(r[0].exception).toBeDefined();
}
Expand All @@ -112,7 +112,7 @@ test('Special characters should not avoid exception catching', () => {
`SELECT 1\t limit `,
`SELECT 1 limit `
]
for (const stmt in stmts) {
for (const stmt of stmts) {
let r = sqlparse(stmt)
expect(r[0].exception).not.toBeNull();
}
Expand Down Expand Up @@ -171,4 +171,30 @@ test('Missing EOF should not block error catching', () => {
expect(r[0].exception).toBeNull()
expect(r[1].exception).not.toBeNull()
}
})

test('Invalid leading token is recovered', () => {
// An invalid leading token must still yield a Statement carrying the exception (GH-284).
let r = sqlparse("SELCT 2")
expect(r).length(1)
expect(r[0].exception).not.toBeNull()
expect(typeof r[0].query).toBe("string") // synthesized statement: attribute access must not throw
expect(r[0].type).toBeNull()

// A bad leading statement must not swallow the valid ones after it.
r = sqlparse("SELCT 1; SELECT 2; SELECT 3 FROM tbl WHERE;")
expect(r).length(3)
expect(r[0].exception).not.toBeNull()
expect(r[1].exception).toBeNull()
expect(r[2].exception).not.toBeNull()
})

// GH-28 tripwire: a bad non-leading statement still derails the statements after it. `test.fails`
// flips to a failure when GH-28 is fixed — drop the marker then.
test.fails('Invalid token mid-stream derails following statements (GH-28)', () => {
const r = sqlparse("SELECT 1; SELEC 2; SELECT 3")
expect(r).length(3)
expect(r[0].exception).toBeNull()
expect(r[1].exception).not.toBeNull()
expect(r[2].exception).toBeNull()
})
2 changes: 2 additions & 0 deletions cratedb_sqlparse_py/cratedb_sqlparse/AstBuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def stmt(self, value):

def enrich(self, stmt) -> None:
self.stmt = stmt
if self.stmt.ctx is None: # synthesized statement: no tree to walk
return
self.visit(self.stmt.ctx)

def visitTableName(self, ctx: SqlBaseParser.TableNameContext):
Expand Down
34 changes: 33 additions & 1 deletion cratedb_sqlparse_py/cratedb_sqlparse/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# ruff: noqa: A005 # Module `parser` shadows a Python standard-library module
import typing as t
from typing import List

from antlr4 import CommonTokenStream, InputStream, RecognitionException, Token
Expand Down Expand Up @@ -145,7 +146,8 @@ def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):

class Statement:
"""
Represents a CrateDB SQL statement.
Represents a CrateDB SQL statement. `ctx` is None when the statement was synthesized from a
parse error instead of a parse tree (see the recovery path in `sqlparse`).
"""

def __init__(self, ctx: SqlBaseParser.StatementContext, exception: ParsingException = None):
Expand All @@ -158,27 +160,35 @@ def tree(self):
"""
Returns the String representation of the Tree in LISP format.
"""
if self.ctx is None:
return None
return self.ctx.toStringTree(recog=self.ctx.parser)

@property
def original_query(self):
"""
The original query that was fed into `sqlparse`, including semicolons and comments.
"""
if self.ctx is None:
return self.exception.query if self.exception else None
return self.ctx.parser.getTokenStream().getText()

@property
def query(self) -> str:
"""
Returns the query, comments and ';' are not included.
"""
if self.ctx is None:
return self.exception.query if self.exception else ""
return self.ctx.parser.getTokenStream().getText(start=self.ctx.start, stop=self.ctx.stop)

@property
def type(self):
"""
Returns the type of the Statement, i.e.: 'SELECT, 'UPDATE', 'COPY TO'...
"""
if self.ctx is None:
return None
return self.ctx.start.text

def __repr__(self):
Expand All @@ -199,6 +209,17 @@ def find_suitable_error(statement, errors):
errors.pop(errors.index(error))


def query_tail_after_first_statement(token_stream) -> t.Optional[str]:
"""
Return the text after the first top-level `;`, or None if there is none. Scans tokens, so a
`;` inside a string or comment is not treated as a separator.
"""
for token in token_stream.tokens:
if token.type == SqlBaseLexer.SEMICOLON:
return token.source[1].strdata[token.stop + 1 :]
return None


def sqlparse(query: str, raise_exception: bool = False) -> List[Statement]:
"""
Parses a string into SQL `Statement`.
Expand Down Expand Up @@ -257,6 +278,17 @@ def sqlparse(query: str, raise_exception: bool = False) -> List[Statement]:
# triggered and not be an actual error.
pass

# Since CrateDB 6.3.2 made the leading `statement` optional, a statement whose first token is
# invalid produces no StatementContext and collapses the parse to []. Rebuild from the
# collected error, then recurse on the tail after the first `;` (GH-284). Fully-collapsed case
# only; a bad non-leading statement still derails its followers (GH-28).
if not raise_exception and not statements_context and error_listener.errors:
recovered = [Statement(None, exception=error_listener.errors[0])]
tail = query_tail_after_first_statement(stream)
if tail is not None and tail.strip():
recovered.extend(sqlparse(tail))
return recovered

# We extract the metadata and enrich every Statement's `metadata`.
stmt_enricher = AstBuilder()

Expand Down
44 changes: 37 additions & 7 deletions cratedb_sqlparse_py/tests/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def test_exception_message():
SELECT A, B, C, D FROM tbl1;
SELECT D, A FROM tbl1 WHERE;
""")
expected_message = "InputMismatchException[line 2:9 mismatched input 'SELEC' expecting {'SELECT', 'DEALLOCATE', 'FETCH', 'END', 'WITH', 'CREATE', 'ALTER', 'KILL', 'CLOSE', 'BEGIN', 'START', 'COMMIT', 'ANALYZE', 'DISCARD', 'EXPLAIN', 'SHOW', 'OPTIMIZE', 'REFRESH', 'RESTORE', 'DROP', 'INSERT', 'VALUES', 'DELETE', 'UPDATE', 'SET', 'RESET', 'COPY', 'GRANT', 'DENY', 'REVOKE', 'DECLARE'}]" # noqa
expected_message = "InputMismatchException[line 2:9 mismatched input 'SELEC' expecting {<EOF>, 'SELECT', 'DEALLOCATE', 'FETCH', 'END', 'WITH', 'CREATE', 'ALTER', 'KILL', 'CLOSE', 'BEGIN', 'START', 'COMMIT', 'ANALYZE', 'DISCARD', 'EXPLAIN', 'SHOW', 'OPTIMIZE', 'REFRESH', 'RESTORE', 'DROP', 'INSERT', 'VALUES', 'DELETE', 'UPDATE', 'SET', 'RESET', 'COPY', 'GRANT', 'DENY', 'REVOKE', 'DECLARE', ';'}]" # noqa
expected_message_2 = "\n SELEC 1;\n ^^^^^\n SELECT A, B, C, D FROM tbl1;\n SELECT D, A FROM tbl1 WHERE;\n " # noqa
assert r[0].exception.error_message == expected_message
assert r[0].exception.original_query_with_error_marked == expected_message_2
Expand Down Expand Up @@ -93,12 +93,12 @@ def test_sqlparse_catches_exception():
"""
from cratedb_sqlparse import sqlparse

stmts = """
SELECT 1\n limit,
SELECT 1\r limit,
SELECT 1\t limit,
SELECT 1 limit
"""
stmts = [
"SELECT 1\n limit",
"SELECT 1\r limit",
"SELECT 1\t limit",
"SELECT 1 limit",
]
for stmt in stmts:
assert sqlparse(stmt)[0].exception

Expand Down Expand Up @@ -183,3 +183,33 @@ def test_sqlparse_match_exception_missing_eof():
assert len(r) == 2
assert not r[0].exception
assert r[1].exception


def test_sqlparse_recovers_invalid_leading_token():
"""An invalid leading token must still yield a Statement carrying the exception (GH-284)."""
from cratedb_sqlparse import sqlparse

r = sqlparse("SELCT 2")
assert len(r) == 1
assert r[0].exception is not None
assert isinstance(r[0].query, str) # synthesized statement: attribute access must not raise
assert r[0].type is None

# A bad leading statement must not swallow the valid ones after it.
r = sqlparse("SELCT 1; SELECT 2; SELECT 3 FROM tbl WHERE;")
assert len(r) == 3
assert r[0].exception is not None
assert r[1].exception is None
assert r[2].exception is not None


@pytest.mark.xfail(reason="GH-28: a bad non-leading statement still derails the statements after it")
def test_sqlparse_invalid_token_mid_stream_derails_followers():
"""Strict-xfail tripwire for GH-28; drop the marker when it starts passing."""
from cratedb_sqlparse import sqlparse

r = sqlparse("SELECT 1; SELEC 2; SELECT 3")
assert len(r) == 3
assert r[0].exception is None
assert r[1].exception is not None
assert r[2].exception is None