Add Databricks/Spark SQL adapter

nicosuave · nicosuave · commit 43eaee616939 · 2025-10-08T00:28:34.000-07:00
- Add DatabricksAdapter with connection URL support (databricks://)
- Add Databricks to symmetric aggregation using xxhash64 hash function
- Add 11 integration tests (skipped in CI, requires real Databricks workspace)
- Update SemanticLayer to recognize databricks:// URLs
- Add databricks-sql-connector to optional dependencies
- Tests at parity with other databases: basic metrics, dimensions, joins, filters, ORDER BY, LIMIT, symmetric aggregates, 3-way joins
- No CI job (requires real Databricks credentials)
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,6 +52,10 @@ clickhouse = [
     "clickhouse-connect>=0.6.0",
     "pyarrow>=14.0.0",  # For Arrow support
 ]
+databricks = [
+    "databricks-sql-connector>=2.0.0",
+    "pyarrow>=14.0.0",  # For Arrow support
+]
 
 [build-system]
 requires = ["hatchling"]
diff --git a/sidemantic/core/semantic_layer.py b/sidemantic/core/semantic_layer.py
@@ -34,6 +34,7 @@ def __init__(
                 - bigquery://project_id/dataset_id
                 - snowflake://user:password@account/database/schema
                 - clickhouse://user:password@host:port/database
+                - databricks://token@server-hostname/http-path
             dialect: SQL dialect for query generation (optional, inferred from adapter)
             auto_register: Set as current layer for auto-registration (default: True)
             use_preaggregations: Enable automatic pre-aggregation routing (default: False)
@@ -76,10 +77,15 @@ def __init__(
 
                 self.adapter = ClickHouseAdapter.from_url(connection)
                 self.dialect = dialect or "clickhouse"
+            elif connection.startswith("databricks://"):
+                from sidemantic.db.databricks import DatabricksAdapter
+
+                self.adapter = DatabricksAdapter.from_url(connection)
+                self.dialect = dialect or "databricks"
             else:
                 raise ValueError(
                     f"Unsupported connection URL: {connection}. "
-                    "Supported: duckdb:///, postgres://, bigquery://, snowflake://, clickhouse://, or BaseDatabaseAdapter instance"
+                    "Supported: duckdb:///, postgres://, bigquery://, snowflake://, clickhouse://, databricks://, or BaseDatabaseAdapter instance"
                 )
         else:
             raise TypeError(f"connection must be a string URL or BaseDatabaseAdapter instance, got {type(connection)}")
diff --git a/sidemantic/core/symmetric_aggregate.py b/sidemantic/core/symmetric_aggregate.py
@@ -26,7 +26,7 @@ def build_symmetric_aggregate_sql(
         primary_key: The primary key field to use for deduplication
         agg_type: Type of aggregation (sum, avg, count, count_distinct)
         model_alias: Optional table/CTE alias to prefix columns
-        dialect: SQL dialect (duckdb, bigquery, postgres, snowflake, clickhouse)
+        dialect: SQL dialect (duckdb, bigquery, postgres, snowflake, clickhouse, databricks)
 
     Returns:
         SQL expression using symmetric aggregates
@@ -68,6 +68,12 @@ def hash_func(col):
         def hash_func(col):
             return f"halfMD5(CAST({col} AS String))"
 
+        multiplier = "1048576"  # 2^20 as literal
+    elif dialect == "databricks":
+        # Databricks/Spark SQL xxhash64 returns bigint
+        def hash_func(col):
+            return f"xxhash64(CAST({col} AS STRING))"
+
         multiplier = "1048576"  # 2^20 as literal
     else:  # duckdb
 
diff --git a/sidemantic/db/__init__.py b/sidemantic/db/__init__.py
@@ -27,4 +27,8 @@ def __getattr__(name):
         from sidemantic.db.clickhouse import ClickHouseAdapter
 
         return ClickHouseAdapter
+    if name == "DatabricksAdapter":
+        from sidemantic.db.databricks import DatabricksAdapter
+
+        return DatabricksAdapter
     raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
diff --git a/sidemantic/db/databricks.py b/sidemantic/db/databricks.py
@@ -0,0 +1,214 @@
+"""Databricks/Spark SQL database adapter."""
+
+from typing import Any
+from urllib.parse import parse_qs, unquote, urlparse
+
+from sidemantic.db.base import BaseDatabaseAdapter
+
+
+class DatabricksResult:
+    """Wrapper for Databricks cursor to match DuckDB result API."""
+
+    def __init__(self, cursor):
+        """Initialize Databricks result wrapper.
+
+        Args:
+            cursor: Databricks cursor object
+        """
+        self.cursor = cursor
+        self._description = cursor.description
+
+    def fetchone(self) -> tuple | None:
+        """Fetch one row from the result."""
+        return self.cursor.fetchone()
+
+    def fetchall(self) -> list[tuple]:
+        """Fetch all remaining rows."""
+        return self.cursor.fetchall()
+
+    def fetch_record_batch(self) -> Any:
+        """Convert result to PyArrow RecordBatchReader."""
+        import pyarrow as pa
+
+        # Databricks cursor may support Arrow format directly
+        # For now, convert from standard result
+        rows = self.cursor.fetchall()
+        if not rows:
+            # Empty result
+            schema = pa.schema([(desc[0], pa.string()) for desc in self._description])
+            return pa.RecordBatchReader.from_batches(schema, [])
+
+        # Build Arrow table from rows
+        columns = {desc[0]: [row[i] for row in rows] for i, desc in enumerate(self._description)}
+        table = pa.table(columns)
+        return pa.RecordBatchReader.from_batches(table.schema, table.to_batches())
+
+    @property
+    def description(self):
+        """Get column descriptions."""
+        return self._description
+
+
+class DatabricksAdapter(BaseDatabaseAdapter):
+    """Databricks/Spark SQL database adapter.
+
+    Example:
+        >>> adapter = DatabricksAdapter(
+        ...     server_hostname="your-workspace.cloud.databricks.com",
+        ...     http_path="/sql/1.0/warehouses/abc123",
+        ...     access_token="dapi..."
+        ... )
+        >>> result = adapter.execute("SELECT * FROM table")
+    """
+
+    def __init__(
+        self,
+        server_hostname: str,
+        http_path: str,
+        access_token: str | None = None,
+        catalog: str | None = None,
+        schema: str | None = None,
+        **kwargs,
+    ):
+        """Initialize Databricks adapter.
+
+        Args:
+            server_hostname: Databricks workspace hostname
+            http_path: SQL warehouse HTTP path
+            access_token: Personal access token or service principal token
+            catalog: Unity Catalog name (optional)
+            schema: Schema/database name (optional)
+            **kwargs: Additional arguments passed to databricks.sql.connect
+        """
+        try:
+            from databricks import sql
+        except ImportError as e:
+            raise ImportError(
+                "Databricks support requires databricks-sql-connector. "
+                "Install with: pip install sidemantic[databricks] or pip install databricks-sql-connector"
+            ) from e
+
+        # Build connection params
+        conn_params = {
+            "server_hostname": server_hostname,
+            "http_path": http_path,
+        }
+
+        if access_token:
+            conn_params["access_token"] = access_token
+
+        if catalog:
+            conn_params["catalog"] = catalog
+
+        if schema:
+            conn_params["schema"] = schema
+
+        # Merge with additional kwargs
+        conn_params.update(kwargs)
+
+        self.conn = sql.connect(**conn_params)
+        self.catalog = catalog
+        self.schema = schema
+
+    def execute(self, sql: str) -> DatabricksResult:
+        """Execute SQL query."""
+        cursor = self.conn.cursor()
+        cursor.execute(sql)
+        return DatabricksResult(cursor)
+
+    def executemany(self, sql: str, params: list) -> DatabricksResult:
+        """Execute SQL with multiple parameter sets."""
+        cursor = self.conn.cursor()
+        cursor.executemany(sql, params)
+        return DatabricksResult(cursor)
+
+    def fetchone(self, result: DatabricksResult) -> tuple | None:
+        """Fetch one row from result."""
+        return result.fetchone()
+
+    def fetch_record_batch(self, result: DatabricksResult) -> Any:
+        """Fetch result as PyArrow RecordBatchReader."""
+        return result.fetch_record_batch()
+
+    def get_tables(self) -> list[dict]:
+        """List all tables in the catalog/schema."""
+        if self.schema:
+            sql = f"SHOW TABLES IN {self.schema}"
+        elif self.catalog:
+            sql = f"SHOW TABLES IN {self.catalog}"
+        else:
+            sql = "SHOW TABLES"
+
+        result = self.execute(sql)
+        rows = result.fetchall()
+        return [{"table_name": row[1], "schema": row[0]} for row in rows]
+
+    def get_columns(self, table_name: str, schema: str | None = None) -> list[dict]:
+        """Get column information for a table."""
+        schema = schema or self.schema
+        table_ref = f"{schema}.{table_name}" if schema else table_name
+
+        sql = f"DESCRIBE {table_ref}"
+        result = self.execute(sql)
+        rows = result.fetchall()
+        return [{"column_name": row[0], "data_type": row[1]} for row in rows]
+
+    def close(self) -> None:
+        """Close the Databricks connection."""
+        self.conn.close()
+
+    @property
+    def dialect(self) -> str:
+        """Return SQL dialect."""
+        return "databricks"
+
+    @property
+    def raw_connection(self) -> Any:
+        """Return raw Databricks connection."""
+        return self.conn
+
+    @classmethod
+    def from_url(cls, url: str) -> "DatabricksAdapter":
+        """Create adapter from connection URL.
+
+        URL format: databricks://token@server-hostname/http-path?catalog=x&schema=y
+        Example: databricks://dapi123@my-workspace.cloud.databricks.com/sql/1.0/warehouses/abc?catalog=main&schema=default
+
+        Args:
+            url: Connection URL
+
+        Returns:
+            DatabricksAdapter instance
+        """
+        if not url.startswith("databricks://"):
+            raise ValueError(f"Invalid Databricks URL: {url}")
+
+        parsed = urlparse(url)
+
+        # Parse hostname
+        server_hostname = parsed.hostname
+        if not server_hostname:
+            raise ValueError("Databricks URL must include server hostname")
+
+        # Parse path as http_path (everything after hostname)
+        http_path = parsed.path or ""
+
+        # Parse token from username (password is ignored)
+        access_token = unquote(parsed.username) if parsed.username else None
+
+        # Parse query parameters for catalog and schema
+        params = {}
+        if parsed.query:
+            params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()}
+
+        catalog = params.pop("catalog", None)
+        schema = params.pop("schema", None)
+
+        return cls(
+            server_hostname=server_hostname,
+            http_path=http_path,
+            access_token=access_token,
+            catalog=catalog,
+            schema=schema,
+            **params,
+        )
diff --git a/tests/db/test_databricks_integration.py b/tests/db/test_databricks_integration.py
diff --git a/uv.lock b/uv.lock