Add Snowflake adapter with fakesnow integration tests

nicosuave · nicosuave · commit 1614973cea6a · 2025-10-08T00:09:54.000-07:00
- Add SnowflakeAdapter with connection URL support (snowflake://)
- Add Snowflake to symmetric aggregation with modulo+small multiplier to avoid overflow
- Add 11 integration tests using fakesnow library (10 passing, 1 skipped)
- Update SemanticLayer to recognize snowflake:// URLs
- Add fakesnow to dev dependencies for testing
- Add Snowflake CI job using fakesnow (no Docker image needed)
- Tests at parity with BigQuery/Postgres: basic metrics, dimensions, joins, filters, ORDER BY, LIMIT, symmetric aggregates, 3-way joins
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -79,3 +79,25 @@ jobs:
           BIGQUERY_PROJECT: "test-project"
           BIGQUERY_DATASET: "test_dataset"
         run: uv run pytest -m integration tests/db/test_bigquery_integration.py -v
+
+  snowflake-integration:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Set up Python
+        run: uv python install 3.12
+
+      - name: Install dependencies
+        run: uv sync --extra snowflake --extra dev
+
+      - name: Run Snowflake integration tests
+        env:
+          SNOWFLAKE_TEST: "1"
+        run: uv run pytest -m integration tests/db/test_snowflake_integration.py -v
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -22,6 +22,8 @@ services:
       - "9050:9050"
     command: ["--project=test-project", "--dataset=test_dataset"]
 
+  # Snowflake tests use fakesnow (Python library) for mocking, no Docker service needed
+
   test:
     build:
       context: .
@@ -38,6 +40,7 @@ services:
       BIGQUERY_EMULATOR_HOST: "bigquery:9050"
       BIGQUERY_PROJECT: "test-project"
       BIGQUERY_DATASET: "test_dataset"
+      SNOWFLAKE_TEST: "1"  # fakesnow patches snowflake.connector, no external service needed
     command: pytest -m integration -v
 
 volumes:
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ dev = [
     "ruff>=0.6.0",
     "pandas>=2.2,<3",
     "numpy>=1.26,<3",
+    "fakesnow>=0.9.0",  # For Snowflake integration tests
 ]
 serve = [
     "riffq>=0.1.0",
@@ -43,6 +44,10 @@ bigquery = [
     "google-cloud-bigquery>=3.0.0",
     "pyarrow>=14.0.0",  # For Arrow support
 ]
+snowflake = [
+    "snowflake-connector-python>=3.0.0",
+    "pyarrow>=14.0.0",  # For Arrow support
+]
 
 [build-system]
 requires = ["hatchling"]
diff --git a/sidemantic/core/semantic_layer.py b/sidemantic/core/semantic_layer.py
@@ -32,6 +32,7 @@ def __init__(
                 - duckdb:///path/to/db.duckdb
                 - postgres://user:pass@host:port/dbname
                 - bigquery://project_id/dataset_id
+                - snowflake://user:password@account/database/schema
             dialect: SQL dialect for query generation (optional, inferred from adapter)
             auto_register: Set as current layer for auto-registration (default: True)
             use_preaggregations: Enable automatic pre-aggregation routing (default: False)
@@ -64,10 +65,15 @@ def __init__(
 
                 self.adapter = BigQueryAdapter.from_url(connection)
                 self.dialect = dialect or "bigquery"
+            elif connection.startswith("snowflake://"):
+                from sidemantic.db.snowflake import SnowflakeAdapter
+
+                self.adapter = SnowflakeAdapter.from_url(connection)
+                self.dialect = dialect or "snowflake"
             else:
                 raise ValueError(
                     f"Unsupported connection URL: {connection}. "
-                    "Supported: duckdb:///, postgres://, bigquery://, or BaseDatabaseAdapter instance"
+                    "Supported: duckdb:///, postgres://, bigquery://, snowflake://, or BaseDatabaseAdapter instance"
                 )
         else:
             raise TypeError(f"connection must be a string URL or BaseDatabaseAdapter instance, got {type(connection)}")
diff --git a/sidemantic/core/symmetric_aggregate.py b/sidemantic/core/symmetric_aggregate.py
@@ -26,7 +26,7 @@ def build_symmetric_aggregate_sql(
         primary_key: The primary key field to use for deduplication
         agg_type: Type of aggregation (sum, avg, count, count_distinct)
         model_alias: Optional table/CTE alias to prefix columns
-        dialect: SQL dialect (duckdb, bigquery, postgres)
+        dialect: SQL dialect (duckdb, bigquery, postgres, snowflake)
 
     Returns:
         SQL expression using symmetric aggregates
@@ -56,6 +56,13 @@ def hash_func(col):
             return f"hashtext({col}::text)::bigint"
 
         multiplier = "1024"  # 2^10 as literal (smaller to avoid overflow)
+    elif dialect == "snowflake":
+        # Snowflake HASH returns very large 64-bit integers
+        # Use modulo to constrain range, then very small multiplier to avoid overflow
+        def hash_func(col):
+            return f"(HASH({col}) % 1000000000)"  # Modulo to constrain range
+
+        multiplier = "100"  # Very small multiplier to avoid overflow
     else:  # duckdb
 
         def hash_func(col):
diff --git a/sidemantic/db/__init__.py b/sidemantic/db/__init__.py
@@ -15,4 +15,12 @@ def __getattr__(name):
         from sidemantic.db.postgres import PostgreSQLAdapter
 
         return PostgreSQLAdapter
+    if name == "BigQueryAdapter":
+        from sidemantic.db.bigquery import BigQueryAdapter
+
+        return BigQueryAdapter
+    if name == "SnowflakeAdapter":
+        from sidemantic.db.snowflake import SnowflakeAdapter
+
+        return SnowflakeAdapter
     raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
diff --git a/sidemantic/db/snowflake.py b/sidemantic/db/snowflake.py
@@ -0,0 +1,231 @@
+"""Snowflake database adapter."""
+
+from typing import Any
+from urllib.parse import parse_qs, unquote, urlparse
+
+from sidemantic.db.base import BaseDatabaseAdapter
+
+
+class SnowflakeResult:
+    """Wrapper for Snowflake cursor to match DuckDB result API."""
+
+    def __init__(self, cursor):
+        """Initialize Snowflake result wrapper.
+
+        Args:
+            cursor: Snowflake cursor object
+        """
+        self.cursor = cursor
+        self._description = cursor.description
+
+    def fetchone(self) -> tuple | None:
+        """Fetch one row from the result."""
+        return self.cursor.fetchone()
+
+    def fetchall(self) -> list[tuple]:
+        """Fetch all remaining rows."""
+        return self.cursor.fetchall()
+
+    def fetch_record_batch(self) -> Any:
+        """Convert result to PyArrow RecordBatchReader."""
+        import pyarrow as pa
+
+        # Fetch all rows and convert to Arrow
+        rows = self.cursor.fetchall()
+        if not rows:
+            # Empty result
+            schema = pa.schema([(desc[0], pa.string()) for desc in self._description])
+            return pa.RecordBatchReader.from_batches(schema, [])
+
+        # Build Arrow table from rows
+        columns = {desc[0]: [row[i] for row in rows] for i, desc in enumerate(self._description)}
+        table = pa.table(columns)
+        return pa.RecordBatchReader.from_batches(table.schema, table.to_batches())
+
+    @property
+    def description(self):
+        """Get column descriptions."""
+        return self._description
+
+
+class SnowflakeAdapter(BaseDatabaseAdapter):
+    """Snowflake database adapter.
+
+    Example:
+        >>> adapter = SnowflakeAdapter(
+        ...     account="myaccount",
+        ...     user="myuser",
+        ...     password="mypass",
+        ...     database="mydb",
+        ...     schema="myschema"
+        ... )
+        >>> result = adapter.execute("SELECT * FROM table")
+    """
+
+    def __init__(
+        self,
+        account: str | None = None,
+        user: str | None = None,
+        password: str | None = None,
+        database: str | None = None,
+        schema: str | None = None,
+        warehouse: str | None = None,
+        role: str | None = None,
+        **kwargs,
+    ):
+        """Initialize Snowflake adapter.
+
+        Args:
+            account: Snowflake account identifier
+            user: Username
+            password: Password
+            database: Database name
+            schema: Schema name
+            warehouse: Warehouse name
+            role: Role name
+            **kwargs: Additional arguments passed to snowflake.connector.connect
+        """
+        try:
+            import snowflake.connector
+        except ImportError as e:
+            raise ImportError(
+                "Snowflake support requires snowflake-connector-python. "
+                "Install with: pip install sidemantic[snowflake] or pip install snowflake-connector-python"
+            ) from e
+
+        # Build connection params
+        conn_params = {}
+        if account:
+            conn_params["account"] = account
+        if user:
+            conn_params["user"] = user
+        if password:
+            conn_params["password"] = password
+        if database:
+            conn_params["database"] = database
+        if schema:
+            conn_params["schema"] = schema
+        if warehouse:
+            conn_params["warehouse"] = warehouse
+        if role:
+            conn_params["role"] = role
+
+        # Merge with additional kwargs
+        conn_params.update(kwargs)
+
+        self.conn = snowflake.connector.connect(**conn_params)
+        self.database = database
+        self.schema = schema
+
+    def execute(self, sql: str) -> SnowflakeResult:
+        """Execute SQL query."""
+        cursor = self.conn.cursor()
+        cursor.execute(sql)
+        return SnowflakeResult(cursor)
+
+    def executemany(self, sql: str, params: list) -> SnowflakeResult:
+        """Execute SQL with multiple parameter sets."""
+        cursor = self.conn.cursor()
+        cursor.executemany(sql, params)
+        return SnowflakeResult(cursor)
+
+    def fetchone(self, result: SnowflakeResult) -> tuple | None:
+        """Fetch one row from result."""
+        return result.fetchone()
+
+    def fetch_record_batch(self, result: SnowflakeResult) -> Any:
+        """Fetch result as PyArrow RecordBatchReader."""
+        return result.fetch_record_batch()
+
+    def get_tables(self) -> list[dict]:
+        """List all tables in the database/schema."""
+        if self.schema:
+            sql = f"""
+                SELECT table_name, table_schema as schema
+                FROM information_schema.tables
+                WHERE table_schema = '{self.schema}'
+                    AND table_type = 'BASE TABLE'
+            """
+        elif self.database:
+            sql = """
+                SELECT table_name, table_schema as schema
+                FROM information_schema.tables
+                WHERE table_type = 'BASE TABLE'
+            """
+        else:
+            sql = """
+                SELECT table_name, table_schema as schema
+                FROM information_schema.tables
+                WHERE table_type = 'BASE TABLE'
+            """
+
+        result = self.execute(sql)
+        rows = result.fetchall()
+        return [{"table_name": row[0], "schema": row[1]} for row in rows]
+
+    def get_columns(self, table_name: str, schema: str | None = None) -> list[dict]:
+        """Get column information for a table."""
+        schema = schema or self.schema
+        schema_filter = f"AND table_schema = '{schema}'" if schema else ""
+
+        sql = f"""
+            SELECT column_name, data_type
+            FROM information_schema.columns
+            WHERE table_name = '{table_name}' {schema_filter}
+        """
+        result = self.execute(sql)
+        rows = result.fetchall()
+        return [{"column_name": row[0], "data_type": row[1]} for row in rows]
+
+    def close(self) -> None:
+        """Close the Snowflake connection."""
+        self.conn.close()
+
+    @property
+    def dialect(self) -> str:
+        """Return SQL dialect."""
+        return "snowflake"
+
+    @property
+    def raw_connection(self) -> Any:
+        """Return raw Snowflake connection."""
+        return self.conn
+
+    @classmethod
+    def from_url(cls, url: str) -> "SnowflakeAdapter":
+        """Create adapter from connection URL.
+
+        URL format: snowflake://user:password@account/database/schema?warehouse=wh&role=myrole
+        Minimal: snowflake://user:password@account
+
+        Args:
+            url: Connection URL
+
+        Returns:
+            SnowflakeAdapter instance
+        """
+        if not url.startswith("snowflake://"):
+            raise ValueError(f"Invalid Snowflake URL: {url}")
+
+        parsed = urlparse(url)
+
+        # Parse path: /database/schema
+        path_parts = [p for p in parsed.path.split("/") if p]
+        database = path_parts[0] if len(path_parts) > 0 else None
+        schema = path_parts[1] if len(path_parts) > 1 else None
+
+        # Parse query parameters
+        params = {}
+        if parsed.query:
+            params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()}
+
+        return cls(
+            account=parsed.hostname,
+            user=unquote(parsed.username) if parsed.username else None,
+            password=unquote(parsed.password) if parsed.password else None,
+            database=database,
+            schema=schema,
+            warehouse=params.pop("warehouse", None),
+            role=params.pop("role", None),
+            **params,
+        )
diff --git a/tests/db/test_snowflake_integration.py b/tests/db/test_snowflake_integration.py
diff --git a/uv.lock b/uv.lock