Data-Centric-AI-Community · kchpp940 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.10-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY . .
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
+    pip install --no-cache-dir . && \
+    pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
+    pip install --no-cache-dir jupyter
+
+EXPOSE 8888
+
+CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]
+
+
diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py
@@ -3,43 +3,25 @@
 """Correlations between variables."""
 
 import warnings
-from typing import Dict, List, Optional, Sized, no_type_check
+from typing import Dict, List, Optional, Sized
 
 import numpy as np
 import pandas as pd
 
 from ydata_profiling.config import Settings
+from ydata_profiling.utils.backend import BaseBackend
 
 try:
     from pandas.core.base import DataError
 except ImportError:
     from pandas.errors import DataError
 
 
-class CorrelationBackend:
+class CorrelationBackend(BaseBackend):
     """Helper class to select and cache the appropriate correlation backend (Pandas or Spark)."""
 
-    @no_type_check
-    def __init__(self, df: Sized):
-        """Determine backend once and store it for all correlation computations."""
-        if isinstance(df, pd.DataFrame):
-            from ydata_profiling.model.pandas import (
-                correlations_pandas as correlation_backend,  # type: ignore
-            )
-        else:
-            from ydata_profiling.model.spark import (
-                correlations_spark as correlation_backend,  # type: ignore
-            )
-
-        self.backend = correlation_backend
-
-    def get_method(self, method_name: str):  # noqa: ANN201
-        """Retrieve the appropriate correlation method class from the backend."""
-        if hasattr(self.backend, method_name):
-            return getattr(self.backend, method_name)
-        raise AttributeError(
-            f"Correlation method '{method_name}' is not available in the backend."
-        )
+    _pandas_module = "ydata_profiling.model.pandas.correlations_pandas"
+    _spark_module = "ydata_profiling.model.spark.correlations_spark"
 
 
 class Correlation:

diff --git a/src/ydata_profiling/model/missing.py b/src/ydata_profiling/model/missing.py
@@ -1,32 +1,17 @@
-import importlib
 import warnings
-from typing import Any, Callable, Dict, Optional, Sized
+from typing import Any, Dict, Optional, Sized
 
 import pandas as pd
 
 from ydata_profiling.config import Settings
+from ydata_profiling.utils.backend import BaseBackend
 
 
-class MissingDataBackend:
+class MissingDataBackend(BaseBackend):
     """Helper class to select and cache the appropriate missing-data backend (Pandas or Spark)."""
 
-    def __init__(self, df: Sized):
-        """Determine backend once and store it for all missing-data computations."""
-        if isinstance(df, pd.DataFrame):
-            self.backend_module = "ydata_profiling.model.pandas.missing_pandas"
-        else:
-            self.backend_module = "ydata_profiling.model.spark.missing_spark"
-
-        self.module = importlib.import_module(self.backend_module)
-
-    def get_method(self, method_name: str) -> Callable:
-        """Retrieve the appropriate missing-data function from the backend module."""
-        try:
-            return getattr(self.module, method_name)
-        except AttributeError as ex:
-            raise AttributeError(
-                f"Missing-data function '{method_name}' is not available in {self.backend_module}."
-            ) from ex
+    _pandas_module = "ydata_profiling.model.pandas.missing_pandas"
+    _spark_module = "ydata_profiling.model.spark.missing_spark"
 
 
 class MissingData:

diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py
@@ -1,9 +1,7 @@
-from collections import Counter
-
 import pandas as pd
 
 from ydata_profiling.config import Settings
-from ydata_profiling.model.table import get_table_stats
+from ydata_profiling.model.table import compute_common_table_stats, get_table_stats
 
 
 @get_table_stats.register
@@ -21,36 +19,18 @@ def pandas_get_table_stats(
         A dictionary that contains the table statistics.
     """
     n = len(df) if not df.empty else 0
+    n_var = len(df.columns)
 
     memory_size = df.memory_usage(deep=config.memory_deep).sum()
     record_size = float(memory_size) / n if n > 0 else 0
 
     table_stats = {
         "n": n,
-        "n_var": len(df.columns),
+        "n_var": n_var,
         "memory_size": memory_size,
         "record_size": record_size,
-        "n_cells_missing": 0,
-        "n_vars_with_missing": 0,
-        "n_vars_all_missing": 0,
     }
 
-    for series_summary in variable_stats.values():
-        if "n_missing" in series_summary and series_summary["n_missing"] > 0:
-            table_stats["n_vars_with_missing"] += 1
-            table_stats["n_cells_missing"] += series_summary["n_missing"]
-            if series_summary["n_missing"] == n:
-                table_stats["n_vars_all_missing"] += 1
-
-    table_stats["p_cells_missing"] = (
-        table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"])
-        if table_stats["n"] > 0 and table_stats["n_var"] > 0
-        else 0
-    )
-
-    # Variable type counts
-    table_stats.update(
-        {"types": dict(Counter([v["type"] for v in variable_stats.values()]))}
-    )
+    table_stats.update(compute_common_table_stats(n, n_var, variable_stats))
 
     return table_stats
diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py
@@ -1,9 +1,7 @@
-from collections import Counter
-
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
-from ydata_profiling.model.table import get_table_stats
+from ydata_profiling.model.table import compute_common_table_stats, get_table_stats
 
 
 @get_table_stats.register
@@ -21,38 +19,9 @@ def get_table_stats_spark(
         A dictionary that contains the table statistics.
     """
     n = df.count()
+    n_var = len(df.columns)
 
-    result = {"n": n, "n_var": len(df.columns)}
-
-    table_stats = {
-        "n_cells_missing": 0,
-        "n_vars_with_missing": 0,
-        "n_vars_all_missing": 0,
-    }
-
-    for series_summary in variable_stats.values():
-        if "n_missing" in series_summary and series_summary["n_missing"] > 0:
-            table_stats["n_vars_with_missing"] += 1
-            table_stats["n_cells_missing"] += series_summary["n_missing"]
-            if series_summary["n_missing"] == n:
-                table_stats["n_vars_all_missing"] += 1
-
-    # without this check we'll get a div by zero error
-    if result["n"] * result["n_var"] > 0:
-        table_stats["p_cells_missing"] = (
-            table_stats["n_cells_missing"] / (result["n"] * result["n_var"])
-            if result["n"] > 0
-            else 0
-        )
-    else:
-        table_stats["p_cells_missing"] = 0
-
-    result["p_cells_missing"] = table_stats["p_cells_missing"]
-    result["n_cells_missing"] = table_stats["n_cells_missing"]
-    result["n_vars_all_missing"] = table_stats["n_vars_all_missing"]
-    result["n_vars_with_missing"] = table_stats["n_vars_with_missing"]
-
-    # Variable type counts
-    result["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))
+    result = {"n": n, "n_var": n_var}
+    result.update(compute_common_table_stats(n, n_var, variable_stats))
 
     return result
diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py
@@ -4,7 +4,7 @@
 from ydata_profiling.config import Settings
 
 
-def spark_get_time_index_description_spark(
+def get_time_index_description_spark(
     config: Settings,
     df: DataFrame,
     table_stats: dict,

diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py
@@ -50,9 +50,8 @@ def summarize(
         return self.handle(str(dtype), config, series, {"type": str(dtype)})
 
 
-# Revisit this with the correct support for Spark as well.
 class ProfilingSummarizer(BaseSummarizer):
-    """A summarizer for Pandas DataFrames."""
+    """A summarizer supporting both Pandas and Spark DataFrames."""
 
     def __init__(self, typeset: VisionsTypeset, use_spark: bool = False):
         self.use_spark = use_spark and is_pyspark_installed()

diff --git a/src/ydata_profiling/model/table.py b/src/ydata_profiling/model/table.py
@@ -1,10 +1,47 @@
+from collections import Counter
 from typing import Any
 
 from multimethod import multimethod
 
 from ydata_profiling.config import Settings
 
 
+def compute_common_table_stats(
+    n: int, n_var: int, variable_stats: dict
+) -> dict:
+    """Compute common table statistics shared by Pandas and Spark backends.
+
+    Args:
+        n: Number of rows in the DataFrame
+        n_var: Number of columns (variables)
+        variable_stats: Previously calculated statistic on the DataFrame series
+
+    Returns:
+        A dictionary with common table statistics: missing values counts, percentages, and type counts
+    """
+    table_stats = {
+        "n_cells_missing": 0,
+        "n_vars_with_missing": 0,
+        "n_vars_all_missing": 0,
+    }
+
+    for series_summary in variable_stats.values():
+        if "n_missing" in series_summary and series_summary["n_missing"] > 0:
+            table_stats["n_vars_with_missing"] += 1
+            table_stats["n_cells_missing"] += series_summary["n_missing"]
+            if series_summary["n_missing"] == n:
+                table_stats["n_vars_all_missing"] += 1
+
+    total_cells = n * n_var
+    table_stats["p_cells_missing"] = (
+        table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0
+    )
+
+    table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))
+
+    return table_stats
+
+
 @multimethod
 def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict:
     raise NotImplementedError()
diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py
@@ -1,9 +1,41 @@
 """
-    File with a function to check the backend being used
+    File with backend utilities and helper functions to check the backend being used
 """
 import importlib
+from typing import Callable, Optional, Sized, Union
+
+import pandas as pd
 
 
 def is_pyspark_installed() -> bool:
     """Check if PySpark is installed without importing it."""
     return importlib.util.find_spec("pyspark") is not None
+
+
+class BaseBackend:
+    """Base helper class to select and cache the appropriate backend (Pandas or Spark)."""
+
+    _pandas_module: Optional[str] = None
+    _spark_module: Optional[str] = None
+
+    def __init__(self, df: Union[pd.DataFrame, Sized]):
+        """Determine backend once and store it for all computations."""
+        if isinstance(df, pd.DataFrame):
+            module_path = self._pandas_module
+        else:
+            module_path = self._spark_module
+
+        if module_path is None:
+            raise ValueError("Backend module path not configured")
+
+        self.module = importlib.import_module(module_path)
+        self.module_path = module_path
+
+    def get_method(self, method_name: str) -> Callable:
+        """Retrieve the appropriate function from the backend module."""
+        try:
+            return getattr(self.module, method_name)
+        except AttributeError as ex:
+            raise AttributeError(
+                f"Function '{method_name}' is not available in {self.module_path}."
+            ) from ex