feat: initial release

Pkcha · Pkcha · commit d83e1a17d23c · 2026-04-12T20:42:38.000+08:00
diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py
@@ -12,8 +12,8 @@
 from ydata_profiling.utils.styles import get_alert_styles
 
 
-def _fmt_percent(value: float, edge_cases: bool = True) -> str:
-    """Format a ratio as a percentage (internal copy to avoid circular imports).
+def fmt_percent(value: float, edge_cases: bool = True) -> str:
+    """Format a ratio as a percentage.
 
     Args:
         edge_cases: Check for edge cases?
@@ -209,7 +209,7 @@ def __init__(
 
     def _get_description(self) -> str:
         if self.values is not None:
-            return f"Dataset has {self.values['n_duplicates']} ({_fmt_percent(self.values['p_duplicates'])}) duplicate rows"
+            return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows"
         else:
             return "Dataset has no duplicated rows"
 
@@ -231,7 +231,7 @@ def __init__(
 
     def _get_description(self) -> str:
         if self.values is not None:
-            return f"Dataset has {self.values['n_near_dups']} ({_fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
+            return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
         else:
             return "Dataset has no near duplicated rows"
 
@@ -272,7 +272,7 @@ def __init__(
 
     def _get_description(self) -> str:
         if self.values is not None:
-            return f"[{self.column_name}] has {self.values['n_distinct']:} ({_fmt_percent(self.values['p_distinct'])}) distinct values"
+            return f"[{self.column_name}] has {self.values['n_distinct']:} ({fmt_percent(self.values['p_distinct'])}) distinct values"
         else:
             return f"[{self.column_name}] has a high cardinality"
 
@@ -294,7 +294,7 @@ def __init__(
 
     def _get_description(self) -> str:
         if self.values is not None:
-            return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {_fmt_percent(self.values['p_fuzzy_vals'])} per category"
+            return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category"
         else:
             return f"[{self.column_name}] no dirty categories values."
 
@@ -365,7 +365,7 @@ def __init__(
 
     def _get_description(self) -> str:
         if self.values is not None:
-            return f"[{self.column_name}] has {self.values['n_infinite']} ({_fmt_percent(self.values['p_infinite'])}) infinite values"
+            return f"[{self.column_name}] has {self.values['n_infinite']} ({fmt_percent(self.values['p_infinite'])}) infinite values"
         else:
             return f"[{self.column_name}] has infinite values"
 
@@ -387,7 +387,7 @@ def __init__(
 
     def _get_description(self) -> str:
         if self.values is not None:
-            return f"[{self.column_name}] {self.values['n_missing']} ({_fmt_percent(self.values['p_missing'])}) missing values"
+            return f"[{self.column_name}] {self.values['n_missing']} ({fmt_percent(self.values['p_missing'])}) missing values"
         else:
             return f"[{self.column_name}] has missing values"
 
@@ -541,7 +541,7 @@ def __init__(
 
     def _get_description(self) -> str:
         if self.values is not None:
-            return f"[{self.column_name}] has {self.values['n_zeros']} ({_fmt_percent(self.values['p_zeros'])}) zeros"
+            return f"[{self.column_name}] has {self.values['n_zeros']} ({fmt_percent(self.values['p_zeros'])}) zeros"
         else:
             return f"[{self.column_name}] has predominantly zeros"
 
diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py
@@ -3,43 +3,25 @@
 """Correlations between variables."""
 
 import warnings
-from typing import Dict, List, Optional, Sized, no_type_check
+from typing import Dict, List, Optional, Sized
 
 import numpy as np
 import pandas as pd
 
 from ydata_profiling.config import Settings
+from ydata_profiling.utils.backend import BaseBackend
 
 try:
     from pandas.core.base import DataError
 except ImportError:
     from pandas.errors import DataError
 
 
-class CorrelationBackend:
+class CorrelationBackend(BaseBackend):
     """Helper class to select and cache the appropriate correlation backend (Pandas or Spark)."""
 
-    @no_type_check
-    def __init__(self, df: Sized):
-        """Determine backend once and store it for all correlation computations."""
-        if isinstance(df, pd.DataFrame):
-            from ydata_profiling.model.pandas import (
-                correlations_pandas as correlation_backend,  # type: ignore
-            )
-        else:
-            from ydata_profiling.model.spark import (
-                correlations_spark as correlation_backend,  # type: ignore
-            )
-
-        self.backend = correlation_backend
-
-    def get_method(self, method_name: str):  # noqa: ANN201
-        """Retrieve the appropriate correlation method class from the backend."""
-        if hasattr(self.backend, method_name):
-            return getattr(self.backend, method_name)
-        raise AttributeError(
-            f"Correlation method '{method_name}' is not available in the backend."
-        )
+    _pandas_module = "ydata_profiling.model.pandas.correlations_pandas"
+    _spark_module = "ydata_profiling.model.spark.correlations_spark"
 
 
 class Correlation:
diff --git a/src/ydata_profiling/model/missing.py b/src/ydata_profiling/model/missing.py
@@ -1,32 +1,17 @@
-import importlib
 import warnings
-from typing import Any, Callable, Dict, Optional, Sized
+from typing import Any, Dict, Optional, Sized
 
 import pandas as pd
 
 from ydata_profiling.config import Settings
+from ydata_profiling.utils.backend import BaseBackend
 
 
-class MissingDataBackend:
+class MissingDataBackend(BaseBackend):
     """Helper class to select and cache the appropriate missing-data backend (Pandas or Spark)."""
 
-    def __init__(self, df: Sized):
-        """Determine backend once and store it for all missing-data computations."""
-        if isinstance(df, pd.DataFrame):
-            self.backend_module = "ydata_profiling.model.pandas.missing_pandas"
-        else:
-            self.backend_module = "ydata_profiling.model.spark.missing_spark"
-
-        self.module = importlib.import_module(self.backend_module)
-
-    def get_method(self, method_name: str) -> Callable:
-        """Retrieve the appropriate missing-data function from the backend module."""
-        try:
-            return getattr(self.module, method_name)
-        except AttributeError as ex:
-            raise AttributeError(
-                f"Missing-data function '{method_name}' is not available in {self.backend_module}."
-            ) from ex
+    _pandas_module = "ydata_profiling.model.pandas.missing_pandas"
+    _spark_module = "ydata_profiling.model.spark.missing_spark"
 
 
 class MissingData:
diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py
@@ -1,9 +1,7 @@
-from collections import Counter
-
 import pandas as pd
 
 from ydata_profiling.config import Settings
-from ydata_profiling.model.table import get_table_stats
+from ydata_profiling.model.table import compute_common_table_stats, get_table_stats
 
 
 @get_table_stats.register
@@ -21,36 +19,18 @@ def pandas_get_table_stats(
         A dictionary that contains the table statistics.
     """
     n = len(df) if not df.empty else 0
+    n_var = len(df.columns)
 
     memory_size = df.memory_usage(deep=config.memory_deep).sum()
     record_size = float(memory_size) / n if n > 0 else 0
 
     table_stats = {
         "n": n,
-        "n_var": len(df.columns),
+        "n_var": n_var,
         "memory_size": memory_size,
         "record_size": record_size,
-        "n_cells_missing": 0,
-        "n_vars_with_missing": 0,
-        "n_vars_all_missing": 0,
     }
 
-    for series_summary in variable_stats.values():
-        if "n_missing" in series_summary and series_summary["n_missing"] > 0:
-            table_stats["n_vars_with_missing"] += 1
-            table_stats["n_cells_missing"] += series_summary["n_missing"]
-            if series_summary["n_missing"] == n:
-                table_stats["n_vars_all_missing"] += 1
-
-    table_stats["p_cells_missing"] = (
-        table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"])
-        if table_stats["n"] > 0 and table_stats["n_var"] > 0
-        else 0
-    )
-
-    # Variable type counts
-    table_stats.update(
-        {"types": dict(Counter([v["type"] for v in variable_stats.values()]))}
-    )
+    table_stats.update(compute_common_table_stats(n, n_var, variable_stats))
 
     return table_stats
diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py
@@ -1,9 +1,7 @@
-from collections import Counter
-
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
-from ydata_profiling.model.table import get_table_stats
+from ydata_profiling.model.table import compute_common_table_stats, get_table_stats
 
 
 @get_table_stats.register
@@ -21,38 +19,9 @@ def get_table_stats_spark(
         A dictionary that contains the table statistics.
     """
     n = df.count()
+    n_var = len(df.columns)
 
-    result = {"n": n, "n_var": len(df.columns)}
-
-    table_stats = {
-        "n_cells_missing": 0,
-        "n_vars_with_missing": 0,
-        "n_vars_all_missing": 0,
-    }
-
-    for series_summary in variable_stats.values():
-        if "n_missing" in series_summary and series_summary["n_missing"] > 0:
-            table_stats["n_vars_with_missing"] += 1
-            table_stats["n_cells_missing"] += series_summary["n_missing"]
-            if series_summary["n_missing"] == n:
-                table_stats["n_vars_all_missing"] += 1
-
-    # without this check we'll get a div by zero error
-    if result["n"] * result["n_var"] > 0:
-        table_stats["p_cells_missing"] = (
-            table_stats["n_cells_missing"] / (result["n"] * result["n_var"])
-            if result["n"] > 0
-            else 0
-        )
-    else:
-        table_stats["p_cells_missing"] = 0
-
-    result["p_cells_missing"] = table_stats["p_cells_missing"]
-    result["n_cells_missing"] = table_stats["n_cells_missing"]
-    result["n_vars_all_missing"] = table_stats["n_vars_all_missing"]
-    result["n_vars_with_missing"] = table_stats["n_vars_with_missing"]
-
-    # Variable type counts
-    result["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))
+    result = {"n": n, "n_var": n_var}
+    result.update(compute_common_table_stats(n, n_var, variable_stats))
 
     return result
diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py
@@ -4,7 +4,7 @@
 from ydata_profiling.config import Settings
 
 
-def spark_get_time_index_description_spark(
+def get_time_index_description_spark(
     config: Settings,
     df: DataFrame,
     table_stats: dict,
diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py
@@ -50,9 +50,8 @@ def summarize(
         return self.handle(str(dtype), config, series, {"type": str(dtype)})
 
 
-# Revisit this with the correct support for Spark as well.
 class ProfilingSummarizer(BaseSummarizer):
-    """A summarizer for Pandas DataFrames."""
+    """A summarizer supporting both Pandas and Spark DataFrames."""
 
     def __init__(self, typeset: VisionsTypeset, use_spark: bool = False):
         self.use_spark = use_spark and is_pyspark_installed()
diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py
@@ -11,6 +11,21 @@
 T = TypeVar("T")
 
 
+def func_nullable_series_contains(fn: Callable) -> Callable:
+    @functools.wraps(fn)
+    def inner(
+        config: Settings, series: pd.Series, state: dict, *args, **kwargs
+    ) -> bool:
+        if series.hasnans:
+            series = series.dropna()
+            if series.empty:
+                return False
+
+        return fn(config, series, state, *args, **kwargs)
+
+    return inner
+
+
 def safe_histogram(
     values: np.ndarray,
     bins: Union[int, str, np.ndarray] = "auto",
diff --git a/src/ydata_profiling/model/table.py b/src/ydata_profiling/model/table.py
@@ -1,10 +1,47 @@
+from collections import Counter
 from typing import Any
 
 from multimethod import multimethod
 
 from ydata_profiling.config import Settings
 
 
+def compute_common_table_stats(
+    n: int, n_var: int, variable_stats: dict
+) -> dict:
+    """Compute common table statistics shared by Pandas and Spark backends.
+
+    Args:
+        n: Number of rows in the DataFrame
+        n_var: Number of columns (variables)
+        variable_stats: Previously calculated statistic on the DataFrame series
+
+    Returns:
+        A dictionary with common table statistics: missing values counts, percentages, and type counts
+    """
+    table_stats = {
+        "n_cells_missing": 0,
+        "n_vars_with_missing": 0,
+        "n_vars_all_missing": 0,
+    }
+
+    for series_summary in variable_stats.values():
+        if "n_missing" in series_summary and series_summary["n_missing"] > 0:
+            table_stats["n_vars_with_missing"] += 1
+            table_stats["n_cells_missing"] += series_summary["n_missing"]
+            if series_summary["n_missing"] == n:
+                table_stats["n_vars_all_missing"] += 1
+
+    total_cells = n * n_var
+    table_stats["p_cells_missing"] = (
+        table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0
+    )
+
+    table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))
+
+    return table_stats
+
+
 @multimethod
 def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict:
     raise NotImplementedError()
diff --git a/src/ydata_profiling/report/structure/variables/render_common.py b/src/ydata_profiling/report/structure/variables/render_common.py
@@ -10,6 +10,7 @@ def render_common(config: Settings, summary: dict) -> dict:
     n_freq_table_max = config.n_freq_table_max
 
     template_variables = {
+        # TODO: with nan
         "freq_table_rows": freq_table(
             freqtable=summary["value_counts_without_nan"],
             n=summary["n"],
diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py