feat: initial release

Pkcha · Pkcha · commit ae0223356748 · 2026-04-12T23:34:50.000+08:00
diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py
@@ -1,47 +1,44 @@
 """
     Auxiliary handler methods for data summary extraction
 """
-from typing import Any, Callable, Dict, List, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Sequence, Tuple
 
 import networkx as nx
 from visions import VisionsTypeset
 
 
-def compose(functions: Sequence[Callable]) -> Callable:
+def compose(functions: Sequence[Callable[..., Any]]) -> Callable[..., Tuple[Any, ...]]:
     """
     Compose a sequence of functions.
 
-    Each function in the sequence receives the result of the previous function.
-    Functions are expected to accept and return tuples for proper chaining.
-
-    :param functions: sequence of functions that accept and return tuples
-    :return: combined function applying all functions in order
+    Each function in the sequence should accept the arguments passed to the composed
+    function and return either a single value or a tuple of values.
+    
+    :param functions: sequence of functions
+    :return: combined function applying all functions in order.
     """
 
     def composed_function(*args: Any) -> Tuple[Any, ...]:
-        result: Union[Tuple[Any, ...], Any] = args
+        result: Tuple[Any, ...] = args
         for func in functions:
-            if isinstance(result, tuple):
-                result = func(*result)
-            else:
-                result = func(result)
-        if isinstance(result, tuple):
-            return result
-        return (result,)
+            result = func(*result)
+            # Ensure result is always a tuple for consistent unpacking
+            if not isinstance(result, tuple):
+                result = (result,)
+        return result
 
     return composed_function
 
 
 class Handler:
     """A generic handler
 
-    Allows any custom mapping between data types and functions.
-    Functions are composed based on the type hierarchy defined in the typeset.
+    Allows any custom mapping between data types and functions
     """
 
     def __init__(
         self,
-        mapping: Dict[str, List[Callable]],
+        mapping: Dict[str, List[Callable[..., Any]]],
         typeset: VisionsTypeset,
         *args: Any,
         **kwargs: Any
@@ -54,33 +51,28 @@ def _complete_dag(self) -> None:
         for from_type, to_type in nx.topological_sort(
             nx.line_graph(self.typeset.base_graph)
         ):
-            from_key = str(from_type)
-            to_key = str(to_type)
-            self.mapping[to_key] = self.mapping.get(from_key, []) + self.mapping.get(
-                to_key, []
+            self.mapping[str(to_type)] = (
+                self.mapping[str(from_type)] + self.mapping[str(to_type)]
             )
 
-    def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Dict[str, Any]:
+    def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Any:
         """
-        Execute the handler chain for the given data type.
-
-        :param dtype: the data type to handle
-        :param args: arguments to pass to the handler functions
-        :param kwargs: keyword arguments (currently unused but reserved for extensibility)
-        :return: a dictionary containing the summary extracted from the data
+        Execute the handler chain for the given dtype.
+        
+        :param dtype: The data type to handle
+        :param args: Arguments to pass to the handler chain
+        :return: The last element of the result tuple from the handler chain
         """
         funcs = self.mapping.get(dtype, [])
         op = compose(funcs)
-        result = op(*args)
-        if result:
-            return result[-1] if isinstance(result[-1], dict) else {}
-        return {}
+        summary = op(*args)[-1]
+        return summary
 
 
-def get_render_map() -> Dict[str, Callable]:
+def get_render_map() -> Dict[str, Callable[..., Any]]:
     import ydata_profiling.report.structure.variables as render_algorithms
 
-    render_map = {
+    render_map: Dict[str, Callable[..., Any]] = {
         "Boolean": render_algorithms.render_boolean,
         "Numeric": render_algorithms.render_real,
         "Complex": render_algorithms.render_complex,
diff --git a/src/ydata_profiling/model/spark/missing_spark.py b/src/ydata_profiling/model/spark/missing_spark.py
@@ -13,47 +13,47 @@
 
 class MissingnoBarSparkPatch:
     """
-    Technical Debt :
-    This is a monkey patching object that allows usage of the library missingno as is for spark dataframes.
-    This is because missingno library's bar function always applies a isnull().sum() on dataframes in the visualisation
-    function, instead of allowing just values counts as an entry point. Thus, in order to calculate the
-    missing values dataframe in spark, we compute it first, then wrap it in this MissingnoBarSparkPatch object which
-    will be unwrapped by missingno and return the pre-computed value counts.
-    The best fix to this currently terrible patch is to submit a PR to missingno to separate preprocessing function
-    (compute value counts from df) and visualisation functions such that we can call the visualisation directly.
-    Unfortunately, the missingno library people have not really responded to our issues on gitlab.
-    See https://github.com/ResidentMario/missingno/issues/119.
-    We could also fork the missingno library and implement some of the code in our database, but that feels
-    like bad practice as well.
+    Adapter class to enable missingno library compatibility with Spark DataFrames.
+    
+    The missingno library's visualization functions internally call isnull().sum() 
+    on dataframes. For Spark DataFrames, we pre-compute the null counts and wrap
+    them in this adapter to provide the expected interface.
+    
+    Note: This is a workaround for missingno's lack of separation between
+    data preprocessing and visualization. See:
+    https://github.com/ResidentMario/missingno/issues/119
     """
 
     def __init__(
-        self, df: DataFrame, columns: List[str] = None, original_df_size: int = None
+        self, 
+        df: DataFrame, 
+        columns: Optional[List[str]] = None, 
+        original_df_size: Optional[int] = None
     ):
         self.df = df
         self.columns = columns
         self.original_df_size = original_df_size
 
-    def isnull(self) -> Any:
-        """
-        This patches the .isnull().sum() function called by missingno library
-        """
-        return self  # return self to patch .sum() function
+    def isnull(self) -> "MissingnoBarSparkPatch":
+        """Returns self to enable chained .isnull().sum() calls."""
+        return self
 
     def sum(self) -> DataFrame:
-        """
-        This patches the .sum() function called by missingno library
-        """
-        return self.df  # return unwrapped dataframe
+        """Returns the pre-computed null counts dataframe."""
+        return self.df
 
     def __len__(self) -> Optional[int]:
-        """
-        This patches the len(df) function called by missingno library
-        """
+        """Returns the original dataframe size."""
         return self.original_df_size
 
 
 def missing_bar(config: Settings, df: DataFrame) -> str:
+    """Generate a missing values bar chart for Spark DataFrame.
+    
+    :param config: Report settings
+    :param df: Spark DataFrame
+    :return: HTML string of the bar chart
+    """
     import pyspark.sql.functions as F
 
     data_nan_counts = (
@@ -70,6 +70,12 @@ def missing_bar(config: Settings, df: DataFrame) -> str:
 
 
 def missing_matrix(config: Settings, df: DataFrame) -> str:
+    """Generate a missing values matrix visualization for Spark DataFrame.
+    
+    :param config: Report settings
+    :param df: Spark DataFrame
+    :return: HTML string of the matrix visualization
+    """
     df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())
     return plot_missing_matrix(
         config,
@@ -80,11 +86,19 @@ def missing_matrix(config: Settings, df: DataFrame) -> str:
 
 
 def missing_heatmap(config: Settings, df: DataFrame) -> str:
+    """Generate a missing values heatmap for Spark DataFrame.
+    
+    :param config: Report settings
+    :param df: Spark DataFrame
+    :return: HTML string of the heatmap
+    """
     df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())
 
+    # Remove completely filled or completely empty variables.
     columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0]
     df = df.iloc[:, columns]
 
+    # Create and mask the correlation matrix. Construct the base heatmap.
     corr_mat = df.isnull().corr()
     mask = np.zeros_like(corr_mat)
     mask[np.triu_indices_from(mask)] = True
diff --git a/src/ydata_profiling/report/presentation/core/renderable.py b/src/ydata_profiling/report/presentation/core/renderable.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, Optional
 
 
 class Renderable(ABC):
@@ -38,5 +38,10 @@ def __str__(self) -> str:
         return self.__class__.__name__
 
     @classmethod
-    def convert_to_class(cls, obj: "Renderable", flavour_func) -> None:
+    def convert_to_class(cls, obj: "Renderable", flavour_func: Callable[["Renderable"], None]) -> None:
+        """Convert the object's class to this class and recursively apply flavour to nested items.
+        
+        :param obj: The renderable object to convert
+        :param flavour_func: Function to apply to nested renderable items
+        """
         obj.__class__ = cls
diff --git a/src/ydata_profiling/report/presentation/flavours/flavours.py b/src/ydata_profiling/report/presentation/flavours/flavours.py
@@ -1,29 +1,46 @@
 """
     Flavours registry information
 """
-from typing import Callable
+from typing import Callable, Dict, Type
 
 from ydata_profiling.report.presentation.core import Root
 from ydata_profiling.report.presentation.core.renderable import Renderable
 
-_FLAVOUR_REGISTRY: dict = {}
+_FLAVOUR_REGISTRY: Dict[str, Dict[Type[Renderable], Type[Renderable]]] = {}
 
 
-def register_flavour(name: str, mapping: dict) -> None:
+def register_flavour(name: str, mapping: Dict[Type[Renderable], Type[Renderable]]) -> None:
+    """Register a flavour mapping.
+    
+    :param name: The flavour name
+    :param mapping: Dictionary mapping core renderable types to flavour-specific types
+    """
     _FLAVOUR_REGISTRY[name] = mapping
 
 
-def get_flavour_mapping(name: str) -> dict:
+def get_flavour_mapping(name: str) -> Dict[Type[Renderable], Type[Renderable]]:
+    """Get a registered flavour mapping.
+    
+    :param name: The flavour name
+    :return: The flavour mapping dictionary
+    :raises ValueError: If the flavour is not registered
+    """
     if name not in _FLAVOUR_REGISTRY:
         raise ValueError(f"Flavour '{name}' is not registered.")
     return _FLAVOUR_REGISTRY[name]
 
 
 def apply_renderable_mapping(
-    mapping: dict,
+    mapping: Dict[Type[Renderable], Type[Renderable]],
     structure: Renderable,
     flavour_func: Callable[[Renderable], None],
 ) -> None:
+    """Apply flavour mapping to a renderable structure.
+    
+    :param mapping: The flavour mapping dictionary
+    :param structure: The renderable structure to transform
+    :param flavour_func: The flavour application function for recursive calls
+    """
     mapping[type(structure)].convert_to_class(structure, flavour_func)
 
 
diff --git a/src/ydata_profiling/report/presentation/flavours/html/table.py b/src/ydata_profiling/report/presentation/flavours/html/table.py
@@ -1,4 +1,4 @@
-from ydata_profiling.report.presentation.core.table import Table
+from ydata_profiling.report.presentation.core import Table
 from ydata_profiling.report.presentation.flavours.html import templates
 
 
diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates.py b/src/ydata_profiling/report/presentation/flavours/html/templates.py
@@ -1,6 +1,7 @@
 """Contains all templates used for generating the HTML profile report"""
 import shutil
 from pathlib import Path
+from typing import Any
 
 import jinja2
 
diff --git a/src/ydata_profiling/report/presentation/frequency_table_utils.py b/src/ydata_profiling/report/presentation/frequency_table_utils.py
@@ -7,8 +7,6 @@
 def _frequency_table(
     freqtable: pd.Series, n: int, max_number_to_print: int
 ) -> List[Dict[str, Any]]:
-    # TODO: replace '' by '(Empty)' ?
-
     if max_number_to_print > n:
         max_number_to_print = n
 
@@ -26,7 +24,6 @@ def _frequency_table(
 
     max_freq = max(freqtable.values[0], freq_other, freq_missing)
 
-    # TODO: Correctly sort missing and other
     # No values
     if max_freq == 0:
         return []
@@ -77,7 +74,7 @@ def freq_table(
     freqtable: Union[pd.Series, List[pd.Series]],
     n: Union[int, List[int]],
     max_number_to_print: int,
-) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
+) -> List[List[Dict[str, Any]]]:
     """Render the rows for a frequency table (value, count).
 
     Args:
@@ -94,7 +91,7 @@ def freq_table(
             _frequency_table(v, n2, max_number_to_print) for v, n2 in zip(freqtable, n)
         ]
     else:
-        return [_frequency_table(freqtable, n, max_number_to_print)]  # type: ignore
+        return [_frequency_table(freqtable, n, max_number_to_print)]
 
 
 def _extreme_obs_table(
@@ -138,4 +135,4 @@ def extreme_obs_table(
             _extreme_obs_table(v, number_to_print, n1) for v, n1 in zip(freqtable, n)
         ]
 
-    return [_extreme_obs_table(freqtable, number_to_print, n)]  # type: ignore
+    return [_extreme_obs_table(freqtable, number_to_print, n)]

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from ydata_profiling.report.presentation.core.table import Table`
	`1`	`+from ydata_profiling.report.presentation.core import Table`
`2`	`2`	`from ydata_profiling.report.presentation.flavours.html import templates`
`3`	`3`
`4`	`4`