Skip to content

Commit ae02233

Browse files
PkchaPkcha
authored andcommitted
feat: initial release
1 parent 307270e commit ae02233

7 files changed

Lines changed: 100 additions & 74 deletions

File tree

src/ydata_profiling/model/handler.py

Lines changed: 27 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,44 @@
11
"""
22
Auxiliary handler methods for data summary extraction
33
"""
4-
from typing import Any, Callable, Dict, List, Sequence, Tuple, Union
4+
from typing import Any, Callable, Dict, List, Sequence, Tuple
55

66
import networkx as nx
77
from visions import VisionsTypeset
88

99

10-
def compose(functions: Sequence[Callable]) -> Callable:
10+
def compose(functions: Sequence[Callable[..., Any]]) -> Callable[..., Tuple[Any, ...]]:
1111
"""
1212
Compose a sequence of functions.
1313
14-
Each function in the sequence receives the result of the previous function.
15-
Functions are expected to accept and return tuples for proper chaining.
16-
17-
:param functions: sequence of functions that accept and return tuples
18-
:return: combined function applying all functions in order
14+
Each function in the sequence should accept the arguments passed to the composed
15+
function and return either a single value or a tuple of values.
16+
17+
:param functions: sequence of functions
18+
:return: combined function applying all functions in order.
1919
"""
2020

2121
def composed_function(*args: Any) -> Tuple[Any, ...]:
22-
result: Union[Tuple[Any, ...], Any] = args
22+
result: Tuple[Any, ...] = args
2323
for func in functions:
24-
if isinstance(result, tuple):
25-
result = func(*result)
26-
else:
27-
result = func(result)
28-
if isinstance(result, tuple):
29-
return result
30-
return (result,)
24+
result = func(*result)
25+
# Ensure result is always a tuple for consistent unpacking
26+
if not isinstance(result, tuple):
27+
result = (result,)
28+
return result
3129

3230
return composed_function
3331

3432

3533
class Handler:
3634
"""A generic handler
3735
38-
Allows any custom mapping between data types and functions.
39-
Functions are composed based on the type hierarchy defined in the typeset.
36+
Allows any custom mapping between data types and functions
4037
"""
4138

4239
def __init__(
4340
self,
44-
mapping: Dict[str, List[Callable]],
41+
mapping: Dict[str, List[Callable[..., Any]]],
4542
typeset: VisionsTypeset,
4643
*args: Any,
4744
**kwargs: Any
@@ -54,33 +51,28 @@ def _complete_dag(self) -> None:
5451
for from_type, to_type in nx.topological_sort(
5552
nx.line_graph(self.typeset.base_graph)
5653
):
57-
from_key = str(from_type)
58-
to_key = str(to_type)
59-
self.mapping[to_key] = self.mapping.get(from_key, []) + self.mapping.get(
60-
to_key, []
54+
self.mapping[str(to_type)] = (
55+
self.mapping[str(from_type)] + self.mapping[str(to_type)]
6156
)
6257

63-
def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Dict[str, Any]:
58+
def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Any:
6459
"""
65-
Execute the handler chain for the given data type.
66-
67-
:param dtype: the data type to handle
68-
:param args: arguments to pass to the handler functions
69-
:param kwargs: keyword arguments (currently unused but reserved for extensibility)
70-
:return: a dictionary containing the summary extracted from the data
60+
Execute the handler chain for the given dtype.
61+
62+
:param dtype: The data type to handle
63+
:param args: Arguments to pass to the handler chain
64+
:return: The last element of the result tuple from the handler chain
7165
"""
7266
funcs = self.mapping.get(dtype, [])
7367
op = compose(funcs)
74-
result = op(*args)
75-
if result:
76-
return result[-1] if isinstance(result[-1], dict) else {}
77-
return {}
68+
summary = op(*args)[-1]
69+
return summary
7870

7971

80-
def get_render_map() -> Dict[str, Callable]:
72+
def get_render_map() -> Dict[str, Callable[..., Any]]:
8173
import ydata_profiling.report.structure.variables as render_algorithms
8274

83-
render_map = {
75+
render_map: Dict[str, Callable[..., Any]] = {
8476
"Boolean": render_algorithms.render_boolean,
8577
"Numeric": render_algorithms.render_real,
8678
"Complex": render_algorithms.render_complex,

src/ydata_profiling/model/spark/missing_spark.py

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,47 +13,47 @@
1313

1414
class MissingnoBarSparkPatch:
1515
"""
16-
Technical Debt :
17-
This is a monkey patching object that allows usage of the library missingno as is for spark dataframes.
18-
This is because missingno library's bar function always applies a isnull().sum() on dataframes in the visualisation
19-
function, instead of allowing just values counts as an entry point. Thus, in order to calculate the
20-
missing values dataframe in spark, we compute it first, then wrap it in this MissingnoBarSparkPatch object which
21-
will be unwrapped by missingno and return the pre-computed value counts.
22-
The best fix to this currently terrible patch is to submit a PR to missingno to separate preprocessing function
23-
(compute value counts from df) and visualisation functions such that we can call the visualisation directly.
24-
Unfortunately, the missingno library people have not really responded to our issues on gitlab.
25-
See https://github.com/ResidentMario/missingno/issues/119.
26-
We could also fork the missingno library and implement some of the code in our database, but that feels
27-
like bad practice as well.
16+
Adapter class to enable missingno library compatibility with Spark DataFrames.
17+
18+
The missingno library's visualization functions internally call isnull().sum()
19+
on dataframes. For Spark DataFrames, we pre-compute the null counts and wrap
20+
them in this adapter to provide the expected interface.
21+
22+
Note: This is a workaround for missingno's lack of separation between
23+
data preprocessing and visualization. See:
24+
https://github.com/ResidentMario/missingno/issues/119
2825
"""
2926

3027
def __init__(
31-
self, df: DataFrame, columns: List[str] = None, original_df_size: int = None
28+
self,
29+
df: DataFrame,
30+
columns: Optional[List[str]] = None,
31+
original_df_size: Optional[int] = None
3232
):
3333
self.df = df
3434
self.columns = columns
3535
self.original_df_size = original_df_size
3636

37-
def isnull(self) -> Any:
38-
"""
39-
This patches the .isnull().sum() function called by missingno library
40-
"""
41-
return self # return self to patch .sum() function
37+
def isnull(self) -> "MissingnoBarSparkPatch":
38+
"""Returns self to enable chained .isnull().sum() calls."""
39+
return self
4240

4341
def sum(self) -> DataFrame:
44-
"""
45-
This patches the .sum() function called by missingno library
46-
"""
47-
return self.df # return unwrapped dataframe
42+
"""Returns the pre-computed null counts dataframe."""
43+
return self.df
4844

4945
def __len__(self) -> Optional[int]:
50-
"""
51-
This patches the len(df) function called by missingno library
52-
"""
46+
"""Returns the original dataframe size."""
5347
return self.original_df_size
5448

5549

5650
def missing_bar(config: Settings, df: DataFrame) -> str:
51+
"""Generate a missing values bar chart for Spark DataFrame.
52+
53+
:param config: Report settings
54+
:param df: Spark DataFrame
55+
:return: HTML string of the bar chart
56+
"""
5757
import pyspark.sql.functions as F
5858

5959
data_nan_counts = (
@@ -70,6 +70,12 @@ def missing_bar(config: Settings, df: DataFrame) -> str:
7070

7171

7272
def missing_matrix(config: Settings, df: DataFrame) -> str:
73+
"""Generate a missing values matrix visualization for Spark DataFrame.
74+
75+
:param config: Report settings
76+
:param df: Spark DataFrame
77+
:return: HTML string of the matrix visualization
78+
"""
7379
df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())
7480
return plot_missing_matrix(
7581
config,
@@ -80,11 +86,19 @@ def missing_matrix(config: Settings, df: DataFrame) -> str:
8086

8187

8288
def missing_heatmap(config: Settings, df: DataFrame) -> str:
89+
"""Generate a missing values heatmap for Spark DataFrame.
90+
91+
:param config: Report settings
92+
:param df: Spark DataFrame
93+
:return: HTML string of the heatmap
94+
"""
8395
df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())
8496

97+
# Remove completely filled or completely empty variables.
8598
columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0]
8699
df = df.iloc[:, columns]
87100

101+
# Create and mask the correlation matrix. Construct the base heatmap.
88102
corr_mat = df.isnull().corr()
89103
mask = np.zeros_like(corr_mat)
90104
mask[np.triu_indices_from(mask)] = True

src/ydata_profiling/report/presentation/core/renderable.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from typing import Any, Dict, Optional
2+
from typing import Any, Callable, Dict, Optional
33

44

55
class Renderable(ABC):
@@ -38,5 +38,10 @@ def __str__(self) -> str:
3838
return self.__class__.__name__
3939

4040
@classmethod
41-
def convert_to_class(cls, obj: "Renderable", flavour_func) -> None:
41+
def convert_to_class(cls, obj: "Renderable", flavour_func: Callable[["Renderable"], None]) -> None:
42+
"""Convert the object's class to this class and recursively apply flavour to nested items.
43+
44+
:param obj: The renderable object to convert
45+
:param flavour_func: Function to apply to nested renderable items
46+
"""
4247
obj.__class__ = cls

src/ydata_profiling/report/presentation/flavours/flavours.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,46 @@
11
"""
22
Flavours registry information
33
"""
4-
from typing import Callable
4+
from typing import Callable, Dict, Type
55

66
from ydata_profiling.report.presentation.core import Root
77
from ydata_profiling.report.presentation.core.renderable import Renderable
88

9-
_FLAVOUR_REGISTRY: dict = {}
9+
_FLAVOUR_REGISTRY: Dict[str, Dict[Type[Renderable], Type[Renderable]]] = {}
1010

1111

12-
def register_flavour(name: str, mapping: dict) -> None:
12+
def register_flavour(name: str, mapping: Dict[Type[Renderable], Type[Renderable]]) -> None:
13+
"""Register a flavour mapping.
14+
15+
:param name: The flavour name
16+
:param mapping: Dictionary mapping core renderable types to flavour-specific types
17+
"""
1318
_FLAVOUR_REGISTRY[name] = mapping
1419

1520

16-
def get_flavour_mapping(name: str) -> dict:
21+
def get_flavour_mapping(name: str) -> Dict[Type[Renderable], Type[Renderable]]:
22+
"""Get a registered flavour mapping.
23+
24+
:param name: The flavour name
25+
:return: The flavour mapping dictionary
26+
:raises ValueError: If the flavour is not registered
27+
"""
1728
if name not in _FLAVOUR_REGISTRY:
1829
raise ValueError(f"Flavour '{name}' is not registered.")
1930
return _FLAVOUR_REGISTRY[name]
2031

2132

2233
def apply_renderable_mapping(
23-
mapping: dict,
34+
mapping: Dict[Type[Renderable], Type[Renderable]],
2435
structure: Renderable,
2536
flavour_func: Callable[[Renderable], None],
2637
) -> None:
38+
"""Apply flavour mapping to a renderable structure.
39+
40+
:param mapping: The flavour mapping dictionary
41+
:param structure: The renderable structure to transform
42+
:param flavour_func: The flavour application function for recursive calls
43+
"""
2744
mapping[type(structure)].convert_to_class(structure, flavour_func)
2845

2946

src/ydata_profiling/report/presentation/flavours/html/table.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from ydata_profiling.report.presentation.core.table import Table
1+
from ydata_profiling.report.presentation.core import Table
22
from ydata_profiling.report.presentation.flavours.html import templates
33

44

src/ydata_profiling/report/presentation/flavours/html/templates.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Contains all templates used for generating the HTML profile report"""
22
import shutil
33
from pathlib import Path
4+
from typing import Any
45

56
import jinja2
67

src/ydata_profiling/report/presentation/frequency_table_utils.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
def _frequency_table(
88
freqtable: pd.Series, n: int, max_number_to_print: int
99
) -> List[Dict[str, Any]]:
10-
# TODO: replace '' by '(Empty)' ?
11-
1210
if max_number_to_print > n:
1311
max_number_to_print = n
1412

@@ -26,7 +24,6 @@ def _frequency_table(
2624

2725
max_freq = max(freqtable.values[0], freq_other, freq_missing)
2826

29-
# TODO: Correctly sort missing and other
3027
# No values
3128
if max_freq == 0:
3229
return []
@@ -77,7 +74,7 @@ def freq_table(
7774
freqtable: Union[pd.Series, List[pd.Series]],
7875
n: Union[int, List[int]],
7976
max_number_to_print: int,
80-
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
77+
) -> List[List[Dict[str, Any]]]:
8178
"""Render the rows for a frequency table (value, count).
8279
8380
Args:
@@ -94,7 +91,7 @@ def freq_table(
9491
_frequency_table(v, n2, max_number_to_print) for v, n2 in zip(freqtable, n)
9592
]
9693
else:
97-
return [_frequency_table(freqtable, n, max_number_to_print)] # type: ignore
94+
return [_frequency_table(freqtable, n, max_number_to_print)]
9895

9996

10097
def _extreme_obs_table(
@@ -138,4 +135,4 @@ def extreme_obs_table(
138135
_extreme_obs_table(v, number_to_print, n1) for v, n1 in zip(freqtable, n)
139136
]
140137

141-
return [_extreme_obs_table(freqtable, number_to_print, n)] # type: ignore
138+
return [_extreme_obs_table(freqtable, number_to_print, n)]

0 commit comments

Comments
 (0)