Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10-slim

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*

COPY . .

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
pip install --no-cache-dir . && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
pip install --no-cache-dir jupyter

EXPOSE 8888

CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]


171 changes: 90 additions & 81 deletions src/ydata_profiling/model/handler.py
Original file line number Diff line number Diff line change
@@ -1,81 +1,90 @@
"""
Auxiliary handler methods for data summary extraction
"""
from typing import Any, Callable, Dict, List, Sequence

import networkx as nx
from visions import VisionsTypeset


def compose(functions: Sequence[Callable]) -> Callable:
"""
Compose a sequence of functions.

:param functions: sequence of functions
:return: combined function applying all functions in order.
"""

def composed_function(*args) -> List[Any]:
result = args # Start with the input arguments
for func in functions:
result = func(*result) if isinstance(result, tuple) else func(result)
return result # type: ignore

return composed_function # type: ignore


class Handler:
"""A generic handler

Allows any custom mapping between data types and functions
"""

def __init__(
self,
mapping: Dict[str, List[Callable]],
typeset: VisionsTypeset,
*args,
**kwargs
):
self.mapping = mapping
self.typeset = typeset
self._complete_dag()

def _complete_dag(self) -> None:
for from_type, to_type in nx.topological_sort(
nx.line_graph(self.typeset.base_graph)
):
self.mapping[str(to_type)] = (
self.mapping[str(from_type)] + self.mapping[str(to_type)]
)

def handle(self, dtype: str, *args, **kwargs) -> dict:
"""
Returns:
object: a tuple containing the config, the dataset series and the summary extracted
"""
funcs = self.mapping.get(dtype, [])
op = compose(funcs)
summary = op(*args)[-1]
return summary


def get_render_map() -> Dict[str, Callable]:
import ydata_profiling.report.structure.variables as render_algorithms

render_map = {
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
"Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
"Path": render_algorithms.render_path,
"File": render_algorithms.render_file,
"Image": render_algorithms.render_image,
"Unsupported": render_algorithms.render_generic,
"TimeSeries": render_algorithms.render_timeseries,
}

return render_map
"""
Auxiliary handler methods for data summary extraction
"""
from typing import Any, Callable, Dict, List, Sequence, Tuple

import networkx as nx
from visions import VisionsTypeset


def compose(functions: Sequence[Callable[..., Any]]) -> Callable[..., Tuple[Any, ...]]:
"""
Compose a sequence of functions.

Each function in the sequence should accept the arguments passed to the composed
function and return either a single value or a tuple of values.

:param functions: sequence of functions
:return: combined function applying all functions in order.
"""

def composed_function(*args: Any) -> Tuple[Any, ...]:
result: Tuple[Any, ...] = args
for func in functions:
result = func(*result)
# Ensure result is always a tuple for consistent unpacking
if not isinstance(result, tuple):
result = (result,)
return result

return composed_function


class Handler:
"""A generic handler

Allows any custom mapping between data types and functions
"""

def __init__(
self,
mapping: Dict[str, List[Callable[..., Any]]],
typeset: VisionsTypeset,
*args: Any,
**kwargs: Any
):
self.mapping = mapping
self.typeset = typeset
self._complete_dag()

def _complete_dag(self) -> None:
for from_type, to_type in nx.topological_sort(
nx.line_graph(self.typeset.base_graph)
):
self.mapping[str(to_type)] = (
self.mapping[str(from_type)] + self.mapping[str(to_type)]
)

def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Any:
"""
Execute the handler chain for the given dtype.

:param dtype: The data type to handle
:param args: Arguments to pass to the handler chain
:return: The last element of the result tuple from the handler chain
"""
funcs = self.mapping.get(dtype, [])
op = compose(funcs)
summary = op(*args)[-1]
return summary


def get_render_map() -> Dict[str, Callable[..., Any]]:
import ydata_profiling.report.structure.variables as render_algorithms

render_map: Dict[str, Callable[..., Any]] = {
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
"Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
"Path": render_algorithms.render_path,
"File": render_algorithms.render_file,
"Image": render_algorithms.render_image,
"Unsupported": render_algorithms.render_generic,
"TimeSeries": render_algorithms.render_timeseries,
}

return render_map
63 changes: 37 additions & 26 deletions src/ydata_profiling/model/spark/missing_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,50 +13,49 @@

class MissingnoBarSparkPatch:
"""
Technical Debt :
This is a monkey patching object that allows usage of the library missingno as is for spark dataframes.
This is because missingno library's bar function always applies a isnull().sum() on dataframes in the visualisation
function, instead of allowing just values counts as an entry point. Thus, in order to calculate the
missing values dataframe in spark, we compute it first, then wrap it in this MissingnoBarSparkPatch object which
will be unwrapped by missingno and return the pre-computed value counts.
The best fix to this currently terrible patch is to submit a PR to missingno to separate preprocessing function
(compute value counts from df) and visualisation functions such that we can call the visualisation directly.
Unfortunately, the missingno library people have not really responded to our issues on gitlab.
See https://github.com/ResidentMario/missingno/issues/119.
We could also fork the missingno library and implement some of the code in our database, but that feels
like bad practice as well.
Adapter class to enable missingno library compatibility with Spark DataFrames.

The missingno library's visualization functions internally call isnull().sum()
on dataframes. For Spark DataFrames, we pre-compute the null counts and wrap
them in this adapter to provide the expected interface.

Note: This is a workaround for missingno's lack of separation between
data preprocessing and visualization. See:
https://github.com/ResidentMario/missingno/issues/119
"""

def __init__(
self, df: DataFrame, columns: List[str] = None, original_df_size: int = None
self,
df: DataFrame,
columns: Optional[List[str]] = None,
original_df_size: Optional[int] = None
):
self.df = df
self.columns = columns
self.original_df_size = original_df_size

def isnull(self) -> Any:
"""
This patches the .isnull().sum() function called by missingno library
"""
return self # return self to patch .sum() function
def isnull(self) -> "MissingnoBarSparkPatch":
"""Returns self to enable chained .isnull().sum() calls."""
return self

def sum(self) -> DataFrame:
"""
This patches the .sum() function called by missingno library
"""
return self.df # return unwrapped dataframe
"""Returns the pre-computed null counts dataframe."""
return self.df

def __len__(self) -> Optional[int]:
"""
This patches the len(df) function called by missingno library
"""
"""Returns the original dataframe size."""
return self.original_df_size


def missing_bar(config: Settings, df: DataFrame) -> str:
"""Generate a missing values bar chart for Spark DataFrame.

:param config: Report settings
:param df: Spark DataFrame
:return: HTML string of the bar chart
"""
import pyspark.sql.functions as F

# FIXME: move to univariate
data_nan_counts = (
df.agg(
*[F.count(F.when(F.isnull(c) | F.isnan(c), c)).alias(c) for c in df.columns]
Expand All @@ -71,6 +70,12 @@ def missing_bar(config: Settings, df: DataFrame) -> str:


def missing_matrix(config: Settings, df: DataFrame) -> str:
"""Generate a missing values matrix visualization for Spark DataFrame.

:param config: Report settings
:param df: Spark DataFrame
:return: HTML string of the matrix visualization
"""
df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())
return plot_missing_matrix(
config,
Expand All @@ -81,6 +86,12 @@ def missing_matrix(config: Settings, df: DataFrame) -> str:


def missing_heatmap(config: Settings, df: DataFrame) -> str:
"""Generate a missing values heatmap for Spark DataFrame.

:param config: Report settings
:param df: Spark DataFrame
:return: HTML string of the heatmap
"""
df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())

# Remove completely filled or completely empty variables.
Expand Down
11 changes: 8 additions & 3 deletions src/ydata_profiling/report/presentation/core/renderable.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
from typing import Any, Callable, Dict, Optional


class Renderable(ABC):
Expand Down Expand Up @@ -34,9 +34,14 @@ def classes(self) -> str:
def render(self) -> Any:
pass

def __str__(self):
def __str__(self) -> str:
return self.__class__.__name__

@classmethod
def convert_to_class(cls, obj: "Renderable", flavour_func) -> None: # noqa: ANN001
def convert_to_class(cls, obj: "Renderable", flavour_func: Callable[["Renderable"], None]) -> None:
"""Convert the object's class to this class and recursively apply flavour to nested items.

:param obj: The renderable object to convert
:param flavour_func: Function to apply to nested renderable items
"""
obj.__class__ = cls
29 changes: 24 additions & 5 deletions src/ydata_profiling/report/presentation/flavours/flavours.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,46 @@
"""
Flavours registry information
"""
from typing import Callable, Dict, Type

from ydata_profiling.report.presentation.core import Root
from ydata_profiling.report.presentation.core.renderable import Renderable

_FLAVOUR_REGISTRY: dict = {}
_FLAVOUR_REGISTRY: Dict[str, Dict[Type[Renderable], Type[Renderable]]] = {}


def register_flavour(name: str, mapping: dict) -> None:
def register_flavour(name: str, mapping: Dict[Type[Renderable], Type[Renderable]]) -> None:
"""Register a flavour mapping.

:param name: The flavour name
:param mapping: Dictionary mapping core renderable types to flavour-specific types
"""
_FLAVOUR_REGISTRY[name] = mapping


def get_flavour_mapping(name: str) -> dict:
def get_flavour_mapping(name: str) -> Dict[Type[Renderable], Type[Renderable]]:
"""Get a registered flavour mapping.

:param name: The flavour name
:return: The flavour mapping dictionary
:raises ValueError: If the flavour is not registered
"""
if name not in _FLAVOUR_REGISTRY:
raise ValueError(f"Flavour '{name}' is not registered.")
return _FLAVOUR_REGISTRY[name]


def apply_renderable_mapping(
mapping: dict,
mapping: Dict[Type[Renderable], Type[Renderable]],
structure: Renderable,
flavour_func, # noqa: ANN001
flavour_func: Callable[[Renderable], None],
) -> None:
"""Apply flavour mapping to a renderable structure.

:param mapping: The flavour mapping dictionary
:param structure: The renderable structure to transform
:param flavour_func: The flavour application function for recursive calls
"""
mapping[type(structure)].convert_to_class(structure, flavour_func)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ydata_profiling.report.presentation.core.table import Table
from ydata_profiling.report.presentation.core import Table
from ydata_profiling.report.presentation.flavours.html import templates


Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Contains all templates used for generating the HTML profile report"""
import shutil
from pathlib import Path
from typing import Any

import jinja2

Expand Down
Loading