Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10-slim

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*

COPY . .

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
pip install --no-cache-dir . && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
pip install --no-cache-dir jupyter

EXPOSE 8888

CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]


18 changes: 9 additions & 9 deletions src/ydata_profiling/model/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from ydata_profiling.utils.styles import get_alert_styles


def fmt_percent(value: float, edge_cases: bool = True) -> str:
"""Format a ratio as a percentage.
def _fmt_percent(value: float, edge_cases: bool = True) -> str:
"""Format a ratio as a percentage (internal copy to avoid circular imports).

Args:
edge_cases: Check for edge cases?
Expand Down Expand Up @@ -209,7 +209,7 @@ def __init__(

def _get_description(self) -> str:
if self.values is not None:
return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows"
return f"Dataset has {self.values['n_duplicates']} ({_fmt_percent(self.values['p_duplicates'])}) duplicate rows"
else:
return "Dataset has no duplicated rows"

Expand All @@ -231,7 +231,7 @@ def __init__(

def _get_description(self) -> str:
if self.values is not None:
return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
return f"Dataset has {self.values['n_near_dups']} ({_fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
else:
return "Dataset has no near duplicated rows"

Expand Down Expand Up @@ -272,7 +272,7 @@ def __init__(

def _get_description(self) -> str:
if self.values is not None:
return f"[{self.column_name}] has {self.values['n_distinct']:} ({fmt_percent(self.values['p_distinct'])}) distinct values"
return f"[{self.column_name}] has {self.values['n_distinct']:} ({_fmt_percent(self.values['p_distinct'])}) distinct values"
else:
return f"[{self.column_name}] has a high cardinality"

Expand All @@ -294,7 +294,7 @@ def __init__(

def _get_description(self) -> str:
if self.values is not None:
return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category"
return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {_fmt_percent(self.values['p_fuzzy_vals'])} per category"
else:
return f"[{self.column_name}] no dirty categories values."

Expand Down Expand Up @@ -365,7 +365,7 @@ def __init__(

def _get_description(self) -> str:
if self.values is not None:
return f"[{self.column_name}] has {self.values['n_infinite']} ({fmt_percent(self.values['p_infinite'])}) infinite values"
return f"[{self.column_name}] has {self.values['n_infinite']} ({_fmt_percent(self.values['p_infinite'])}) infinite values"
else:
return f"[{self.column_name}] has infinite values"

Expand All @@ -387,7 +387,7 @@ def __init__(

def _get_description(self) -> str:
if self.values is not None:
return f"[{self.column_name}] {self.values['n_missing']} ({fmt_percent(self.values['p_missing'])}) missing values"
return f"[{self.column_name}] {self.values['n_missing']} ({_fmt_percent(self.values['p_missing'])}) missing values"
else:
return f"[{self.column_name}] has missing values"

Expand Down Expand Up @@ -541,7 +541,7 @@ def __init__(

def _get_description(self) -> str:
if self.values is not None:
return f"[{self.column_name}] has {self.values['n_zeros']} ({fmt_percent(self.values['p_zeros'])}) zeros"
return f"[{self.column_name}] has {self.values['n_zeros']} ({_fmt_percent(self.values['p_zeros'])}) zeros"
else:
return f"[{self.column_name}] has predominantly zeros"

Expand Down
15 changes: 0 additions & 15 deletions src/ydata_profiling/model/summary_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,6 @@
T = TypeVar("T")


def func_nullable_series_contains(fn: Callable) -> Callable:
@functools.wraps(fn)
def inner(
config: Settings, series: pd.Series, state: dict, *args, **kwargs
) -> bool:
if series.hasnans:
series = series.dropna()
if series.empty:
return False

return fn(config, series, state, *args, **kwargs)

return inner


def safe_histogram(
values: np.ndarray,
bins: Union[int, str, np.ndarray] = "auto",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ def render_common(config: Settings, summary: dict) -> dict:
n_freq_table_max = config.n_freq_table_max

template_variables = {
# TODO: with nan
"freq_table_rows": freq_table(
freqtable=summary["value_counts_without_nan"],
n=summary["n"],
Expand Down