Skip to content

Commit 5241b5b

Browse files
committed
fix: improve hist compute logic
1 parent 34d7ffd commit 5241b5b

1 file changed

Lines changed: 55 additions & 16 deletions

File tree

src/ydata_profiling/model/summary_algorithms.py

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,35 @@ def inner(
2525

2626
return inner
2727

28+
def safe_histogram(
29+
values: np.ndarray,
30+
bins: Union[int, str, np.ndarray] = "auto",
31+
weights: Optional[np.ndarray] = None,
32+
density: bool = False,
33+
) -> Tuple[np.ndarray, np.ndarray]:
34+
"""
35+
Wrapper to avoid
36+
ValueError: Too many bins for data range. Cannot create N finite-sized bins.
37+
"""
38+
try:
39+
return np.histogram(values, bins=bins, weights=weights, density=density)
40+
except ValueError as exc:
41+
if "Too many bins for data range" in str(exc):
42+
try:
43+
return np.histogram(values, bins="auto", weights=weights, density=density)
44+
except ValueError:
45+
finite = values[np.isfinite(values)]
46+
if finite.size == 0:
47+
return np.array([]), np.array([])
48+
vmin = float(np.min(finite))
49+
vmax = float(np.max(finite))
50+
if vmin == vmax:
51+
eps = 0.5 if vmin == 0 else abs(vmin) * 0.5
52+
bin_edges = np.array([vmin - eps, vmin + eps])
53+
else:
54+
bin_edges = np.array([vmin, vmax])
55+
return np.histogram(values, bins=bin_edges, weights=weights, density=density)
56+
raise
2857

2958
def histogram_compute(
3059
config: Settings,
@@ -38,33 +67,43 @@ def histogram_compute(
3867
return {name: []}
3968

4069
hist_config = config.plot.histogram
41-
bins_arg = "auto" if hist_config.bins == 0 else min(hist_config.bins, n_unique)
4270

43-
def _safe_histogram_bin_edges(values: np.ndarray, bins_param: Union[int, str]) -> np.ndarray:
44-
try:
45-
return np.histogram_bin_edges(values, bins=bins_param)
46-
except ValueError as exc:
47-
if "Too many bins for data range" in str(exc):
48-
# fallback: auto selection
49-
return np.histogram_bin_edges(values, bins="auto")
50-
raise
71+
# Compute data range
72+
finite = finite_values[np.isfinite(finite_values)]
73+
vmin = float(np.min(finite))
74+
vmax = float(np.max(finite))
75+
data_range = vmax - vmin
76+
77+
# Choose of Bins based on observed data values
78+
if data_range == 0:
79+
eps = 0.5 if vmin == 0 else abs(vmin) * 0.1
80+
bins = np.array([vmin - eps, vmin + eps])
81+
else:
82+
requested_bins = hist_config.bins if hist_config.bins > 0 else "auto"
83+
84+
if isinstance(requested_bins, int):
85+
safe_bins = min(requested_bins, n_unique, hist_config.max_bins)
5186

52-
bins = _safe_histogram_bin_edges(finite_values, bins_arg)
87+
safe_bins = max(1, safe_bins)
5388

54-
if len(bins) > hist_config.max_bins:
55-
bins = _safe_histogram_bin_edges(finite_values, hist_config.max_bins)
56-
if weights is not None and len(weights) != len(bins):
57-
weights = None
89+
bins = np.linspace(vmin, vmax, safe_bins + 1)
90+
else:
91+
bins = np.histogram_bin_edges(finite_values, bins="auto")
92+
if len(bins) - 1 > hist_config.max_bins:
93+
bins = np.linspace(vmin, vmax, hist_config.max_bins + 1)
5894

59-
stats[name] = np.histogram(
95+
hist = np.histogram(
6096
finite_values,
6197
bins=bins,
6298
weights=weights,
63-
density=config.plot.histogram.density,
99+
density=hist_config.density,
64100
)
101+
102+
stats[name] = hist
65103
return stats
66104

67105

106+
68107
def chi_square(
69108
values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None
70109
) -> dict:

0 commit comments

Comments
 (0)