Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
matrix:
os: [ ubuntu-22.04 ]
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13" ]
python-version: ["3.10", "3.11", "3.12", "3.13" ]
pandas: [ "pandas>1.1" ]
numpy: [ "numpy>=1.21" ]
runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -65,7 +65,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-22.04]
python-version: ["3.12"]
python-version: ["3.13"]
pandas: [ "pandas>1.1" ]
numpy: [ "numpy>=1.21" ]

Expand Down
35 changes: 17 additions & 18 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ package_name = "ydata-profiling"

[project]
name = "ydata-profiling"
requires-python = ">=3.7,<3.14"
requires-python = ">=3.10,<3.14"
authors = [
{name = "YData Labs Inc", email = "opensource@ydata.ai"}
]
Expand All @@ -32,43 +32,42 @@ classifiers = [
"Topic :: Scientific/Engineering",
"Framework :: IPython",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
]

dependencies = [
"scipy>=1.4.1, <1.16",
"pandas>1.1, <3.0, !=1.4.0",
"scipy>=1.8, <1.17",
"pandas>1.5, <3.0, !=1.4.0",
"matplotlib>=3.5, <=3.10",
"pydantic>=2",
"PyYAML>=5.0.0, <6.1",
"jinja2>=2.11.1, <3.2",
"pydantic>=2, <3",
"PyYAML>=6.0.3, <6.1",
"jinja2>=3.1.6, <3.2",
"visions[type_image_path]>=0.7.5, <0.8.2",
"numpy>=1.16.0,<2.2",
"numpy>=1.22,<2.4",
# Could be optional
# Related to HTML report
"minify-html>=0.15.0",
"filetype>=1.0.0",
# Correlations
"phik>=0.11.1,<0.13",
"phik>=0.12.5, <0.13",
# Examples
"requests>=2.24.0, <3",
"requests>=2.32.0, <3",
# Progress bar
"tqdm>=4.48.2, <5",
"tqdm>=4.66.3, <5",
"seaborn>=0.10.1, <0.14",
"multimethod>=1.4, <2",
# metrics
"statsmodels>=0.13.2, <1",
# type checking
"typeguard>=3, <5",
"imagehash==4.3.1",
"wordcloud>=1.9.3",
"dacite>=1.8",
"numba>=0.56.0, <=0.61",
"typeguard>=4, <5",
"imagehash==4.3.2",
"wordcloud>=1.9.4",
"dacite>=1.9, <2",
"numba>=0.60,<0.63",
]

dynamic = [
Expand Down
3 changes: 0 additions & 3 deletions src/ydata_profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from ydata_profiling.controller import pandas_decorator # isort:skip # noqa
from ydata_profiling.profile_report import ProfileReport # isort:skip # noqa
from ydata_profiling.version import __version__ # isort:skip # noqa
from ydata_profiling.utils.information import display_banner

# backend
import ydata_profiling.model.pandas # isort:skip # noqa
Expand All @@ -26,8 +25,6 @@

warnings.simplefilter("ignore", category=NumbaDeprecationWarning)

display_banner()

__all__ = [
"pandas_decorator",
"ProfileReport",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
series_handle_nulls,
series_hashable,
)
from ydata_profiling.utils.information import DisplayInfo


def get_character_counts_vc(vc: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -270,11 +269,6 @@ def pandas_describe_categorical_1d(

if config.vars.cat.dirty_categories: # noqa: SIM102
if not _displayed_catvar_banner:
display_info = DisplayInfo(
title="Identify dirty categories with ydata-sdk",
info_text="This feature is only available for ydata-sdk users. Register to give try it.",
)
display_info.display_message()
_displayed_catvar_banner = True

return config, series, summary
107 changes: 95 additions & 12 deletions src/ydata_profiling/model/summary_algorithms.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import functools
from typing import Any, Callable, Optional, Tuple, TypeVar
from typing import Any, Callable, Optional, Tuple, TypeVar, Union

import numpy as np
import pandas as pd
Expand All @@ -26,6 +26,41 @@ def inner(
return inner


def safe_histogram(
values: np.ndarray,
bins: Union[int, str, np.ndarray] = "auto",
weights: Optional[np.ndarray] = None,
density: bool = False,
) -> Tuple[np.ndarray, np.ndarray]:
"""
Wrapper to avoid
ValueError: Too many bins for data range. Cannot create N finite-sized bins.
"""
try:
return np.histogram(values, bins=bins, weights=weights, density=density)
except ValueError as exc:
if "Too many bins for data range" in str(exc):
try:
return np.histogram(
values, bins="auto", weights=weights, density=density
)
except ValueError:
finite = values[np.isfinite(values)]
if finite.size == 0:
return np.array([]), np.array([])
vmin = float(np.min(finite))
vmax = float(np.max(finite))
if vmin == vmax:
eps = 0.5 if vmin == 0 else abs(vmin) * 0.5
bin_edges = np.array([vmin - eps, vmin + eps])
else:
bin_edges = np.array([vmin, vmax])
return np.histogram(
values, bins=bin_edges, weights=weights, density=density
)
raise


def histogram_compute(
config: Settings,
finite_values: np.ndarray,
Expand All @@ -36,27 +71,75 @@ def histogram_compute(
stats = {}
if len(finite_values) == 0:
return {name: []}

hist_config = config.plot.histogram
bins_arg = "auto" if hist_config.bins == 0 else min(hist_config.bins, n_unique)
bins = np.histogram_bin_edges(finite_values, bins=bins_arg)
if len(bins) > hist_config.max_bins:
bins = np.histogram_bin_edges(finite_values, bins=hist_config.max_bins)
weights = weights if weights and len(weights) == hist_config.max_bins else None

stats[name] = np.histogram(
finite_values, bins=bins, weights=weights, density=config.plot.histogram.density

# Compute data range
finite = finite_values[np.isfinite(finite_values)]
vmin = float(np.min(finite))
vmax = float(np.max(finite))
data_range = vmax - vmin

# Choose of Bins based on observed data values
if data_range == 0:
eps = 0.5 if vmin == 0 else abs(vmin) * 0.1
bins = np.array([vmin - eps, vmin + eps])
else:
requested_bins = hist_config.bins if hist_config.bins > 0 else "auto"

if isinstance(requested_bins, int):
safe_bins = min(requested_bins, n_unique, hist_config.max_bins)

safe_bins = max(1, safe_bins)

bins = np.linspace(vmin, vmax, safe_bins + 1)
else:
bins = np.histogram_bin_edges(finite_values, bins="auto")
if len(bins) - 1 > hist_config.max_bins:
bins = np.linspace(vmin, vmax, hist_config.max_bins + 1)

hist = np.histogram(
finite_values,
bins=bins,
weights=weights,
density=hist_config.density,
)

stats[name] = hist
return stats


def chi_square(
values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None
values: Optional[np.ndarray] = None,
histogram: Optional[np.ndarray] = None,
) -> dict:
# Case 1: histogram not passed → we compute it
if histogram is None:
bins = np.histogram_bin_edges(values, bins="auto")
if values is None:
return {"statistic": 0, "pvalue": 0}

# Try NumPy "auto" binning (may fail under NumPy 2)
try:
bins = np.histogram_bin_edges(values, bins="auto")
except ValueError:
# Fallback: basic 1-bin histogram covering the min→max range
finite = values[np.isfinite(values)]
if finite.size == 0:
return {"statistic": 0, "pvalue": 0}

vmin = float(finite.min())
vmax = float(finite.max())
if vmin == vmax:
bins = np.array([vmin - 0.5, vmin + 0.5])
else:
bins = np.array([vmin, vmax])

histogram, _ = np.histogram(values, bins=bins)
if len(histogram) == 0 or np.sum(histogram) == 0:

# Case 2: histogram exists but is empty
if histogram.size == 0 or histogram.sum() == 0:
return {"statistic": 0, "pvalue": 0}

return dict(chisquare(histogram)._asdict())


Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
{% if tabs | length > 0 %}
{% if oss %}
<p class="text-body-secondary text-end">Brought to you by <a href="https://ydata.ai/?utm_source=opensource&utm_medium=ydataprofiling&utm_campaign=report">YData</a></p>
{% endif %}
<div class="row item {% if classes %}{{ classes }}{% endif %}" {% if id %} id="{{ id }}"{% endif %}>
<ul class="nav nav-tabs tab-nav" role="tablist">
{% for tab in tabs %}
Expand Down
68 changes: 0 additions & 68 deletions src/ydata_profiling/utils/information.py

This file was deleted.

13 changes: 9 additions & 4 deletions tests/unit/test_console.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import os

import pytest
import requests

from ydata_profiling.controller import console
from ydata_profiling.utils.paths import get_config

NASA_URL = "https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv"


@pytest.fixture
def console_data(get_data_file):
return get_data_file(
"meteorites.csv",
"https://data.nasa.gov/docs/legacy/meteorite_landings/Meteorite_Landings.csv",
)
try:
return get_data_file("meteorites.csv", NASA_URL)
except requests.RequestException as e:
pytest.skip(f"Skipping console tests: NASA dataset unavailable ({e})")
except Exception as e:
pytest.skip(f"Skipping console tests: cannot fetch meteorites.csv ({e})")


@pytest.mark.skipif(os.name == "nt", reason="multiprocessing+pytest broken on Windows")
Expand Down