Skip to content

Commit d83e1a1

Browse files
PkchaPkcha
authored andcommitted
feat: initial release
1 parent 3f15815 commit d83e1a1

11 files changed

Lines changed: 115 additions & 115 deletions

File tree

src/ydata_profiling/model/alerts.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
from ydata_profiling.utils.styles import get_alert_styles
1313

1414

15-
def _fmt_percent(value: float, edge_cases: bool = True) -> str:
16-
"""Format a ratio as a percentage (internal copy to avoid circular imports).
15+
def fmt_percent(value: float, edge_cases: bool = True) -> str:
16+
"""Format a ratio as a percentage.
1717
1818
Args:
1919
edge_cases: Check for edge cases?
@@ -209,7 +209,7 @@ def __init__(
209209

210210
def _get_description(self) -> str:
211211
if self.values is not None:
212-
return f"Dataset has {self.values['n_duplicates']} ({_fmt_percent(self.values['p_duplicates'])}) duplicate rows"
212+
return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows"
213213
else:
214214
return "Dataset has no duplicated rows"
215215

@@ -231,7 +231,7 @@ def __init__(
231231

232232
def _get_description(self) -> str:
233233
if self.values is not None:
234-
return f"Dataset has {self.values['n_near_dups']} ({_fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
234+
return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
235235
else:
236236
return "Dataset has no near duplicated rows"
237237

@@ -272,7 +272,7 @@ def __init__(
272272

273273
def _get_description(self) -> str:
274274
if self.values is not None:
275-
return f"[{self.column_name}] has {self.values['n_distinct']:} ({_fmt_percent(self.values['p_distinct'])}) distinct values"
275+
return f"[{self.column_name}] has {self.values['n_distinct']:} ({fmt_percent(self.values['p_distinct'])}) distinct values"
276276
else:
277277
return f"[{self.column_name}] has a high cardinality"
278278

@@ -294,7 +294,7 @@ def __init__(
294294

295295
def _get_description(self) -> str:
296296
if self.values is not None:
297-
return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {_fmt_percent(self.values['p_fuzzy_vals'])} per category"
297+
return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category"
298298
else:
299299
return f"[{self.column_name}] no dirty categories values."
300300

@@ -365,7 +365,7 @@ def __init__(
365365

366366
def _get_description(self) -> str:
367367
if self.values is not None:
368-
return f"[{self.column_name}] has {self.values['n_infinite']} ({_fmt_percent(self.values['p_infinite'])}) infinite values"
368+
return f"[{self.column_name}] has {self.values['n_infinite']} ({fmt_percent(self.values['p_infinite'])}) infinite values"
369369
else:
370370
return f"[{self.column_name}] has infinite values"
371371

@@ -387,7 +387,7 @@ def __init__(
387387

388388
def _get_description(self) -> str:
389389
if self.values is not None:
390-
return f"[{self.column_name}] {self.values['n_missing']} ({_fmt_percent(self.values['p_missing'])}) missing values"
390+
return f"[{self.column_name}] {self.values['n_missing']} ({fmt_percent(self.values['p_missing'])}) missing values"
391391
else:
392392
return f"[{self.column_name}] has missing values"
393393

@@ -541,7 +541,7 @@ def __init__(
541541

542542
def _get_description(self) -> str:
543543
if self.values is not None:
544-
return f"[{self.column_name}] has {self.values['n_zeros']} ({_fmt_percent(self.values['p_zeros'])}) zeros"
544+
return f"[{self.column_name}] has {self.values['n_zeros']} ({fmt_percent(self.values['p_zeros'])}) zeros"
545545
else:
546546
return f"[{self.column_name}] has predominantly zeros"
547547

src/ydata_profiling/model/correlations.py

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,43 +3,25 @@
33
"""Correlations between variables."""
44

55
import warnings
6-
from typing import Dict, List, Optional, Sized, no_type_check
6+
from typing import Dict, List, Optional, Sized
77

88
import numpy as np
99
import pandas as pd
1010

1111
from ydata_profiling.config import Settings
12+
from ydata_profiling.utils.backend import BaseBackend
1213

1314
try:
1415
from pandas.core.base import DataError
1516
except ImportError:
1617
from pandas.errors import DataError
1718

1819

19-
class CorrelationBackend:
20+
class CorrelationBackend(BaseBackend):
2021
"""Helper class to select and cache the appropriate correlation backend (Pandas or Spark)."""
2122

22-
@no_type_check
23-
def __init__(self, df: Sized):
24-
"""Determine backend once and store it for all correlation computations."""
25-
if isinstance(df, pd.DataFrame):
26-
from ydata_profiling.model.pandas import (
27-
correlations_pandas as correlation_backend, # type: ignore
28-
)
29-
else:
30-
from ydata_profiling.model.spark import (
31-
correlations_spark as correlation_backend, # type: ignore
32-
)
33-
34-
self.backend = correlation_backend
35-
36-
def get_method(self, method_name: str): # noqa: ANN201
37-
"""Retrieve the appropriate correlation method class from the backend."""
38-
if hasattr(self.backend, method_name):
39-
return getattr(self.backend, method_name)
40-
raise AttributeError(
41-
f"Correlation method '{method_name}' is not available in the backend."
42-
)
23+
_pandas_module = "ydata_profiling.model.pandas.correlations_pandas"
24+
_spark_module = "ydata_profiling.model.spark.correlations_spark"
4325

4426

4527
class Correlation:

src/ydata_profiling/model/missing.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,17 @@
1-
import importlib
21
import warnings
3-
from typing import Any, Callable, Dict, Optional, Sized
2+
from typing import Any, Dict, Optional, Sized
43

54
import pandas as pd
65

76
from ydata_profiling.config import Settings
7+
from ydata_profiling.utils.backend import BaseBackend
88

99

10-
class MissingDataBackend:
10+
class MissingDataBackend(BaseBackend):
1111
"""Helper class to select and cache the appropriate missing-data backend (Pandas or Spark)."""
1212

13-
def __init__(self, df: Sized):
14-
"""Determine backend once and store it for all missing-data computations."""
15-
if isinstance(df, pd.DataFrame):
16-
self.backend_module = "ydata_profiling.model.pandas.missing_pandas"
17-
else:
18-
self.backend_module = "ydata_profiling.model.spark.missing_spark"
19-
20-
self.module = importlib.import_module(self.backend_module)
21-
22-
def get_method(self, method_name: str) -> Callable:
23-
"""Retrieve the appropriate missing-data function from the backend module."""
24-
try:
25-
return getattr(self.module, method_name)
26-
except AttributeError as ex:
27-
raise AttributeError(
28-
f"Missing-data function '{method_name}' is not available in {self.backend_module}."
29-
) from ex
13+
_pandas_module = "ydata_profiling.model.pandas.missing_pandas"
14+
_spark_module = "ydata_profiling.model.spark.missing_spark"
3015

3116

3217
class MissingData:
Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
1-
from collections import Counter
2-
31
import pandas as pd
42

53
from ydata_profiling.config import Settings
6-
from ydata_profiling.model.table import get_table_stats
4+
from ydata_profiling.model.table import compute_common_table_stats, get_table_stats
75

86

97
@get_table_stats.register
@@ -21,36 +19,18 @@ def pandas_get_table_stats(
2119
A dictionary that contains the table statistics.
2220
"""
2321
n = len(df) if not df.empty else 0
22+
n_var = len(df.columns)
2423

2524
memory_size = df.memory_usage(deep=config.memory_deep).sum()
2625
record_size = float(memory_size) / n if n > 0 else 0
2726

2827
table_stats = {
2928
"n": n,
30-
"n_var": len(df.columns),
29+
"n_var": n_var,
3130
"memory_size": memory_size,
3231
"record_size": record_size,
33-
"n_cells_missing": 0,
34-
"n_vars_with_missing": 0,
35-
"n_vars_all_missing": 0,
3632
}
3733

38-
for series_summary in variable_stats.values():
39-
if "n_missing" in series_summary and series_summary["n_missing"] > 0:
40-
table_stats["n_vars_with_missing"] += 1
41-
table_stats["n_cells_missing"] += series_summary["n_missing"]
42-
if series_summary["n_missing"] == n:
43-
table_stats["n_vars_all_missing"] += 1
44-
45-
table_stats["p_cells_missing"] = (
46-
table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"])
47-
if table_stats["n"] > 0 and table_stats["n_var"] > 0
48-
else 0
49-
)
50-
51-
# Variable type counts
52-
table_stats.update(
53-
{"types": dict(Counter([v["type"] for v in variable_stats.values()]))}
54-
)
34+
table_stats.update(compute_common_table_stats(n, n_var, variable_stats))
5535

5636
return table_stats
Lines changed: 4 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
1-
from collections import Counter
2-
31
from pyspark.sql import DataFrame
42

53
from ydata_profiling.config import Settings
6-
from ydata_profiling.model.table import get_table_stats
4+
from ydata_profiling.model.table import compute_common_table_stats, get_table_stats
75

86

97
@get_table_stats.register
@@ -21,38 +19,9 @@ def get_table_stats_spark(
2119
A dictionary that contains the table statistics.
2220
"""
2321
n = df.count()
22+
n_var = len(df.columns)
2423

25-
result = {"n": n, "n_var": len(df.columns)}
26-
27-
table_stats = {
28-
"n_cells_missing": 0,
29-
"n_vars_with_missing": 0,
30-
"n_vars_all_missing": 0,
31-
}
32-
33-
for series_summary in variable_stats.values():
34-
if "n_missing" in series_summary and series_summary["n_missing"] > 0:
35-
table_stats["n_vars_with_missing"] += 1
36-
table_stats["n_cells_missing"] += series_summary["n_missing"]
37-
if series_summary["n_missing"] == n:
38-
table_stats["n_vars_all_missing"] += 1
39-
40-
# without this check we'll get a div by zero error
41-
if result["n"] * result["n_var"] > 0:
42-
table_stats["p_cells_missing"] = (
43-
table_stats["n_cells_missing"] / (result["n"] * result["n_var"])
44-
if result["n"] > 0
45-
else 0
46-
)
47-
else:
48-
table_stats["p_cells_missing"] = 0
49-
50-
result["p_cells_missing"] = table_stats["p_cells_missing"]
51-
result["n_cells_missing"] = table_stats["n_cells_missing"]
52-
result["n_vars_all_missing"] = table_stats["n_vars_all_missing"]
53-
result["n_vars_with_missing"] = table_stats["n_vars_with_missing"]
54-
55-
# Variable type counts
56-
result["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))
24+
result = {"n": n, "n_var": n_var}
25+
result.update(compute_common_table_stats(n, n_var, variable_stats))
5726

5827
return result

src/ydata_profiling/model/spark/timeseries_index_spark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from ydata_profiling.config import Settings
55

66

7-
def spark_get_time_index_description_spark(
7+
def get_time_index_description_spark(
88
config: Settings,
99
df: DataFrame,
1010
table_stats: dict,

src/ydata_profiling/model/summarizer.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,8 @@ def summarize(
5050
return self.handle(str(dtype), config, series, {"type": str(dtype)})
5151

5252

53-
# Revisit this with the correct support for Spark as well.
5453
class ProfilingSummarizer(BaseSummarizer):
55-
"""A summarizer for Pandas DataFrames."""
54+
"""A summarizer supporting both Pandas and Spark DataFrames."""
5655

5756
def __init__(self, typeset: VisionsTypeset, use_spark: bool = False):
5857
self.use_spark = use_spark and is_pyspark_installed()

src/ydata_profiling/model/summary_algorithms.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,21 @@
1111
T = TypeVar("T")
1212

1313

14+
def func_nullable_series_contains(fn: Callable) -> Callable:
15+
@functools.wraps(fn)
16+
def inner(
17+
config: Settings, series: pd.Series, state: dict, *args, **kwargs
18+
) -> bool:
19+
if series.hasnans:
20+
series = series.dropna()
21+
if series.empty:
22+
return False
23+
24+
return fn(config, series, state, *args, **kwargs)
25+
26+
return inner
27+
28+
1429
def safe_histogram(
1530
values: np.ndarray,
1631
bins: Union[int, str, np.ndarray] = "auto",

src/ydata_profiling/model/table.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,47 @@
1+
from collections import Counter
12
from typing import Any
23

34
from multimethod import multimethod
45

56
from ydata_profiling.config import Settings
67

78

9+
def compute_common_table_stats(
10+
n: int, n_var: int, variable_stats: dict
11+
) -> dict:
12+
"""Compute common table statistics shared by Pandas and Spark backends.
13+
14+
Args:
15+
n: Number of rows in the DataFrame
16+
n_var: Number of columns (variables)
17+
variable_stats: Previously calculated statistic on the DataFrame series
18+
19+
Returns:
20+
A dictionary with common table statistics: missing values counts, percentages, and type counts
21+
"""
22+
table_stats = {
23+
"n_cells_missing": 0,
24+
"n_vars_with_missing": 0,
25+
"n_vars_all_missing": 0,
26+
}
27+
28+
for series_summary in variable_stats.values():
29+
if "n_missing" in series_summary and series_summary["n_missing"] > 0:
30+
table_stats["n_vars_with_missing"] += 1
31+
table_stats["n_cells_missing"] += series_summary["n_missing"]
32+
if series_summary["n_missing"] == n:
33+
table_stats["n_vars_all_missing"] += 1
34+
35+
total_cells = n * n_var
36+
table_stats["p_cells_missing"] = (
37+
table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0
38+
)
39+
40+
table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))
41+
42+
return table_stats
43+
44+
845
@multimethod
946
def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict:
1047
raise NotImplementedError()

src/ydata_profiling/report/structure/variables/render_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def render_common(config: Settings, summary: dict) -> dict:
1010
n_freq_table_max = config.n_freq_table_max
1111

1212
template_variables = {
13+
# TODO: with nan
1314
"freq_table_rows": freq_table(
1415
freqtable=summary["value_counts_without_nan"],
1516
n=summary["n"],

0 commit comments

Comments
 (0)