Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10-slim

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*

COPY . .

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
pip install --no-cache-dir . && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
pip install --no-cache-dir jupyter

EXPOSE 8888

CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]


28 changes: 5 additions & 23 deletions src/ydata_profiling/model/correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,25 @@
"""Correlations between variables."""

import warnings
from typing import Dict, List, Optional, Sized, no_type_check
from typing import Dict, List, Optional, Sized

import numpy as np
import pandas as pd

from ydata_profiling.config import Settings
from ydata_profiling.utils.backend import BaseBackend

try:
from pandas.core.base import DataError
except ImportError:
from pandas.errors import DataError


class CorrelationBackend:
class CorrelationBackend(BaseBackend):
"""Helper class to select and cache the appropriate correlation backend (Pandas or Spark)."""

@no_type_check
def __init__(self, df: Sized):
"""Determine backend once and store it for all correlation computations."""
if isinstance(df, pd.DataFrame):
from ydata_profiling.model.pandas import (
correlations_pandas as correlation_backend, # type: ignore
)
else:
from ydata_profiling.model.spark import (
correlations_spark as correlation_backend, # type: ignore
)

self.backend = correlation_backend

def get_method(self, method_name: str): # noqa: ANN201
"""Retrieve the appropriate correlation method class from the backend."""
if hasattr(self.backend, method_name):
return getattr(self.backend, method_name)
raise AttributeError(
f"Correlation method '{method_name}' is not available in the backend."
)
_pandas_module = "ydata_profiling.model.pandas.correlations_pandas"
_spark_module = "ydata_profiling.model.spark.correlations_spark"


class Correlation:
Expand Down
25 changes: 5 additions & 20 deletions src/ydata_profiling/model/missing.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,17 @@
import importlib
import warnings
from typing import Any, Callable, Dict, Optional, Sized
from typing import Any, Dict, Optional, Sized

import pandas as pd

from ydata_profiling.config import Settings
from ydata_profiling.utils.backend import BaseBackend


class MissingDataBackend:
class MissingDataBackend(BaseBackend):
"""Helper class to select and cache the appropriate missing-data backend (Pandas or Spark)."""

def __init__(self, df: Sized):
"""Determine backend once and store it for all missing-data computations."""
if isinstance(df, pd.DataFrame):
self.backend_module = "ydata_profiling.model.pandas.missing_pandas"
else:
self.backend_module = "ydata_profiling.model.spark.missing_spark"

self.module = importlib.import_module(self.backend_module)

def get_method(self, method_name: str) -> Callable:
"""Retrieve the appropriate missing-data function from the backend module."""
try:
return getattr(self.module, method_name)
except AttributeError as ex:
raise AttributeError(
f"Missing-data function '{method_name}' is not available in {self.backend_module}."
) from ex
_pandas_module = "ydata_profiling.model.pandas.missing_pandas"
_spark_module = "ydata_profiling.model.spark.missing_spark"


class MissingData:
Expand Down
28 changes: 4 additions & 24 deletions src/ydata_profiling/model/pandas/table_pandas.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from collections import Counter

import pandas as pd

from ydata_profiling.config import Settings
from ydata_profiling.model.table import get_table_stats
from ydata_profiling.model.table import compute_common_table_stats, get_table_stats


@get_table_stats.register
Expand All @@ -21,36 +19,18 @@ def pandas_get_table_stats(
A dictionary that contains the table statistics.
"""
n = len(df) if not df.empty else 0
n_var = len(df.columns)

memory_size = df.memory_usage(deep=config.memory_deep).sum()
record_size = float(memory_size) / n if n > 0 else 0

table_stats = {
"n": n,
"n_var": len(df.columns),
"n_var": n_var,
"memory_size": memory_size,
"record_size": record_size,
"n_cells_missing": 0,
"n_vars_with_missing": 0,
"n_vars_all_missing": 0,
}

for series_summary in variable_stats.values():
if "n_missing" in series_summary and series_summary["n_missing"] > 0:
table_stats["n_vars_with_missing"] += 1
table_stats["n_cells_missing"] += series_summary["n_missing"]
if series_summary["n_missing"] == n:
table_stats["n_vars_all_missing"] += 1

table_stats["p_cells_missing"] = (
table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"])
if table_stats["n"] > 0 and table_stats["n_var"] > 0
else 0
)

# Variable type counts
table_stats.update(
{"types": dict(Counter([v["type"] for v in variable_stats.values()]))}
)
table_stats.update(compute_common_table_stats(n, n_var, variable_stats))

return table_stats
39 changes: 4 additions & 35 deletions src/ydata_profiling/model/spark/table_spark.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from collections import Counter

from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.table import get_table_stats
from ydata_profiling.model.table import compute_common_table_stats, get_table_stats


@get_table_stats.register
Expand All @@ -21,38 +19,9 @@ def get_table_stats_spark(
A dictionary that contains the table statistics.
"""
n = df.count()
n_var = len(df.columns)

result = {"n": n, "n_var": len(df.columns)}

table_stats = {
"n_cells_missing": 0,
"n_vars_with_missing": 0,
"n_vars_all_missing": 0,
}

for series_summary in variable_stats.values():
if "n_missing" in series_summary and series_summary["n_missing"] > 0:
table_stats["n_vars_with_missing"] += 1
table_stats["n_cells_missing"] += series_summary["n_missing"]
if series_summary["n_missing"] == n:
table_stats["n_vars_all_missing"] += 1

# without this check we'll get a div by zero error
if result["n"] * result["n_var"] > 0:
table_stats["p_cells_missing"] = (
table_stats["n_cells_missing"] / (result["n"] * result["n_var"])
if result["n"] > 0
else 0
)
else:
table_stats["p_cells_missing"] = 0

result["p_cells_missing"] = table_stats["p_cells_missing"]
result["n_cells_missing"] = table_stats["n_cells_missing"]
result["n_vars_all_missing"] = table_stats["n_vars_all_missing"]
result["n_vars_with_missing"] = table_stats["n_vars_with_missing"]

# Variable type counts
result["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))
result = {"n": n, "n_var": n_var}
result.update(compute_common_table_stats(n, n_var, variable_stats))

return result
2 changes: 1 addition & 1 deletion src/ydata_profiling/model/spark/timeseries_index_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ydata_profiling.config import Settings


def spark_get_time_index_description_spark(
def get_time_index_description_spark(
config: Settings,
df: DataFrame,
table_stats: dict,
Expand Down
3 changes: 1 addition & 2 deletions src/ydata_profiling/model/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,8 @@ def summarize(
return self.handle(str(dtype), config, series, {"type": str(dtype)})


# Revisit this with the correct support for Spark as well.
class ProfilingSummarizer(BaseSummarizer):
"""A summarizer for Pandas DataFrames."""
"""A summarizer supporting both Pandas and Spark DataFrames."""

def __init__(self, typeset: VisionsTypeset, use_spark: bool = False):
self.use_spark = use_spark and is_pyspark_installed()
Expand Down
37 changes: 37 additions & 0 deletions src/ydata_profiling/model/table.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,47 @@
from collections import Counter
from typing import Any

from multimethod import multimethod

from ydata_profiling.config import Settings


def compute_common_table_stats(
n: int, n_var: int, variable_stats: dict
) -> dict:
"""Compute common table statistics shared by Pandas and Spark backends.

Args:
n: Number of rows in the DataFrame
n_var: Number of columns (variables)
variable_stats: Previously calculated statistic on the DataFrame series

Returns:
A dictionary with common table statistics: missing values counts, percentages, and type counts
"""
table_stats = {
"n_cells_missing": 0,
"n_vars_with_missing": 0,
"n_vars_all_missing": 0,
}

for series_summary in variable_stats.values():
if "n_missing" in series_summary and series_summary["n_missing"] > 0:
table_stats["n_vars_with_missing"] += 1
table_stats["n_cells_missing"] += series_summary["n_missing"]
if series_summary["n_missing"] == n:
table_stats["n_vars_all_missing"] += 1

total_cells = n * n_var
table_stats["p_cells_missing"] = (
table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0
)

table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))

return table_stats


@multimethod
def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict:
raise NotImplementedError()
34 changes: 33 additions & 1 deletion src/ydata_profiling/utils/backend.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,41 @@
"""
File with a function to check the backend being used
File with backend utilities and helper functions to check the backend being used
"""
import importlib
from typing import Callable, Optional, Sized, Union

import pandas as pd


def is_pyspark_installed() -> bool:
"""Check if PySpark is installed without importing it."""
return importlib.util.find_spec("pyspark") is not None


class BaseBackend:
"""Base helper class to select and cache the appropriate backend (Pandas or Spark)."""

_pandas_module: Optional[str] = None
_spark_module: Optional[str] = None

def __init__(self, df: Union[pd.DataFrame, Sized]):
"""Determine backend once and store it for all computations."""
if isinstance(df, pd.DataFrame):
module_path = self._pandas_module
else:
module_path = self._spark_module

if module_path is None:
raise ValueError("Backend module path not configured")

self.module = importlib.import_module(module_path)
self.module_path = module_path

def get_method(self, method_name: str) -> Callable:
"""Retrieve the appropriate function from the backend module."""
try:
return getattr(self.module, method_name)
except AttributeError as ex:
raise AttributeError(
f"Function '{method_name}' is not available in {self.module_path}."
) from ex