Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10-slim

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*

COPY . .

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
pip install --no-cache-dir . && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
pip install --no-cache-dir jupyter

EXPOSE 8888

CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]


142 changes: 66 additions & 76 deletions src/ydata_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,7 @@
import yaml
from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr


def _merge_dictionaries(dict1: dict, dict2: dict) -> dict:
"""
Recursive merge dictionaries.

:param dict1: Base dictionary to merge.
:param dict2: Dictionary to merge on top of base dictionary.
:return: Merged dictionary
"""
for key, val in dict1.items():
if isinstance(val, dict):
dict2_node = dict2.setdefault(key, {})
_merge_dictionaries(val, dict2_node)
else:
if key not in dict2:
dict2[key] = val

return dict2
from ydata_profiling.utils.common import update


class Dataset(BaseModel):
Expand Down Expand Up @@ -355,60 +338,7 @@ class Config:
html: Html = Html()
notebook: Notebook = Notebook()

def update(self, updates: dict) -> "Settings":
update = _merge_dictionaries(self.dict(), updates)
return self.parse_obj(self.copy(update=update))

@staticmethod
def from_file(config_file: Union[Path, str]) -> "Settings":
"""Create a Settings object from a yaml file.

Args:
config_file: yaml file path
Returns:
Settings
"""
with open(config_file) as f:
data = yaml.safe_load(f)

return Settings.parse_obj(data)


class SparkSettings(Settings):
"""
Setting class with the standard report configuration for Spark DataFrames
All the supported analysis are set to true
"""

vars: Univariate = Univariate()

vars.num.low_categorical_threshold = 0

infer_dtypes: bool = False

correlations: Dict[str, Correlation] = {
"spearman": Correlation(key="spearman", calculate=True),
"pearson": Correlation(key="pearson", calculate=True),
}

correlation_table: bool = True

interactions: Interactions = Interactions()
interactions.continuous = False

missing_diagrams: Dict[str, bool] = {
"bar": False,
"matrix": False,
"dendrogram": False,
"heatmap": False,
}
samples: Samples = Samples()
samples.tail = 0
samples.random = 0


class Config:
arg_groups: Dict[str, Any] = {
_arg_groups: Dict[str, Any] = {
"sensitive": {
"samples": None,
"duplicates": None,
Expand Down Expand Up @@ -475,8 +405,8 @@ class Config:

@staticmethod
def get_arg_groups(key: str) -> dict:
kwargs = Config.arg_groups[key]
shorthand_args, _ = Config.shorthands(kwargs, split=False)
kwargs = Settings._arg_groups[key]
shorthand_args, _ = Settings.shorthands(kwargs, split=False)
return shorthand_args

@staticmethod
Expand All @@ -485,12 +415,72 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]:
if not split:
shorthand_args = kwargs
for key, value in list(kwargs.items()):
if value is None and key in Config._shorthands:
shorthand_args[key] = Config._shorthands[key]
if value is None and key in Settings._shorthands:
shorthand_args[key] = Settings._shorthands[key]
if split:
del kwargs[key]

if split:
return shorthand_args, kwargs
else:
return shorthand_args, {}

def update(self, updates: dict) -> "Settings":
merged = update(self.dict().copy(), updates)
return self.parse_obj(self.copy(update=merged))

@staticmethod
def from_file(config_file: Union[Path, str]) -> "Settings":
"""Create a Settings object from a yaml file.

Args:
config_file: yaml file path
Returns:
Settings
"""
with open(config_file) as f:
data = yaml.safe_load(f)

return Settings.parse_obj(data)


class SparkSettings(Settings):
"""
Setting class with the standard report configuration for Spark DataFrames
All the supported analysis are set to true
"""

vars: Univariate = Univariate()

vars.num.low_categorical_threshold = 0

infer_dtypes: bool = False

correlations: Dict[str, Correlation] = {
"spearman": Correlation(key="spearman", calculate=True),
"pearson": Correlation(key="pearson", calculate=True),
}

correlation_table: bool = True

interactions: Interactions = Interactions()
interactions.continuous = False

missing_diagrams: Dict[str, bool] = {
"bar": False,
"matrix": False,
"dendrogram": False,
"heatmap": False,
}
samples: Samples = Samples()
samples.tail = 0
samples.random = 0


class Config(Settings):
"""
Deprecated: Use Settings instead.
Backward compatibility alias for Settings class.
"""

pass
20 changes: 2 additions & 18 deletions src/ydata_profiling/model/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,6 @@ def handle(self, dtype: str, *args, **kwargs) -> dict:
return summary


def get_render_map() -> Dict[str, Callable]:
import ydata_profiling.report.structure.variables as render_algorithms
from ydata_profiling.report.structure import get_render_map

render_map = {
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
"Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
"Path": render_algorithms.render_path,
"File": render_algorithms.render_file,
"Image": render_algorithms.render_image,
"Unsupported": render_algorithms.render_generic,
"TimeSeries": render_algorithms.render_timeseries,
}

return render_map
__all__ = ["compose", "Handler", "get_render_map"]
31 changes: 15 additions & 16 deletions src/ydata_profiling/model/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,8 @@ def summarize(
return self.handle(str(dtype), config, series, {"type": str(dtype)})


# Revisit this with the correct support for Spark as well.
class ProfilingSummarizer(BaseSummarizer):
"""A summarizer for Pandas DataFrames."""
"""A summarizer supporting both Pandas and Spark DataFrames."""

def __init__(self, typeset: VisionsTypeset, use_spark: bool = False):
self.use_spark = use_spark and is_pyspark_installed()
Expand All @@ -65,7 +64,15 @@ def summary_map(self) -> Dict[str, List[Callable]]:
return self._summary_map

def _create_summary_map(self) -> Dict[str, List[Callable]]:
"""Creates the summary map for Pandas summarization."""
"""Creates the summary map based on the backend."""
common_map = {
"URL": [describe_url_1d],
"Path": [describe_path_1d],
"File": [describe_file_1d],
"Image": [describe_image_1d],
"TimeSeries": [describe_timeseries_1d],
}

if self.use_spark:
from ydata_profiling.model.spark import (
describe_boolean_1d_spark,
Expand All @@ -78,7 +85,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
describe_text_1d_spark,
)

summary_map = {
base_map = {
"Unsupported": [
describe_counts_spark,
describe_generic_spark,
Expand All @@ -89,14 +96,9 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
"Text": [describe_text_1d_spark],
"Categorical": [describe_categorical_1d_spark],
"Boolean": [describe_boolean_1d_spark],
"URL": [describe_url_1d],
"Path": [describe_path_1d],
"File": [describe_file_1d],
"Image": [describe_image_1d],
"TimeSeries": [describe_timeseries_1d],
}
else:
summary_map = {
base_map = {
"Unsupported": [
pandas_describe_counts,
pandas_describe_generic,
Expand All @@ -107,13 +109,10 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
"Text": [pandas_describe_text_1d],
"Categorical": [pandas_describe_categorical_1d],
"Boolean": [pandas_describe_boolean_1d],
"URL": [pandas_describe_url_1d],
"Path": [pandas_describe_path_1d],
"File": [pandas_describe_file_1d],
"Image": [pandas_describe_image_1d],
"TimeSeries": [pandas_describe_timeseries_1d],
}
return summary_map

base_map.update(common_map)
return base_map


def format_summary(summary: Union[BaseDescription, dict]) -> dict:
Expand Down
6 changes: 3 additions & 3 deletions src/ydata_profiling/profile_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from typeguard import typechecked
from visions import VisionsTypeset

from ydata_profiling.config import Config, Settings, SparkSettings
from ydata_profiling.config import Settings, SparkSettings
from ydata_profiling.expectations_report import ExpectationsReport
from ydata_profiling.model import BaseDescription
from ydata_profiling.model.alerts import AlertType
Expand Down Expand Up @@ -132,11 +132,11 @@ def __init__(
cfg = Settings()
for condition, key in groups:
if condition:
cfg = cfg.update(Config.get_arg_groups(key))
cfg = cfg.update(Settings.get_arg_groups(key))
report_config = report_config.update(cfg.dict(exclude_defaults=True))

if len(kwargs) > 0:
shorthands, kwargs = Config.shorthands(kwargs)
shorthands, kwargs = Settings.shorthands(kwargs)
report_config = report_config.update(
Settings().update(shorthands).dict(exclude_defaults=True)
)
Expand Down
22 changes: 22 additions & 0 deletions src/ydata_profiling/report/structure/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,23 @@
"""Data structure for the report"""
from typing import Callable, Dict


def get_render_map() -> Dict[str, Callable]:
import ydata_profiling.report.structure.variables as render_algorithms

render_map = {
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
"Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
"Path": render_algorithms.render_path,
"File": render_algorithms.render_file,
"Image": render_algorithms.render_image,
"Unsupported": render_algorithms.render_generic,
"TimeSeries": render_algorithms.render_timeseries,
}

return render_map
2 changes: 1 addition & 1 deletion src/ydata_profiling/report/structure/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ydata_profiling.config import Settings
from ydata_profiling.model import BaseDescription
from ydata_profiling.model.alerts import AlertType
from ydata_profiling.model.handler import get_render_map
from ydata_profiling.report.structure import get_render_map
from ydata_profiling.report.presentation.core import (
HTML,
Collapse,
Expand Down