fg-data-profiling/src/ydata_profiling/model/summarizer.py at d83e1a17d23c4d1bdee001b326fdaac1a4707548 · Data-Centric-AI-Community/fg-data-profiling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# mypy: ignore-errors

from dataclasses import asdict
from typing import Any, Callable, Dict, List, Type, Union

import numpy as np
import pandas as pd
from visions import VisionsBaseType, VisionsTypeset

from ydata_profiling.config import Settings
from ydata_profiling.model import BaseDescription
from ydata_profiling.model.handler import Handler
from ydata_profiling.model.pandas import (
    pandas_describe_boolean_1d,
    pandas_describe_categorical_1d,
    pandas_describe_counts,
    pandas_describe_date_1d,
    pandas_describe_file_1d,
    pandas_describe_generic,
    pandas_describe_image_1d,
    pandas_describe_numeric_1d,
    pandas_describe_path_1d,
    pandas_describe_text_1d,
    pandas_describe_timeseries_1d,
    pandas_describe_url_1d,
)
from ydata_profiling.model.pandas.describe_supported_pandas import (
    pandas_describe_supported,
)
from ydata_profiling.model.summary_algorithms import (  # Check what is this method used for
    describe_file_1d,
    describe_image_1d,
    describe_path_1d,
    describe_timeseries_1d,
    describe_url_1d,
)
from ydata_profiling.utils.backend import is_pyspark_installed


class BaseSummarizer(Handler):
    """A base summarizer

    Can be used to define custom summarizations
    """

    def summarize(
        self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType]
    ) -> dict:
        """Generates the summary for a given series"""
        return self.handle(str(dtype), config, series, {"type": str(dtype)})


class ProfilingSummarizer(BaseSummarizer):
    """A summarizer supporting both Pandas and Spark DataFrames."""

    def __init__(self, typeset: VisionsTypeset, use_spark: bool = False):
        self.use_spark = use_spark and is_pyspark_installed()
        self._summary_map = self._create_summary_map()
        super().__init__(self._summary_map, typeset)

    @property
    def summary_map(self) -> Dict[str, List[Callable]]:
        """Allows users to modify the summary map after initialization."""
        return self._summary_map

    def _create_summary_map(self) -> Dict[str, List[Callable]]:
        """Creates the summary map for Pandas summarization."""
        if self.use_spark:
            from ydata_profiling.model.spark import (
                describe_boolean_1d_spark,
                describe_categorical_1d_spark,
                describe_counts_spark,
                describe_date_1d_spark,
                describe_generic_spark,
                describe_numeric_1d_spark,
                describe_supported_spark,
                describe_text_1d_spark,
            )

            summary_map = {
                "Unsupported": [
                    describe_counts_spark,
                    describe_generic_spark,
                    describe_supported_spark,
                ],
                "Numeric": [describe_numeric_1d_spark],
                "DateTime": [describe_date_1d_spark],
                "Text": [describe_text_1d_spark],
                "Categorical": [describe_categorical_1d_spark],
                "Boolean": [describe_boolean_1d_spark],
                "URL": [describe_url_1d],
                "Path": [describe_path_1d],
                "File": [describe_file_1d],
                "Image": [describe_image_1d],
                "TimeSeries": [describe_timeseries_1d],
            }
        else:
            summary_map = {
                "Unsupported": [
                    pandas_describe_counts,
                    pandas_describe_generic,
                    pandas_describe_supported,
                ],
                "Numeric": [pandas_describe_numeric_1d],
                "DateTime": [pandas_describe_date_1d],
                "Text": [pandas_describe_text_1d],
                "Categorical": [pandas_describe_categorical_1d],
                "Boolean": [pandas_describe_boolean_1d],
                "URL": [pandas_describe_url_1d],
                "Path": [pandas_describe_path_1d],
                "File": [pandas_describe_file_1d],
                "Image": [pandas_describe_image_1d],
                "TimeSeries": [pandas_describe_timeseries_1d],
            }
        return summary_map


def format_summary(summary: Union[BaseDescription, dict]) -> dict:
    """Prepare summary for export to json file.

    Args:
        summary (Union[BaseDescription, dict]): summary to export

    Returns:
        dict: summary as dict
    """

    def fmt(v: Any) -> Any:
        if isinstance(v, dict):
            return {k: fmt(va) for k, va in v.items()}
        else:
            if isinstance(v, pd.Series):
                return fmt(v.to_dict())
            elif (
                isinstance(v, tuple)
                and len(v) == 2
                and all(isinstance(x, np.ndarray) for x in v)
            ):
                return {"counts": v[0].tolist(), "bin_edges": v[1].tolist()}
            else:
                return v

    if isinstance(summary, BaseDescription):
        summary = asdict(summary)

    summary = {k: fmt(v) for k, v in summary.items()}
    return summary


def _redact_column(column: Dict[str, Any]) -> Dict[str, Any]:
    def redact_key(data: Dict[str, Any]) -> Dict[str, Any]:
        return {f"REDACTED_{i}": v for i, (_, v) in enumerate(data.items())}

    def redact_value(data: Dict[str, Any]) -> Dict[str, Any]:
        return {k: f"REDACTED_{i}" for i, (k, _) in enumerate(data.items())}

    keys_to_redact = [
        "block_alias_char_counts",
        "block_alias_values",
        "category_alias_char_counts",
        "category_alias_values",
        "character_counts",
        "script_char_counts",
        "value_counts_index_sorted",
        "value_counts_without_nan",
        "word_counts",
    ]

    values_to_redact = ["first_rows"]

    for field in keys_to_redact:
        if field not in column:
            continue
        is_dict = (isinstance(v, dict) for v in column[field].values())
        if any(is_dict):
            column[field] = {k: redact_key(v) for k, v in column[field].items()}
        else:
            column[field] = redact_key(column[field])

    for field in values_to_redact:
        if field not in column:
            continue
        is_dict = (isinstance(v, dict) for v in column[field].values())
        if any(is_dict):
            column[field] = {k: redact_value(v) for k, v in column[field].items()}
        else:
            column[field] = redact_value(column[field])

    return column


def redact_summary(summary: dict, config: Settings) -> dict:
    """Redact summary to export to json file.

    Args:
        summary (dict): summary to redact

    Returns:
        dict: redacted summary
    """
    for _, col in summary["variables"].items():
        if (config.vars.cat.redact and col["type"] == "Categorical") or (
            config.vars.text.redact and col["type"] == "Text"
        ):
            col = _redact_column(col)
    return summary