-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Expand file tree
/
Copy pathsummarizer.py
More file actions
206 lines (174 loc) · 6.95 KB
/
summarizer.py
File metadata and controls
206 lines (174 loc) · 6.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# mypy: ignore-errors
from dataclasses import asdict
from typing import Any, Callable, Dict, List, Type, Union
import numpy as np
import pandas as pd
from visions import VisionsBaseType, VisionsTypeset
from ydata_profiling.config import Settings
from ydata_profiling.model import BaseDescription
from ydata_profiling.model.handler import Handler
from ydata_profiling.model.pandas import (
pandas_describe_boolean_1d,
pandas_describe_categorical_1d,
pandas_describe_counts,
pandas_describe_date_1d,
pandas_describe_file_1d,
pandas_describe_generic,
pandas_describe_image_1d,
pandas_describe_numeric_1d,
pandas_describe_path_1d,
pandas_describe_text_1d,
pandas_describe_timeseries_1d,
pandas_describe_url_1d,
)
from ydata_profiling.model.pandas.describe_supported_pandas import (
pandas_describe_supported,
)
from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for
describe_file_1d,
describe_image_1d,
describe_path_1d,
describe_timeseries_1d,
describe_url_1d,
)
from ydata_profiling.utils.backend import is_pyspark_installed
class BaseSummarizer(Handler):
"""A base summarizer
Can be used to define custom summarizations
"""
def summarize(
self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType]
) -> dict:
"""Generates the summary for a given series"""
return self.handle(str(dtype), config, series, {"type": str(dtype)})
class ProfilingSummarizer(BaseSummarizer):
"""A summarizer supporting both Pandas and Spark DataFrames."""
def __init__(self, typeset: VisionsTypeset, use_spark: bool = False):
self.use_spark = use_spark and is_pyspark_installed()
self._summary_map = self._create_summary_map()
super().__init__(self._summary_map, typeset)
@property
def summary_map(self) -> Dict[str, List[Callable]]:
"""Allows users to modify the summary map after initialization."""
return self._summary_map
def _create_summary_map(self) -> Dict[str, List[Callable]]:
"""Creates the summary map for Pandas summarization."""
if self.use_spark:
from ydata_profiling.model.spark import (
describe_boolean_1d_spark,
describe_categorical_1d_spark,
describe_counts_spark,
describe_date_1d_spark,
describe_generic_spark,
describe_numeric_1d_spark,
describe_supported_spark,
describe_text_1d_spark,
)
summary_map = {
"Unsupported": [
describe_counts_spark,
describe_generic_spark,
describe_supported_spark,
],
"Numeric": [describe_numeric_1d_spark],
"DateTime": [describe_date_1d_spark],
"Text": [describe_text_1d_spark],
"Categorical": [describe_categorical_1d_spark],
"Boolean": [describe_boolean_1d_spark],
"URL": [describe_url_1d],
"Path": [describe_path_1d],
"File": [describe_file_1d],
"Image": [describe_image_1d],
"TimeSeries": [describe_timeseries_1d],
}
else:
summary_map = {
"Unsupported": [
pandas_describe_counts,
pandas_describe_generic,
pandas_describe_supported,
],
"Numeric": [pandas_describe_numeric_1d],
"DateTime": [pandas_describe_date_1d],
"Text": [pandas_describe_text_1d],
"Categorical": [pandas_describe_categorical_1d],
"Boolean": [pandas_describe_boolean_1d],
"URL": [pandas_describe_url_1d],
"Path": [pandas_describe_path_1d],
"File": [pandas_describe_file_1d],
"Image": [pandas_describe_image_1d],
"TimeSeries": [pandas_describe_timeseries_1d],
}
return summary_map
def format_summary(summary: Union[BaseDescription, dict]) -> dict:
"""Prepare summary for export to json file.
Args:
summary (Union[BaseDescription, dict]): summary to export
Returns:
dict: summary as dict
"""
def fmt(v: Any) -> Any:
if isinstance(v, dict):
return {k: fmt(va) for k, va in v.items()}
else:
if isinstance(v, pd.Series):
return fmt(v.to_dict())
elif (
isinstance(v, tuple)
and len(v) == 2
and all(isinstance(x, np.ndarray) for x in v)
):
return {"counts": v[0].tolist(), "bin_edges": v[1].tolist()}
else:
return v
if isinstance(summary, BaseDescription):
summary = asdict(summary)
summary = {k: fmt(v) for k, v in summary.items()}
return summary
def _redact_column(column: Dict[str, Any]) -> Dict[str, Any]:
def redact_key(data: Dict[str, Any]) -> Dict[str, Any]:
return {f"REDACTED_{i}": v for i, (_, v) in enumerate(data.items())}
def redact_value(data: Dict[str, Any]) -> Dict[str, Any]:
return {k: f"REDACTED_{i}" for i, (k, _) in enumerate(data.items())}
keys_to_redact = [
"block_alias_char_counts",
"block_alias_values",
"category_alias_char_counts",
"category_alias_values",
"character_counts",
"script_char_counts",
"value_counts_index_sorted",
"value_counts_without_nan",
"word_counts",
]
values_to_redact = ["first_rows"]
for field in keys_to_redact:
if field not in column:
continue
is_dict = (isinstance(v, dict) for v in column[field].values())
if any(is_dict):
column[field] = {k: redact_key(v) for k, v in column[field].items()}
else:
column[field] = redact_key(column[field])
for field in values_to_redact:
if field not in column:
continue
is_dict = (isinstance(v, dict) for v in column[field].values())
if any(is_dict):
column[field] = {k: redact_value(v) for k, v in column[field].items()}
else:
column[field] = redact_value(column[field])
return column
def redact_summary(summary: dict, config: Settings) -> dict:
"""Redact summary to export to json file.
Args:
summary (dict): summary to redact
Returns:
dict: redacted summary
"""
for _, col in summary["variables"].items():
if (config.vars.cat.redact and col["type"] == "Categorical") or (
config.vars.text.redact and col["type"] == "Text"
):
col = _redact_column(col)
return summary