-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Expand file tree
/
Copy pathmissing.py
More file actions
131 lines (103 loc) · 3.93 KB
/
missing.py
File metadata and controls
131 lines (103 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import warnings
from typing import Any, Dict, Optional, Sized
import pandas as pd
from ydata_profiling.config import Settings
from ydata_profiling.utils.backend import BaseBackend
class MissingDataBackend(BaseBackend):
"""Helper class to select and cache the appropriate missing-data backend (Pandas or Spark)."""
_pandas_module = "ydata_profiling.model.pandas.missing_pandas"
_spark_module = "ydata_profiling.model.spark.missing_spark"
class MissingData:
_method_name: str = ""
def compute(
self, config: Settings, df: Sized, backend: MissingDataBackend
) -> Optional[Sized]:
"""Computes correlation using the correct backend (Pandas or Spark)."""
try:
method = backend.get_method(self._method_name)
except AttributeError as ex:
raise NotImplementedError() from ex
else:
return method(config, df)
class MissingBar(MissingData):
_method_name = "missing_bar"
class MissingMatrix(MissingData):
_method_name = "missing_matrix"
class MissingHeatmap(MissingData):
_method_name = "missing_heatmap"
def get_missing_active(config: Settings, table_stats: dict) -> Dict[str, Any]:
"""
Args:
config: report Settings object
table_stats: The overall statistics for the DataFrame.
Returns:
"""
missing_map = {
"bar": {
"min_missing": 0,
"name": "Count",
"caption": "A simple visualization of nullity by column.",
"function": MissingBar(),
},
"matrix": {
"min_missing": 0,
"name": "Matrix",
"caption": "Nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.",
"function": MissingMatrix(),
},
"heatmap": {
"min_missing": 2,
"name": "Heatmap",
"caption": "The correlation heatmap measures nullity correlation: how strongly the presence or absence of one variable affects the presence of another.",
"function": MissingHeatmap(),
},
}
missing_map = {
name: settings
for name, settings in missing_map.items()
if (
config.missing_diagrams[name]
and table_stats["n_vars_with_missing"] >= settings["min_missing"]
)
and (
name != "heatmap"
or (
table_stats["n_vars_with_missing"] - table_stats["n_vars_all_missing"]
>= settings["min_missing"]
)
)
}
return missing_map
def get_missing_diagram(
config: Settings, df: pd.DataFrame, settings: Dict[str, Any]
) -> Optional[Dict[str, Any]]:
"""Gets the rendered diagrams for missing values.
Args:
config: report Settings object
df: The DataFrame on which to calculate the missing values.
settings: missing diagram name, caption and function
Returns:
A dictionary containing the base64 encoded plots for each diagram that is active in the config (matrix, bar, heatmap).
"""
backend = MissingDataBackend(df)
missing_func = settings.get("function")
if missing_func is None:
return None # No function defined, skip execution
try:
result = missing_func.compute(config, df, backend)
except ValueError as e:
warnings.warn(
f"""There was an attempt to generate the {settings['name']} missing values diagrams, but this failed.
To hide this warning, disable the calculation
(using `df.profile_report(missing_diagrams={{"{settings['name']}": False}}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: '{e}')"""
)
return None
else:
return {
"name": settings["name"],
"caption": settings["caption"],
"matrix": result,
}