-
Notifications
You must be signed in to change notification settings - Fork 186
Expand file tree
/
Copy pathtext_data.py
More file actions
148 lines (123 loc) · 5.29 KB
/
text_data.py
File metadata and controls
148 lines (123 loc) · 5.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""Contains class for saving and loading text files."""
from io import StringIO
from typing import Dict, List, Optional, Union, cast
from . import data_utils
from .base_data import BaseData
class TextData(BaseData):
"""TextData class to save and load text files."""
data_type: str = "text"
def __init__(
self,
input_file_path: Optional[str] = None,
data: Optional[List[str]] = None,
options: Optional[Dict] = None,
) -> None:
"""
Initialize Data class for loading datasets of type TEXT.
Can be specified by
passing in memory data or via a file path. Options pertaining the TEXT
may also be specified using the options dict parameter.
Possible Options::
options = dict(
data_format= type: str, choices: "text"
samples_per_line= type: int
)
data_format: user selected format in which to return data
can only be of specified types
samples_per_line: chunks by which to read in the specified dataset
:param input_file_path: path to the file being loaded or None
:type input_file_path: str
:param data: data being loaded into the class instead of an input file
:type data: multiple types
:param options: options pertaining to the data type
:type options: dict
:return: None
"""
if data is not None and not isinstance(data, str):
raise ValueError("Input data type is not string.")
options = self._check_and_return_options(options)
super().__init__(input_file_path, data, options)
# 'Private' properties
# _data_formats: dict containing data_formats (key) and function
# calls (values) which take self._data and convert it
# into the desired data_format for output.
# _selected_data_format: user selected format in which to return data
# can only be of types in _data_formats
# _delimiter: delimiter used to decipher the csv input file
# _selected_columns: columns being selected from the entire dataset
self._data_formats["text"] = self._get_data_as_text
self._selected_data_format: str = options.get("data_format", "text")
self._samples_per_line: int = options.get("samples_per_line", int(5e9))
if data is not None:
self._load_data(data)
@property
def samples_per_line(self) -> int:
"""Return samples per line."""
return self._samples_per_line
@property
def is_structured(self) -> bool:
"""Determine compatibility with StructuredProfiler."""
return False
def _load_data(self, data: Optional[List[str]] = None) -> None:
"""Load data."""
if data is not None:
self._data = data
else:
self._data = data_utils.read_text_as_list_of_strs(
cast(str, self.input_file_path), self.file_encoding
)
def _get_data_as_text(self, data: Union[str, List[str]]) -> List[str]:
"""Return data as text."""
if isinstance(data, list) and len(data) and isinstance(data[0], str):
data = "".join(data)
elif not isinstance(data, str) and data:
raise ValueError(
"Data is not in a str or list of str format and cannot be " "converted."
)
data = cast(str, data)
samples_per_line = min(max(len(data), 1), self.samples_per_line)
data = [
data[i * samples_per_line : (i + 1) * samples_per_line]
for i in range((len(data) + samples_per_line - 1) // samples_per_line)
]
return data
def tokenize(self) -> None:
"""Tokenize data."""
raise NotImplementedError("Tokenizing does not currently exist for text data.")
@classmethod
def is_match(cls, file_path: str, options: Optional[Dict] = None) -> bool:
"""
Return True if all are text files.
:param file_path: path to the file to be examined
:type file_path: str
:param options: text file read options
:type options: dict
:return: is file a text file or not
:rtype: bool
"""
if options is None:
options = {}
# if user passes options, this will update them for encodings
if "encoding" not in options and not isinstance(file_path, StringIO):
options = {"encoding": data_utils.detect_file_encoding(file_path)}
return True
def reload(
self,
input_file_path: Optional[str] = None,
data: Optional[List[str]] = None,
options: Optional[Dict] = None,
) -> None:
"""
Reload the data class with a new dataset.
This erases all existing
data/options and replaces it with the input data/options.
:param input_file_path: path to the file being loaded or None
:type input_file_path: str
:param data: data being loaded into the class instead of an input file
:type data: multiple types
:param options: options pertaining to the data type
:type options: dict
:return: None
"""
super().reload(input_file_path, data, options)
TextData.__init__(self, self.input_file_path, data, options)