-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdata_processor.py
More file actions
270 lines (216 loc) · 9.05 KB
/
data_processor.py
File metadata and controls
270 lines (216 loc) · 9.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
"""
Data processor module for string and list operations.
Demonstrates Copilot's ability to work with data structures.
"""
import ast
import re
class DataProcessor:
"""Handles various data processing operations."""
def __init__(self):
"""Initialize the data processor."""
self.processed_count = 0
def reverse_string(self, text: str) -> str:
"""Reverse a string."""
self.processed_count += 1
return text[::-1]
# TODO: Create a method that checks if a string is a palindrome
# Should ignore case and spaces
# TODO: Create a method that counts the number of vowels in a string
# Should handle both uppercase and lowercase
def remove_duplicates(self, items: list) -> list:
"""Remove duplicate items from a list while preserving order."""
self.processed_count += 1
seen = set()
result = []
for item in items:
if item not in seen:
seen.add(item)
result.append(item)
return result
# TODO: Create a method called 'find_common_elements' that finds common elements between two lists
# TODO: Create a method that sorts a list of dictionaries by a specified key
def chunk_list(self, items: list, chunk_size: int) -> list:
"""Split a list into chunks of specified size."""
if chunk_size <= 0:
raise ValueError("Chunk size must be positive")
self.processed_count += 1
return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
# TODO: Create a method that flattens a nested list structure
# Example: [[1, 2], [3, [4, 5]]] -> [1, 2, 3, 4, 5]
def get_processed_count(self) -> int:
"""Return the number of operations processed."""
return self.processed_count
# BUG: Doesn't handle edge cases properly
def count_words(self, text: str) -> int:
"""Count words in a text."""
# BUG: Counts empty strings and multiple spaces incorrectly
return len(text.split(' '))
# PERFORMANCE ISSUE: O(n²) complexity - should use set for O(n)
def find_duplicates_slow(self, items: list) -> list:
"""Find duplicate items in a list (inefficient version)."""
# PERFORMANCE: This is O(n²) and could be much faster
duplicates = []
for i, item in enumerate(items):
for j, other in enumerate(items):
if i != j and item == other and item not in duplicates:
duplicates.append(item)
return duplicates
def process_data(self, data: list) -> list:
"""
Process data by doubling each element.
Args:
data (list): List of numeric values to process
Returns:
list: New list with each element doubled
Raises:
TypeError: If data is not iterable or contains non-numeric values
Example:
>>> processor = DataProcessor()
>>> processor.process_data([1, 2, 3])
[2, 4, 6]
"""
if not hasattr(data, '__iter__'):
raise TypeError("Data must be iterable")
result = []
for x in data:
try:
result.append(x * 2)
except TypeError:
raise TypeError("All elements must support multiplication")
return result
def calculate_expression(self, expression: str) -> float:
"""
Calculate a mathematical expression from string.
Only supports basic arithmetic operations (+, -, *, /, parentheses).
Uses safe parsing instead of eval to prevent code injection.
Args:
expression (str): Mathematical expression to evaluate
Returns:
float: Result of the calculation
Raises:
ValueError: If expression contains invalid characters or syntax
"""
# Remove whitespace
expression = expression.replace(' ', '')
# Validate expression contains only safe characters
if not re.match(r'^[0-9+\-*/().\s]+$', expression):
raise ValueError("Expression contains invalid characters. Only numbers and +, -, *, /, () are allowed.")
try:
# Use ast.literal_eval for simple numeric literals, or compile and evaluate safely
# For basic arithmetic, we parse the AST and evaluate only math operations
tree = ast.parse(expression, mode='eval')
return self._eval_expr(tree.body)
except SyntaxError as e:
raise ValueError(f"Invalid expression: {e}")
except ValueError:
raise
except Exception as e:
raise ValueError(f"Invalid expression: {e}")
def _eval_expr(self, node):
"""
Safely evaluate an AST node containing only arithmetic operations.
Args:
node: AST node to evaluate (ast.Num, ast.BinOp, ast.UnaryOp, or ast.Constant)
Returns:
float: The evaluated result of the expression
Raises:
ValueError: If node contains unsupported operations or expression types
"""
if isinstance(node, ast.Num): # <number>
return node.n
elif isinstance(node, ast.BinOp): # <left> <operator> <right>
left = self._eval_expr(node.left)
right = self._eval_expr(node.right)
if isinstance(node.op, ast.Add):
return left + right
elif isinstance(node.op, ast.Sub):
return left - right
elif isinstance(node.op, ast.Mult):
return left * right
elif isinstance(node.op, ast.Div):
if right == 0:
raise ValueError("Division by zero")
return left / right
else:
raise ValueError(f"Unsupported operation: {type(node.op).__name__}")
elif isinstance(node, ast.UnaryOp): # <operator> <operand> e.g., -1
operand = self._eval_expr(node.operand)
if isinstance(node.op, ast.USub):
return -operand
elif isinstance(node.op, ast.UAdd):
return +operand
else:
raise ValueError(f"Unsupported unary operation: {type(node.op).__name__}")
elif isinstance(node, ast.Constant): # Python 3.8+
return node.value
else:
raise ValueError(f"Unsupported expression type: {type(node).__name__}")
# BUG: Edge case not handled
def get_last_n_items(self, items: list, n: int) -> list:
"""Get the last n items from a list."""
# BUG: Doesn't handle when n > len(items)
# BUG: Doesn't handle negative n values
return items[-n:]
def merge_dictionaries(self, dict1: dict, dict2: dict) -> dict:
"""
Merge two dictionaries, with dict2 values overwriting dict1 on key conflicts.
Creates a new dictionary containing all keys from both input dictionaries.
If a key exists in both dictionaries, the value from dict2 takes precedence.
Args:
dict1 (dict): First dictionary to merge
dict2 (dict): Second dictionary to merge (takes precedence on conflicts)
Returns:
dict: New dictionary containing merged key-value pairs
Example:
>>> processor = DataProcessor()
>>> d1 = {'a': 1, 'b': 2}
>>> d2 = {'b': 3, 'c': 4}
>>> processor.merge_dictionaries(d1, d2)
{'a': 1, 'b': 3, 'c': 4}
"""
result = dict1.copy()
result.update(dict2)
return result
def to_title_case(text: str) -> str:
"""
Convert a string to title case.
Capitalizes the first letter of each word and converts all other letters to lowercase.
Args:
text (str): The string to convert to title case
Returns:
str: The string converted to title case
Example:
>>> to_title_case("hello world")
'Hello World'
>>> to_title_case("HELLO world")
'Hello World'
"""
return text.title()
def find_most_frequent(items: list):
"""
Find the most frequent element in a list.
If there's a tie, returns the first one encountered.
Args:
items (list): List of items to analyze
Returns:
The most frequent element in the list
Raises:
ValueError: If the list is empty
Example:
>>> find_most_frequent([1, 2, 2, 3, 3, 3])
3
>>> find_most_frequent(['a', 'b', 'a', 'c'])
'a'
"""
if not items:
raise ValueError("Cannot find most frequent element in empty list")
frequency = {}
for item in items:
frequency[item] = frequency.get(item, 0) + 1
max_count = 0
most_frequent = None
for item in items:
if frequency[item] > max_count:
max_count = frequency[item]
most_frequent = item
return most_frequent