Scrapegraph-ai/scrapegraphai/utils/code_error_correction.py at main · ScrapeGraphAI/Scrapegraph-ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
"""
This module contains the functions for code generation to correct different types of errors.

Functions:
- syntax_focused_code_generation: Generates corrected code based on syntax error analysis.
- execution_focused_code_generation: Generates corrected code based on execution error analysis.
- validation_focused_code_generation: Generates corrected code based on
validation error analysis, considering JSON schema.
- semantic_focused_code_generation: Generates corrected code based on semantic error analysis,
comparing generated and reference results.
"""

import json
from functools import lru_cache
from typing import Any, Dict

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field

from ..prompts import (
    TEMPLATE_EXECUTION_CODE_GENERATION,
    TEMPLATE_SEMANTIC_CODE_GENERATION,
    TEMPLATE_SYNTAX_CODE_GENERATION,
    TEMPLATE_VALIDATION_CODE_GENERATION,
)


class CodeGenerationError(Exception):
    """Base exception for code generation errors."""

    pass


class InvalidCorrectionStateError(CodeGenerationError):
    """Exception raised when state dictionary is missing required keys."""

    pass


class CorrectionState(BaseModel):
    """Base model for code correction state validation."""

    generated_code: str = Field(
        ..., description="The original generated code to correct"
    )

    class Config:
        extra = "allow"


class ValidationCorrectionState(CorrectionState):
    """Model for validation correction state validation."""

    json_schema: Dict[str, Any] = Field(..., description="JSON schema for validation")


class SemanticCorrectionState(CorrectionState):
    """Model for semantic correction state validation."""

    execution_result: Any = Field(..., description="Result of code execution")
    reference_answer: Any = Field(..., description="Reference answer for comparison")


@lru_cache(maxsize=32)
def get_optimal_correction_template(error_type: str) -> str:
    """
    Returns the optimal prompt template for code correction based on the error type.
    Results are cached for performance.

    Args:
        error_type (str): Type of error to correct.

    Returns:
        str: The prompt template text.
    """
    template_registry = {
        "syntax": TEMPLATE_SYNTAX_CODE_GENERATION,
        "execution": TEMPLATE_EXECUTION_CODE_GENERATION,
        "validation": TEMPLATE_VALIDATION_CODE_GENERATION,
        "semantic": TEMPLATE_SEMANTIC_CODE_GENERATION,
    }
    return template_registry.get(error_type, TEMPLATE_SYNTAX_CODE_GENERATION)


def syntax_focused_code_generation(
    state: Dict[str, Any], analysis: str, llm_model
) -> str:
    """
    Generates corrected code based on syntax error analysis.

    Args:
        state (dict): Contains the 'generated_code'.
        analysis (str): The analysis of the syntax errors.
        llm_model: The language model used for generating the corrected code.

    Returns:
        str: The corrected code.

    Raises:
        InvalidCorrectionStateError: If state is missing required keys.

    Example:
        >>> state = {
            'generated_code': 'print("Hello World"'
        }
        >>> analysis = "Missing closing parenthesis in print statement"
        >>> corrected_code = syntax_focused_code_generation(state, analysis, mock_llm)
    """
    try:
        # Validate state using Pydantic model
        validated_state = CorrectionState(
            generated_code=state.get("generated_code", "")
        )

        if not analysis or not isinstance(analysis, str):
            raise InvalidCorrectionStateError("Analysis must be a non-empty string")

        # Create prompt template and chain
        prompt = PromptTemplate(
            template=get_optimal_correction_template("syntax"),
            input_variables=["analysis", "generated_code"],
        )
        chain = prompt | llm_model | StrOutputParser()

        # Execute chain with validated state
        return chain.invoke(
            {"analysis": analysis, "generated_code": validated_state.generated_code}
        )

    except KeyError as e:
        raise InvalidCorrectionStateError(
            f"Missing required key in state dictionary: {e}"
        )
    except Exception as e:
        raise CodeGenerationError(f"Syntax code generation failed: {str(e)}")


def execution_focused_code_generation(
    state: Dict[str, Any], analysis: str, llm_model
) -> str:
    """
    Generates corrected code based on execution error analysis.

    Args:
        state (dict): Contains the 'generated_code'.
        analysis (str): The analysis of the execution errors.
        llm_model: The language model used for generating the corrected code.

    Returns:
        str: The corrected code.

    Raises:
        InvalidCorrectionStateError: If state is missing required keys or analysis is invalid.

    Example:
        >>> state = {
            'generated_code': 'print(x)'
        }
        >>> analysis = "Variable 'x' is not defined before use"
        >>> corrected_code = execution_focused_code_generation(state, analysis, mock_llm)
    """
    try:
        # Validate state using Pydantic model
        validated_state = CorrectionState(
            generated_code=state.get("generated_code", "")
        )

        if not analysis or not isinstance(analysis, str):
            raise InvalidCorrectionStateError("Analysis must be a non-empty string")

        # Create prompt template and chain
        prompt = PromptTemplate(
            template=get_optimal_correction_template("execution"),
            input_variables=["analysis", "generated_code"],
        )
        chain = prompt | llm_model | StrOutputParser()

        # Execute chain with validated state
        return chain.invoke(
            {"analysis": analysis, "generated_code": validated_state.generated_code}
        )

    except KeyError as e:
        raise InvalidCorrectionStateError(
            f"Missing required key in state dictionary: {e}"
        )
    except Exception as e:
        raise CodeGenerationError(f"Execution code generation failed: {str(e)}")


def validation_focused_code_generation(
    state: Dict[str, Any], analysis: str, llm_model
) -> str:
    """
    Generates corrected code based on validation error analysis.

    Args:
        state (dict): Contains the 'generated_code' and 'json_schema'.
        analysis (str): The analysis of the validation errors.
        llm_model: The language model used for generating the corrected code.

    Returns:
        str: The corrected code.

    Raises:
        InvalidCorrectionStateError: If state is missing required keys or analysis is invalid.

    Example:
        >>> state = {
            'generated_code': 'return {"name": "John"}',
            'json_schema': {'required': ['name', 'age']}
        }
        >>> analysis = "The output JSON is missing the required 'age' field"
        >>> corrected_code = validation_focused_code_generation(state, analysis, mock_llm)
    """
    try:
        # Validate state using Pydantic model
        validated_state = ValidationCorrectionState(
            generated_code=state.get("generated_code", ""),
            json_schema=state.get("json_schema", {}),
        )

        if not analysis or not isinstance(analysis, str):
            raise InvalidCorrectionStateError("Analysis must be a non-empty string")

        # Create prompt template and chain
        prompt = PromptTemplate(
            template=get_optimal_correction_template("validation"),
            input_variables=["analysis", "generated_code", "json_schema"],
        )
        chain = prompt | llm_model | StrOutputParser()

        # Execute chain with validated state
        return chain.invoke(
            {
                "analysis": analysis,
                "generated_code": validated_state.generated_code,
                "json_schema": validated_state.json_schema,
            }
        )

    except KeyError as e:
        raise InvalidCorrectionStateError(
            f"Missing required key in state dictionary: {e}"
        )
    except Exception as e:
        raise CodeGenerationError(f"Validation code generation failed: {str(e)}")


def semantic_focused_code_generation(
    state: Dict[str, Any], analysis: str, llm_model
) -> str:
    """
    Generates corrected code based on semantic error analysis.

    Args:
        state (dict): Contains the 'generated_code', 'execution_result', and 'reference_answer'.
        analysis (str): The analysis of the semantic differences.
        llm_model: The language model used for generating the corrected code.

    Returns:
        str: The corrected code.

    Raises:
        InvalidCorrectionStateError: If state is missing required keys or analysis is invalid.

    Example:
        >>> state = {
            'generated_code': 'def add(a, b): return a + b',
            'execution_result': {'result': 3},
            'reference_answer': {'result': 3, 'documentation': 'Adds two numbers'}
        }
        >>> analysis = "The code is missing documentation"
        >>> corrected_code = semantic_focused_code_generation(state, analysis, mock_llm)
    """
    try:
        # Validate state using Pydantic model
        validated_state = SemanticCorrectionState(
            generated_code=state.get("generated_code", ""),
            execution_result=state.get("execution_result", {}),
            reference_answer=state.get("reference_answer", {}),
        )

        if not analysis or not isinstance(analysis, str):
            raise InvalidCorrectionStateError("Analysis must be a non-empty string")

        # Create prompt template and chain
        prompt = PromptTemplate(
            template=get_optimal_correction_template("semantic"),
            input_variables=[
                "analysis",
                "generated_code",
                "generated_result",
                "reference_result",
            ],
        )
        chain = prompt | llm_model | StrOutputParser()

        # Execute chain with validated state
        return chain.invoke(
            {
                "analysis": analysis,
                "generated_code": validated_state.generated_code,
                "generated_result": json.dumps(
                    validated_state.execution_result, indent=2
                ),
                "reference_result": json.dumps(
                    validated_state.reference_answer, indent=2
                ),
            }
        )

    except KeyError as e:
        raise InvalidCorrectionStateError(
            f"Missing required key in state dictionary: {e}"
        )
    except Exception as e:
        raise CodeGenerationError(f"Semantic code generation failed: {str(e)}")