Skip to content

Commit 8b7fa17

Browse files
authored
PR: fix: docstring duplication
### Fixed - **Python Docstring Cleaning**: (fixes #11) - **C-Style Comment Handling**:
2 parents 78737c6 + e65c91d commit 8b7fa17

5 files changed

Lines changed: 257 additions & 79 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616

1717
## [NextRelease]
1818

19-
- **Something great**: for sure
19+
### Fixed
20+
21+
- **Python Docstring Cleaning**: Improved the `remove_agent_docstring` function to better handle Python docstrings by preserving manual content while removing auto-generated table of contents. The function now correctly identifies and removes only the auto-generated content while maintaining the structure of existing manual docstrings. (fixes #11)
22+
- **C-Style Comment Handling**: Enhanced the docstring removal logic for C-style languages (Kotlin, Java, Go, etc.) to be more flexible with comment formatting variations, ensuring proper detection and removal of auto-generated content across different comment styles.
2023

2124
## [1.3.2]
2225

agent_docstrings/languages/common.py

Lines changed: 29 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
"""
22
--- AUTO-GENERATED DOCSTRING ---
3-
Table of content is automatically generated by Agent Docstrings v1.3.0
3+
Table of content is automatically generated by Agent Docstrings v1.3.2
44
55
Classes/Functions:
6-
- SignatureInfo (line 20):
7-
- ClassInfo (line 26):
8-
- CommentStyle (line 34):
9-
- remove_agent_docstring(text: str, language: str) -> str (line 57)
6+
- SignatureInfo (line 17):
7+
- ClassInfo (line 21):
8+
- CommentStyle (line 27):
9+
- remove_agent_docstring(text: str, language: str) -> str (line 46)
1010
--- END AUTO-GENERATED DOCSTRING ---
1111
"""
1212
from __future__ import annotations
13-
1413
import re
1514
from typing import List, Tuple, Dict, NamedTuple
1615

@@ -22,23 +21,20 @@ class SignatureInfo(NamedTuple):
2221
signature: str
2322
line: int
2423

25-
2624
class ClassInfo(NamedTuple):
2725
"""Stores information about a parsed class, including its methods."""
2826
name: str
2927
line: int
3028
methods: List[SignatureInfo]
3129
inner_classes: List["ClassInfo"]
3230

33-
3431
class CommentStyle(NamedTuple):
3532
"""Stores language-specific comment formatting information."""
3633
start: str
3734
end: str
3835
prefix: str
3936
indent: str
4037

41-
4238
COMMENT_STYLES: Dict[str, CommentStyle] = {
4339
"python": CommentStyle('"""', '"""', " ", " "),
4440
"kotlin": CommentStyle('/**', ' */', ' * ', " "),
@@ -53,63 +49,43 @@ class CommentStyle(NamedTuple):
5349
"delphi": CommentStyle('(*', '*)', ' * ', " "),
5450
}
5551

56-
5752
def remove_agent_docstring(text: str, language: str) -> str:
58-
"""Remove a previously generated docstring from *text*.
59-
60-
The search uses language-specific comment patterns to find a block
61-
containing DOCSTRING_START_MARKER and DOCSTRING_END_MARKER at the
62-
beginning of the file, and removes it.
63-
64-
Args:
65-
text (str): Full contents of the source file.
66-
language (str): Canonical language name (e.g. ``"python"``) used
67-
to pick the correct comment delimiters from
68-
:data:`COMMENT_STYLES`.
69-
70-
Returns:
71-
str: *text* without the agent docstring block. If no such
72-
docstring is detected, *text* is returned unchanged.
73-
"""
53+
"""Remove a previously generated docstring from *text*."""
7454
style = COMMENT_STYLES[language]
75-
76-
# ! Create a more flexible pattern that can match various formats
7755
start_marker_escaped = re.escape(DOCSTRING_START_MARKER)
7856
end_marker_escaped = re.escape(DOCSTRING_END_MARKER)
79-
8057
if language == "python":
81-
# * Python uses triple quotes - check for new format first
82-
pattern = re.compile(
83-
rf'^\s*"""\s*{start_marker_escaped}.*?{end_marker_escaped}\s*"""\s*\n?',
84-
re.DOTALL
85-
)
86-
match = pattern.search(text)
87-
if match:
88-
return text[match.end():]
89-
90-
# * Also check for old format (without proper markers)
91-
old_format_pattern = re.compile(
92-
rf'^\s*"""\s*Classes/Functions:.*?"""\s*\n?',
93-
re.DOTALL
94-
)
95-
match = old_format_pattern.search(text)
96-
if match:
97-
return text[match.end():]
58+
def replacer(match):
59+
docstring_content = match.group(0)
60+
auto_content_pattern = re.compile(
61+
rf'\s*{start_marker_escaped}[\s\S]*?{end_marker_escaped}\s*?\n?',
62+
re.DOTALL
63+
)
64+
cleaned_docstring = auto_content_pattern.sub('', docstring_content)
65+
temp_cleaned = cleaned_docstring.replace('"""', '').replace("'''", '').strip()
66+
if not temp_cleaned:
67+
return '' # Remove empty docstring
68+
# Ensure single newline padding for non-empty manual comments
69+
return f'"""\n{temp_cleaned}\n"""'
70+
docstring_pattern = re.compile(r'^\s*("""[\s\S]*?"""|'r"'''[\s\S]*?''')")
71+
# Iteratively clean the text
72+
cleaned_text = docstring_pattern.sub(replacer, text)
73+
cleaned_text = docstring_pattern.sub(replacer, cleaned_text) # Run again to handle adjacent blocks
74+
# Collapse whitespace and return
75+
return cleaned_text.strip()
9876
else:
99-
# * For C-style comments, be more flexible with the format
100-
# * Handle both compact (/**---...---*/) and expanded formats
77+
# For C-style comments, be more flexible with the format
78+
# Handle both compact (/**---...---*/) and expanded formats
10179
start_escaped = re.escape(style.start.rstrip()) # Remove trailing spaces
102-
103-
# * Handle different possible endings (with or without space before *)
80+
# Handle different possible endings (with or without space before *)
10481
end_patterns = [
10582
re.escape(style.end), # Original format with space
10683
re.escape(style.end.strip()), # Without space
10784
]
108-
109-
# * Try each possible end pattern
85+
# Try each possible end pattern
11086
for end_pattern in end_patterns:
11187
pattern = re.compile(
112-
rf'^\s*{start_escaped}.*?{start_marker_escaped}.*?{end_marker_escaped}.*?{end_pattern}\s*\n?',
88+
rf'^\s*{start_escaped}[\s\S]*?{start_marker_escaped}[\s\S]*?{end_marker_escaped}[\s\S]*?{end_pattern}\s*\n?',
11389
re.DOTALL
11490
)
11591
match = pattern.search(text)

tests/test_common.py

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,34 +2,31 @@
22

33
"""
44
--- AUTO-GENERATED DOCSTRING ---
5-
Table of content is automatically generated by Agent Docstrings v1.3.1
5+
Table of content is automatically generated by Agent Docstrings v1.3.2
66
77
Classes/Functions:
8-
- TestDataClasses (line 40):
9-
- test_signature_info_creation() -> None (line 43)
10-
- test_class_info_creation() -> None (line 49)
11-
- test_comment_style_creation() -> None (line 68)
12-
- TestCommentStyles (line 77):
13-
- test_all_supported_languages_have_styles() -> None (line 80)
14-
- test_comment_style_values(language: str, expected_start: str, expected_end: str, expected_prefix: str, expected_indent: str) -> None (line 97)
15-
- TestHeaderStripping (line 113):
16-
- test_strip_python_header() -> None (line 116)
17-
- test_strip_block_comment_header() -> None (line 141)
18-
- test_strip_c_style_comment_header() -> None (line 159)
19-
- test_no_header_to_strip() -> None (line 177)
20-
- test_preserve_shebang_when_stripping() -> None (line 186)
21-
- test_strip_header_with_various_whitespace() -> None (line 199)
22-
- test_strip_only_first_matching_header() -> None (line 207)
23-
- test_strip_header_edge_cases() -> None (line 223)
24-
- test_header_not_at_start() -> None (line 235)
25-
- test_invalid_language_patterns(language: str) -> None (line 249)
8+
- TestDataClasses (line 37):
9+
- test_signature_info_creation() -> None (line 39)
10+
- test_class_info_creation() -> None (line 44)
11+
- test_comment_style_creation() -> None (line 60)
12+
- TestCommentStyles (line 67):
13+
- test_all_supported_languages_have_styles() -> None (line 69)
14+
- test_comment_style_values(language: str, expected_start: str, expected_end: str, expected_prefix: str, expected_indent: str) -> None (line 85)
15+
- TestHeaderStripping (line 99):
16+
- test_strip_python_header() -> None (line 101)
17+
- test_strip_block_comment_header() -> None (line 121)
18+
- test_strip_c_style_comment_header() -> None (line 136)
19+
- test_no_header_to_strip() -> None (line 151)
20+
- test_preserve_shebang_when_stripping() -> None (line 158)
21+
- test_strip_header_with_various_whitespace() -> None (line 169)
22+
- test_strip_only_first_matching_header() -> None (line 175)
23+
- test_strip_header_edge_cases() -> None (line 189)
24+
- test_header_not_at_start() -> None (line 198)
25+
- test_invalid_language_patterns(language: str) -> None (line 209)
2626
--- END AUTO-GENERATED DOCSTRING ---
2727
Tests for agent_docstrings.languages.common module.
2828
"""
29-
30-
3129
import pytest
32-
3330
from agent_docstrings.languages.common import (
3431
COMMENT_STYLES,
3532
ClassInfo,
@@ -40,7 +37,6 @@
4037
DOCSTRING_END_MARKER,
4138
)
4239

43-
4440
class TestDataClasses:
4541
"""Tests for data classes used in parsing."""
4642

@@ -77,7 +73,6 @@ def test_comment_style_creation(self) -> None:
7773
assert style.prefix == " * "
7874
assert style.indent == " "
7975

80-
8176
class TestCommentStyles:
8277
"""Tests for comment style definitions."""
8378

@@ -113,7 +108,6 @@ def test_comment_style_values(
113108
assert style.prefix == expected_prefix
114109
assert style.indent == expected_indent
115110

116-
117111
class TestHeaderStripping:
118112
"""Tests for remove_agent_docstring function."""
119113

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
"""
2+
--- AUTO-GENERATED DOCSTRING ---
3+
Table of content is automatically generated by Agent Docstrings v1.3.2
4+
5+
Classes/Functions:
6+
- test_no_docstring_duplication_on_repeated_runs(source_processor) -> None (line 15)
7+
- test_manual_docstring_preservation_with_auto_generation(source_processor) -> None (line 56)
8+
- test_existing_auto_docstring_replacement(source_processor) -> None (line 100)
9+
- test_multiple_auto_docstring_removal(source_processor) -> None (line 135)
10+
--- END AUTO-GENERATED DOCSTRING ---
11+
"""
12+
import pytest
13+
import re
14+
from textwrap import dedent
15+
16+
def test_no_docstring_duplication_on_repeated_runs(source_processor) -> None:
17+
"""
18+
Test that running the docstring generator multiple times on the same file
19+
does not create duplicate auto-generated docstrings.
20+
This test simulates the scenario where a file with manual docstring
21+
gets processed multiple times, ensuring no double docstrings are created.
22+
"""
23+
# * Initial file with manual docstring
24+
initial_content = dedent('''
25+
"""
26+
Human comments
27+
This is a manual docstring that should be preserved.
28+
"""
29+
def test_function():
30+
"""This is a function docstring."""
31+
return "test"
32+
class TestClass:
33+
def method(self):
34+
return "method"
35+
''').strip()
36+
# * First run - should generate auto docstring and merge with manual
37+
result_content_1, lines_1, _ = source_processor("test_duplication.py", initial_content)
38+
# * Verify that auto-generated docstring was added
39+
assert "--- AUTO-GENERATED DOCSTRING ---" in result_content_1
40+
assert "Human comments" in result_content_1 # Manual content preserved
41+
assert "test_function()" in result_content_1 # Auto-generated content added
42+
# * Count auto-generated docstring markers
43+
auto_markers_1 = result_content_1.count("--- AUTO-GENERATED DOCSTRING ---")
44+
assert auto_markers_1 == 1, f"Expected 1 auto docstring marker, found {auto_markers_1}"
45+
# * Second run - should not create duplicate auto docstrings
46+
result_content_2, lines_2, _ = source_processor("test_duplication.py", result_content_1)
47+
# * Verify no duplication occurred
48+
auto_markers_2 = result_content_2.count("--- AUTO-GENERATED DOCSTRING ---")
49+
assert auto_markers_2 == 1, f"Expected 1 auto docstring marker after second run, found {auto_markers_2}"
50+
# * Verify manual content is still preserved
51+
assert "Human comments" in result_content_2
52+
assert "This is a manual docstring that should be preserved." in result_content_2
53+
# * Verify auto-generated content is still present
54+
assert "test_function()" in result_content_2
55+
assert "TestClass" in result_content_2
56+
assert "method()" in result_content_2
57+
58+
def test_manual_docstring_preservation_with_auto_generation(source_processor) -> None:
59+
"""
60+
Test that manual docstrings are properly preserved when auto-generating
61+
docstrings, and that the structure is correct.
62+
"""
63+
# * File with manual docstring only
64+
initial_content = dedent('''
65+
"""
66+
This is a manual module docstring.
67+
It should be preserved and merged with auto-generated content.
68+
"""
69+
def function_one():
70+
pass
71+
def function_two():
72+
pass
73+
''').strip()
74+
result_content, lines, _ = source_processor("test_manual_preservation.py", initial_content)
75+
# * Verify structure: manual content should come after auto-generated content
76+
lines_list = result_content.split('\n')
77+
# * Find the docstring boundaries
78+
docstring_start = None
79+
docstring_end = None
80+
manual_content_found = False
81+
for i, line in enumerate(lines_list):
82+
if line.strip() == '"""' and docstring_start is None:
83+
docstring_start = i
84+
elif line.strip() == '"""' and docstring_start is not None:
85+
docstring_end = i
86+
break
87+
assert docstring_start is not None, "Docstring start not found"
88+
assert docstring_end is not None, "Docstring end not found"
89+
# * Extract docstring content
90+
docstring_content = lines_list[docstring_start:docstring_end + 1]
91+
docstring_text = '\n'.join(docstring_content)
92+
# * Verify auto-generated content is first
93+
assert "--- AUTO-GENERATED DOCSTRING ---" in docstring_text
94+
assert "function_one()" in docstring_text
95+
assert "function_two()" in docstring_text
96+
# * Verify manual content is preserved
97+
assert "This is a manual module docstring." in docstring_text
98+
assert "It should be preserved and merged with auto-generated content." in docstring_text
99+
# * Verify only one docstring block exists
100+
docstring_blocks = result_content.count('"""')
101+
assert docstring_blocks == 2, f"Expected 2 triple quotes (start and end), found {docstring_blocks}"
102+
103+
def test_existing_auto_docstring_replacement(source_processor) -> None:
104+
"""
105+
Test that existing auto-generated docstrings are properly replaced
106+
when the file is processed again.
107+
"""
108+
# * File with existing auto-generated docstring
109+
initial_content = dedent('''
110+
"""
111+
--- AUTO-GENERATED DOCSTRING ---
112+
Table of content is automatically generated by Agent Docstrings v1.3.1
113+
Classes/Functions:
114+
- old_function() (line 8)
115+
--- END AUTO-GENERATED DOCSTRING ---
116+
"""
117+
def old_function():
118+
pass
119+
def new_function():
120+
pass
121+
''').strip()
122+
result_content, lines, _ = source_processor("test_replacement.py", initial_content)
123+
# * Find the docstring in the result
124+
docstring_match = re.search(r'"""[\s\S]*?"""', result_content)
125+
assert docstring_match, "Could not find docstring in processed file"
126+
docstring_text = docstring_match.group(0)
127+
# * Verify new content is in the docstring
128+
assert "old_function()" in docstring_text
129+
assert "new_function()" in docstring_text
130+
# * Verify only one auto-generated docstring exists in the whole file
131+
auto_markers = result_content.count("--- AUTO-GENERATED DOCSTRING ---")
132+
assert auto_markers == 1, f"Expected 1 auto docstring marker, found {auto_markers}"
133+
# * Verify the version is updated in the docstring
134+
assert "Agent Docstrings v1.3.2" in docstring_text
135+
# * Verify that old_function is mentioned only once *within the docstring*
136+
assert docstring_text.count("old_function()") == 1, "Function should appear only once in docstring"
137+
assert docstring_text.count("new_function()") == 1, "Function should appear only once in docstring"
138+
139+
def test_multiple_auto_docstring_removal(source_processor) -> None:
140+
"""
141+
Test that multiple auto-generated docstrings are properly removed
142+
and replaced with a single one.
143+
"""
144+
# * File with multiple auto-generated docstrings (simulating a bug)
145+
initial_content = dedent('''
146+
"""
147+
--- AUTO-GENERATED DOCSTRING ---
148+
Table of content is automatically generated by Agent Docstrings v1.3.1
149+
--- END AUTO-GENERATED DOCSTRING ---
150+
"""
151+
"""
152+
--- AUTO-GENERATED DOCSTRING ---
153+
Table of content is automatically generated by Agent Docstrings v1.3.2
154+
--- END AUTO-GENERATED DOCSTRING ---
155+
Human comments
156+
"""
157+
def test_function():
158+
return "test"
159+
''').strip()
160+
result_content, lines, _ = source_processor("test_multiple_removal.py", initial_content)
161+
# * Verify only one auto-generated docstring exists
162+
auto_markers = result_content.count("--- AUTO-GENERATED DOCSTRING ---")
163+
assert auto_markers == 1, f"Expected 1 auto docstring marker, found {auto_markers}"
164+
# * Verify manual content is preserved
165+
assert "Human comments" in result_content
166+
# * Verify function is documented
167+
assert "test_function()" in result_content
168+
# * Verify that there is only one docstring block in the final output
169+
docstring_blocks = re.findall(r'"""[\s\S]*?"""', result_content)
170+
assert len(docstring_blocks) == 1, f"Expected 1 docstring block, found {len(docstring_blocks)}"

0 commit comments

Comments
 (0)