Skip to content
This repository was archived by the owner on Mar 26, 2026. It is now read-only.

Commit a4c808f

Browse files
authored
fix: remove license headers in markdown files (#273)
* fix: remove license headers * test: update unit test for new markdown feature * test: update goldens * fix: remove debug print statement
1 parent 78e2a4a commit a4c808f

8 files changed

Lines changed: 578 additions & 0 deletions

File tree

docfx_yaml/markdown_utils.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,33 @@ def _extract_header_from_markdown(mdfile: Iterable[str]) -> str:
145145
return ""
146146

147147

148+
def _remove_license(mdfile_path: str) -> None:
149+
"""Removes any licenses in markdown files."""
150+
151+
comment_tag_begin = "<!--"
152+
comment_tag_end = "-->"
153+
154+
with open(mdfile_path) as mdfile:
155+
file_content = mdfile.read()
156+
157+
# Find the first occurrence of comment tags.
158+
begin_index = file_content.find(comment_tag_begin)
159+
end_index = file_content.find(comment_tag_end)
160+
161+
# Check whether the HTML comment is a license - they should be at the top of
162+
# the file, and if any content prior to the license is visible other than
163+
# whitespace we assume it's not a license comment.
164+
if (pre_comment := file_content[:begin_index]) and not pre_comment.isspace():
165+
return
166+
167+
# Strip the license.
168+
file_content = file_content[end_index + len(comment_tag_end):]
169+
170+
# Reset file position to the beginning to write
171+
with open(mdfile_path, 'w') as mdfile:
172+
mdfile.write(file_content)
173+
174+
148175
def _highlight_md_codeblocks(mdfile_path: str) -> None:
149176
"""Adds syntax highlighting to code blocks for a given markdown file."""
150177
fence = '```'
@@ -285,6 +312,8 @@ def move_markdown_pages(
285312
if mdfile.is_file() and mdfile.name.lower() not in files_to_ignore:
286313
mdfile_name = ""
287314

315+
_remove_license(mdfile)
316+
288317
# Extract the header name for TOC.
289318
with open(mdfile) as mdfile_iterator:
290319
name = _extract_header_from_markdown(mdfile_iterator)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Test header for a simple markdown file.
2+
3+
##Content header
4+
This is a simple line followed by an h2 header.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Test header for a simple markdown file.
2+
3+
<!-- This is a simple inline HTML comment.
4+
5+
This comment should not be stripped.
6+
7+
-->
8+
9+
##Content header
10+
This is a simple line followed by an h2 header.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Test header for a simple markdown file.
2+
3+
<!-- This is a simple inline HTML comment.
4+
5+
This comment should not be stripped.
6+
7+
-->
8+
9+
##Content header
10+
This is a simple line followed by an h2 header.

tests/test_markdown.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,5 +278,33 @@ def test_remove_unused_pages_with_exception(self):
278278
pytest.fail('Should not have thrown an exception.')
279279

280280

281+
test_markdown_filenames = [
282+
[
283+
"tests/markdown_example_header.md",
284+
"tests/markdown_example_header_want.md",
285+
],
286+
[
287+
"tests/markdown_example_header_with_comments.md",
288+
"tests/markdown_example_header_with_comments_want.md",
289+
],
290+
]
291+
@parameterized.expand(test_markdown_filenames)
292+
def test_remove_license(self, base_filename, want_filename):
293+
# Check that licenses are correctly removed.
294+
295+
# Copy the base file we'll need to test.
296+
with tempfile.NamedTemporaryFile(mode='w+', delete=False) as test_file:
297+
with open(base_filename) as base_file:
298+
test_file.write(base_file.read())
299+
test_file.flush()
300+
test_file.seek(0)
301+
302+
markdown_utils._remove_license(test_file.name)
303+
test_file.seek(0)
304+
305+
with open(want_filename) as mdfile_want:
306+
self.assertEqual(test_file.read(), mdfile_want.read())
307+
308+
281309
if __name__ == '__main__':
282310
unittest.main()

tests/testdata/goldens/handwritten/toc.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
name: Overview
44
- href: changelog.md
55
name: Changelog
6+
- href: upgrading.md
7+
name: 3.0.0 Migration Guide
68
- items:
79
- href: blobs.md
810
name: Blobs / Objects
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
# 3.0.0 Migration Guide
2+
3+
## New Required Dependencies
4+
5+
Some of the previously optional dependencies are now *required* in `3.x` versions of the
6+
library, namely
7+
[google-cloud-bigquery-storage](https://pypi.org/project/google-cloud-bigquery-storage/)
8+
(minimum version `2.0.0`) and [pyarrow](https://pypi.org/project/pyarrow/) (minimum
9+
version `3.0.0`).
10+
11+
The behavior of some of the package “extras” has thus also changed:
12+
13+
14+
* The `pandas` extra now requires the [db-types](https://pypi.org/project/db-dtypes/)
15+
package.
16+
17+
18+
* The `bqstorage` extra has been preserved for comaptibility reasons, but it is now a
19+
no-op and should be omitted when installing the BigQuery client library.
20+
21+
**Before:**
22+
23+
```default
24+
$ pip install google-cloud-bigquery[bqstorage]
25+
```
26+
27+
**After:**
28+
29+
```default
30+
$ pip install google-cloud-bigquery
31+
```
32+
33+
34+
* The `bignumeric_type` extra has been removed, as `BIGNUMERIC` type is now
35+
automatically supported. That extra should thus not be used.
36+
37+
**Before:**
38+
39+
```default
40+
$ pip install google-cloud-bigquery[bignumeric_type]
41+
```
42+
43+
**After:**
44+
45+
```default
46+
$ pip install google-cloud-bigquery
47+
```
48+
49+
## Type Annotations
50+
51+
The library is now type-annotated and declares itself as such. If you use a static
52+
type checker such as `mypy`, you might start getting errors in places where
53+
`google-cloud-bigquery` package is used.
54+
55+
It is recommended to update your code and/or type annotations to fix these errors, but
56+
if this is not feasible in the short term, you can temporarily ignore type annotations
57+
in `google-cloud-bigquery`, for example by using a special `# type: ignore` comment:
58+
59+
```py
60+
from google.cloud import bigquery # type: ignore
61+
```
62+
63+
But again, this is only recommended as a possible short-term workaround if immediately
64+
fixing the type check errors in your project is not feasible.
65+
66+
## Re-organized Types
67+
68+
The auto-generated parts of the library has been removed, and proto-based types formerly
69+
found in `google.cloud.bigquery_v2` have been replaced by the new implementation (but
70+
see the [section](#legacy-types) below).
71+
72+
For example, the standard SQL data types should new be imported from a new location:
73+
74+
**Before:**
75+
76+
```py
77+
from google.cloud.bigquery_v2 import StandardSqlDataType
78+
from google.cloud.bigquery_v2.types import StandardSqlField
79+
from google.cloud.bigquery_v2.types.standard_sql import StandardSqlStructType
80+
```
81+
82+
**After:**
83+
84+
```py
85+
from google.cloud.bigquery import StandardSqlDataType
86+
from google.cloud.bigquery.standard_sql import StandardSqlField
87+
from google.cloud.bigquery.standard_sql import StandardSqlStructType
88+
```
89+
90+
The `TypeKind` enum defining all possible SQL types for schema fields has been renamed
91+
and is not nested anymore under `StandardSqlDataType`:
92+
93+
**Before:**
94+
95+
```py
96+
from google.cloud.bigquery_v2 import StandardSqlDataType
97+
98+
if field_type == StandardSqlDataType.TypeKind.STRING:
99+
...
100+
```
101+
102+
**After:**
103+
104+
```py
105+
106+
from google.cloud.bigquery import StandardSqlTypeNames
107+
108+
if field_type == StandardSqlTypeNames.STRING:
109+
...
110+
```
111+
112+
## Issuing queries with `Client.create_job` preserves destination table
113+
114+
The `Client.create_job` method no longer removes the destination table from a
115+
query job’s configuration. Destination table for the query can thus be
116+
explicitly defined by the user.
117+
118+
## Changes to data types when reading a pandas DataFrame
119+
120+
The default dtypes returned by the `to_dataframe` method have changed.
121+
122+
123+
* Now, the BigQuery `BOOLEAN` data type maps to the pandas `boolean` dtype.
124+
Previously, this mapped to the pandas `bool` dtype when the column did not
125+
contain `NULL` values and the pandas `object` dtype when `NULL` values are
126+
present.
127+
128+
129+
* Now, the BigQuery `INT64` data type maps to the pandas `Int64` dtype.
130+
Previously, this mapped to the pandas `int64` dtype when the column did not
131+
contain `NULL` values and the pandas `float64` dtype when `NULL` values are
132+
present.
133+
134+
135+
* Now, the BigQuery `DATE` data type maps to the pandas `dbdate` dtype, which
136+
is provided by the
137+
[db-dtypes](https://googleapis.dev/python/db-dtypes/latest/index.html)
138+
package. If any date value is outside of the range of
139+
[pandas.Timestamp.min](https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.min.html)
140+
(1677-09-22) and
141+
[pandas.Timestamp.max](https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.max.html)
142+
(2262-04-11), the data type maps to the pandas `object` dtype. The
143+
`date_as_object` parameter has been removed.
144+
145+
146+
* Now, the BigQuery `TIME` data type maps to the pandas `dbtime` dtype, which
147+
is provided by the
148+
[db-dtypes](https://googleapis.dev/python/db-dtypes/latest/index.html)
149+
package.
150+
151+
## Changes to data types loading a pandas DataFrame
152+
153+
In the absence of schema information, pandas columns with naive
154+
`datetime64[ns]` values, i.e. without timezone information, are recognized and
155+
loaded using the `DATETIME` type. On the other hand, for columns with
156+
timezone-aware `datetime64[ns, UTC]` values, the `TIMESTAMP` type is continued
157+
to be used.
158+
159+
## Changes to `Model`, `Client.get_model`, `Client.update_model`, and `Client.list_models`
160+
161+
The types of several `Model` properties have been changed.
162+
163+
164+
* `Model.feature_columns` now returns a sequence of `google.cloud.bigquery.standard_sql.StandardSqlField`.
165+
166+
167+
* `Model.label_columns` now returns a sequence of `google.cloud.bigquery.standard_sql.StandardSqlField`.
168+
169+
170+
* `Model.model_type` now returns a string.
171+
172+
173+
* `Model.training_runs` now returns a sequence of dictionaries, as recieved from the [BigQuery REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/models#Model.FIELDS.training_runs).
174+
175+
<a name="legacy-protobuf-types"></a>
176+
177+
## Legacy Protocol Buffers Types
178+
179+
For compatibility reasons, the legacy proto-based types still exists as static code
180+
and can be imported:
181+
182+
```py
183+
from google.cloud.bigquery_v2 import Model # a sublcass of proto.Message
184+
```
185+
186+
Mind, however, that importing them will issue a warning, because aside from
187+
being importable, these types **are not maintained anymore**. They may differ
188+
both from the types in `google.cloud.bigquery`, and from the types supported on
189+
the backend.
190+
191+
### Maintaining compatibility with `google-cloud-bigquery` version 2.0
192+
193+
If you maintain a library or system that needs to support both
194+
`google-cloud-bigquery` version 2.x and 3.x, it is recommended that you detect
195+
when version 2.x is in use and convert properties that use the legacy protocol
196+
buffer types, such as `Model.training_runs`, into the types used in 3.x.
197+
198+
Call the [`to_dict`
199+
method](https://proto-plus-python.readthedocs.io/en/latest/reference/message.html#proto.message.Message.to_dict)
200+
on the protocol buffers objects to get a JSON-compatible dictionary.
201+
202+
```py
203+
from google.cloud.bigquery_v2 import Model
204+
205+
training_run: Model.TrainingRun = ...
206+
training_run_dict = training_run.to_dict()
207+
```
208+
209+
# 2.0.0 Migration Guide
210+
211+
The 2.0 release of the `google-cloud-bigquery` client drops support for Python
212+
versions below 3.6. The client surface itself has not changed, but the 1.x series
213+
will not be receiving any more feature updates or bug fixes. You are thus
214+
encouraged to upgrade to the 2.x series.
215+
216+
If you experience issues or have questions, please file an
217+
[issue](https://github.com/googleapis/python-bigquery/issues).
218+
219+
## Supported Python Versions
220+
221+
> **WARNING**: Breaking change
222+
223+
The 2.0.0 release requires Python 3.6+.
224+
225+
## Supported BigQuery Storage Clients
226+
227+
The 2.0.0 release requires BigQuery Storage `>= 2.0.0`, which dropped support
228+
for `v1beta1` and `v1beta2` versions of the BigQuery Storage API. If you want to
229+
use a BigQuery Storage client, it must be the one supporting the `v1` API version.
230+
231+
## Changed GAPIC Enums Path
232+
233+
> **WARNING**: Breaking change
234+
235+
Generated GAPIC enum types have been moved under `types`. Import paths need to be
236+
adjusted.
237+
238+
**Before:**
239+
240+
```py
241+
from google.cloud.bigquery_v2.gapic import enums
242+
243+
distance_type = enums.Model.DistanceType.COSINE
244+
```
245+
246+
**After:**
247+
248+
```py
249+
from google.cloud.bigquery_v2 import types
250+
251+
distance_type = types.Model.DistanceType.COSINE
252+
```

0 commit comments

Comments
 (0)