fix: remove license headers in markdown files (#273)

dandhlee · web-flow · commit a4c808fdbfbf · 2023-01-30T14:21:07.000-06:00
* fix: remove license headers

* test: update unit test for new markdown feature

* test: update goldens

* fix: remove debug print statement
diff --git a/docfx_yaml/markdown_utils.py b/docfx_yaml/markdown_utils.py
@@ -145,6 +145,33 @@ def _extract_header_from_markdown(mdfile: Iterable[str]) -> str:
     return ""
 
 
+def _remove_license(mdfile_path: str) -> None:
+    """Removes any licenses in markdown files."""
+
+    comment_tag_begin = "<!--"
+    comment_tag_end = "-->"
+
+    with open(mdfile_path) as mdfile:
+        file_content = mdfile.read()
+
+    # Find the first occurrence of comment tags.
+    begin_index = file_content.find(comment_tag_begin)
+    end_index = file_content.find(comment_tag_end)
+
+    # Check whether the HTML comment is a license - they should be at the top of
+    # the file, and if any content prior to the license is visible other than
+    # whitespace we assume it's not a license comment.
+    if (pre_comment := file_content[:begin_index]) and not pre_comment.isspace():
+        return
+
+    # Strip the license.
+    file_content = file_content[end_index + len(comment_tag_end):]
+
+    # Reset file position to the beginning to write
+    with open(mdfile_path, 'w') as mdfile:
+        mdfile.write(file_content)
+
+
 def _highlight_md_codeblocks(mdfile_path: str) -> None:
     """Adds syntax highlighting to code blocks for a given markdown file."""
     fence = '```'
@@ -285,6 +312,8 @@ def move_markdown_pages(
         if mdfile.is_file() and mdfile.name.lower() not in files_to_ignore:
             mdfile_name = ""
 
+            _remove_license(mdfile)
+
             # Extract the header name for TOC.
             with open(mdfile) as mdfile_iterator:
                 name = _extract_header_from_markdown(mdfile_iterator)
diff --git a/tests/markdown_example_header_want.md b/tests/markdown_example_header_want.md
@@ -0,0 +1,4 @@
+ # Test header for a simple markdown file.
+
+##Content header
+This is a simple line followed by an h2 header.
diff --git a/tests/markdown_example_header_with_comments.md b/tests/markdown_example_header_with_comments.md
@@ -0,0 +1,10 @@
+# Test header for a simple markdown file.
+
+<!-- This is a simple inline HTML comment.
+
+This comment should not be stripped.
+
+-->
+
+##Content header
+This is a simple line followed by an h2 header.
diff --git a/tests/markdown_example_header_with_comments_want.md b/tests/markdown_example_header_with_comments_want.md
@@ -0,0 +1,10 @@
+# Test header for a simple markdown file.
+
+<!-- This is a simple inline HTML comment.
+
+This comment should not be stripped.
+
+-->
+
+##Content header
+This is a simple line followed by an h2 header.
diff --git a/tests/test_markdown.py b/tests/test_markdown.py
@@ -278,5 +278,33 @@ def test_remove_unused_pages_with_exception(self):
             pytest.fail('Should not have thrown an exception.')
 
 
+    test_markdown_filenames = [
+        [
+            "tests/markdown_example_header.md",
+            "tests/markdown_example_header_want.md",
+        ],
+        [
+            "tests/markdown_example_header_with_comments.md",
+            "tests/markdown_example_header_with_comments_want.md",
+        ],
+    ]
+    @parameterized.expand(test_markdown_filenames)
+    def test_remove_license(self, base_filename, want_filename):
+        # Check that licenses are correctly removed.
+
+        # Copy the base file we'll need to test.
+        with tempfile.NamedTemporaryFile(mode='w+', delete=False) as test_file:
+            with open(base_filename) as base_file:
+                test_file.write(base_file.read())
+                test_file.flush()
+                test_file.seek(0)
+
+            markdown_utils._remove_license(test_file.name)
+            test_file.seek(0)
+
+            with open(want_filename) as mdfile_want:
+                self.assertEqual(test_file.read(), mdfile_want.read())
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/testdata/goldens/handwritten/toc.yml b/tests/testdata/goldens/handwritten/toc.yml
@@ -3,6 +3,8 @@
     name: Overview
   - href: changelog.md
     name: Changelog
+  - href: upgrading.md
+    name: 3.0.0 Migration Guide
   - items:
     - href: blobs.md
       name: Blobs / Objects
diff --git a/tests/testdata/goldens/handwritten/upgrading.md b/tests/testdata/goldens/handwritten/upgrading.md
@@ -0,0 +1,252 @@
+# 3.0.0 Migration Guide
+
+## New Required Dependencies
+
+Some of the previously optional dependencies are now *required* in `3.x` versions of the
+library, namely
+[google-cloud-bigquery-storage](https://pypi.org/project/google-cloud-bigquery-storage/)
+(minimum version `2.0.0`) and [pyarrow](https://pypi.org/project/pyarrow/) (minimum
+version `3.0.0`).
+
+The behavior of some of the package “extras” has thus also changed:
+
+
+* The `pandas` extra now requires the [db-types](https://pypi.org/project/db-dtypes/)
+package.
+
+
+* The `bqstorage` extra has been preserved for comaptibility reasons, but it is now a
+no-op and should be omitted when installing the BigQuery client library.
+
+**Before:**
+
+```default
+$ pip install google-cloud-bigquery[bqstorage]
+```
+
+**After:**
+
+```default
+$ pip install google-cloud-bigquery
+```
+
+
+* The `bignumeric_type` extra has been removed, as `BIGNUMERIC` type is now
+automatically supported. That extra should thus not be used.
+
+**Before:**
+
+```default
+$ pip install google-cloud-bigquery[bignumeric_type]
+```
+
+**After:**
+
+```default
+$ pip install google-cloud-bigquery
+```
+
+## Type Annotations
+
+The library is now type-annotated and declares itself as such. If you use a static
+type checker such as `mypy`, you might start getting errors in places where
+`google-cloud-bigquery` package is used.
+
+It is recommended to update your code and/or type annotations to fix these errors, but
+if this is not feasible in the short term, you can temporarily ignore type annotations
+in `google-cloud-bigquery`, for example by using a special `# type: ignore` comment:
+
+```py
+from google.cloud import bigquery  # type: ignore
+```
+
+But again, this is only recommended as a possible short-term workaround if immediately
+fixing the type check errors in your project is not feasible.
+
+## Re-organized Types
+
+The auto-generated parts of the library has been removed, and proto-based types formerly
+found in `google.cloud.bigquery_v2` have been replaced by the new implementation (but
+see the [section](#legacy-types) below).
+
+For example, the standard SQL data types should new be imported from a new location:
+
+**Before:**
+
+```py
+from google.cloud.bigquery_v2 import StandardSqlDataType
+from google.cloud.bigquery_v2.types import StandardSqlField
+from google.cloud.bigquery_v2.types.standard_sql import StandardSqlStructType
+```
+
+**After:**
+
+```py
+from google.cloud.bigquery import StandardSqlDataType
+from google.cloud.bigquery.standard_sql import StandardSqlField
+from google.cloud.bigquery.standard_sql import StandardSqlStructType
+```
+
+The `TypeKind` enum defining all possible SQL types for schema fields has been renamed
+and is not nested anymore under `StandardSqlDataType`:
+
+**Before:**
+
+```py
+from google.cloud.bigquery_v2 import StandardSqlDataType
+
+if field_type == StandardSqlDataType.TypeKind.STRING:
+    ...
+```
+
+**After:**
+
+```py
+
+from google.cloud.bigquery import StandardSqlTypeNames
+
+if field_type == StandardSqlTypeNames.STRING:
+    ...
+```
+
+## Issuing queries with `Client.create_job` preserves destination table
+
+The `Client.create_job` method no longer removes the destination table from a
+query job’s configuration. Destination table for the query can thus be
+explicitly defined by the user.
+
+## Changes to data types when reading a pandas DataFrame
+
+The default dtypes returned by the `to_dataframe` method have changed.
+
+
+* Now, the BigQuery `BOOLEAN` data type maps to the pandas `boolean` dtype.
+Previously, this mapped to the pandas `bool` dtype when the column did not
+contain `NULL` values and the pandas `object` dtype when `NULL` values are
+present.
+
+
+* Now, the BigQuery `INT64` data type maps to the pandas `Int64` dtype.
+Previously, this mapped to the pandas `int64` dtype when the column did not
+contain `NULL` values and the pandas `float64` dtype when `NULL` values are
+present.
+
+
+* Now, the BigQuery `DATE` data type maps to the pandas `dbdate` dtype, which
+is provided by the
+[db-dtypes](https://googleapis.dev/python/db-dtypes/latest/index.html)
+package. If any date value is outside of the range of
+[pandas.Timestamp.min](https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.min.html)
+(1677-09-22) and
+[pandas.Timestamp.max](https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.max.html)
+(2262-04-11), the data type maps to the pandas `object` dtype. The
+`date_as_object` parameter has been removed.
+
+
+* Now, the BigQuery `TIME` data type maps to the pandas `dbtime` dtype, which
+is provided by the
+[db-dtypes](https://googleapis.dev/python/db-dtypes/latest/index.html)
+package.
+
+## Changes to data types loading a pandas DataFrame
+
+In the absence of schema information, pandas columns with naive
+`datetime64[ns]` values, i.e. without timezone information, are recognized and
+loaded using the `DATETIME` type.  On the other hand, for columns with
+timezone-aware `datetime64[ns, UTC]` values, the `TIMESTAMP` type is continued
+to be used.
+
+## Changes to `Model`, `Client.get_model`, `Client.update_model`, and `Client.list_models`
+
+The types of several `Model` properties have been changed.
+
+
+* `Model.feature_columns` now returns a sequence of `google.cloud.bigquery.standard_sql.StandardSqlField`.
+
+
+* `Model.label_columns` now returns a sequence of `google.cloud.bigquery.standard_sql.StandardSqlField`.
+
+
+* `Model.model_type` now returns a string.
+
+
+* `Model.training_runs` now returns a sequence of dictionaries, as recieved from the [BigQuery REST API](https://cloud.google.com/bigquery/docs/reference/rest/v2/models#Model.FIELDS.training_runs).
+
+<a name="legacy-protobuf-types"></a>
+
+## Legacy Protocol Buffers Types
+
+For compatibility reasons, the legacy proto-based types still exists as static code
+and can be imported:
+
+```py
+from google.cloud.bigquery_v2 import Model  # a sublcass of proto.Message
+```
+
+Mind, however, that importing them will issue a warning, because aside from
+being importable, these types **are not maintained anymore**. They may differ
+both from the types in `google.cloud.bigquery`, and from the types supported on
+the backend.
+
+### Maintaining compatibility with `google-cloud-bigquery` version 2.0
+
+If you maintain a library or system that needs to support both
+`google-cloud-bigquery` version 2.x and 3.x, it is recommended that you detect
+when version 2.x is in use and convert properties that use the legacy protocol
+buffer types, such as `Model.training_runs`, into the types used in 3.x.
+
+Call the [`to_dict`
+method](https://proto-plus-python.readthedocs.io/en/latest/reference/message.html#proto.message.Message.to_dict)
+on the protocol buffers objects to get a JSON-compatible dictionary.
+
+```py
+from google.cloud.bigquery_v2 import Model
+
+training_run: Model.TrainingRun = ...
+training_run_dict = training_run.to_dict()
+```
+
+# 2.0.0 Migration Guide
+
+The 2.0 release of the `google-cloud-bigquery` client drops support for Python
+versions below 3.6. The client surface itself has not changed, but the 1.x series
+will not be receiving any more feature updates or bug fixes. You are thus
+encouraged to upgrade to the 2.x series.
+
+If you experience issues or have questions, please file an
+[issue](https://github.com/googleapis/python-bigquery/issues).
+
+## Supported Python Versions
+
+> **WARNING**: Breaking change
+
+The 2.0.0 release requires Python 3.6+.
+
+## Supported BigQuery Storage Clients
+
+The 2.0.0 release requires BigQuery Storage `>= 2.0.0`, which dropped support
+for `v1beta1` and `v1beta2` versions of the BigQuery Storage API. If you want to
+use a BigQuery Storage client, it must be the one supporting the `v1` API version.
+
+## Changed GAPIC Enums Path
+
+> **WARNING**: Breaking change
+
+Generated GAPIC enum types have been moved under `types`. Import paths need to be
+adjusted.
+
+**Before:**
+
+```py
+from google.cloud.bigquery_v2.gapic import enums
+
+distance_type = enums.Model.DistanceType.COSINE
+```
+
+**After:**
+
+```py
+from google.cloud.bigquery_v2 import types
+
+distance_type = types.Model.DistanceType.COSINE
+```
diff --git a/tests/testdata/handwritten/docs/UPGRADING.md b/tests/testdata/handwritten/docs/UPGRADING.md