chore: Add heuristics for issue tracker (#12703)

ohmayr · web-flow · commit 3650261ad872 · 2024-05-22T14:09:24.000-04:00
Fixes #11147 🦕 Fixes #12679
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -20,6 +20,7 @@ on:
     # * is a special character in YAML so you have to quote this string
     # Run this Github Action every Tuesday at 7 AM UTC
     - cron:  '0 7 * * 2'
+  workflow_dispatch:
 
 permissions: read-all
 
diff --git a/scripts/updateapilist.py b/scripts/updateapilist.py
@@ -16,9 +16,14 @@
 
 import os
 import requests
+import logging
 from typing import List, Optional
 from dataclasses import dataclass
 
+# Configure logging to output messages to console
+logging.basicConfig(level=logging.INFO)  # Set the desired logging level
+
+import re
 
 class MissingGithubToken(ValueError):
     """Raised when the GITHUB_TOKEN environment variable is not set"""
@@ -57,9 +62,20 @@ class MissingGithubToken(ValueError):
 # BASE_API defines the base API for Github.
 BASE_API = "https://api.github.com"
 
+# GITHUB_ISSUES defines the issues URL for a repository on GitHub.
+GITHUB_ISSUES = "https://github.com/{repo}/issues"
 
+# BASE_ISSUE_TRACKER defines the base URL for issue tracker.
+BASE_ISSUE_TRACKER = "https://issuetracker.google.com"
 
+# This issue-tracker component is part of some saved searches for listing API-side issues.
+# However, when we construct URLs for filing new issues (which in some cases we do by analyzing
+# the query string for a saved search), we want to ensure we DON'T file a new issue against
+# this generic component but against a more specific one.
+GENERIC_ISSUE_TRACKER_COMPONENT = "187065"
 
+# This sentinel value is used to mark cache fields that have not been computed yet.
+NOT_COMPUTED = -1
 
 class CloudClient:
     repo: str = None
@@ -68,13 +84,97 @@ class CloudClient:
     distribution_name: str = None
     issue_tracker: str = None
 
+
     def __init__(self, repo: dict):
         self.repo = repo["repo"]
         # For now, strip out "Google Cloud" to standardize the titles
         self.title = repo["name_pretty"].replace("Google ", "").replace("Cloud ", "")
         self.release_level = repo["release_level"]
         self.distribution_name = repo["distribution_name"]
         self.issue_tracker = repo.get("issue_tracker")
+        self._cached_component_id = NOT_COMPUTED
+        self._cached_template_id = NOT_COMPUTED
+        self._cached_saved_search_id = NOT_COMPUTED
+    
+    @property
+    def saved_search_id(self):
+        if self._cached_saved_search_id != NOT_COMPUTED:
+            return self._cached_saved_search_id
+        if not self.issue_tracker:
+            self._cached_saved_search_id = None
+        else:
+            match = re.search(r'savedsearches/(\d+)', self.issue_tracker)
+            self._cached_saved_search_id = match.group(1) if match else None
+        return self._cached_saved_search_id
+    
+    @property
+    def saved_search_response_text(self):
+        if not self.saved_search_id:
+            return None
+        url = f"{BASE_ISSUE_TRACKER}/action/saved_searches/{self.saved_search_id}"
+        response = _fetch_response(url)
+        return response.text if response else None
+
+    @property
+    def issue_tracker_component_id(self):
+        if self._cached_component_id != NOT_COMPUTED:
+            return self._cached_component_id
+        
+        # First, check if the issue tracker is a saved search:
+        query_string = self.saved_search_response_text or self.issue_tracker
+        if not query_string:
+            self._cached_component_id = None
+        else:
+            # Try to match 'component=' in the query string
+            query_match = re.search(r'\bcomponent=(\d+)', query_string)
+            if query_match:
+                self._cached_component_id = query_match.group(1)
+            else:
+                # If not found, try to match 'componentid:' in the query string
+                query_match = re.findall(r'\bcomponentid:(\d+)', query_string)
+                for component_id in query_match:
+                    if component_id == GENERIC_ISSUE_TRACKER_COMPONENT:
+                        continue
+                    if self._cached_component_id != NOT_COMPUTED:
+                        self._cached_component_id = None
+                        logging.error(f"More than one component ID found for issue tracker: {self.issue_tracker}")
+                        break
+                    self._cached_component_id = component_id
+                self._cached_component_id = self._cached_component_id if self._cached_component_id != NOT_COMPUTED else None
+        return self._cached_component_id
+    
+    @property
+    def issue_tracker_template_id(self):
+        if self._cached_template_id != NOT_COMPUTED:
+            return self._cached_template_id
+        if not self.issue_tracker:
+            self._cached_template_id =  None
+        else:
+            match = re.search(r'(?:\?|&)template=(\d+)', self.issue_tracker)
+            self._cached_template_id = match.group(1) if match else None
+        return self._cached_template_id
+    
+    @property
+    def show_client_issues(self):
+        return GITHUB_ISSUES.format(repo=self.repo)
+    
+    @property
+    def file_api_issue(self):
+        if self.issue_tracker_component_id:
+            link = f"{BASE_ISSUE_TRACKER}/issues/new?component={self.issue_tracker_component_id}"
+            if self.issue_tracker_template_id:
+                link += f"&template={self.issue_tracker_template_id}"
+            return link
+        return None
+    
+    @property
+    def show_api_issues(self):
+        if self.saved_search_id:
+            # Return the original issue_tracker content, which already links to the saved search.
+            return self.issue_tracker
+        elif self.issue_tracker_component_id:
+            return f"{BASE_ISSUE_TRACKER}/issues?q=componentid:{self.issue_tracker_component_id}"
+        return None
 
     # For sorting, we want to sort by release level, then API pretty_name
     def __lt__(self, other):
@@ -95,15 +195,35 @@ class Extractor:
     def client_for_repo(self, repo_slug) -> Optional[CloudClient]:
         path = self.path_format.format(repo_slug=repo_slug)
         url = f"{RAW_CONTENT_BASE_URL}/{path}/{REPO_METADATA_FILENAME}"
-        response = requests.get(url)
-        if response.status_code != requests.codes.ok:
-            return
-
-        return CloudClient(response.json())
+        _, metadata = _fetch_and_parse_response(url)
+        if not metadata:
+            return None
+        return CloudClient(metadata)
     
     def get_clients_from_batch_response(self, response_json) -> List[CloudClient]:
         return [self.client_for_repo(repo[self.response_key]) for repo in response_json if allowed_repo(repo)]
 
+def _fetch_response(url: str, headers:dict = None, params:Optional[dict] = None) -> Optional[requests.Response]:
+    try:
+        response = requests.get(url, headers=headers, params=params)
+        response.raise_for_status()
+        return response
+    except requests.RequestException as e:
+        logging.error(f"Request failed for URL {url}: {e}")
+        return None
+
+def _parse_response(response: requests.Response) -> Optional[dict]:
+    try:
+        return response.json()
+    except ValueError as e:
+        logging.error(f"JSON decoding failed for URL {response.url}: {e}")
+        return None
+    
+def _fetch_and_parse_response(url: str, headers:dict = None, params:Optional[dict] = None):
+    response = _fetch_response(url, headers, params)
+    if not response:
+        return None, None
+    return response, _parse_response(response)
 
 def replace_content_in_readme(content_rows: List[str]) -> None:
     START_MARKER = ".. API_TABLE_START"
@@ -135,16 +255,17 @@ def client_row(client: CloudClient) -> str:
     url = f"https://github.com/{client.repo}"
     if client.repo == MONO_REPO:
         url += f"/tree/main/packages/{client.distribution_name}"
-
+    _show_api_issues = client.show_api_issues
+    _file_api_issue = client.file_api_issue
     content_row = [
         f"   * - `{client.title} <{url}>`_\n",
-        f"     - " + client.release_level + "\n",
-        f"     - |PyPI-{client.distribution_name}|\n",   
+        f"     - {client.release_level}\n",
+        f"     - |PyPI-{client.distribution_name}|\n",
+        f"     - `API Issues <{_show_api_issues}>`_\n" if _show_api_issues else "     -\n",
+        f"     - `File an API Issue <{_file_api_issue}>`_\n" if _file_api_issue else "     -\n",
+        f"     - `Client Library Issues <{client.show_client_issues}>`_\n"
     ]
 
-    if client.issue_tracker:
-        content_row.append(f"     - `API Issues <{client.issue_tracker}>`_\n")
-
     return (content_row, pypi_badge)
 
 
@@ -157,7 +278,9 @@ def generate_table_contents(clients: List[CloudClient]) -> List[str]:
         "   * - Client\n",
         "     - Release Level\n",
         "     - Version\n",
-        "     - API Issue Tracker\n",
+        "     - API Issues\n",
+        "     - File an API Issue\n",
+        "     - Client Library Issues\n",
     ]
 
     pypi_links = ["\n"]
@@ -181,30 +304,33 @@ def mono_repo_clients(token: str) -> List[CloudClient]:
     # all mono repo clients
     url = f"{BASE_API}/repos/{MONO_REPO}/contents/packages"
     headers = {'Authorization': f'token {token}'}
-    response = requests.get(url=url, headers=headers)
+    _, packages = _fetch_and_parse_response(url, headers)
+    if not packages:
+        return []
     mono_repo_extractor = Extractor(path_format=MONO_REPO_PATH_FORMAT, response_key=PACKAGE_RESPONSE_KEY)
-    
-    return mono_repo_extractor.get_clients_from_batch_response(response.json())
+    return mono_repo_extractor.get_clients_from_batch_response(packages)
 
 
 def split_repo_clients(token: str) -> List[CloudClient]:
-    
-    first_request = True
-    while first_request or 'next' in response.links:
-        if first_request:
-            url = f"{BASE_API}/search/repositories?page=1"
-            first_request = False
-        else:
-            url = response.links['next']['url']
-        headers = {'Authorization': f'token {token}'}
-        params = {'per_page': 100, "q": "python- in:name org:googleapis"}
-        response = requests.get(url=url, params=params, headers=headers)
-        repositories = response.json().get("items", [])
+    clients = []
+    url = f"{BASE_API}/search/repositories?page=1"
+    headers = {'Authorization': f'token {token}'}
+    params = {'per_page': 100, "q": "python- in:name org:googleapis"}
+
+    while url:
+        response, metadata = _fetch_and_parse_response(url, headers, params)
+        if not metadata:
+            break
+        repositories = metadata.get("items", [])
         if len(repositories) == 0:
             break
-
         split_repo_extractor = Extractor(path_format=SPLIT_REPO_PATH_FORMAT, response_key=REPO_RESPONSE_KEY)
-        return split_repo_extractor.get_clients_from_batch_response(repositories)
+        clients.extend(split_repo_extractor.get_clients_from_batch_response(repositories))
+
+        # Check for the 'next' link in the response headers for pagination
+        url = response.links.get('next', {}).get('url')
+
+    return clients
 
 
 def get_token():