Skip to content

Commit bdf96f6

Browse files
committed
added retry mechanism when blocked while using proxy
1 parent 53cd2b0 commit bdf96f6

4 files changed

Lines changed: 90 additions & 10 deletions

File tree

youtube_transcript_api/_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ def __init__(
4848
http_client.cookies = _load_cookie_jar(cookie_path)
4949
if proxy_config is not None:
5050
http_client.proxies = proxy_config.to_requests_dict()
51-
if proxy_config.prevent_keeping_connections_alive():
51+
if proxy_config.prevent_keeping_connections_alive:
5252
http_client.headers.update({"Connection": "close"})
53-
self._fetcher = TranscriptListFetcher(http_client)
53+
self._fetcher = TranscriptListFetcher(http_client, proxy_config=proxy_config)
5454

5555
def fetch(
5656
self,

youtube_transcript_api/_transcripts.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
from itertools import chain
55

66
from html import unescape
7-
from typing import List, Dict, Iterator, Iterable, Pattern
7+
from typing import List, Dict, Iterator, Iterable, Pattern, Optional
88

99
from defusedxml import ElementTree
1010

1111
import re
1212

1313
from requests import HTTPError, Session, Response
1414

15+
from .proxies import ProxyConfig
1516
from ._errors import (
1617
VideoUnavailable,
1718
YouTubeRequestFailed,
@@ -339,16 +340,32 @@ def _get_language_description(self, transcript_strings: Iterable[str]) -> str:
339340

340341

341342
class TranscriptListFetcher:
342-
def __init__(self, http_client: Session):
343+
def __init__(self, http_client: Session, proxy_config: Optional[ProxyConfig]):
343344
self._http_client = http_client
345+
self._proxy_config = proxy_config
344346

345347
def fetch(self, video_id: str) -> TranscriptList:
346348
return TranscriptList.build(
347349
self._http_client,
348350
video_id,
349-
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
351+
self._fetch_captions_json(video_id),
350352
)
351353

354+
def _fetch_captions_json(self, video_id: str, try_number: int = 0) -> Dict:
355+
try:
356+
return self._extract_captions_json(
357+
self._fetch_video_html(video_id), video_id
358+
)
359+
except RequestBlocked as exception:
360+
retries = (
361+
0
362+
if self._proxy_config is None
363+
else self._proxy_config.retries_when_blocked
364+
)
365+
if try_number + 1 < retries:
366+
return self._fetch_captions_json(video_id, try_number=try_number + 1)
367+
raise exception
368+
352369
def _extract_captions_json(self, html: str, video_id: str) -> Dict:
353370
splitted_html = html.split("var ytInitialPlayerResponse = ")
354371

youtube_transcript_api/proxies.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def to_requests_dict(self) -> RequestsProxyConfigDict:
3232
"""
3333
pass
3434

35+
@property
3536
def prevent_keeping_connections_alive(self) -> bool:
3637
"""
3738
If you are using rotating proxies, it can be useful to prevent the HTTP
@@ -40,6 +41,16 @@ def prevent_keeping_connections_alive(self) -> bool:
4041
"""
4142
return False
4243

44+
@property
45+
def retries_when_blocked(self) -> int:
46+
"""
47+
Defines how many times we should retry if a request is blocked. When using
48+
rotating residential proxies with a large IP pool it can make sense to retry a
49+
couple of times when a blocked IP is encountered, since a retry will trigger
50+
an IP rotation and the next IP might not be blocked.
51+
"""
52+
return 0
53+
4354

4455
class GenericProxyConfig(ProxyConfig):
4556
"""
@@ -83,8 +94,9 @@ class WebshareProxyConfig(GenericProxyConfig):
8394
most reliable way to work around being blocked by YouTube.
8495
8596
If you don't have a Webshare account yet, you will have to create one
86-
at https://www.webshare.io/?referral_code=w0xno53eb50g and purchase a residential
87-
proxy package that suits your workload, to be able to use this proxy config.
97+
at https://www.webshare.io/?referral_code=w0xno53eb50g and purchase a "Residential"
98+
proxy package that suits your workload, to be able to use this proxy config (make
99+
sure NOT to purchase "Proxy Server" or "Static Residential"!).
88100
89101
Once you have created an account you only need the "Proxy Username" and
90102
"Proxy Password" that you can find in your Webshare settings
@@ -105,24 +117,33 @@ def __init__(
105117
self,
106118
proxy_username: str,
107119
proxy_password: str,
120+
retries_when_blocked: int = 10,
108121
domain_name: str = DEFAULT_DOMAIN_NAME,
109122
proxy_port: int = DEFAULT_PORT,
110123
):
111124
"""
112125
Once you have created a Webshare account at
113-
https://www.webshare.io/?referral_code=w0xno53eb50g and purchased a residential
114-
proxy package, this config class allows you to easily use it, by defaulting to
115-
the most reliable proxy settings (rotating residential proxies).
126+
https://www.webshare.io/?referral_code=w0xno53eb50g and purchased a
127+
"Residential" package (make sure NOT to purchase "Proxy Server" or
128+
"Static Residential"!), this config class allows you to easily use it,
129+
by defaulting to the most reliable proxy settings (rotating residential
130+
proxies).
116131
117132
:param proxy_username: "Proxy Username" found at
118133
https://dashboard.webshare.io/proxy/settings
119134
:param proxy_password: "Proxy Password" found at
120135
https://dashboard.webshare.io/proxy/settings
136+
:param retries_when_blocked: Define how many times we should retry if a request
137+
is blocked. When using rotating residential proxies with a large IP pool it
138+
makes sense to retry a couple of times when a blocked IP is encountered,
139+
since a retry will trigger an IP rotation and the next IP might not be
140+
blocked. Defaults to 10.
121141
"""
122142
self.proxy_username = proxy_username
123143
self.proxy_password = proxy_password
124144
self.domain_name = domain_name
125145
self.proxy_port = proxy_port
146+
self._retries_when_blocked = retries_when_blocked
126147

127148
@property
128149
def url(self) -> str:
@@ -139,5 +160,10 @@ def http_url(self) -> str:
139160
def https_url(self) -> str:
140161
return self.url
141162

163+
@property
142164
def prevent_keeping_connections_alive(self) -> bool:
143165
return True
166+
167+
@property
168+
def retries_when_blocked(self) -> int:
169+
return self._retries_when_blocked

youtube_transcript_api/test/test_api.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,43 @@ def test_fetch__with_proxy_prevent_alive_connections(self, to_requests_dict):
341341
request = httpretty.last_request()
342342
self.assertEqual(request.headers.get("Connection"), "close")
343343

344+
@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
345+
def test_fetch__with_proxy_retry_when_blocked(self, to_requests_dict):
346+
for _ in range(3):
347+
httpretty.register_uri(
348+
httpretty.GET,
349+
"https://www.youtube.com/watch",
350+
body=load_asset("youtube_request_blocked.html.static"),
351+
)
352+
proxy_config = WebshareProxyConfig(
353+
proxy_username="username",
354+
proxy_password="password",
355+
)
356+
357+
YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo")
358+
359+
self.assertEqual(len(httpretty.latest_requests()), 3 + 2)
360+
361+
@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
362+
def test_fetch__with_proxy_reraise_when_blocked(self, to_requests_dict):
363+
retries = 5
364+
for _ in range(retries):
365+
httpretty.register_uri(
366+
httpretty.GET,
367+
"https://www.youtube.com/watch",
368+
body=load_asset("youtube_request_blocked.html.static"),
369+
)
370+
proxy_config = WebshareProxyConfig(
371+
proxy_username="username",
372+
proxy_password="password",
373+
retries_when_blocked=retries,
374+
)
375+
376+
with self.assertRaises(RequestBlocked):
377+
YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo")
378+
379+
self.assertEqual(len(httpretty.latest_requests()), retries)
380+
344381
def test_fetch__with_cookies(self):
345382
cookie_path = get_asset_path("example_cookies.txt")
346383
transcript = YouTubeTranscriptApi(cookie_path=cookie_path).fetch("GJLlxj_dtq8")

0 commit comments

Comments
 (0)