-
-
Notifications
You must be signed in to change notification settings - Fork 29
Expand file tree
/
Copy pathutil.py
More file actions
524 lines (456 loc) · 19.4 KB
/
util.py
File metadata and controls
524 lines (456 loc) · 19.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
#! python3 # noqa: E265
# ############################################################################
# ########## Libraries #############
# ##################################
# standard library
import logging
import ssl
from datetime import date, datetime
from email.utils import formatdate
from mimetypes import guess_type
from pathlib import Path
from typing import Iterable, Tuple
from urllib import request
from urllib.error import HTTPError, URLError
from urllib.parse import urlencode, urlparse, urlunparse
# 3rd party
import markdown
from git import GitCommandError, GitCommandNotFound, InvalidGitRepositoryError, Repo
from mkdocs.config.config_options import Config
from mkdocs.structure.pages import Page
from mkdocs.utils import get_build_timestamp
# package
from mkdocs_rss_plugin import __about__
from mkdocs_rss_plugin.git_manager.ci import CiHandler
# ############################################################################
# ########## Globals #############
# ################################
REMOTE_REQUEST_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"User-Agent": "{}/{}".format(__about__.__title__, __about__.__version__),
}
# ############################################################################
# ########## Globals ###############
# ##################################
logger = logging.getLogger("mkdocs.mkdocs_rss_plugin")
# ############################################################################
# ########## Classes #############
# ################################
class Util:
def __init__(self, path: str = "."):
"""Class hosting the plugin logic.
:param str path: path tot the git repository to use. Defaults to: "." - optional
"""
try:
git_repo = Repo(path, search_parent_directories=True)
self.repo = git_repo.git
self.git_is_valid = 1
except InvalidGitRepositoryError as err:
logging.warning(
"[rss-plugin] Path is not a valid git directory. " " Trace: %s" % err
)
self.git_is_valid = 0
except Exception as err:
logging.warning("[rss-plugin] Git issue: %s" % err)
self.git_is_valid = 0
# Checks if user is running builds on CI and raise appropriate warnings
CiHandler(git_repo.git).raise_ci_warnings()
def build_url(self, base_url: str, path: str, args_dict: dict = None) -> str:
"""Build URL using base URL, cumulating existing and passed path, \
then adding URL arguments.
:param base_url: base URL with existing path to use
:type base_url: str
:param path: URL path to cumulate with existing
:type path: str
:param args_dict: URL arguments to add, defaults to None
:type args_dict: dict, optional
:return: complete and valid URL
:rtype: str
"""
# Returns a list in the structure of urlparse.ParseResult
url_parts = list(urlparse(base_url))
url_parts[2] += path
if args_dict:
url_parts[4] = urlencode(args_dict)
return urlunparse(url_parts)
def get_file_dates(
self,
in_page: Page,
source_date_creation: str = "git",
source_date_update: str = "git",
meta_datetime_format: str = "%Y-%m-%d %H:%M",
) -> Tuple[int, int]:
"""Extract creation and update dates from page metadata (yaml frontmatter) or \
git log for given file.
:param in_page: input page to work with
:type in_page: Page
:param source_date_creation: which source to use (git or meta tag) for creation \
date, defaults to "git"
:type source_date_creation: str, optional
:param source_date_update: which source to use (git or meta tag) for update \
date, defaults to "git"
:type source_date_update: str, optional
:param meta_datetime_format: datetime string format, defaults to "%Y-%m-%d %H:%M"
:type meta_datetime_format: str, optional
:return: tuple of timestamps (creation date, last commit date)
:rtype: Tuple[int, int]
"""
# empty vars
dt_created = dt_updated = None
# if enabled, try to retrieve dates from page metadata
if source_date_creation != "git" and in_page.meta.get(source_date_creation):
dt_created = self.get_date_from_meta(
date_metatag_value=in_page.meta.get(source_date_creation),
meta_datetime_format=meta_datetime_format,
)
if isinstance(dt_created, str):
logger.error(dt_created)
dt_created = None
if source_date_update != "git" and in_page.meta.get(source_date_update):
dt_updated = self.get_date_from_meta(
date_metatag_value=in_page.meta.get(source_date_update),
meta_datetime_format=meta_datetime_format,
)
if isinstance(dt_updated, str):
logger.error(dt_updated)
dt_updated = None
# explore git log
if self.git_is_valid:
try:
# only if dates have not been retrieved from page meta
if not dt_created:
dt_created = self.repo.log(
in_page.file.abs_src_path,
n=1,
date="short",
format="%at",
diff_filter="AR",
)
if not dt_updated:
dt_updated = self.repo.log(
in_page.file.abs_src_path,
n=1,
date="short",
format="%at",
)
except GitCommandError as err:
logging.warning(
"[rss-plugin] Unable to read git logs of '%s'. Is git log readable?"
" Falling back to build date. "
" Trace: %s" % (in_page.file.abs_src_path, err)
)
except GitCommandNotFound as err:
logging.error(
"[rss-plugin] Unable to perform command 'git log'. Is git installed? "
" Falling back to build date. "
" Trace: %s" % err
)
self.git_is_valid = 0
else:
pass
# return results
if all([dt_created, dt_updated]):
return (
int(dt_created),
int(dt_updated),
)
else:
logging.warning(
"[rss-plugin] Dates could not be retrieved for page: %s."
% in_page.file.abs_src_path
)
return (
get_build_timestamp(),
get_build_timestamp(),
)
def get_authors_from_meta(self, in_page: Page) -> Tuple[str] or None:
"""Returns authors from page meta. It handles 'author' and 'authors' for keys, \
str and iterable as values types.
:param in_page: page to look into
:type in_page: Page
:return: tuple of authors names
:rtype: Tuple[str] or None
"""
# identify the key
if "author" in in_page.meta:
if isinstance(in_page.meta.get("author"), str):
return (in_page.meta.get("author"),)
elif isinstance(in_page.meta.get("author"), (list, tuple)):
return tuple(in_page.meta.get("author"))
else:
logging.warning(
"[rss-plugin] Type of author value in page.meta (%s) is not valid. "
"It should be str, list or tuple, not: %s."
% in_page.file.abs_src_path,
type(in_page.meta.get("author")),
)
return None
elif "authors" in in_page.meta:
if isinstance(in_page.meta.get("authors"), str):
return (in_page.meta.get("authors"),)
elif isinstance(in_page.meta.get("authors"), (list, tuple)):
return tuple(in_page.meta.get("authors"))
else:
logging.warning(
"[rss-plugin] Type of authors value in page.meta (%s) is not valid. "
"It should be str, list or tuple, not: %s."
% in_page.file.abs_src_path,
type(in_page.meta.get("authors")),
)
return None
def get_categories_from_meta(
self, in_page: Page, categories_labels: Iterable
) -> tuple:
"""Returns category from page meta.
:param in_page: input page to parse
:type in_page: Page
:param categories_labels: meta tags to look into
:type categories_labels: Iterable
:return: found categories
:rtype: tuple
"""
if not categories_labels:
return None
output_categories = []
for category_label in categories_labels:
if category_label in in_page.meta:
if isinstance(in_page.meta.get(category_label), (list, tuple)):
output_categories.extend(in_page.meta.get(category_label))
elif isinstance(in_page.meta.get(category_label), str):
output_categories.append(in_page.meta.get(category_label))
else:
pass
else:
continue
return sorted(output_categories)
def get_date_from_meta(
self, date_metatag_value: str, meta_datetime_format: str
) -> float:
"""Get date from page.meta handling str with associated datetime format and \
date already transformed by MkDocs.
:param date_metatag_value: value of page.meta.{tag_for_date}
:type date_metatag_value: str
:param meta_datetime_format: expected format of datetime
:type meta_datetime_format: str
:return: datetime as timestamp
:rtype: float
"""
out_date = None
try:
if isinstance(date_metatag_value, str):
out_date = datetime.strptime(date_metatag_value, meta_datetime_format)
elif isinstance(date_metatag_value, (date, datetime)):
out_date = datetime.combine(date_metatag_value, datetime.min.time())
else:
return "[rss-plugin] Incompatible date type."
except ValueError as err:
return "[rss-plugin] Incompatible date found. Trace: {}".format(err)
except Exception as err:
return "[rss-plugin] Unable to retrieve creation date. Trace: {}".format(
err
)
return out_date.timestamp()
def get_description_or_abstract(self, in_page: Page, chars_count: int = 160) -> str:
"""Returns description from page meta. If it doesn't exist, use the \
{chars_count} first characters from page content (in markdown).
:param Page in_page: page to look at
:param int chars_count: if page.meta.description is not set, number of chars \
of the content to use. Defaults to: 160 - optional
:return: page description to use
:rtype: str
"""
description = in_page.meta.get("description")
# Set chars_count to None if it is set to be unlimited, for slicing.
if chars_count < 0:
chars_count = None
# If the abstract chars is not unlimited and the description exists,
# return the description.
if description and chars_count != None:
return description
# If chars count is unlimited, use the html content
elif in_page.content and chars_count == None:
if chars_count == None or len(in_page.content) < chars_count:
return in_page.content[:chars_count]
# Use markdown
elif in_page.markdown:
if chars_count == None or len(in_page.markdown) < chars_count:
return markdown.markdown(
in_page.markdown[:chars_count], output_format="html5"
)
else:
return markdown.markdown(
"{}...".format(in_page.markdown[: chars_count - 3]),
output_format="html5",
)
# Unlimited chars_count but no content is found, then return the description.
else:
return description if description else ""
def get_image(self, in_page: Page, base_url: str) -> tuple:
"""Get image from page meta and returns properties.
:param in_page: page to parse
:type in_page: Page
:param base_url: website URL to resolve absolute URLs for images referenced with local path.
:type base_url: str
:return: (image url, mime type, image length)
:rtype: tuple
"""
if in_page.meta.get("image"):
img_url = in_page.meta.get("image")
elif in_page.meta.get("illustration"):
img_url = in_page.meta.get("illustration")
else:
return None
# guess mimetype
mime_type = guess_type(url=img_url, strict=False)[0]
# if path, resolve absolute url
if not img_url.startswith("http"):
img_length = self.get_local_image_length(
page_path=in_page.file.abs_src_path, path_to_append=img_url
)
img_url = self.build_url(base_url=base_url, path=img_url)
else:
img_length = self.get_remote_image_length(image_url=img_url)
# return final tuple
return (img_url, mime_type, img_length)
def get_local_image_length(self, page_path: str, path_to_append: str) -> int:
"""Build URL using base URL, cumulating existing and passed path, \
then adding URL arguments.
:param page_path: base URL with existing path to use
:type base_url: str
:param path: URL path to cumulate with existing
:type path: str
:return: complete and valid path
:rtype: int
"""
image_path = Path(page_path).parent / Path(path_to_append)
if not image_path.is_file():
return None
return image_path.stat().st_size
def get_remote_image_length(
self,
image_url: str,
http_method: str = "HEAD",
attempt: int = 0,
ssl_context: ssl.SSLContext = None,
) -> int:
"""Retrieve length for remote images (starting with 'http' \
in meta.image or meta.illustration). \
It tries to perform a HEAD request and get the length from the headers. \
If it fails, it tries again with a GET and disabling SSL verification.
:param image_url: remote image URL
:type image_url: str
:param http_method: HTTP method used to perform request, defaults to "HEAD"
:type http_method: str, optional
:param attempt: request tries counter, defaults to 0
:type attempt: int, optional
:param ssl_context: SSL context, defaults to None
:type ssl_context: ssl.SSLContext, optional
:return: image length as str or None
:rtype: int
"""
# prepare request
req = request.Request(
image_url,
method=http_method,
headers=REMOTE_REQUEST_HEADERS,
)
# first, try HEAD request to avoid downloading the image
try:
attempt += 1
remote_img = request.urlopen(url=req, context=ssl_context)
img_length = remote_img.getheader("content-length")
except (HTTPError, URLError) as err:
logging.warning(
"[rss-plugin] Remote image could not been reached: {}. "
"Trying again with GET and disabling SSL verification. Attempt: {}. "
"Trace: {}".format(image_url, attempt, err)
)
if attempt < 2:
return self.get_remote_image_length(
image_url,
http_method="GET",
attempt=attempt,
ssl_context=ssl._create_unverified_context(),
)
else:
logging.error(
"[rss-plugin] Remote image is not reachable: {} after {} attempts. "
" Trace: {}".format(
image_url,
attempt,
err,
)
)
return None
return int(img_length)
@staticmethod
def get_site_url(mkdocs_config: Config) -> str or None:
"""Extract site URL from MkDocs configuration and enforce the behavior to ensure \
returning a str with length > 0 or None. If exists, it adds an ending slash.
:param mkdocs_config: configuration object
:type mkdocs_config: Config
:return: site url
:rtype: str or None
"""
# this method exists because the following line returns an empty string instead of \
# None (because the key alwayus exists)
defined_site_url = mkdocs_config.get("site_url", None)
# cases
if defined_site_url is None or not len(defined_site_url):
# in case of mkdocs's behavior change
site_url = None
else:
site_url = defined_site_url
# handle trailing slash
if not site_url.endswith("/"):
site_url = site_url + "/"
return site_url
@staticmethod
def guess_locale(mkdocs_config: Config) -> str or None:
"""Extract language code from MkDocs or Theme configuration.
:param mkdocs_config: configuration object
:type mkdocs_config: Config
:return: language code
:rtype: str or None
"""
# MkDocs locale settings - might be added in future mkdocs versions
# see: https://github.com/timvink/mkdocs-git-revision-date-localized-plugin/issues/24
if mkdocs_config.get("locale"):
return mkdocs_config.get("locale")
# Some themes implement a locale or a language setting
if "theme" in mkdocs_config and "locale" in mkdocs_config.get("theme"):
return mkdocs_config.get("theme")._vars.get("locale")
elif "theme" in mkdocs_config and "language" in mkdocs_config.get("theme"):
return mkdocs_config.get("theme")._vars.get("language")
else:
return None
@staticmethod
def filter_pages(pages: list, attribute: str, length: int) -> list:
"""Filter and return pages into a friendly RSS structure.
:param pages: pages to filter
:type pages: list
:param attribute: page attribute as filter variable
:type attribute: str
:param length: max number of pages to return
:type length: int
:return: list of filtered pages
:rtype: list
"""
filtered_pages = []
for page in sorted(
pages, key=lambda page: getattr(page, attribute), reverse=True
)[:length]:
filtered_pages.append(
{
"authors": page.authors,
"categories": page.categories,
"comments_url": page.url_comments,
"description": page.description,
"guid": page.guid,
"image": page.image,
"link": page.url_full,
"pubDate": formatdate(getattr(page, attribute)),
"title": page.title,
}
)
return filtered_pages