Merge pull request #1647 from auouymous/build-mac-on-github
[gpodder.git] / src / gpodder / youtube.py
blobb6bbac79c5fe2ac9a98cdb159269f69b0bdceaaf
1 # -*- coding: utf-8 -*-
3 # gPodder - A media aggregator and podcast client
4 # Copyright (c) 2005-2018 The gPodder Team
6 # gPodder is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
11 # gPodder is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 # gpodder.youtube - YouTube and related magic
20 # Justin Forest <justin.forest@gmail.com> 2008-10-13
23 import io
24 import json
25 import logging
26 import re
27 import urllib
28 import xml.etree.ElementTree
29 from functools import lru_cache
30 from html.parser import HTMLParser
31 from urllib.parse import parse_qs
33 import gpodder
34 from gpodder import registry, util
36 logger = logging.getLogger(__name__)
39 _ = gpodder.gettext
42 # http://en.wikipedia.org/wiki/YouTube#Quality_and_formats
43 # https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py#L447
45 # adaptive audio formats
46 # 140 MP4 128k
47 # 251 WebM 160k
48 # 250 WebM 70k
49 # 249 WebM 50k
51 # formats and fallbacks of same quality: WebM -> MP4 -> FLV
52 flv_240 = [5]
53 flv_270 = [6]
54 flv_360 = [34]
55 flv_480 = [35]
56 mp4_144 = ['160+140']
57 mp4_240 = ['133+140'] + flv_240
58 mp4_360 = [18, '134+140'] + flv_360
59 mp4_480 = ['135+140'] + flv_480
60 mp4_720 = [22, '136+140']
61 mp4_1080 = [37, '137+140']
62 mp4_1440 = ['264+140']
63 mp4_2160 = ['266+140']
64 mp4_3072 = [38]
65 mp4_4320 = ['138+140']
66 webm_144 = ['278+250'] + mp4_144
67 webm_240 = ['242+250'] + mp4_240
68 webm_360 = [43, '243+251'] + mp4_360
69 webm_480 = [44, '244+251'] + mp4_480
70 webm_720 = [45, '247+251'] + mp4_720
71 webm_1080 = [46, '248+251'] + mp4_1080
72 webm_1440 = ['271+251'] + mp4_1440
73 webm_2160 = ['313+251'] + mp4_2160
74 webm_4320 = ['272+251'] + mp4_4320
75 # fallbacks to lower quality
76 webm_240 += webm_144
77 webm_360 += flv_270 + webm_240
78 webm_480 += webm_360
79 webm_720 += webm_480
80 webm_1080 += webm_720
81 webm_1440 += webm_1080
82 webm_2160 += webm_1440
83 webm_4320 += mp4_3072 + webm_2160
84 mp4_240 += mp4_144
85 mp4_360 += flv_270 + mp4_240
86 mp4_480 += mp4_360
87 mp4_720 += mp4_480
88 mp4_1080 += mp4_720
89 mp4_1440 += mp4_1080
90 mp4_2160 += mp4_1440
91 mp4_3072 += mp4_2160
92 mp4_4320 += mp4_3072
93 flv_270 += flv_240
94 flv_360 += flv_270
95 flv_480 += flv_360
96 # format id, (preferred ids, path(?), description) # video bitrate, audio bitrate
97 formats = [
98 # WebM VP8, VP9 or VP9 HFR video, Vorbis or Opus audio
99 # Fallback to MP4 or FLV
100 (272, (webm_4320, '272/7680x4320/99/0/0', 'WebM 4320p 8K (7680x4320) youtube-dl')), # N/A, 160 kbps
101 (313, (webm_2160, '313/3840x2160/99/0/0', 'WebM 2160p 4K (3840x2160) youtube-dl')), # N/A, 160 kbps
102 (271, (webm_1440, '271/2560x1440/99/0/0', 'WebM 1440p (2560x1440) youtube-dl')), # N/A, 160 kbps
103 (46, (webm_1080, '46/1920x1080/99/0/0', 'WebM 1080p (1920x1080) youtube-dl')), # N/A, 192 kbps
104 (45, (webm_720, '45/1280x720/99/0/0', 'WebM 720p (1280x720) youtube-dl')), # 2.0 Mbps, 192 kbps
105 (44, (webm_480, '44/854x480/99/0/0', 'WebM 480p (854x480) youtube-dl')), # 1.0 Mbps, 128 kbps
106 (43, (webm_360, '43/640x360/99/0/0', 'WebM 360p (640x360)')), # 0.5 Mbps, 128 kbps
107 (242, (webm_240, '242/426x240/99/0/0', 'WebM 240p (426x240) youtube-dl')), # N/A, 70 kbps
108 (278, (webm_144, '278/256x144/99/0/0', 'WebM 144p (256x144) youtube-dl')), # N/A, 70 kbps
110 # MP4 H.264 video, AAC audio
111 # Fallback to FLV
112 (138, (mp4_4320, '138/7680x4320/9/0/115', 'MP4 4320p 8K (7680x4320) youtube-dl')), # N/A, 128 kbps
113 (38, (mp4_3072, '38/4096x3072/9/0/115', 'MP4 3072p 4K (4096x3072)')), # 5.0 - 3.5 Mbps, 192 kbps
114 (266, (mp4_2160, '266/3840x2160/9/0/115', 'MP4 2160p 4K (3840x2160) youtube-dl')), # N/A, 128 kbps
115 (264, (mp4_1440, '264/2560x1440/9/0/115', 'MP4 1440p (2560x1440) youtube-dl')), # N/A, 128 kbps
116 (37, (mp4_1080, '37/1920x1080/9/0/115', 'MP4 1080p (1920x1080) youtube-dl')), # 4.3 - 3.0 Mbps, 192 kbps
117 (22, (mp4_720, '22/1280x720/9/0/115', 'MP4 720p (1280x720)')), # 2.9 - 2.0 Mbps, 192 kbps
118 (135, (mp4_480, '135/854x480/9/0/115', 'MP4 480p (854x480) youtube-dl')), # N/A, 128 kbps
119 (18, (mp4_360, '18/640x360/9/0/115', 'MP4 360p (640x360)')), # 0.5 Mbps, 96 kbps
120 (133, (mp4_240, '133/426x240/9/0/115', 'MP4 240p (426x240) youtube-dl')), # N/A, 128 kbps
121 (160, (mp4_144, '160/256x144/9/0/115', 'MP4 144p (256x144) youtube-dl')), # N/A, 128 kbps
123 # FLV H.264 video, AAC audio
124 # Fallback to FLV 6 or 5
125 (35, (flv_480, '35/854x480/9/0/115', 'FLV 480p (854x480)')), # 1 - 0.80 Mbps, 128 kbps
126 (34, (flv_360, '34/640x360/9/0/115', 'FLV 360p (640x360)')), # 0.50 Mbps, 128 kbps
128 # FLV Sorenson H.263 video, MP3 audio
129 (6, (flv_270, '6/480x270/7/0/0', 'FLV 270p (480x270)')), # 0.80 Mbps, 64 kbps
130 (5, (flv_240, '5/320x240/7/0/0', 'FLV 240p (320x240)')), # 0.25 Mbps, 64 kbps
132 formats_dict = dict(formats)
134 # streaming formats and fallbacks to lower quality
135 hls_144 = [91]
136 hls_240 = [92] + hls_144
137 hls_360 = [93] + hls_240
138 hls_480 = [94] + hls_360
139 hls_720 = [95] + hls_480
140 hls_1080 = [96] + hls_720
141 hls_formats = [
142 (96, (hls_1080, '9/1920x1080/9/0/115', 'MP4 1080p (1920x1080)')), # N/A, 256 kbps
143 (95, (hls_720, '9/1280x720/9/0/115', 'MP4 720p (1280x720)')), # N/A, 256 kbps
144 (94, (hls_480, '9/854x480/9/0/115', 'MP4 480p (854x480)')), # N/A, 128 kbps
145 (93, (hls_360, '9/640x360/9/0/115', 'MP4 360p (640x360)')), # N/A, 128 kbps
146 (92, (hls_240, '9/426x240/9/0/115', 'MP4 240p (426x240)')), # N/A, 48 kbps
147 (91, (hls_144, '9/256x144/9/0/115', 'MP4 144p (256x144)')), # N/A, 48 kbps
149 hls_formats_dict = dict(hls_formats)
151 CHANNEL_VIDEOS_XML = 'https://www.youtube.com/feeds/videos.xml'
152 WATCH_ENDPOINT = 'https://www.youtube.com/watch?bpctr=9999999999&has_verified=1&v='
154 # The page may contain "};" sequences inside the initial player response.
155 # Use a greedy match with script end tag, and fallback to a non-greedy match without.
156 INITIAL_PLAYER_RESPONSE_RE1 = r'ytInitialPlayerResponse\s*=\s*({.+})\s*;\s*</script'
157 INITIAL_PLAYER_RESPONSE_RE2 = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
160 def get_ipr(page):
161 for regex in (INITIAL_PLAYER_RESPONSE_RE1, INITIAL_PLAYER_RESPONSE_RE2):
162 ipr = re.search(regex, page)
163 if ipr is not None:
164 return ipr
165 return None
168 class YouTubeError(Exception):
169 pass
172 def get_fmt_ids(youtube_config, allow_partial):
173 if allow_partial:
174 if youtube_config.preferred_hls_fmt_id == 0:
175 hls_fmt_ids = (youtube_config.preferred_hls_fmt_ids if youtube_config.preferred_hls_fmt_ids else [])
176 else:
177 fmt = hls_formats_dict.get(youtube_config.preferred_hls_fmt_id)
178 if fmt is None:
179 hls_fmt_ids = []
180 else:
181 hls_fmt_ids, path, description = fmt
182 else:
183 hls_fmt_ids = []
185 if youtube_config.preferred_fmt_id == 0:
186 return (youtube_config.preferred_fmt_ids + hls_fmt_ids if youtube_config.preferred_fmt_ids else hls_fmt_ids)
188 fmt = formats_dict.get(youtube_config.preferred_fmt_id)
189 if fmt is None:
190 return hls_fmt_ids
191 fmt_ids, path, description = fmt
192 return fmt_ids + hls_fmt_ids
195 @registry.download_url.register
196 def youtube_real_download_url(config, episode, allow_partial):
197 fmt_ids = get_fmt_ids(config.youtube, allow_partial) if config else None
198 res, duration = get_real_download_url(episode.url, allow_partial, fmt_ids)
199 if duration is not None:
200 episode.total_time = int(int(duration) / 1000)
201 return None if res == episode.url else res
204 def youtube_get_old_endpoint(vid):
205 # TODO: changing 'detailpage' to 'embedded' allows age-restricted content
206 url = 'https://www.youtube.com/get_video_info?html5=1&c=TVHTML5&cver=6.20180913&el=detailpage&video_id=' + vid
207 r = util.urlopen(url)
208 if not r.ok:
209 raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
210 else:
211 return r.text, None
214 def youtube_get_new_endpoint(vid):
215 url = WATCH_ENDPOINT + vid
216 r = util.urlopen(url)
217 if not r.ok:
218 raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
220 ipr = get_ipr(r.text)
221 if ipr is None:
222 try:
223 url = get_gdpr_consent_url(r.text)
224 except YouTubeError as e:
225 raise YouTubeError('Youtube "%s": No ytInitialPlayerResponse found and %s' % (url, str(e)))
226 r = util.urlopen(url)
227 if not r.ok:
228 raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
230 ipr = get_ipr(r.text)
231 if ipr is None:
232 raise YouTubeError('Youtube "%s": No ytInitialPlayerResponse found' % url)
234 return None, ipr.group(1)
237 def get_total_time(episode):
238 try:
239 vid = get_youtube_id(episode.url)
240 if vid is None:
241 return 0
243 url = WATCH_ENDPOINT + vid
244 r = util.urlopen(url)
245 if not r.ok:
246 return 0
248 ipr = get_ipr(r.text)
249 if ipr is None:
250 url = get_gdpr_consent_url(r.text)
251 r = util.urlopen(url)
252 if not r.ok:
253 return 0
255 ipr = get_ipr(r.text)
256 if ipr is None:
257 return 0
259 player_response = json.loads(ipr.group(1))
260 return int(player_response['videoDetails']['lengthSeconds']) # 0 if live
261 except:
262 return 0
265 def get_real_download_url(url, allow_partial, preferred_fmt_ids=None):
266 if not preferred_fmt_ids:
267 preferred_fmt_ids, _, _ = formats_dict[22] # MP4 720p
269 duration = None
271 vid = get_youtube_id(url)
272 if vid is not None:
273 try:
274 old_page, new_page = youtube_get_new_endpoint(vid)
275 except YouTubeError as e:
276 logger.info(str(e))
277 old_page, new_page = youtube_get_old_endpoint(vid)
279 def find_urls(old_page, new_page):
280 # streamingData is preferable to url_encoded_fmt_stream_map
281 # streamingData.formats are the same as url_encoded_fmt_stream_map
282 # streamingData.adaptiveFormats are audio-only and video-only formats
284 x = parse_qs(old_page) if old_page else json.loads(new_page)
285 player_response = json.loads(x['player_response'][0]) if old_page and 'player_response' in x else x
286 error_message = None
288 if 'reason' in x:
289 # TODO: unknown if this is valid for new_page
290 error_message = util.remove_html_tags(x['reason'][0])
291 elif 'playabilityStatus' in player_response:
292 playabilityStatus = player_response['playabilityStatus']
294 if 'reason' in playabilityStatus:
295 error_message = util.remove_html_tags(playabilityStatus['reason'])
296 elif 'liveStreamability' in playabilityStatus \
297 and not playabilityStatus['liveStreamability'].get('liveStreamabilityRenderer', {}).get('displayEndscreen', False):
298 # playabilityStatus.liveStreamability -- video is or was a live stream
299 # playabilityStatus.liveStreamability.liveStreamabilityRenderer.displayEndscreen -- video has ended if present
301 if allow_partial and 'streamingData' in player_response and 'hlsManifestUrl' in player_response['streamingData']:
302 r = util.urlopen(player_response['streamingData']['hlsManifestUrl'])
303 if not r.ok:
304 raise YouTubeError('HLS Manifest: %d %s' % (r.status_code, r.reason))
305 manifest = r.text.splitlines()
307 urls = [line for line in manifest if line[0] != '#']
308 itag_re = re.compile(r'/itag/([0-9]+)/')
309 for url in urls:
310 itag = itag_re.search(url).group(1)
311 yield int(itag), [url, None]
312 return
314 error_message = 'live stream'
315 elif 'streamingData' in player_response:
316 if 'formats' in player_response['streamingData']:
317 for f in player_response['streamingData']['formats']:
318 if 'url' in f: # DRM videos store url inside a signatureCipher key
319 yield int(f['itag']), [f['url'], f.get('approxDurationMs')]
320 if 'adaptiveFormats' in player_response['streamingData']:
321 for f in player_response['streamingData']['adaptiveFormats']:
322 if 'url' in f: # DRM videos store url inside a signatureCipher key
323 yield int(f['itag']), [f['url'], f.get('approxDurationMs')]
324 return
326 if error_message is not None:
327 raise YouTubeError(('Cannot stream video: %s' if allow_partial else 'Cannot download video: %s') % error_message)
329 if old_page:
330 r4 = re.search(r'url_encoded_fmt_stream_map=([^&]+)', old_page)
331 if r4 is not None:
332 fmt_url_map = urllib.parse.unquote(r4.group(1))
333 for fmt_url_encoded in fmt_url_map.split(','):
334 video_info = parse_qs(fmt_url_encoded)
335 yield int(video_info['itag'][0]), [video_info['url'][0], None]
337 fmt_id_url_map = sorted(find_urls(old_page, new_page), reverse=True)
339 if not fmt_id_url_map:
340 drm = re.search(r'(%22(cipher|signatureCipher)%22%3A|"signatureCipher":)', old_page or new_page)
341 if drm is not None:
342 raise YouTubeError('Unsupported DRM content')
343 raise YouTubeError('No formats found')
345 formats_available = {fmt_id for fmt_id, url in fmt_id_url_map}
346 fmt_id_url_map = dict(fmt_id_url_map)
348 for fmt_id in preferred_fmt_ids:
349 if not re.search(r'^[0-9]+$', str(fmt_id)):
350 # skip non-integer formats 'best', '136+140' or twitch '720p'
351 continue
352 fmt_id = int(fmt_id)
353 if fmt_id in formats_available:
354 fmt = formats_dict.get(fmt_id) or hls_formats_dict.get(fmt_id)
355 if fmt is not None:
356 _, _, description = fmt
357 else:
358 description = 'Unknown'
360 logger.info('Found YouTube format: %s (fmt_id=%d)',
361 description, fmt_id)
362 url, duration = fmt_id_url_map[fmt_id]
363 break
364 else:
365 raise YouTubeError('No preferred formats found')
367 return url, duration
370 @lru_cache(1)
371 def get_youtube_id(url):
372 r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/watch\?v=([^&]*)', re.IGNORECASE).match(url)
373 if r is not None:
374 return r.group(1)
376 r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/v/(.*)[?]', re.IGNORECASE).match(url)
377 if r is not None:
378 return r.group(1)
380 r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/v/(.*)\.swf', re.IGNORECASE).match(url)
381 if r is not None:
382 return r.group(1)
384 return for_each_feed_pattern(lambda url, channel: channel, url, None)
387 def is_video_link(url):
388 return (get_youtube_id(url) is not None)
391 def is_youtube_guid(guid):
392 return guid.startswith('tag:youtube.com,2008:video:')
395 def for_each_feed_pattern(func, url, fallback_result):
397 Try to find the username for all possible YouTube feed/webpage URLs
398 Will call func(url, channel) for each match, and if func() returns
399 a result other than None, returns this. If no match is found or
400 func() returns None, return fallback_result.
402 CHANNEL_MATCH_PATTERNS = [
403 r'http[s]?://(?:[a-z]+\.)?youtube\.com/user/([a-z0-9]+)',
404 r'http[s]?://(?:[a-z]+\.)?youtube\.com/profile?user=([a-z0-9]+)',
405 r'http[s]?://(?:[a-z]+\.)?youtube\.com/rss/user/([a-z0-9]+)/videos\.rss',
406 r'http[s]?://(?:[a-z]+\.)?youtube\.com/channel/([-_a-z0-9]+)',
407 r'http[s]?://(?:[a-z]+\.)?youtube\.com/feeds/videos.xml\?user=([a-z0-9]+)',
408 r'http[s]?://(?:[a-z]+\.)?youtube\.com/feeds/videos.xml\?channel_id=([-_a-z0-9]+)',
409 r'http[s]?://gdata.youtube.com/feeds/users/([^/]+)/uploads',
410 r'http[s]?://gdata.youtube.com/feeds/base/users/([^/]+)/uploads',
413 for pattern in CHANNEL_MATCH_PATTERNS:
414 m = re.match(pattern, url, re.IGNORECASE)
415 if m is not None:
416 result = func(url, m.group(1))
417 if result is not None:
418 return result
420 return fallback_result
423 def get_real_channel_url(url):
424 def return_user_feed(url, channel):
425 result = 'https://gdata.youtube.com/feeds/users/{0}/uploads'.format(channel)
426 logger.debug('YouTube link resolved: %s => %s', url, result)
427 return result
429 return for_each_feed_pattern(return_user_feed, url, url)
432 @lru_cache(1)
433 def get_channel_id_url(url, feed_data=None):
434 if 'youtube.com' in url:
435 # URL may contain channel ID, avoid a network request
436 m = re.search(r'channel_id=([^"]+)', url)
437 if m:
438 # old versions of gpodder allowed newlines and whitespace in feed URLs, strip here to avoid a 404
439 channel_id = m.group(1).strip()
440 channel_url = 'https://www.youtube.com/channel/{}'.format(channel_id)
441 return channel_url
443 try:
444 if feed_data is None:
445 r = util.urlopen(url, cookies={'SOCS': 'CAI'})
446 if not r.ok:
447 raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
448 else:
449 r = feed_data
450 # video page may contain corrupt HTML/XML, search for tag to avoid exception
451 m = re.search(r'(channel_id=([^"]+)">|"channelId":"([^"]+)")', r.text)
452 if m:
453 channel_id = m.group(2) or m.group(3)
454 else:
455 raw_xml_data = io.BytesIO(r.content)
456 xml_data = xml.etree.ElementTree.parse(raw_xml_data)
457 channel_id = xml_data.find("{http://www.youtube.com/xml/schemas/2015}channelId").text
458 if channel_id is None:
459 # check entries if feed has an empty channelId
460 m = re.search(r'<yt:channelId>([^<]+)</yt:channelId>', r.text)
461 if m:
462 channel_id = m.group(1)
463 if channel_id is None:
464 raise Exception('Could not retrieve YouTube channel ID for URL %s.' % url)
466 # feeds no longer contain the required "UC" prefix on channel ID
467 if len(channel_id) == 22:
468 channel_id = "UC" + channel_id
469 channel_url = 'https://www.youtube.com/channel/{}'.format(channel_id)
470 return channel_url
472 except Exception:
473 logger.warning('Could not retrieve YouTube channel ID for URL %s.' % url, exc_info=True)
475 raise Exception('Could not retrieve YouTube channel ID for URL %s.' % url)
478 def get_cover(url, feed_data=None):
479 if 'youtube.com' in url:
481 class YouTubeHTMLCoverParser(HTMLParser):
482 """This custom html parser searches for the youtube channel thumbnail/avatar"""
483 def __init__(self):
484 super().__init__()
485 self.url = []
487 def handle_starttag(self, tag, attributes):
488 attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
490 # Look for 900x900px image first.
491 if tag == 'link' \
492 and 'rel' in attribute_dict \
493 and attribute_dict['rel'] == 'image_src':
494 self.url.append(attribute_dict['href'])
496 # Fallback to image that may only be 100x100px.
497 elif tag == 'img' \
498 and 'class' in attribute_dict \
499 and attribute_dict['class'] == "channel-header-profile-image":
500 self.url.append(attribute_dict['src'])
502 try:
503 channel_url = get_channel_id_url(url, feed_data)
504 r = util.urlopen(channel_url)
505 if not r.ok:
506 raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
507 html_data = util.response_text(r)
508 parser = YouTubeHTMLCoverParser()
509 parser.feed(html_data)
510 if parser.url:
511 logger.debug('Youtube cover art for {} is: {}'.format(url, parser.url))
512 return parser.url[0]
514 except Exception:
515 logger.warning('Could not retrieve cover art', exc_info=True)
518 def get_gdpr_consent_url(html_data):
520 Creates the URL for automatically accepting GDPR consents
521 EU GDPR redirects to a form that needs to be posted to be redirected to a get request
522 with the form data as input to the youtube video URL. This extracts that form data from
523 the GDPR form and builds up the URL the posted form results.
525 class ConsentHTML(HTMLParser):
526 def __init__(self):
527 super().__init__()
528 self.url = ''
529 self.consentForm = False
531 def handle_starttag(self, tag, attributes):
532 attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
533 if tag == 'form' and attribute_dict['action'] == 'https://consent.youtube.com/s':
534 self.consentForm = True
535 self.url = 'https://consent.google.com/s?'
536 # Get GDPR form elements
537 if self.consentForm and tag == 'input' and attribute_dict['type'] == 'hidden':
538 self.url += '&' + attribute_dict['name'] + '=' + urllib.parse.quote_plus(attribute_dict['value'])
540 def handle_endtag(self, tag):
541 if tag == 'form':
542 self.consentForm = False
544 try:
545 parser = ConsentHTML()
546 parser.feed(html_data)
547 except Exception:
548 raise YouTubeError('Could not retrieve GDPR accepted consent URL')
550 if parser.url:
551 logger.debug('YouTube GDPR accept consent URL is: %s', parser.url)
552 return parser.url
553 else:
554 logger.debug('YouTube GDPR accepted consent URL could not be resolved.')
555 raise YouTubeError('No acceptable GDPR consent URL')
558 def get_channel_desc(url, feed_data=None):
559 if 'youtube.com' in url:
561 class YouTubeHTMLDesc(HTMLParser):
562 """This custom html parser searches for the YouTube channel description."""
563 def __init__(self):
564 super().__init__()
565 self.description = ''
567 def handle_starttag(self, tag, attributes):
568 attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
570 # Get YouTube channel description.
571 if tag == 'meta' \
572 and 'name' in attribute_dict \
573 and attribute_dict['name'] == "description":
574 self.description = attribute_dict['content']
576 try:
577 channel_url = get_channel_id_url(url, feed_data)
578 r = util.urlopen(channel_url)
579 if not r.ok:
580 raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
581 html_data = util.response_text(r)
582 parser = YouTubeHTMLDesc()
583 parser.feed(html_data)
584 if parser.description:
585 logger.debug('YouTube description for %s is: %s', url, parser.description)
586 return parser.description
587 else:
588 logger.debug('YouTube description for %s is not provided.', url)
589 return _('No description available')
591 except Exception:
592 logger.warning('Could not retrieve YouTube channel description for %s.' % url, exc_info=True)
595 def parse_youtube_url(url):
597 Youtube Channel Links are parsed into youtube feed links
598 >>> parse_youtube_url("https://www.youtube.com/channel/CHANNEL_ID")
599 'https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID'
601 Youtube User Links are parsed into youtube feed links
602 >>> parse_youtube_url("https://www.youtube.com/user/USERNAME")
603 'https://www.youtube.com/feeds/videos.xml?user=USERNAME'
605 Youtube Playlist Links are parsed into youtube feed links
606 >>> parse_youtube_url("https://www.youtube.com/playlist?list=PLAYLIST_ID")
607 'https://www.youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID'
609 >>> parse_youtube_url(None)
610 None
612 @param url: the path to the channel, user or playlist
613 @return: the feed url if successful or the given url if not
615 if url is None:
616 return url
617 scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
618 logger.debug("Analyzing URL: {}".format(" ".join([scheme, netloc, path, query, fragment])))
620 if 'youtube.com' in netloc:
621 if path == '/feeds/videos.xml' and re.search(r'^(user|channel|playlist)_id=.*', query):
622 return url
624 if '/user/' in path or '/channel/' in path or 'list=' in query:
625 logger.debug("Valid Youtube URL detected. Parsing...")
627 if path.startswith('/user/'):
628 user_id = path.split('/')[2]
629 query = 'user={user_id}'.format(user_id=user_id)
631 if path.startswith('/channel/'):
632 channel_id = path.split('/')[2]
633 query = 'channel_id={channel_id}'.format(channel_id=channel_id)
635 if 'list=' in query:
636 playlist_query = [query_value for query_value in query.split("&") if 'list=' in query_value][0]
637 playlist_id = playlist_query[5:]
638 query = 'playlist_id={playlist_id}'.format(playlist_id=playlist_id)
640 path = '/feeds/videos.xml'
642 new_url = urllib.parse.urlunsplit((scheme, netloc, path, query, fragment))
643 logger.debug("New Youtube URL: {}".format(new_url))
644 return new_url
646 # look for channel URL in page
647 logger.debug("Unknown Youtube URL, trying to extract channel ID...")
648 new_url = get_channel_id_url(url)
649 if new_url:
650 logger.debug("New Youtube URL: {}".format(new_url))
651 return parse_youtube_url(new_url)
653 logger.debug("Not a valid Youtube URL: {}".format(url))
654 return url