src/gpodder/youtube.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # gPodder - A media aggregator and podcast client
   4 # Copyright (c) 2005-2018 The gPodder Team
   5 #
   6 # gPodder is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # gPodder is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19 #  gpodder.youtube - YouTube and related magic
  20 #  Justin Forest <justin.forest@gmail.com> 2008-10-13
  21 #
  22
  23 import io
  24 import json
  25 import logging
  26 import re
  27 import urllib
  28 import xml.etree.ElementTree
  29 from functools import lru_cache
  30 from html.parser import HTMLParser
  31 from urllib.parse import parse_qs
  32
  33 import gpodder
  34 from gpodder import registry, util
  35
  36 logger = logging.getLogger(__name__)
  37
  38
  39 _ = gpodder.gettext
  40
  41
  42 # http://en.wikipedia.org/wiki/YouTube#Quality_and_formats
  43 # https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py#L447
  44
  45 # adaptive audio formats
  46 #   140  MP4   128k
  47 #   251  WebM  160k
  48 #   250  WebM  70k
  49 #   249  WebM  50k
  50
  51 # formats and fallbacks of same quality: WebM -> MP4 -> FLV
  52 flv_240 = [5]
  53 flv_270 = [6]
  54 flv_360 = [34]
  55 flv_480 = [35]
  56 mp4_144 = ['160+140']
  57 mp4_240 = ['133+140'] + flv_240
  58 mp4_360 = [18, '134+140'] + flv_360
  59 mp4_480 = ['135+140'] + flv_480
  60 mp4_720 = [22, '136+140']
  61 mp4_1080 = [37, '137+140']
  62 mp4_1440 = ['264+140']
  63 mp4_2160 = ['266+140']
  64 mp4_3072 = [38]
  65 mp4_4320 = ['138+140']
  66 webm_144 = ['278+250'] + mp4_144
  67 webm_240 = ['242+250'] + mp4_240
  68 webm_360 = [43, '243+251'] + mp4_360
  69 webm_480 = [44, '244+251'] + mp4_480
  70 webm_720 = [45, '247+251'] + mp4_720
  71 webm_1080 = [46, '248+251'] + mp4_1080
  72 webm_1440 = ['271+251'] + mp4_1440
  73 webm_2160 = ['313+251'] + mp4_2160
  74 webm_4320 = ['272+251'] + mp4_4320
  75 # fallbacks to lower quality
  76 webm_240 += webm_144
  77 webm_360 += flv_270 + webm_240
  78 webm_480 += webm_360
  79 webm_720 += webm_480
  80 webm_1080 += webm_720
  81 webm_1440 += webm_1080
  82 webm_2160 += webm_1440
  83 webm_4320 += mp4_3072 + webm_2160
  84 mp4_240 += mp4_144
  85 mp4_360 += flv_270 + mp4_240
  86 mp4_480 += mp4_360
  87 mp4_720 += mp4_480
  88 mp4_1080 += mp4_720
  89 mp4_1440 += mp4_1080
  90 mp4_2160 += mp4_1440
  91 mp4_3072 += mp4_2160
  92 mp4_4320 += mp4_3072
  93 flv_270 += flv_240
  94 flv_360 += flv_270
  95 flv_480 += flv_360
  96 # format id, (preferred ids, path(?), description) # video bitrate, audio bitrate
  97 formats = [
  98     # WebM VP8, VP9 or VP9 HFR video, Vorbis or Opus audio
  99     # Fallback to MP4 or FLV
 100     (272, (webm_4320, '272/7680x4320/99/0/0', 'WebM 4320p 8K (7680x4320) youtube-dl')),  # N/A,      160 kbps
 101     (313, (webm_2160, '313/3840x2160/99/0/0', 'WebM 2160p 4K (3840x2160) youtube-dl')),  # N/A,      160 kbps
 102     (271, (webm_1440, '271/2560x1440/99/0/0', 'WebM 1440p (2560x1440) youtube-dl')),     # N/A,      160 kbps
 103     (46, (webm_1080, '46/1920x1080/99/0/0', 'WebM 1080p (1920x1080) youtube-dl')),       # N/A,      192 kbps
 104     (45, (webm_720, '45/1280x720/99/0/0', 'WebM 720p (1280x720) youtube-dl')),           # 2.0 Mbps, 192 kbps
 105     (44, (webm_480, '44/854x480/99/0/0', 'WebM 480p (854x480) youtube-dl')),             # 1.0 Mbps, 128 kbps
 106     (43, (webm_360, '43/640x360/99/0/0', 'WebM 360p (640x360)')),                        # 0.5 Mbps, 128 kbps
 107     (242, (webm_240, '242/426x240/99/0/0', 'WebM 240p (426x240) youtube-dl')),           # N/A,       70 kbps
 108     (278, (webm_144, '278/256x144/99/0/0', 'WebM 144p (256x144) youtube-dl')),           # N/A,       70 kbps
 109
 110     # MP4 H.264 video, AAC audio
 111     # Fallback to FLV
 112     (138, (mp4_4320, '138/7680x4320/9/0/115', 'MP4 4320p 8K (7680x4320) youtube-dl')),  # N/A,       128 kbps
 113     (38, (mp4_3072, '38/4096x3072/9/0/115', 'MP4 3072p 4K (4096x3072)')),               # 5.0 - 3.5 Mbps, 192 kbps
 114     (266, (mp4_2160, '266/3840x2160/9/0/115', 'MP4 2160p 4K (3840x2160) youtube-dl')),  # N/A,       128 kbps
 115     (264, (mp4_1440, '264/2560x1440/9/0/115', 'MP4 1440p (2560x1440) youtube-dl')),     # N/A,       128 kbps
 116     (37, (mp4_1080, '37/1920x1080/9/0/115', 'MP4 1080p (1920x1080) youtube-dl')),       # 4.3 - 3.0 Mbps, 192 kbps
 117     (22, (mp4_720, '22/1280x720/9/0/115', 'MP4 720p (1280x720)')),                      # 2.9 - 2.0 Mbps, 192 kbps
 118     (135, (mp4_480, '135/854x480/9/0/115', 'MP4 480p (854x480) youtube-dl')),           # N/A,       128 kbps
 119     (18, (mp4_360, '18/640x360/9/0/115', 'MP4 360p (640x360)')),                        # 0.5 Mbps,   96 kbps
 120     (133, (mp4_240, '133/426x240/9/0/115', 'MP4 240p (426x240) youtube-dl')),           # N/A,       128 kbps
 121     (160, (mp4_144, '160/256x144/9/0/115', 'MP4 144p (256x144) youtube-dl')),           # N/A,       128 kbps
 122
 123     # FLV H.264 video, AAC audio
 124     # Fallback to FLV 6 or 5
 125     (35, (flv_480, '35/854x480/9/0/115', 'FLV 480p (854x480)')),  # 1 - 0.80 Mbps, 128 kbps
 126     (34, (flv_360, '34/640x360/9/0/115', 'FLV 360p (640x360)')),  # 0.50 Mbps, 128 kbps
 127
 128     # FLV Sorenson H.263 video, MP3 audio
 129     (6, (flv_270, '6/480x270/7/0/0', 'FLV 270p (480x270)')),  # 0.80 Mbps,  64 kbps
 130     (5, (flv_240, '5/320x240/7/0/0', 'FLV 240p (320x240)')),  # 0.25 Mbps,  64 kbps
 131 ]
 132 formats_dict = dict(formats)
 133
 134 # streaming formats and fallbacks to lower quality
 135 hls_144 = [91]
 136 hls_240 = [92] + hls_144
 137 hls_360 = [93] + hls_240
 138 hls_480 = [94] + hls_360
 139 hls_720 = [95] + hls_480
 140 hls_1080 = [96] + hls_720
 141 hls_formats = [
 142     (96, (hls_1080, '9/1920x1080/9/0/115', 'MP4 1080p (1920x1080)')),   # N/A,       256 kbps
 143     (95, (hls_720, '9/1280x720/9/0/115', 'MP4 720p (1280x720)')),       # N/A,       256 kbps
 144     (94, (hls_480, '9/854x480/9/0/115', 'MP4 480p (854x480)')),         # N/A,       128 kbps
 145     (93, (hls_360, '9/640x360/9/0/115', 'MP4 360p (640x360)')),         # N/A,       128 kbps
 146     (92, (hls_240, '9/426x240/9/0/115', 'MP4 240p (426x240)')),         # N/A,        48 kbps
 147     (91, (hls_144, '9/256x144/9/0/115', 'MP4 144p (256x144)')),         # N/A,        48 kbps
 148 ]
 149 hls_formats_dict = dict(hls_formats)
 150
 151 CHANNEL_VIDEOS_XML = 'https://www.youtube.com/feeds/videos.xml'
 152 WATCH_ENDPOINT = 'https://www.youtube.com/watch?bpctr=9999999999&has_verified=1&v='
 153
 154 # The page may contain "};" sequences inside the initial player response.
 155 # Use a greedy match with script end tag, and fallback to a non-greedy match without.
 156 INITIAL_PLAYER_RESPONSE_RE1 = r'ytInitialPlayerResponse\s*=\s*({.+})\s*;\s*</script'
 157 INITIAL_PLAYER_RESPONSE_RE2 = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
 158
 159
 160 def get_ipr(page):
 161     for regex in (INITIAL_PLAYER_RESPONSE_RE1, INITIAL_PLAYER_RESPONSE_RE2):
 162         ipr = re.search(regex, page)
 163         if ipr is not None:
 164             return ipr
 165     return None
 166
 167
 168 class YouTubeError(Exception):
 169     pass
 170
 171
 172 def get_fmt_ids(youtube_config, allow_partial):
 173     if allow_partial:
 174         if youtube_config.preferred_hls_fmt_id == 0:
 175             hls_fmt_ids = (youtube_config.preferred_hls_fmt_ids if youtube_config.preferred_hls_fmt_ids else [])
 176         else:
 177             fmt = hls_formats_dict.get(youtube_config.preferred_hls_fmt_id)
 178             if fmt is None:
 179                 hls_fmt_ids = []
 180             else:
 181                 hls_fmt_ids, path, description = fmt
 182     else:
 183         hls_fmt_ids = []
 184
 185     if youtube_config.preferred_fmt_id == 0:
 186         return (youtube_config.preferred_fmt_ids + hls_fmt_ids if youtube_config.preferred_fmt_ids else hls_fmt_ids)
 187
 188     fmt = formats_dict.get(youtube_config.preferred_fmt_id)
 189     if fmt is None:
 190         return hls_fmt_ids
 191     fmt_ids, path, description = fmt
 192     return fmt_ids + hls_fmt_ids
 193
 194
 195 @registry.download_url.register
 196 def youtube_real_download_url(config, episode, allow_partial):
 197     fmt_ids = get_fmt_ids(config.youtube, allow_partial) if config else None
 198     res, duration = get_real_download_url(episode.url, allow_partial, fmt_ids)
 199     if duration is not None:
 200         episode.total_time = int(int(duration) / 1000)
 201     return None if res == episode.url else res
 202
 203
 204 def youtube_get_old_endpoint(vid):
 205     # TODO: changing 'detailpage' to 'embedded' allows age-restricted content
 206     url = 'https://www.youtube.com/get_video_info?html5=1&c=TVHTML5&cver=6.20180913&el=detailpage&video_id=' + vid
 207     r = util.urlopen(url)
 208     if not r.ok:
 209         raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
 210     else:
 211         return r.text, None
 212
 213
 214 def youtube_get_new_endpoint(vid):
 215     url = WATCH_ENDPOINT + vid
 216     r = util.urlopen(url)
 217     if not r.ok:
 218         raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
 219
 220     ipr = get_ipr(r.text)
 221     if ipr is None:
 222         try:
 223             url = get_gdpr_consent_url(r.text)
 224         except YouTubeError as e:
 225             raise YouTubeError('Youtube "%s": No ytInitialPlayerResponse found and %s' % (url, str(e)))
 226         r = util.urlopen(url)
 227         if not r.ok:
 228             raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
 229
 230         ipr = get_ipr(r.text)
 231         if ipr is None:
 232             raise YouTubeError('Youtube "%s": No ytInitialPlayerResponse found' % url)
 233
 234     return None, ipr.group(1)
 235
 236
 237 def get_total_time(episode):
 238     try:
 239         vid = get_youtube_id(episode.url)
 240         if vid is None:
 241             return 0
 242
 243         url = WATCH_ENDPOINT + vid
 244         r = util.urlopen(url)
 245         if not r.ok:
 246             return 0
 247
 248         ipr = get_ipr(r.text)
 249         if ipr is None:
 250             url = get_gdpr_consent_url(r.text)
 251             r = util.urlopen(url)
 252             if not r.ok:
 253                 return 0
 254
 255             ipr = get_ipr(r.text)
 256             if ipr is None:
 257                 return 0
 258
 259         player_response = json.loads(ipr.group(1))
 260         return int(player_response['videoDetails']['lengthSeconds'])  # 0 if live
 261     except:
 262         return 0
 263
 264
 265 def get_real_download_url(url, allow_partial, preferred_fmt_ids=None):
 266     if not preferred_fmt_ids:
 267         preferred_fmt_ids, _, _ = formats_dict[22]  # MP4 720p
 268
 269     duration = None
 270
 271     vid = get_youtube_id(url)
 272     if vid is not None:
 273         try:
 274             old_page, new_page = youtube_get_new_endpoint(vid)
 275         except YouTubeError as e:
 276             logger.info(str(e))
 277             old_page, new_page = youtube_get_old_endpoint(vid)
 278
 279         def find_urls(old_page, new_page):
 280             # streamingData is preferable to url_encoded_fmt_stream_map
 281             # streamingData.formats are the same as url_encoded_fmt_stream_map
 282             # streamingData.adaptiveFormats are audio-only and video-only formats
 283
 284             x = parse_qs(old_page) if old_page else json.loads(new_page)
 285             player_response = json.loads(x['player_response'][0]) if old_page and 'player_response' in x else x
 286             error_message = None
 287
 288             if 'reason' in x:
 289                 # TODO: unknown if this is valid for new_page
 290                 error_message = util.remove_html_tags(x['reason'][0])
 291             elif 'playabilityStatus' in player_response:
 292                 playabilityStatus = player_response['playabilityStatus']
 293
 294                 if 'reason' in playabilityStatus:
 295                     error_message = util.remove_html_tags(playabilityStatus['reason'])
 296                 elif 'liveStreamability' in playabilityStatus \
 297                         and not playabilityStatus['liveStreamability'].get('liveStreamabilityRenderer', {}).get('displayEndscreen', False):
 298                     # playabilityStatus.liveStreamability -- video is or was a live stream
 299                     # playabilityStatus.liveStreamability.liveStreamabilityRenderer.displayEndscreen -- video has ended if present
 300
 301                     if allow_partial and 'streamingData' in player_response and 'hlsManifestUrl' in player_response['streamingData']:
 302                         r = util.urlopen(player_response['streamingData']['hlsManifestUrl'])
 303                         if not r.ok:
 304                             raise YouTubeError('HLS Manifest: %d %s' % (r.status_code, r.reason))
 305                         manifest = r.text.splitlines()
 306
 307                         urls = [line for line in manifest if line[0] != '#']
 308                         itag_re = re.compile(r'/itag/([0-9]+)/')
 309                         for url in urls:
 310                             itag = itag_re.search(url).group(1)
 311                             yield int(itag), [url, None]
 312                         return
 313
 314                     error_message = 'live stream'
 315                 elif 'streamingData' in player_response:
 316                     if 'formats' in player_response['streamingData']:
 317                         for f in player_response['streamingData']['formats']:
 318                             if 'url' in f:  # DRM videos store url inside a signatureCipher key
 319                                 yield int(f['itag']), [f['url'], f.get('approxDurationMs')]
 320                     if 'adaptiveFormats' in player_response['streamingData']:
 321                         for f in player_response['streamingData']['adaptiveFormats']:
 322                             if 'url' in f:  # DRM videos store url inside a signatureCipher key
 323                                 yield int(f['itag']), [f['url'], f.get('approxDurationMs')]
 324                     return
 325
 326             if error_message is not None:
 327                 raise YouTubeError(('Cannot stream video: %s' if allow_partial else 'Cannot download video: %s') % error_message)
 328
 329             if old_page:
 330                 r4 = re.search(r'url_encoded_fmt_stream_map=([^&]+)', old_page)
 331                 if r4 is not None:
 332                     fmt_url_map = urllib.parse.unquote(r4.group(1))
 333                     for fmt_url_encoded in fmt_url_map.split(','):
 334                         video_info = parse_qs(fmt_url_encoded)
 335                         yield int(video_info['itag'][0]), [video_info['url'][0], None]
 336
 337         fmt_id_url_map = sorted(find_urls(old_page, new_page), reverse=True)
 338
 339         if not fmt_id_url_map:
 340             drm = re.search(r'(%22(cipher|signatureCipher)%22%3A|"signatureCipher":)', old_page or new_page)
 341             if drm is not None:
 342                 raise YouTubeError('Unsupported DRM content')
 343             raise YouTubeError('No formats found')
 344
 345         formats_available = {fmt_id for fmt_id, url in fmt_id_url_map}
 346         fmt_id_url_map = dict(fmt_id_url_map)
 347
 348         for fmt_id in preferred_fmt_ids:
 349             if not re.search(r'^[0-9]+$', str(fmt_id)):
 350                 # skip non-integer formats 'best', '136+140' or twitch '720p'
 351                 continue
 352             fmt_id = int(fmt_id)
 353             if fmt_id in formats_available:
 354                 fmt = formats_dict.get(fmt_id) or hls_formats_dict.get(fmt_id)
 355                 if fmt is not None:
 356                     _, _, description = fmt
 357                 else:
 358                     description = 'Unknown'
 359
 360                 logger.info('Found YouTube format: %s (fmt_id=%d)',
 361                         description, fmt_id)
 362                 url, duration = fmt_id_url_map[fmt_id]
 363                 break
 364         else:
 365             raise YouTubeError('No preferred formats found')
 366
 367     return url, duration
 368
 369
 370 @lru_cache(1)
 371 def get_youtube_id(url):
 372     r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/watch\?v=([^&]*)', re.IGNORECASE).match(url)
 373     if r is not None:
 374         return r.group(1)
 375
 376     r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/v/(.*)[?]', re.IGNORECASE).match(url)
 377     if r is not None:
 378         return r.group(1)
 379
 380     r = re.compile(r'http[s]?://(?:[a-z]+\.)?youtube\.com/v/(.*)\.swf', re.IGNORECASE).match(url)
 381     if r is not None:
 382         return r.group(1)
 383
 384     return for_each_feed_pattern(lambda url, channel: channel, url, None)
 385
 386
 387 def is_video_link(url):
 388     return (get_youtube_id(url) is not None)
 389
 390
 391 def is_youtube_guid(guid):
 392     return guid.startswith('tag:youtube.com,2008:video:')
 393
 394
 395 def for_each_feed_pattern(func, url, fallback_result):
 396     """
 397     Try to find the username for all possible YouTube feed/webpage URLs
 398     Will call func(url, channel) for each match, and if func() returns
 399     a result other than None, returns this. If no match is found or
 400     func() returns None, return fallback_result.
 401     """
 402     CHANNEL_MATCH_PATTERNS = [
 403         r'http[s]?://(?:[a-z]+\.)?youtube\.com/user/([a-z0-9]+)',
 404         r'http[s]?://(?:[a-z]+\.)?youtube\.com/profile?user=([a-z0-9]+)',
 405         r'http[s]?://(?:[a-z]+\.)?youtube\.com/rss/user/([a-z0-9]+)/videos\.rss',
 406         r'http[s]?://(?:[a-z]+\.)?youtube\.com/channel/([-_a-z0-9]+)',
 407         r'http[s]?://(?:[a-z]+\.)?youtube\.com/feeds/videos.xml\?user=([a-z0-9]+)',
 408         r'http[s]?://(?:[a-z]+\.)?youtube\.com/feeds/videos.xml\?channel_id=([-_a-z0-9]+)',
 409         r'http[s]?://gdata.youtube.com/feeds/users/([^/]+)/uploads',
 410         r'http[s]?://gdata.youtube.com/feeds/base/users/([^/]+)/uploads',
 411     ]
 412
 413     for pattern in CHANNEL_MATCH_PATTERNS:
 414         m = re.match(pattern, url, re.IGNORECASE)
 415         if m is not None:
 416             result = func(url, m.group(1))
 417             if result is not None:
 418                 return result
 419
 420     return fallback_result
 421
 422
 423 def get_real_channel_url(url):
 424     def return_user_feed(url, channel):
 425         result = 'https://gdata.youtube.com/feeds/users/{0}/uploads'.format(channel)
 426         logger.debug('YouTube link resolved: %s => %s', url, result)
 427         return result
 428
 429     return for_each_feed_pattern(return_user_feed, url, url)
 430
 431
 432 @lru_cache(1)
 433 def get_channel_id_url(url, feed_data=None):
 434     if 'youtube.com' in url:
 435         # URL may contain channel ID, avoid a network request
 436         m = re.search(r'channel_id=([^"]+)', url)
 437         if m:
 438             # old versions of gpodder allowed newlines and whitespace in feed URLs, strip here to avoid a 404
 439             channel_id = m.group(1).strip()
 440             channel_url = 'https://www.youtube.com/channel/{}'.format(channel_id)
 441             return channel_url
 442
 443         try:
 444             if feed_data is None:
 445                 r = util.urlopen(url, cookies={'SOCS': 'CAI'})
 446                 if not r.ok:
 447                     raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
 448             else:
 449                 r = feed_data
 450             # video page may contain corrupt HTML/XML, search for tag to avoid exception
 451             m = re.search(r'(channel_id=([^"]+)">|"channelId":"([^"]+)")', r.text)
 452             if m:
 453                 channel_id = m.group(2) or m.group(3)
 454             else:
 455                 raw_xml_data = io.BytesIO(r.content)
 456                 xml_data = xml.etree.ElementTree.parse(raw_xml_data)
 457                 channel_id = xml_data.find("{http://www.youtube.com/xml/schemas/2015}channelId").text
 458                 if channel_id is None:
 459                     # check entries if feed has an empty channelId
 460                     m = re.search(r'<yt:channelId>([^<]+)</yt:channelId>', r.text)
 461                     if m:
 462                         channel_id = m.group(1)
 463                     if channel_id is None:
 464                         raise Exception('Could not retrieve YouTube channel ID for URL %s.' % url)
 465
 466                 # feeds no longer contain the required "UC" prefix on channel ID
 467                 if len(channel_id) == 22:
 468                     channel_id = "UC" + channel_id
 469             channel_url = 'https://www.youtube.com/channel/{}'.format(channel_id)
 470             return channel_url
 471
 472         except Exception:
 473             logger.warning('Could not retrieve YouTube channel ID for URL %s.' % url, exc_info=True)
 474
 475     raise Exception('Could not retrieve YouTube channel ID for URL %s.' % url)
 476
 477
 478 def get_cover(url, feed_data=None):
 479     if 'youtube.com' in url:
 480
 481         class YouTubeHTMLCoverParser(HTMLParser):
 482             """This custom html parser searches for the youtube channel thumbnail/avatar"""
 483             def __init__(self):
 484                 super().__init__()
 485                 self.url = []
 486
 487             def handle_starttag(self, tag, attributes):
 488                 attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
 489
 490                 # Look for 900x900px image first.
 491                 if tag == 'link' \
 492                         and 'rel' in attribute_dict \
 493                         and attribute_dict['rel'] == 'image_src':
 494                     self.url.append(attribute_dict['href'])
 495
 496                 # Fallback to image that may only be 100x100px.
 497                 elif tag == 'img' \
 498                         and 'class' in attribute_dict \
 499                         and attribute_dict['class'] == "channel-header-profile-image":
 500                     self.url.append(attribute_dict['src'])
 501
 502         try:
 503             channel_url = get_channel_id_url(url, feed_data)
 504             r = util.urlopen(channel_url)
 505             if not r.ok:
 506                 raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
 507             html_data = util.response_text(r)
 508             parser = YouTubeHTMLCoverParser()
 509             parser.feed(html_data)
 510             if parser.url:
 511                 logger.debug('Youtube cover art for {} is: {}'.format(url, parser.url))
 512                 return parser.url[0]
 513
 514         except Exception:
 515             logger.warning('Could not retrieve cover art', exc_info=True)
 516
 517
 518 def get_gdpr_consent_url(html_data):
 519     """
 520     Creates the URL for automatically accepting GDPR consents
 521     EU GDPR redirects to a form that needs to be posted to be redirected to a get request
 522     with the form data as input to the youtube video URL. This extracts that form data from
 523     the GDPR form and builds up the URL the posted form results.
 524     """
 525     class ConsentHTML(HTMLParser):
 526         def __init__(self):
 527             super().__init__()
 528             self.url = ''
 529             self.consentForm = False
 530
 531         def handle_starttag(self, tag, attributes):
 532             attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
 533             if tag == 'form' and attribute_dict['action'] == 'https://consent.youtube.com/s':
 534                 self.consentForm = True
 535                 self.url = 'https://consent.google.com/s?'
 536             # Get GDPR form elements
 537             if self.consentForm and tag == 'input' and attribute_dict['type'] == 'hidden':
 538                 self.url += '&' + attribute_dict['name'] + '=' + urllib.parse.quote_plus(attribute_dict['value'])
 539
 540         def handle_endtag(self, tag):
 541             if tag == 'form':
 542                 self.consentForm = False
 543
 544     try:
 545         parser = ConsentHTML()
 546         parser.feed(html_data)
 547     except Exception:
 548         raise YouTubeError('Could not retrieve GDPR accepted consent URL')
 549
 550     if parser.url:
 551         logger.debug('YouTube GDPR accept consent URL is: %s', parser.url)
 552         return parser.url
 553     else:
 554         logger.debug('YouTube GDPR accepted consent URL could not be resolved.')
 555         raise YouTubeError('No acceptable GDPR consent URL')
 556
 557
 558 def get_channel_desc(url, feed_data=None):
 559     if 'youtube.com' in url:
 560
 561         class YouTubeHTMLDesc(HTMLParser):
 562             """This custom html parser searches for the YouTube channel description."""
 563             def __init__(self):
 564                 super().__init__()
 565                 self.description = ''
 566
 567             def handle_starttag(self, tag, attributes):
 568                 attribute_dict = {attribute[0]: attribute[1] for attribute in attributes}
 569
 570                 # Get YouTube channel description.
 571                 if tag == 'meta' \
 572                         and 'name' in attribute_dict \
 573                         and attribute_dict['name'] == "description":
 574                     self.description = attribute_dict['content']
 575
 576         try:
 577             channel_url = get_channel_id_url(url, feed_data)
 578             r = util.urlopen(channel_url)
 579             if not r.ok:
 580                 raise YouTubeError('Youtube "%s": %d %s' % (url, r.status_code, r.reason))
 581             html_data = util.response_text(r)
 582             parser = YouTubeHTMLDesc()
 583             parser.feed(html_data)
 584             if parser.description:
 585                 logger.debug('YouTube description for %s is: %s', url, parser.description)
 586                 return parser.description
 587             else:
 588                 logger.debug('YouTube description for %s is not provided.', url)
 589                 return _('No description available')
 590
 591         except Exception:
 592             logger.warning('Could not retrieve YouTube channel description for %s.' % url, exc_info=True)
 593
 594
 595 def parse_youtube_url(url):
 596     """
 597     Youtube Channel Links are parsed into youtube feed links
 598     >>> parse_youtube_url("https://www.youtube.com/channel/CHANNEL_ID")
 599     'https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID'
 600
 601     Youtube User Links are parsed into youtube feed links
 602     >>> parse_youtube_url("https://www.youtube.com/user/USERNAME")
 603     'https://www.youtube.com/feeds/videos.xml?user=USERNAME'
 604
 605     Youtube Playlist Links are parsed into youtube feed links
 606     >>> parse_youtube_url("https://www.youtube.com/playlist?list=PLAYLIST_ID")
 607     'https://www.youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID'
 608
 609     >>> parse_youtube_url(None)
 610     None
 611
 612     @param url: the path to the channel, user or playlist
 613     @return: the feed url if successful or the given url if not
 614     """
 615     if url is None:
 616         return url
 617     scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
 618     logger.debug("Analyzing URL: {}".format(" ".join([scheme, netloc, path, query, fragment])))
 619
 620     if 'youtube.com' in netloc:
 621         if path == '/feeds/videos.xml' and re.search(r'^(user|channel|playlist)_id=.*', query):
 622             return url
 623
 624         if '/user/' in path or '/channel/' in path or 'list=' in query:
 625             logger.debug("Valid Youtube URL detected. Parsing...")
 626
 627             if path.startswith('/user/'):
 628                 user_id = path.split('/')[2]
 629                 query = 'user={user_id}'.format(user_id=user_id)
 630
 631             if path.startswith('/channel/'):
 632                 channel_id = path.split('/')[2]
 633                 query = 'channel_id={channel_id}'.format(channel_id=channel_id)
 634
 635             if 'list=' in query:
 636                 playlist_query = [query_value for query_value in query.split("&") if 'list=' in query_value][0]
 637                 playlist_id = playlist_query[5:]
 638                 query = 'playlist_id={playlist_id}'.format(playlist_id=playlist_id)
 639
 640             path = '/feeds/videos.xml'
 641
 642             new_url = urllib.parse.urlunsplit((scheme, netloc, path, query, fragment))
 643             logger.debug("New Youtube URL: {}".format(new_url))
 644             return new_url
 645
 646         # look for channel URL in page
 647         logger.debug("Unknown Youtube URL, trying to extract channel ID...")
 648         new_url = get_channel_id_url(url)
 649         if new_url:
 650             logger.debug("New Youtube URL: {}".format(new_url))
 651             return parse_youtube_url(new_url)
 652
 653     logger.debug("Not a valid Youtube URL: {}".format(url))
 654     return url