From 0b48f8c21f90f26e78f06c15e144d85683981b85 Mon Sep 17 00:00:00 2001 From: John Foerch Date: Mon, 5 Sep 2011 13:15:58 -0400 Subject: [PATCH] youtube.js: rewrite scraper for changes to youtube youtube_scrape_function, make_call_each, make_call_each_until_success, youtube_scrape_standard_flv, youtube_scrape_hq_mp4, youtube_scrape_720p_mp4, youtube_scrape_1080p_mp4: removed The fancy scraping function surrounding the user variable youtube_scrape_function has been removed, because it really doesn't accommodate the way youtube scraping needs to work now. --- modules/page-modes/youtube.js | 343 ++++++++++++++++++------------------------ 1 file changed, 145 insertions(+), 198 deletions(-) rewrite modules/page-modes/youtube.js (67%) diff --git a/modules/page-modes/youtube.js b/modules/page-modes/youtube.js dissimilarity index 67% index c52b7fc..67d1081 100644 --- a/modules/page-modes/youtube.js +++ b/modules/page-modes/youtube.js @@ -1,198 +1,145 @@ -/** - * (C) Copyright 2008 Jeremy Maitin-Shepard - * (C) Copyright 2009-2010 John J. Foerch - * - * Use, modification, and distribution are subject to the terms specified in the - * COPYING file. -**/ - -in_module(null); - -require("content-buffer.js"); -require("media.js"); - -let media_youtube_content_key_regexp = /&t=([^&]+)/; -let media_youtube_content_title_regexp = //; - - -/* - * Youtube Format Scrapers - * - * Given a push function, the id of the video, the `t' key, and the - * text of the document (to scrape further data if necessary), the format - * scrapers' job is to call the push function with four args: url, file - * extension, mime type, and description. - * - * Scrapers should return true on success and false on failure. - */ -function youtube_scrape_standard_flv (push, id, t, text) { - push('http://youtube.com/get_video?video_id='+id+'&t='+t+'&asv=3', - 'flv', 'video/x-flv', 'standard flv'); - return true; -} - -function youtube_scrape_hq_mp4 (push, id, t, text) { - if (/"fmt_map": ""/.test(text)) - return false; - push('http://youtube.com/get_video?video_id='+id+'&t='+t+'&fmt=18'+'&asv=3', - 'mp4', 'video/mp4', 'hq mp4'); - return true; -} - -function youtube_scrape_720p_mp4 (push, id, t, text) { - if (!(/'IS_HD_AVAILABLE': true/.test(text))) - return false; - push('http://youtube.com/get_video?video_id='+id+'&t='+t+'&fmt=22'+'&asv=3', - 'mp4', 'video/mp4', '720p mp4'); - return true; -} - -function youtube_scrape_1080p_mp4 (push, id, t, text) { - if (!(/"fmt_map": "37/.test(text))) - return false; - push('http://youtube.com/get_video?video_id='+id+'&t='+t+'&fmt=37'+'&asv=3', - 'mp4', 'video/mp4', '1080p mp4'); - return true; -} - - -/* - * Scraper Composition - */ - -/** - * make_call_each takes any number of functions as arguments and returns a - * closure which, when called, calls each of those functions in order, and - * finally returns true if any of the functions returned true. - */ -function make_call_each () { - var fns = Array.prototype.slice.call(arguments, 0); - return function () { - var args = Array.prototype.slice.call(arguments, 0); - var found = false; - fns.map(function (fn) { - if (fn.apply(this, args)) - found = true; - }); - return found; - }; -} - -/** - * make_call_each_until_success takes any number of functions as arguments - * and returns a closure which, when called, calls each of those functions - * in order until one returns true. - */ -function make_call_each_until_success () { - var fns = Array.prototype.slice.call(arguments, 0); - return function () { - var args = Array.prototype.slice.call(arguments, 0); - for each (var fn in fns) { - if (fn.apply(this, args)) - return true; - } - return false; - }; -} - -define_variable('youtube_scrape_function', - make_call_each(youtube_scrape_1080p_mp4, - youtube_scrape_720p_mp4, - youtube_scrape_hq_mp4, - youtube_scrape_standard_flv), - "This function is called as the last step of scraping a youtube page, "+ - "after the basic information needed to build the media url has been "+ - "extracted. Youtube_scape_function is called with four arguments: a "+ - "`push' function, the id, the t key, and the text of the page. Its "+ - "job is to call the push function for each media url desired with "+ - "the arguments url, file extension, mime type, and description. It "+ - "should return true if it called the push function at least once, and "+ - "otherwise false."); - - -function youtube_scrape_text (results, frame, id, text) { - var title_match = media_youtube_content_title_regexp.exec(text); - if (!title_match) - return null; - var title = decodeURIComponent(title_match[1]); - var res = media_youtube_content_key_regexp.exec(text); - if (!res) - return null; - function push (url, extension, mime_type, description) { - results.push(load_spec({ - uri: url, - title: title, - filename_extension: extension, - source_frame: frame, - mime_type: mime_type, - description: description - })); - } - youtube_scrape_function(push, id, res[1], text); -} - -function media_scrape_youtube (buffer, results) { - try { - var uri = buffer.current_uri.spec; - var result = media_youtube_uri_test_regexp.exec(uri); - if (!result) - return; - let text = buffer.document.documentElement.innerHTML; - let id = result[1]; - youtube_scrape_text(results, buffer.top_frame, id, text); - } catch (e if !(e instanceof interactive_error)) {} -} - -define_page_mode("youtube_mode", - $display_name = "YouTube", - $enable = function (buffer) { - media_setup_local_object_classes(buffer); - }); - -function media_scrape_embedded_youtube (buffer, results) { - const embedded_youtube_regexp = /^http:\/\/[a-zA-Z0-9\-.]+\.youtube\.com\/v\/(.*)$/; - for (let frame in frame_iterator(buffer.top_frame, buffer.focused_frame)) { - // Look for embedded YouTube videos - let obj_els = frame.document.getElementsByTagName("object"); - for (let i = 0; i < obj_els.length; ++i) { - let obj_el = obj_els[i]; - let param_els = obj_el.getElementsByTagName("param"); - inner: - for (let j = 0; j < param_els.length; ++j) { - let param_el = param_els[j]; - let match; - if (param_el.getAttribute("name").toLowerCase() == "movie" && - (match = embedded_youtube_regexp.exec(param_el.getAttribute("value"))) != null) { - try { - let id = match[1]; - let lspec = load_spec({uri: "http://youtube.com/watch?v=" + id}); - var result = - yield buffer.window.minibuffer.wait_for( - "Requesting " + lspec.uri + "...", - send_http_request(lspec)); - let text = result.responseText; - if (text != null && text.length > 0) - youtube_scrape_text(results, frame, id, text); - } catch (e if (e instanceof abort)) { - // Still allow other media scrapers to try even if the user aborted an http request, - // but don't continue looking for embedded youtube videos. - return; - } catch (e) { - // Some other error here means there was some problem with the request. - // We'll just ignore it. - } - break inner; - } - } - } - } -} - - -let media_youtube_uri_test_regexp = build_url_regex($domain = /(?:[a-z]+\.)?youtube/, - $path = /watch\?v=([A-Za-z0-9\-_]+)/); -media_scrapers.unshift([/.*/, media_scrape_embedded_youtube]); -media_scrapers.unshift([media_youtube_uri_test_regexp, media_scrape_youtube]); -auto_mode_list.push([media_youtube_uri_test_regexp, youtube_mode]); - -provide("youtube"); +/** + * (C) Copyright 2008 Jeremy Maitin-Shepard + * (C) Copyright 2009-2011 John J. Foerch + * + * Use, modification, and distribution are subject to the terms specified in the + * COPYING file. +**/ + +in_module(null); + +require("content-buffer.js"); +require("media.js"); + +var youtube_t_regexp = /"t": "([^"]+)"/; +var youtube_title_regexp = //; + +function regexp_exec (regexp, string, group) { + var res = regexp.exec(string); + if (! res) + return null; + return res[group]; +} + +function youtube_parse_video_info (info) { + var sp = info.split("&"); + var res = {}; + for each (var kv in sp) { + let [k, v] = kv.split("="); + res[k] = decodeURIComponent(v); + } + if (! res.url_encoded_fmt_stream_map) { + dumpln(dump_obj(res)); + return []; + } + var url_encoded_fmt_stream_map = + res.url_encoded_fmt_stream_map.split(","); + var data = []; + for each (var chunk in url_encoded_fmt_stream_map) { + var url = "", itag = ""; + var d = {}; + for each (kv in chunk.split("&")) { + let [k, v] = kv.split("="); + d[k] = decodeURIComponent(v); + } + data.push(d); + } + return data; +} + +function youtube_get_video_info (url, id, t) { + for each (el in ["profilepage", "detailpage"]) { + var video_info_url = + "http://www.youtube.com/get_video_info?&video_id="+ + encodeURIComponent(id)+"&el="+el+"&ps=default&eurl="+ + encodeURIComponent(url)+"&hl=en_US&t="+encodeURIComponent(t); + var res = yield send_http_request({uri: video_info_url}); + if (res) { + var info = youtube_parse_video_info(res.responseText); + yield co_return(info); + } + } +} + +function youtube_scrape_text (results, frame, url, id, text) { + var title = decodeURIComponent(regexp_exec(youtube_title_regexp, text, 1) + || "video"+Date.now()); + var t = regexp_exec(youtube_t_regexp, text, 1); + if (! t) + yield co_return(); + var info = yield youtube_get_video_info(url, id, t); + for each (var d in info) { + var extension = mime_service.getPrimaryExtension(d.type, null); + results.push(load_spec({ + uri: d.url, + title: title, + filename_extension: extension, + source_frame: frame, + mime_type: d.type, + description: d.quality + " " + extension + })); + } +} + +function youtube_scrape_buffer (buffer, results) { + var url = buffer.current_uri.spec; + var id = regexp_exec(youtube_uri_regexp, url, 1); + if (! id) + yield co_return(); + var text = buffer.document.documentElement.innerHTML; + yield youtube_scrape_text(results, buffer.top_frame, url, id, text); +} + +define_page_mode("youtube_mode", + $display_name = "YouTube", + $enable = function (buffer) { + media_setup_local_object_classes(buffer); + }); + +function youtube_scrape_embedded (buffer, results) { + const embedded_youtube_regexp = /^http:\/\/[a-zA-Z0-9\-.]+\.youtube\.com\/v\/([^?]*).*$/; + for (let frame in frame_iterator(buffer.top_frame, buffer.focused_frame)) { + // Look for embedded YouTube videos + let obj_els = frame.document.getElementsByTagName("object"); + for (let i = 0; i < obj_els.length; ++i) { + let obj_el = obj_els[i]; + let param_els = obj_el.getElementsByTagName("param"); + inner: + for (let j = 0; j < param_els.length; ++j) { + let param_el = param_els[j]; + let match; + if (param_el.getAttribute("name").toLowerCase() == "movie" && + (match = embedded_youtube_regexp.exec(param_el.getAttribute("value"))) != null) { + try { + let id = match[1]; + let lspec = load_spec({uri: "http://youtube.com/watch?v=" + id}); + var result = + yield buffer.window.minibuffer.wait_for( + "Requesting " + lspec.uri + "...", + send_http_request(lspec)); + let text = result.responseText; + if (text != null && text.length > 0) + yield youtube_scrape_text(results, frame, lspec.uri, id, text); + } catch (e if (e instanceof abort)) { + // Still allow other media scrapers to try even if the user aborted an http request, + // but don't continue looking for embedded youtube videos. + yield co_return(); + } catch (e) { + // Some other error here means there was some problem with the request. + // We'll just ignore it. + } + break inner; + } + } + } + } +} + + +let youtube_uri_regexp = build_url_regex($domain = /(?:[a-z]+\.)?youtube/, + $path = /watch\?v=([A-Za-z0-9\-_]+)/); +media_scrapers.unshift([/.*/, youtube_scrape_embedded]); +media_scrapers.unshift([youtube_uri_regexp, youtube_scrape_buffer]); +auto_mode_list.push([youtube_uri_regexp, youtube_mode]); + +provide("youtube"); -- 2.11.4.GIT