From 2e8259b044fda2a6424b71eb8368cafa2fa6d86e Mon Sep 17 00:00:00 2001 From: Lars Magne Ingebrigtsen Date: Mon, 3 Nov 2014 01:01:20 +0100 Subject: [PATCH] Add a new, somewhat experimental "readability" command to eww * net/eww.el (eww-readable): New command and keystroke. * net/shr.el (shr-retransform-dom): New function. --- etc/NEWS | 6 ++++++ lisp/ChangeLog | 4 ++++ lisp/net/eww.el | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lisp/net/shr.el | 20 ++++++++++++++++++++ 4 files changed, 86 insertions(+) diff --git a/etc/NEWS b/etc/NEWS index a07cb5f2949..d88e8b3f335 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -133,6 +133,12 @@ result of the calculation into the current buffer. *** New minor mode global-eldoc-mode *** eldoc-documentation-function now defaults to nil +** eww + +*** A new command `R' (`eww-readable') will try do identify the main +textual parts of a web page and display only that, leaving menus and +the like off the page. + ** Message mode *** text/html messages that contain inline image parts will be diff --git a/lisp/ChangeLog b/lisp/ChangeLog index eb374375198..b6e32f285ce 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,5 +1,9 @@ 2014-11-02 Lars Magne Ingebrigtsen + * net/eww.el (eww-readable): New command and keystroke. + + * net/shr.el (shr-retransform-dom): New function. + * net/eww.el (eww-display-html): Set `eww-current-source' in the correct buffer. (eww-view-source): Use it. diff --git a/lisp/net/eww.el b/lisp/net/eww.el index e4acd69ef4d..579f0878bbd 100644 --- a/lisp/net/eww.el +++ b/lisp/net/eww.el @@ -402,6 +402,7 @@ word(s) will be searched for via `eww-search-prefix'." (setq-local eww-contents-url nil)) (defun eww-view-source () + "View the HTML source code of the current page." (interactive) (let ((buf (get-buffer-create "*eww-source*")) (source eww-current-source)) @@ -413,6 +414,60 @@ word(s) will be searched for via `eww-search-prefix'." (html-mode))) (view-buffer buf))) +(defun eww-readable () + "View the main \"readable\" parts of the current web page. +This command uses heuristics to find the parts of the web page that +contains the main textual portion, leaving out navigation menus and +the like." + (interactive) + (let* ((source eww-current-source) + (dom (shr-transform-dom + (with-temp-buffer + (insert source) + (libxml-parse-html-region (point-min) (point-max)))))) + (eww-score-readability dom) + (eww-display-html 'utf-8 nil (shr-retransform-dom + (eww-highest-readability dom))) + (setq eww-current-source source))) + +(defun eww-score-readability (node) + (let ((score -1)) + (cond + ((memq (car node) '(script head)) + (setq score -2)) + ((eq (car node) 'meta) + (setq score -1)) + ((eq (car node) 'a) + (setq score (- (length (split-string + (or (cdr (assoc 'text (cdr node))) "")))))) + (t + (dolist (elem (cdr node)) + (cond + ((eq (car elem) 'text) + (setq score (+ score (length (split-string (cdr elem)))))) + ((consp (cdr elem)) + (setq score (+ score + (or (cdr (assoc :eww-readability-score (cdr elem))) + (eww-score-readability elem))))))))) + ;; Cache the score of the node to avoid recomputing all the time. + (setcdr node (cons (cons :eww-readability-score score) (cdr node))) + score)) + +(defun eww-highest-readability (node) + (let ((result node) + highest) + (dolist (elem (cdr node)) + (when (and (consp (cdr elem)) + (> (or (cdr (assoc + :eww-readability-score + (setq highest + (eww-highest-readability elem)))) + most-negative-fixnum) + (or (cdr (assoc :eww-readability-score (cdr result))) + most-negative-fixnum))) + (setq result highest))) + result)) + (defvar eww-mode-map (let ((map (make-sparse-keymap))) (suppress-keymap map) @@ -435,6 +490,7 @@ word(s) will be searched for via `eww-search-prefix'." (define-key map "w" 'eww-copy-page-url) (define-key map "C" 'url-cookie-list) (define-key map "v" 'eww-view-source) + (define-key map "R" 'eww-readable) (define-key map "H" 'eww-list-histories) (define-key map "b" 'eww-add-bookmark) diff --git a/lisp/net/shr.el b/lisp/net/shr.el index 878728c9319..59326de64dd 100644 --- a/lisp/net/shr.el +++ b/lisp/net/shr.el @@ -370,6 +370,26 @@ size, and full-buffer size." (push (shr-transform-dom sub) result))) (nreverse result))) +(defun shr-retransform-dom (dom) + "Transform the shr DOM back into the libxml DOM." + (let ((tag (car dom)) + (attributes nil) + (text nil) + (sub-nodes nil)) + (dolist (elem (cdr dom)) + (cond + ((eq (car elem) 'text) + (setq text (cdr elem))) + ((not (consp (cdr elem))) + (push (cons (intern (substring (symbol-name (car elem)) 1) obarray) + (cdr elem)) + attributes)) + (t + (push (shr-retransform-dom elem) sub-nodes)))) + (append (list tag (nreverse attributes)) + (nreverse sub-nodes) + (and text (list text))))) + (defsubst shr-generic (cont) (dolist (sub cont) (cond -- 2.11.4.GIT